KVM: MMU: prefetch ptes when intercepted guest #PF [Kernel]

Prev: Bluetooth: Fix abuse of the preincrement operator
Next: [PATCH 2/2] Removing dead CONFIG_IOMMU_DEBUG

From: Marcelo Tosatti on 28 Jun 2010 09:20

On Fri, Jun 25, 2010 at 08:07:06PM +0800, Xiao Guangrong wrote:
> Support prefetch ptes when intercept guest #PF, avoid to #PF by later
> access
>
> If we meet any failure in the prefetch path, we will exit it and
> not try other ptes to avoid become heavy path
>
> Note: this speculative will mark page become dirty but it not really
> accessed, the same issue is in other speculative paths like invlpg,
> pte write, fortunately, it just affect host memory management. After
> Avi's patchset named "[PATCH v2 1/4] KVM: MMU: Introduce drop_spte()"
> merged, we will easily fix it. Will do it in the future.
>
> Signed-off-by: Xiao Guangrong <xiaoguangrong(a)cn.fujitsu.com>
> ---
> arch/x86/kvm/mmu.c | 69 +++++++++++++++++++++++++++++++++++++++++
> arch/x86/kvm/paging_tmpl.h | 74 ++++++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 143 insertions(+), 0 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 6c06666..b2ad723 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -89,6 +89,8 @@ module_param(oos_shadow, bool, 0644);
> }
> #endif
>
> +#define PTE_PREFETCH_NUM 16
> +
> #define PT_FIRST_AVAIL_BITS_SHIFT 9
> #define PT64_SECOND_AVAIL_BITS_SHIFT 52
>
> @@ -1998,6 +2000,72 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
> {
> }
>
> +static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
> + struct kvm_mmu_page *sp,
> + u64 *start, u64 *end)
> +{
> + gfn_t gfn;
> + struct page *pages[PTE_PREFETCH_NUM];
> +
> + if (pte_prefetch_topup_memory_cache(vcpu, end - start))
> + return -1;
> +
> + gfn = sp->gfn + start - sp->spt;
> + while (start < end) {
> + unsigned long addr;
> + int entry, j, ret;
> +
> + addr = gfn_to_hva_many(vcpu->kvm, gfn, &entry);
> + if (kvm_is_error_hva(addr))
> + return -1;
> +
> + entry = min(entry, (int)(end - start));
> + ret = __get_user_pages_fast(addr, entry, 1, pages);
> + if (ret <= 0)
> + return -1;
> +
> + for (j = 0; j < ret; j++, gfn++, start++)
> + mmu_set_spte(vcpu, start, ACC_ALL,
> + sp->role.access, 0, 0, 1, NULL,
> + sp->role.level, gfn,
> + page_to_pfn(pages[j]), true, false);
> +
> + if (ret < entry)
> + return -1;
> + }
> + return 0;
> +}
> +
> +static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
> +{
> + struct kvm_mmu_page *sp;
> + u64 *start = NULL;
> + int index, i, max;
> +
> + sp = page_header(__pa(sptep));
> + WARN_ON(!sp->role.direct);
> +
> + if (sp->role.level > PT_PAGE_TABLE_LEVEL)
> + return;
> +
> + index = sptep - sp->spt;
> + i = index & ~(PTE_PREFETCH_NUM - 1);
> + max = index | (PTE_PREFETCH_NUM - 1);
> +
> + for (; i < max; i++) {
> + u64 *spte = sp->spt + i;
> +
> + if (*spte != shadow_trap_nonpresent_pte || spte == sptep) {
> + if (!start)
> + continue;
> + if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
> + break;
> + start = NULL;
> + } else if (!start)
> + start = spte;
> + }
> +}
> +
> static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
> int level, gfn_t gfn, pfn_t pfn)
> {
> @@ -2012,6 +2080,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
> 0, write, 1, &pt_write,
> level, gfn, pfn, false, true);
> ++vcpu->stat.pf_fixed;
> + direct_pte_prefetch(vcpu, iterator.sptep);
> break;
> }
>
> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> index fdba751..134f031 100644
> --- a/arch/x86/kvm/paging_tmpl.h
> +++ b/arch/x86/kvm/paging_tmpl.h
> @@ -291,6 +291,79 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
> gpte_to_gfn(gpte), pfn, true, true);
> }
>
> +static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, u64 *sptep)
> +{
> + struct kvm_mmu_page *sp;
> + pt_element_t gptep[PTE_PREFETCH_NUM];
> + gpa_t first_pte_gpa;
> + int offset = 0, index, i, j, max;
> +
> + sp = page_header(__pa(sptep));
> + index = sptep - sp->spt;
> +
> + if (sp->role.level > PT_PAGE_TABLE_LEVEL)
> + return;
> +
> + if (sp->role.direct)
> + return direct_pte_prefetch(vcpu, sptep);

Can never happen.

> +
> + index = sptep - sp->spt;
> + i = index & ~(PTE_PREFETCH_NUM - 1);
> + max = index | (PTE_PREFETCH_NUM - 1);
> +
> + if (PTTYPE == 32)
> + offset = sp->role.quadrant << PT64_LEVEL_BITS;
> +
> + first_pte_gpa = gfn_to_gpa(sp->gfn) +
> + (offset + i) * sizeof(pt_element_t);
> +
> + if (kvm_read_guest_atomic(vcpu->kvm, first_pte_gpa, gptep,
> + sizeof(gptep)) < 0)
> + return;
> +
> + for (j = 0; i < max; i++, j++) {
> + pt_element_t gpte;
> + unsigned pte_access;
> + u64 *spte = sp->spt + i;
> + gfn_t gfn;
> + pfn_t pfn;
> +
> + if (spte == sptep)
> + continue;
> +
> + if (*spte != shadow_trap_nonpresent_pte)
> + continue;
> +
> + gpte = gptep[j];
> +
> + if (is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL))
> + break;
> +
> + if (!(gpte & PT_ACCESSED_MASK))
> + continue;
> +
> + if (!is_present_gpte(gpte)) {
> + if (!sp->unsync)
> + __set_spte(spte, shadow_notrap_nonpresent_pte);
> + continue;
> + }
> +
> + gfn = gpte_to_gfn(gpte);
> +
> + pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
> + if (is_error_pfn(pfn) ||
> + pte_prefetch_topup_memory_cache(vcpu, 1)) {
> + kvm_release_pfn_clean(pfn);
> + break;
> + }
> +
> + pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
> + mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
> + is_dirty_gpte(gpte), NULL, sp->role.level, gfn,
> + pfn, true, false);
> + }
> +}
> +
> /*
> * Fetch a shadow pte for a specific level in the paging hierarchy.
> */
> @@ -322,6 +395,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
> user_fault, write_fault,
> dirty, ptwrite, level,
> gw->gfn, pfn, false, true);
> + FNAME(pte_prefetch)(vcpu, sptep);
> break;
> }

I'm afraid this can introduce regressions since it increases mmu_lock
contention. Can you get some numbers with 4-vcpu or 8-vcpu guest and
many threads benchmarks, such as kernbench and apachebench? (on
non-EPT).

Also prefetch should be disabled for EPT, due to lack of accessed bit.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

From: Xiao Guangrong on 29 Jun 2010 04:20

Marcelo Tosatti wrote:

>> +
>> + if (sp->role.level > PT_PAGE_TABLE_LEVEL)
>> + return;
>> +
>> + if (sp->role.direct)
>> + return direct_pte_prefetch(vcpu, sptep);
>
> Can never happen.
>

Marcelo,

Thanks for your comment. You mean that we can't meet sp->role.direct here?
could you please tell me why? During my test, it can be triggered.

>> @@ -322,6 +395,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
>> user_fault, write_fault,
>> dirty, ptwrite, level,
>> gw->gfn, pfn, false, true);
>> + FNAME(pte_prefetch)(vcpu, sptep);
>> break;
>> }
>
>
> I'm afraid this can introduce regressions since it increases mmu_lock
> contention. Can you get some numbers with 4-vcpu or 8-vcpu guest and
> many threads benchmarks, such as kernbench and apachebench? (on
> non-EPT).
>

The pte prefetch is the fast path, it only occupies little time, for the worst
case, only need read 128 byte form the guest pte, and if it prefetched success,
the #PF cause by later access will avoid, then we avoid to exit form the guest,
and walk guest pte, walk shadow pages, flush local tlb... a lots of work can be
reduced.

Before i post this patchset firstly, i do the performance test by using unixbench,
it improved ~3.6% under EPT disable case.
(it's in the first version's chagelog)

Today, i do the kernbench test with 4 vcpu and 1G memory, the result shows it
improved ~1.6% :-)

> Also prefetch should be disabled for EPT, due to lack of accessed bit.
>

But we call mmu_set_spte() with speculative == false, it not touch the accessed bit.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

From: Marcelo Tosatti on 29 Jun 2010 08:10

On Tue, Jun 29, 2010 at 04:07:40PM +0800, Xiao Guangrong wrote:
>
>
> Marcelo Tosatti wrote:
>
> >> +
> >> + if (sp->role.level > PT_PAGE_TABLE_LEVEL)
> >> + return;
> >> +
> >> + if (sp->role.direct)
> >> + return direct_pte_prefetch(vcpu, sptep);
> >
> > Can never happen.
> >
>
> Marcelo,
>
> Thanks for your comment. You mean that we can't meet sp->role.direct here?
> could you please tell me why? During my test, it can be triggered.

Ah, for 1->1 emulation, right.

> >> @@ -322,6 +395,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
> >> user_fault, write_fault,
> >> dirty, ptwrite, level,
> >> gw->gfn, pfn, false, true);
> >> + FNAME(pte_prefetch)(vcpu, sptep);
> >> break;
> >> }
> >
> >
> > I'm afraid this can introduce regressions since it increases mmu_lock
> > contention. Can you get some numbers with 4-vcpu or 8-vcpu guest and
> > many threads benchmarks, such as kernbench and apachebench? (on
> > non-EPT).
> >
>
> The pte prefetch is the fast path, it only occupies little time, for the worst
> case, only need read 128 byte form the guest pte, and if it prefetched success,
> the #PF cause by later access will avoid, then we avoid to exit form the guest,
> and walk guest pte, walk shadow pages, flush local tlb... a lots of work can be
> reduced.
>
> Before i post this patchset firstly, i do the performance test by using unixbench,
> it improved ~3.6% under EPT disable case.
> (it's in the first version's chagelog)
>
> Today, i do the kernbench test with 4 vcpu and 1G memory, the result shows it
> improved ~1.6% :-)

OK, nice.

> > Also prefetch should be disabled for EPT, due to lack of accessed bit.
> >
>
> But we call mmu_set_spte() with speculative == false, it not touch the accessed bit.

There is no accessed bit on EPT. So the aging code (kvm_age_rmapp)
considers any present translation as accessed. There is no way to
distinguish between actually accessed translations and prefetched (but
unused) ones.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

From: Xiao Guangrong on 29 Jun 2010 21:10

Marcelo Tosatti wrote:

>>> Also prefetch should be disabled for EPT, due to lack of accessed bit.
>>>
>> But we call mmu_set_spte() with speculative == false, it not touch the accessed bit.
>
> There is no accessed bit on EPT. So the aging code (kvm_age_rmapp)
> considers any present translation as accessed. There is no way to
> distinguish between actually accessed translations and prefetched (but
> unused) ones.
>

You are right, i'll disable the prefetch for EPT in the next version, thanks for
you point it out.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

From: Marcelo Tosatti on 30 Jun 2010 16:50

On Wed, Jun 30, 2010 at 04:08:05PM +0800, Xiao Guangrong wrote:
> Support prefetch ptes when intercept guest #PF, avoid to #PF by later
> access
>
> If we meet any failure in the prefetch path, we will exit it and
> not try other ptes to avoid become heavy path
>
> Note: this speculative will mark page become dirty but it not really
> accessed, the same issue is in other speculative paths like invlpg,
> pte write, fortunately, it just affect host memory management. After
> Avi's patchset named "[PATCH v2 1/4] KVM: MMU: Introduce drop_spte()"
> merged, we will easily fix it. Will do it in the future.
>
> Signed-off-by: Xiao Guangrong <xiaoguangrong(a)cn.fujitsu.com>
> ---
> arch/x86/kvm/mmu.c | 83 ++++++++++++++++++++++++++++++++++++++++++++
> arch/x86/kvm/paging_tmpl.h | 76 ++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 159 insertions(+), 0 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 6673484..fadfafe 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -2002,6 +2002,88 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
> {
> }
>
> +static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
> + struct kvm_mmu_page *sp,
> + u64 *start, u64 *end)
> +{
> + gfn_t gfn;
> + struct page *pages[PTE_PREFETCH_NUM];
> +
> + gfn = sp->gfn + start - sp->spt;
> + while (start < end) {
> + unsigned long addr;
> + int entry, j, ret;
> +
> + addr = gfn_to_hva_many(vcpu->kvm, gfn, &entry);
> + if (kvm_is_error_hva(addr))
> + return -1;
> +
> + entry = min(entry, (int)(end - start));
> + ret = __get_user_pages_fast(addr, entry, 1, pages);
> + if (ret <= 0)
> + return -1;

Why can't you use gfn_to_pfn_atomic() here, one page at a time? Is
the overhead significant that this is worthwhile?

You're bypassing the centralized interface.

> +
> + for (j = 0; j < ret; j++, gfn++, start++)
> + mmu_set_spte(vcpu, start, ACC_ALL,
> + sp->role.access, 0, 0, 1, NULL,
> + sp->role.level, gfn,
> + page_to_pfn(pages[j]), true, false);
> +
> + if (ret < entry)
> + return -1;
> + }
> + return 0;
> +}
> +
> +static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
> + struct kvm_mmu_page *sp, u64 *sptep)
> +{
> + u64 *start = NULL;
> + int index, i, max;
> +
> + WARN_ON(!sp->role.direct);
> +
> + if (pte_prefetch_topup_memory_cache(vcpu))
> + return;
> +
> + index = sptep - sp->spt;
> + i = index & ~(PTE_PREFETCH_NUM - 1);
> + max = index | (PTE_PREFETCH_NUM - 1);
> +
> + for (; i < max; i++) {
> + u64 *spte = sp->spt + i;
> +
> + if (*spte != shadow_trap_nonpresent_pte || spte == sptep) {
> + if (!start)
> + continue;
> + if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
> + break;
> + start = NULL;
> + } else if (!start)
> + start = spte;
> + }
> +}
> +
> +static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
> +{
> + struct kvm_mmu_page *sp;
> +
> + /*
> + * Since it's no accessed bit on EPT, it's no way to
> + * distinguish between actually accessed translations
> + * and prefetched, so disable pte prefetch if EPT is
> + * enabled.
> + */
> + if (!shadow_accessed_mask)
> + return;
> +
> + sp = page_header(__pa(sptep));
> + if (sp->role.level > PT_PAGE_TABLE_LEVEL)
> + return;
> +
> + __direct_pte_prefetch(vcpu, sp, sptep);
> +}
> +
> static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
> int level, gfn_t gfn, pfn_t pfn)
> {
> @@ -2015,6 +2097,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
> mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
> 0, write, 1, &pt_write,
> level, gfn, pfn, false, true);
> + direct_pte_prefetch(vcpu, iterator.sptep);
> ++vcpu->stat.pf_fixed;
> break;
> }
> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> index 3350c02..d8c3be8 100644
> --- a/arch/x86/kvm/paging_tmpl.h
> +++ b/arch/x86/kvm/paging_tmpl.h
> @@ -291,6 +291,81 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
> gpte_to_gfn(gpte), pfn, true, true);
> }
>
> +static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, u64 *sptep)
> +{
> + struct kvm_mmu_page *sp;
> + pt_element_t gptep[PTE_PREFETCH_NUM];
> + gpa_t first_pte_gpa;
> + int offset = 0, index, i, j, max;
> +
> + sp = page_header(__pa(sptep));
> + index = sptep - sp->spt;
> +
> + if (sp->role.level > PT_PAGE_TABLE_LEVEL)
> + return;
> +
> + if (sp->role.direct)
> + return __direct_pte_prefetch(vcpu, sp, sptep);
> +
> + index = sptep - sp->spt;
> + i = index & ~(PTE_PREFETCH_NUM - 1);
> + max = index | (PTE_PREFETCH_NUM - 1);
> +
> + if (PTTYPE == 32)
> + offset = sp->role.quadrant << PT64_LEVEL_BITS;
> +
> + first_pte_gpa = gfn_to_gpa(sp->gfn) +
> + (offset + i) * sizeof(pt_element_t);
> +
> + if (kvm_read_guest_atomic(vcpu->kvm, first_pte_gpa, gptep,
> + sizeof(gptep)) < 0)
> + return;
> +
> + if (pte_prefetch_topup_memory_cache(vcpu))
> + return;
> +
> + for (j = 0; i < max; i++, j++) {
> + pt_element_t gpte;
> + unsigned pte_access;
> + u64 *spte = sp->spt + i;
> + gfn_t gfn;
> + pfn_t pfn;
> +
> + if (spte == sptep)
> + continue;
> +
> + if (*spte != shadow_trap_nonpresent_pte)
> + continue;
> +
> + gpte = gptep[j];
> +
> + if (is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL))
> + break;
> +
> + if (!(gpte & PT_ACCESSED_MASK))
> + continue;
> +
> + if (!is_present_gpte(gpte)) {
> + if (!sp->unsync)
> + __set_spte(spte, shadow_notrap_nonpresent_pte);
> + continue;
> + }
> +
> + gfn = gpte_to_gfn(gpte);
> +
> + pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
> + if (is_error_pfn(pfn)) {
> + kvm_release_pfn_clean(pfn);
> + break;
> + }
> +
> + pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
> + mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
> + is_dirty_gpte(gpte), NULL, sp->role.level, gfn,
> + pfn, true, false);

reset_host_protection should be true, see commit 1403283acca (also for
direct case to be consistent).
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

| Next | Last
Pages: 1 2 3
Prev: Bluetooth: Fix abuse of the preincrement operator
Next: [PATCH 2/2] Removing dead CONFIG_IOMMU_DEBUG