From: Avi Kivity on
On 06/15/2010 05:47 AM, Xiao Guangrong wrote:
> Support prefetch ptes when intercept guest #PF, avoid to #PF by later
> access
>
> If we meet any failure in the prefetch path, we will exit it and
> not try other ptes to avoid become heavy path
>
>
>
> +#define PTE_PREFETCH_NUM 16
> +
> #define PT_FIRST_AVAIL_BITS_SHIFT 9
> #define PT64_SECOND_AVAIL_BITS_SHIFT 52
>
> @@ -2041,6 +2043,39 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
> {
> }
>
> +static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
> +{
> + struct kvm_mmu_page *sp;
> + int index, i;
> +
> + sp = page_header(__pa(sptep));
> + WARN_ON(!sp->role.direct);
> + index = sptep - sp->spt;
> +
> + for (i = index + 1; i< min(PT64_ENT_PER_PAGE,
> + index + PTE_PREFETCH_NUM); i++) {
> + gfn_t gfn;
> + pfn_t pfn;
> + u64 *spte = sp->spt + i;
> +
> + if (*spte != shadow_trap_nonpresent_pte)
> + continue;
> +
> + gfn = sp->gfn + (i<< ((sp->role.level - 1) * PT64_LEVEL_BITS));
>

Can calculate outside the loop and use +=.

Can this in fact work for level != PT_PAGE_TABLE_LEVEL? We might start
at PT_PAGE_DIRECTORY_LEVEL but get 4k pages while iterating.

> +
> + pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
> + if (is_error_pfn(pfn)) {
> + kvm_release_pfn_clean(pfn);
> + break;
> + }
> + if (pte_prefetch_topup_memory_cache(vcpu))
> + break;
> +
> + mmu_set_spte(vcpu, spte, ACC_ALL, ACC_ALL, 0, 0, 1, NULL,
> + sp->role.level, gfn, pfn, true, false);
> + }
> +}
>

Nice. Direct prefetch should usually succeed.

Can later augment to call get_users_pages_fast(..., PTE_PREFETCH_NUM,
....) to reduce gup overhead.

>
> +static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, u64 *sptep)
> +{
> + struct kvm_mmu_page *sp;
> + pt_element_t *table = NULL;
> + int offset = 0, shift, index, i;
> +
> + sp = page_header(__pa(sptep));
> + index = sptep - sp->spt;
> +
> + if (PTTYPE == 32) {
> + shift = PAGE_SHIFT - (PT_LEVEL_BITS -
> + PT64_LEVEL_BITS) * sp->role.level;
> + offset = sp->role.quadrant<< shift;
> + }
> +
> + for (i = index + 1; i< min(PT64_ENT_PER_PAGE,
> + index + PTE_PREFETCH_NUM); i++) {
> + struct page *page;
> + pt_element_t gpte;
> + unsigned pte_access;
> + u64 *spte = sp->spt + i;
> + gfn_t gfn;
> + pfn_t pfn;
> + int dirty;
> +
> + if (*spte != shadow_trap_nonpresent_pte)
> + continue;
> +
> + pte_access = sp->role.access;
> + if (sp->role.direct) {
> + dirty = 1;
> + gfn = sp->gfn + (i<< ((sp->role.level - 1) *
> + PT64_LEVEL_BITS));
> + goto gfn_mapping;
> + }
>

Should just call direct_pte_prefetch.

> +
> + if (!table) {
> + page = gfn_to_page_atomic(vcpu->kvm, sp->gfn);
> + if (is_error_page(page)) {
> + kvm_release_page_clean(page);
> + break;
> + }
> + table = kmap_atomic(page, KM_USER0);
> + table = (pt_element_t *)((char *)table + offset);
> + }
>

Why not kvm_read_guest_atomic()? Can do it outside the loop.

> +
> + gpte = table[i];
> + if (!(gpte& PT_ACCESSED_MASK))
> + continue;
> +
> + if (!is_present_gpte(gpte)) {
> + if (!sp->unsync)
> + *spte = shadow_notrap_nonpresent_pte;
>

Need __set_spte().

> + continue;
> + }
> + dirty = is_dirty_gpte(gpte);
> + gfn = (gpte& PT64_BASE_ADDR_MASK)>> PAGE_SHIFT;
> + pte_access = pte_access& FNAME(gpte_access)(vcpu, gpte);
> +gfn_mapping:
> + pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
> + if (is_error_pfn(pfn)) {
> + kvm_release_pfn_clean(pfn);
> + break;
> + }
> +
> + if (pte_prefetch_topup_memory_cache(vcpu))
> + break;
> + mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
> + dirty, NULL, sp->role.level, gfn, pfn,
> + true, false);
> + }
> + if (table)
> + kunmap_atomic((char *)table - offset, KM_USER0);
> +}
>

I think lot of code can be shared with the pte prefetch in invlpg.

> +
> /*
> * Fetch a shadow pte for a specific level in the paging hierarchy.
> */
> @@ -322,6 +397,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
> is_dirty_gpte(gw->ptes[gw->level-1]),
> ptwrite, level,
> gw->gfn, pfn, false, true);
> + FNAME(pte_prefetch)(vcpu, sptep);
> break;
> }
>
>


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Marcelo Tosatti on
On Tue, Jun 15, 2010 at 10:47:04AM +0800, Xiao Guangrong wrote:
> Support prefetch ptes when intercept guest #PF, avoid to #PF by later
> access
>
> If we meet any failure in the prefetch path, we will exit it and
> not try other ptes to avoid become heavy path
>
> Signed-off-by: Xiao Guangrong <xiaoguangrong(a)cn.fujitsu.com>
> ---
> arch/x86/kvm/mmu.c | 36 +++++++++++++++++++++
> arch/x86/kvm/paging_tmpl.h | 76 ++++++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 112 insertions(+), 0 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 92ff099..941c86b 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -89,6 +89,8 @@ module_param(oos_shadow, bool, 0644);
> }
> #endif
>
> +#define PTE_PREFETCH_NUM 16
> +
> #define PT_FIRST_AVAIL_BITS_SHIFT 9
> #define PT64_SECOND_AVAIL_BITS_SHIFT 52
>
> @@ -2041,6 +2043,39 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
> {
> }
>
> +static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
> +{
> + struct kvm_mmu_page *sp;
> + int index, i;
> +
> + sp = page_header(__pa(sptep));
> + WARN_ON(!sp->role.direct);
> + index = sptep - sp->spt;
> +
> + for (i = index + 1; i < min(PT64_ENT_PER_PAGE,
> + index + PTE_PREFETCH_NUM); i++) {
> + gfn_t gfn;
> + pfn_t pfn;
> + u64 *spte = sp->spt + i;
> +
> + if (*spte != shadow_trap_nonpresent_pte)
> + continue;
> +
> + gfn = sp->gfn + (i << ((sp->role.level - 1) * PT64_LEVEL_BITS));
> +
> + pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
> + if (is_error_pfn(pfn)) {
> + kvm_release_pfn_clean(pfn);
> + break;
> + }
> + if (pte_prefetch_topup_memory_cache(vcpu))
> + break;
> +
> + mmu_set_spte(vcpu, spte, ACC_ALL, ACC_ALL, 0, 0, 1, NULL,
> + sp->role.level, gfn, pfn, true, false);

Can only map with level > 1 if the host page matches the size.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Xiao Guangrong on


Avi Kivity wrote:

>> + if (*spte != shadow_trap_nonpresent_pte)
>> + continue;
>> +
>> + gfn = sp->gfn + (i<< ((sp->role.level - 1) * PT64_LEVEL_BITS));
>>
>

Avi,

Thanks for your comment.

> Can calculate outside the loop and use +=.
>

It's nice, will do it in the next version.

> Can this in fact work for level != PT_PAGE_TABLE_LEVEL? We might start
> at PT_PAGE_DIRECTORY_LEVEL but get 4k pages while iterating.

Ah, i forgot it. We can't assume that the host also support huge page for
next gfn, as Marcelo's suggestion, we should "only map with level > 1 if
the host page matches the size".

Um, the problem is, when we get host page size, we should hold 'mm->mmap_sem',
it can't used in atomic context and it's also a slow path, we hope pte prefetch
path is fast.

How about only allow prefetch for sp.leve = 1 now? i'll improve it in the future,
i think it need more time :-)

>
>> +
>> + pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
>> + if (is_error_pfn(pfn)) {
>> + kvm_release_pfn_clean(pfn);
>> + break;
>> + }
>> + if (pte_prefetch_topup_memory_cache(vcpu))
>> + break;
>> +
>> + mmu_set_spte(vcpu, spte, ACC_ALL, ACC_ALL, 0, 0, 1, NULL,
>> + sp->role.level, gfn, pfn, true, false);
>> + }
>> +}
>>
>
> Nice. Direct prefetch should usually succeed.
>
> Can later augment to call get_users_pages_fast(..., PTE_PREFETCH_NUM,
> ...) to reduce gup overhead.

But we can't assume the gfn's hva is consecutive, for example, gfn and gfn+1
maybe in the different slots.

>
>>
>> +static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, u64 *sptep)
>> +{
>> + struct kvm_mmu_page *sp;
>> + pt_element_t *table = NULL;
>> + int offset = 0, shift, index, i;
>> +
>> + sp = page_header(__pa(sptep));
>> + index = sptep - sp->spt;
>> +
>> + if (PTTYPE == 32) {
>> + shift = PAGE_SHIFT - (PT_LEVEL_BITS -
>> + PT64_LEVEL_BITS) * sp->role.level;
>> + offset = sp->role.quadrant<< shift;
>> + }
>> +
>> + for (i = index + 1; i< min(PT64_ENT_PER_PAGE,
>> + index + PTE_PREFETCH_NUM); i++) {
>> + struct page *page;
>> + pt_element_t gpte;
>> + unsigned pte_access;
>> + u64 *spte = sp->spt + i;
>> + gfn_t gfn;
>> + pfn_t pfn;
>> + int dirty;
>> +
>> + if (*spte != shadow_trap_nonpresent_pte)
>> + continue;
>> +
>> + pte_access = sp->role.access;
>> + if (sp->role.direct) {
>> + dirty = 1;
>> + gfn = sp->gfn + (i<< ((sp->role.level - 1) *
>> + PT64_LEVEL_BITS));
>> + goto gfn_mapping;
>> + }
>>
>
> Should just call direct_pte_prefetch.
>

OK, will fix it.

>> +
>> + if (!table) {
>> + page = gfn_to_page_atomic(vcpu->kvm, sp->gfn);
>> + if (is_error_page(page)) {
>> + kvm_release_page_clean(page);
>> + break;
>> + }
>> + table = kmap_atomic(page, KM_USER0);
>> + table = (pt_element_t *)((char *)table + offset);
>> + }
>>
>
> Why not kvm_read_guest_atomic()? Can do it outside the loop.

Do you mean that read all prefetched sptes at one time?
If prefetch one spte fail, the later sptes that we read is waste, so i
choose read next spte only when current spte is prefetched successful.

But i not have strong opinion on it since it's fast to read all sptes at
one time, at the worst case, only 16 * 8 = 128 bytes we need to read.

>
>> +
>> + gpte = table[i];
>> + if (!(gpte& PT_ACCESSED_MASK))
>> + continue;
>> +
>> + if (!is_present_gpte(gpte)) {
>> + if (!sp->unsync)
>> + *spte = shadow_notrap_nonpresent_pte;
>>
>
> Need __set_spte().

Oops, fix it.

>
>> + continue;
>> + }
>> + dirty = is_dirty_gpte(gpte);
>> + gfn = (gpte& PT64_BASE_ADDR_MASK)>> PAGE_SHIFT;
>> + pte_access = pte_access& FNAME(gpte_access)(vcpu, gpte);
>> +gfn_mapping:
>> + pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
>> + if (is_error_pfn(pfn)) {
>> + kvm_release_pfn_clean(pfn);
>> + break;
>> + }
>> +
>> + if (pte_prefetch_topup_memory_cache(vcpu))
>> + break;
>> + mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
>> + dirty, NULL, sp->role.level, gfn, pfn,
>> + true, false);
>> + }
>> + if (table)
>> + kunmap_atomic((char *)table - offset, KM_USER0);
>> +}
>>
>
> I think lot of code can be shared with the pte prefetch in invlpg.
>

Yes, please allow me to cleanup those code after my future patchset:

[PATCH v4 9/9] KVM MMU: optimize sync/update unsync-page

it's the last part in the 'allow multiple shadow pages' patchset.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Avi Kivity on
On 06/17/2010 10:25 AM, Xiao Guangrong wrote:
>
>
>> Can this in fact work for level != PT_PAGE_TABLE_LEVEL? We might start
>> at PT_PAGE_DIRECTORY_LEVEL but get 4k pages while iterating.
>>
> Ah, i forgot it. We can't assume that the host also support huge page for
> next gfn, as Marcelo's suggestion, we should "only map with level> 1 if
> the host page matches the size".
>
> Um, the problem is, when we get host page size, we should hold 'mm->mmap_sem',
> it can't used in atomic context and it's also a slow path, we hope pte prefetch
> path is fast.
>
> How about only allow prefetch for sp.leve = 1 now? i'll improve it in the future,
> i think it need more time :-)
>

I don't think prefetch for level > 1 is worthwhile. One fault per 2MB
is already very good, no need to optimize it further.

>>> +
>>> + pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
>>> + if (is_error_pfn(pfn)) {
>>> + kvm_release_pfn_clean(pfn);
>>> + break;
>>> + }
>>> + if (pte_prefetch_topup_memory_cache(vcpu))
>>> + break;
>>> +
>>> + mmu_set_spte(vcpu, spte, ACC_ALL, ACC_ALL, 0, 0, 1, NULL,
>>> + sp->role.level, gfn, pfn, true, false);
>>> + }
>>> +}
>>>
>>>
>> Nice. Direct prefetch should usually succeed.
>>
>> Can later augment to call get_users_pages_fast(..., PTE_PREFETCH_NUM,
>> ...) to reduce gup overhead.
>>
> But we can't assume the gfn's hva is consecutive, for example, gfn and gfn+1
> maybe in the different slots.
>

Right. We could limit it to one slot then for simplicity.

>
>>> +
>>> + if (!table) {
>>> + page = gfn_to_page_atomic(vcpu->kvm, sp->gfn);
>>> + if (is_error_page(page)) {
>>> + kvm_release_page_clean(page);
>>> + break;
>>> + }
>>> + table = kmap_atomic(page, KM_USER0);
>>> + table = (pt_element_t *)((char *)table + offset);
>>> + }
>>>
>>>
>> Why not kvm_read_guest_atomic()? Can do it outside the loop.
>>
> Do you mean that read all prefetched sptes at one time?
>

Yes.

> If prefetch one spte fail, the later sptes that we read is waste, so i
> choose read next spte only when current spte is prefetched successful.
>
> But i not have strong opinion on it since it's fast to read all sptes at
> one time, at the worst case, only 16 * 8 = 128 bytes we need to read.
>

In general batching is worthwhile, the cost of the extra bytes is low
compared to the cost of bringing in the cacheline and error checking.

btw, you could align the prefetch to 16 pte boundary. That would
improve performance for memory that is scanned backwards.

So we can change the fault path to always fault 16 ptes, aligned on 16
pte boundary, with the needed pte called with specualtive=false.
>> I think lot of code can be shared with the pte prefetch in invlpg.
>>
>>
> Yes, please allow me to cleanup those code after my future patchset:
>
> [PATCH v4 9/9] KVM MMU: optimize sync/update unsync-page
>
> it's the last part in the 'allow multiple shadow pages' patchset.
>

Sure.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Xiao Guangrong on


Avi Kivity wrote:
> On 06/17/2010 10:25 AM, Xiao Guangrong wrote:
>>
>>
>>> Can this in fact work for level != PT_PAGE_TABLE_LEVEL? We might start
>>> at PT_PAGE_DIRECTORY_LEVEL but get 4k pages while iterating.
>>>
>> Ah, i forgot it. We can't assume that the host also support huge page for
>> next gfn, as Marcelo's suggestion, we should "only map with level> 1 if
>> the host page matches the size".
>>
>> Um, the problem is, when we get host page size, we should hold
>> 'mm->mmap_sem',
>> it can't used in atomic context and it's also a slow path, we hope pte
>> prefetch
>> path is fast.
>>
>> How about only allow prefetch for sp.leve = 1 now? i'll improve it in
>> the future,
>> i think it need more time :-)
>>
>
> I don't think prefetch for level > 1 is worthwhile. One fault per 2MB
> is already very good, no need to optimize it further.
>

OK

>>>> +
>>>> + pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
>>>> + if (is_error_pfn(pfn)) {
>>>> + kvm_release_pfn_clean(pfn);
>>>> + break;
>>>> + }
>>>> + if (pte_prefetch_topup_memory_cache(vcpu))
>>>> + break;
>>>> +
>>>> + mmu_set_spte(vcpu, spte, ACC_ALL, ACC_ALL, 0, 0, 1, NULL,
>>>> + sp->role.level, gfn, pfn, true, false);
>>>> + }
>>>> +}
>>>>
>>>>
>>> Nice. Direct prefetch should usually succeed.
>>>
>>> Can later augment to call get_users_pages_fast(..., PTE_PREFETCH_NUM,
>>> ...) to reduce gup overhead.
>>>
>> But we can't assume the gfn's hva is consecutive, for example, gfn and
>> gfn+1
>> maybe in the different slots.
>>
>
> Right. We could limit it to one slot then for simplicity.

OK, i'll do it.

>
>>
>>>> +
>>>> + if (!table) {
>>>> + page = gfn_to_page_atomic(vcpu->kvm, sp->gfn);
>>>> + if (is_error_page(page)) {
>>>> + kvm_release_page_clean(page);
>>>> + break;
>>>> + }
>>>> + table = kmap_atomic(page, KM_USER0);
>>>> + table = (pt_element_t *)((char *)table + offset);
>>>> + }
>>>>
>>>>
>>> Why not kvm_read_guest_atomic()? Can do it outside the loop.
>>>
>> Do you mean that read all prefetched sptes at one time?
>>
>
> Yes.
>
>> If prefetch one spte fail, the later sptes that we read is waste, so i
>> choose read next spte only when current spte is prefetched successful.
>>
>> But i not have strong opinion on it since it's fast to read all sptes at
>> one time, at the worst case, only 16 * 8 = 128 bytes we need to read.
>>
>
> In general batching is worthwhile, the cost of the extra bytes is low
> compared to the cost of bringing in the cacheline and error checking.
>

Agreed.

> btw, you could align the prefetch to 16 pte boundary. That would
> improve performance for memory that is scanned backwards.
>

Yeah, good idea.

> So we can change the fault path to always fault 16 ptes, aligned on 16
> pte boundary, with the needed pte called with specualtive=false.

Avi, i not understand it clearly, Could you please explain it? :-(
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/