kvm, x86: use ro page and don't copy shared page [Kernel]

Prev: [PATCH] ARM/nuc900: re-organize the nuc900 lcd arch platform data setting
Next: [git pull] Input updates for 2.6.35-rc5

From: Gleb Natapov on 16 Jul 2010 03:20

On Fri, Jul 16, 2010 at 10:13:07AM +0800, Lai Jiangshan wrote:
> When page fault, we always call get_user_pages(write=1).
>
> Actually, we don't need to do this when it is not write fault.
> get_user_pages(write=1) will cause shared page(ksm) copied.
> If this page is not modified in future, this copying and the copied page
> are just wasted. Ksm may scan and merge them and may cause thrash.
>
But is page is written into afterwords we will get another page fault.

> In this patch, if the page is RO for host VMM and it not write fault for guest,
> we will use RO page, otherwise we use a writable page.
>
Currently pages allocated for guest memory are required to be RW, so after your series
behaviour will remain exactly the same as before.

> Signed-off-by: Lai Jiangshan <laijs(a)cn.fujitsu.com>
> ---
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 8ba9b0d..6382140 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -1832,6 +1832,45 @@ static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
> }
> }
>
> +/* get a current mapped page fast, and test whether the page is writable. */
> +static struct page *get_user_page_and_protection(unsigned long addr,
> + int *writable)
> +{
> + struct page *page[1];
> +
> + if (__get_user_pages_fast(addr, 1, 1, page) == 1) {
> + *writable = 1;
> + return page[0];
> + }
> + if (__get_user_pages_fast(addr, 1, 0, page) == 1) {
> + *writable = 0;
> + return page[0];
> + }
> + return NULL;
> +}
> +
> +static pfn_t kvm_get_pfn_for_page_fault(struct kvm *kvm, gfn_t gfn,
> + int write_fault, int *host_writable)
> +{
> + unsigned long addr;
> + struct page *page;
> +
> + if (!write_fault) {
> + addr = gfn_to_hva(kvm, gfn);
> + if (kvm_is_error_hva(addr)) {
> + get_page(bad_page);
> + return page_to_pfn(bad_page);
> + }
> +
> + page = get_user_page_and_protection(addr, host_writable);
> + if (page)
> + return page_to_pfn(page);
> + }
> +
> + *host_writable = 1;
> + return kvm_get_pfn_for_gfn(kvm, gfn);
> +}
> +
kvm_get_pfn_for_gfn() returns fault_page if page is mapped RO, so caller
of kvm_get_pfn_for_page_fault() and kvm_get_pfn_for_gfn() will get
different results when called on the same page. Not good.
kvm_get_pfn_for_page_fault() logic should be folded into
kvm_get_pfn_for_gfn().

> static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
> bool can_unsync)
> {
> @@ -2085,6 +2124,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
> int level;
> pfn_t pfn;
> unsigned long mmu_seq;
> + int host_writable;
>
> level = mapping_level(vcpu, gfn);
>
> @@ -2099,7 +2139,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
>
> mmu_seq = vcpu->kvm->mmu_notifier_seq;
> smp_rmb();
> - pfn = kvm_get_pfn_for_gfn(vcpu->kvm, gfn);
> + pfn = kvm_get_pfn_for_page_fault(vcpu->kvm, gfn, write, &host_writable);
>
> /* mmio */
> if (is_error_pfn(pfn))
> @@ -2109,7 +2149,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
> if (mmu_notifier_retry(vcpu, mmu_seq))
> goto out_unlock;
> kvm_mmu_free_some_pages(vcpu);
> - r = __direct_map(vcpu, v, write, level, gfn, pfn, true);
> + r = __direct_map(vcpu, v, write, level, gfn, pfn, host_writable);
> spin_unlock(&vcpu->kvm->mmu_lock);
>
>
> @@ -2307,6 +2347,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
> int level;
> gfn_t gfn = gpa >> PAGE_SHIFT;
> unsigned long mmu_seq;
> + int write_fault = error_code & PFERR_WRITE_MASK;
> + int host_writable;
>
> ASSERT(vcpu);
> ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
> @@ -2321,15 +2363,16 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
>
> mmu_seq = vcpu->kvm->mmu_notifier_seq;
> smp_rmb();
> - pfn = kvm_get_pfn_for_gfn(vcpu->kvm, gfn);
> + pfn = kvm_get_pfn_for_page_fault(vcpu->kvm, gfn, write_fault,
> + &host_writable);
> if (is_error_pfn(pfn))
> return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
> spin_lock(&vcpu->kvm->mmu_lock);
> if (mmu_notifier_retry(vcpu, mmu_seq))
> goto out_unlock;
> kvm_mmu_free_some_pages(vcpu);
> - r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
> - level, gfn, pfn, true);
> + r = __direct_map(vcpu, gpa, write_fault,
> + level, gfn, pfn, host_writable);
> spin_unlock(&vcpu->kvm->mmu_lock);
>
> return r;
> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> index a9dbaa0..1874f51 100644
> --- a/arch/x86/kvm/paging_tmpl.h
> +++ b/arch/x86/kvm/paging_tmpl.h
> @@ -430,6 +430,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
> pfn_t pfn;
> int level = PT_PAGE_TABLE_LEVEL;
> unsigned long mmu_seq;
> + int host_writable;
>
> pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
> kvm_mmu_audit(vcpu, "pre page fault");
> @@ -461,7 +462,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
>
> mmu_seq = vcpu->kvm->mmu_notifier_seq;
> smp_rmb();
> - pfn = kvm_get_pfn_for_gfn(vcpu->kvm, walker.gfn);
> + pfn = kvm_get_pfn_for_page_fault(vcpu->kvm, walker.gfn, write_fault,
> + &host_writable);
>
> /* mmio */
> if (is_error_pfn(pfn))
> @@ -472,7 +474,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
> goto out_unlock;
> kvm_mmu_free_some_pages(vcpu);
> sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
> - level, &write_pt, pfn, true);
> + level, &write_pt, pfn, host_writable);
> (void)sptep;
> pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
> sptep, *sptep, write_pt);
> diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
> index 738e659..a4ce19f 100644
> --- a/arch/x86/mm/gup.c
> +++ b/arch/x86/mm/gup.c
> @@ -8,6 +8,7 @@
> #include <linux/mm.h>
> #include <linux/vmstat.h>
> #include <linux/highmem.h>
> +#include <linux/module.h>
>
> #include <asm/pgtable.h>
>
> @@ -274,6 +275,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
>
> return nr;
> }
> +EXPORT_SYMBOL_GPL(__get_user_pages_fast);
>
> /**
> * get_user_pages_fast() - pin user pages in memory
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo(a)vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html

--
Gleb.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

From: Marcelo Tosatti on 16 Jul 2010 21:50

On Fri, Jul 16, 2010 at 10:19:36AM +0300, Gleb Natapov wrote:
> On Fri, Jul 16, 2010 at 10:13:07AM +0800, Lai Jiangshan wrote:
> > When page fault, we always call get_user_pages(write=1).
> >
> > Actually, we don't need to do this when it is not write fault.
> > get_user_pages(write=1) will cause shared page(ksm) copied.
> > If this page is not modified in future, this copying and the copied page
> > are just wasted. Ksm may scan and merge them and may cause thrash.
> >
> But is page is written into afterwords we will get another page fault.
>
> > In this patch, if the page is RO for host VMM and it not write fault for guest,
> > we will use RO page, otherwise we use a writable page.
> >
> Currently pages allocated for guest memory are required to be RW, so after your series
> behaviour will remain exactly the same as before.

Except KSM pages.

> > Signed-off-by: Lai Jiangshan <laijs(a)cn.fujitsu.com>
> > ---
> > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> > index 8ba9b0d..6382140 100644
> > --- a/arch/x86/kvm/mmu.c
> > +++ b/arch/x86/kvm/mmu.c
> > @@ -1832,6 +1832,45 @@ static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
> > }
> > }
> >
> > +/* get a current mapped page fast, and test whether the page is writable. */
> > +static struct page *get_user_page_and_protection(unsigned long addr,
> > + int *writable)
> > +{
> > + struct page *page[1];
> > +
> > + if (__get_user_pages_fast(addr, 1, 1, page) == 1) {
> > + *writable = 1;
> > + return page[0];
> > + }
> > + if (__get_user_pages_fast(addr, 1, 0, page) == 1) {
> > + *writable = 0;
> > + return page[0];
> > + }
> > + return NULL;
> > +}
> > +
> > +static pfn_t kvm_get_pfn_for_page_fault(struct kvm *kvm, gfn_t gfn,
> > + int write_fault, int *host_writable)
> > +{
> > + unsigned long addr;
> > + struct page *page;
> > +
> > + if (!write_fault) {
> > + addr = gfn_to_hva(kvm, gfn);
> > + if (kvm_is_error_hva(addr)) {
> > + get_page(bad_page);
> > + return page_to_pfn(bad_page);
> > + }
> > +
> > + page = get_user_page_and_protection(addr, host_writable);
> > + if (page)
> > + return page_to_pfn(page);
> > + }
> > +
> > + *host_writable = 1;
> > + return kvm_get_pfn_for_gfn(kvm, gfn);
> > +}
> > +
> kvm_get_pfn_for_gfn() returns fault_page if page is mapped RO, so caller
> of kvm_get_pfn_for_page_fault() and kvm_get_pfn_for_gfn() will get
> different results when called on the same page. Not good.
> kvm_get_pfn_for_page_fault() logic should be folded into
> kvm_get_pfn_for_gfn().

Agreed. Please keep gfn_to_pfn related code in virt/kvm/kvm_main.c.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

From: Gleb Natapov on 17 Jul 2010 00:40

On Fri, Jul 16, 2010 at 08:26:12PM -0300, Marcelo Tosatti wrote:
> On Fri, Jul 16, 2010 at 10:19:36AM +0300, Gleb Natapov wrote:
> > On Fri, Jul 16, 2010 at 10:13:07AM +0800, Lai Jiangshan wrote:
> > > When page fault, we always call get_user_pages(write=1).
> > >
> > > Actually, we don't need to do this when it is not write fault.
> > > get_user_pages(write=1) will cause shared page(ksm) copied.
> > > If this page is not modified in future, this copying and the copied page
> > > are just wasted. Ksm may scan and merge them and may cause thrash.
> > >
> > But is page is written into afterwords we will get another page fault.
> >
> > > In this patch, if the page is RO for host VMM and it not write fault for guest,
> > > we will use RO page, otherwise we use a writable page.
> > >
> > Currently pages allocated for guest memory are required to be RW, so after your series
> > behaviour will remain exactly the same as before.
>
> Except KSM pages.
>
KSM page will be COWed by __get_user_pages_fast(addr, 1, 1, page) in
get_user_page_and_protection() just like it COWed now, no?

> > > Signed-off-by: Lai Jiangshan <laijs(a)cn.fujitsu.com>
> > > ---
> > > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> > > index 8ba9b0d..6382140 100644
> > > --- a/arch/x86/kvm/mmu.c
> > > +++ b/arch/x86/kvm/mmu.c
> > > @@ -1832,6 +1832,45 @@ static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
> > > }
> > > }
> > >
> > > +/* get a current mapped page fast, and test whether the page is writable. */
> > > +static struct page *get_user_page_and_protection(unsigned long addr,
> > > + int *writable)
> > > +{
> > > + struct page *page[1];
> > > +
> > > + if (__get_user_pages_fast(addr, 1, 1, page) == 1) {
> > > + *writable = 1;
> > > + return page[0];
> > > + }
> > > + if (__get_user_pages_fast(addr, 1, 0, page) == 1) {
> > > + *writable = 0;
> > > + return page[0];
> > > + }
> > > + return NULL;
> > > +}
> > > +
> > > +static pfn_t kvm_get_pfn_for_page_fault(struct kvm *kvm, gfn_t gfn,
> > > + int write_fault, int *host_writable)
> > > +{
> > > + unsigned long addr;
> > > + struct page *page;
> > > +
> > > + if (!write_fault) {
> > > + addr = gfn_to_hva(kvm, gfn);
> > > + if (kvm_is_error_hva(addr)) {
> > > + get_page(bad_page);
> > > + return page_to_pfn(bad_page);
> > > + }
> > > +
> > > + page = get_user_page_and_protection(addr, host_writable);
> > > + if (page)
> > > + return page_to_pfn(page);
> > > + }
> > > +
> > > + *host_writable = 1;
> > > + return kvm_get_pfn_for_gfn(kvm, gfn);
> > > +}
> > > +
> > kvm_get_pfn_for_gfn() returns fault_page if page is mapped RO, so caller
> > of kvm_get_pfn_for_page_fault() and kvm_get_pfn_for_gfn() will get
> > different results when called on the same page. Not good.
> > kvm_get_pfn_for_page_fault() logic should be folded into
> > kvm_get_pfn_for_gfn().
>
> Agreed. Please keep gfn_to_pfn related code in virt/kvm/kvm_main.c.

--
Gleb.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

From: Avi Kivity on 18 Jul 2010 11:20

On 07/17/2010 07:31 AM, Gleb Natapov wrote:
>>>
>>> Currently pages allocated for guest memory are required to be RW, so after your series
>>> behaviour will remain exactly the same as before.
>>>
>> Except KSM pages.
>>
>>
> KSM page will be COWed by __get_user_pages_fast(addr, 1, 1, page) in
> get_user_page_and_protection() just like it COWed now, no?
>

Well, we don't want to COW it on write faults.

The optimal behaviour is:

- write faults: COW and instantiate a writeable spte
- read faults: instantiate a readable spte; if likely(page is
writeable), make it a writeable spte; if likely(page is dirty) make it a
dirty spte
- speculative spte instantiations: if likely(page is present)
instantiate a pte; if accessed, mark it accessed, if writeable, mark it
writeable; if dirty, mark it dirty

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

From: Gleb Natapov on 18 Jul 2010 11:30

On Sun, Jul 18, 2010 at 06:14:11PM +0300, Avi Kivity wrote:
> On 07/17/2010 07:31 AM, Gleb Natapov wrote:
> >>>
> >>>Currently pages allocated for guest memory are required to be RW, so after your series
> >>>behaviour will remain exactly the same as before.
> >>Except KSM pages.
> >>
> >KSM page will be COWed by __get_user_pages_fast(addr, 1, 1, page) in
> >get_user_page_and_protection() just like it COWed now, no?
>
> Well, we don't want to COW it on write faults.
>
> The optimal behaviour is:
>
> - write faults: COW and instantiate a writeable spte
So do we or don't we want to COW on write faults?

--
Gleb.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

| Next | Last
Pages: 1 2
Prev: [PATCH] ARM/nuc900: re-organize the nuc900 lcd arch platform data setting
Next: [git pull] Input updates for 2.6.35-rc5