From: Andrea Arcangeli on
Hi,

with mainline + my exec-migrate race fix + your patch1 in this series,
and the below patches, THP should work safe also on top of new
anon-vma code. I'm keeping them incremental but I'll keep them applied
also in aa.git as these patches should work for both the new and old
anon-vma semantics (if there are rejects they're minor). aa.git will
stick longer with old anon-vma code to avoid testing too much stuff at
the same time.

I'm sending the changes below for review.

I think I'll also try to use quilt guards to maintain two trees one
with new anon-vma ready for merging and one with the old
anon-vma. aa.git origin/master will stick to the old anon-vma code.

Thanks,
Andrea

---
Subject: update to the new anon-vma semantics

From: Andrea Arcangeli <aarcange(a)redhat.com>

The new anon-vma code broke for example wait_split_huge_page because the
vma->anon_vma->lock isn't necessarily the one that split_huge_page holds.
split_huge_page holds the page->mapping/anon_vma->lock if "page" is a shared
readonly hugepage.

The code that works with the new anon-vma code also works with the old one, so
it's better to apply it uncoditionally so it gets more testing also on top of
the old anon-vma code.

Signed-off-by: Andrea Arcangeli <aarcange(a)redhat.com>
---

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -74,24 +74,13 @@ extern int handle_pte_fault(struct mm_st
pte_t *pte, pmd_t *pmd, unsigned int flags);
extern int split_huge_page(struct page *page);
extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd);
+extern void wait_split_huge_page(struct mm_struct *mm, pmd_t *pmd);
#define split_huge_page_pmd(__mm, __pmd) \
do { \
pmd_t *____pmd = (__pmd); \
if (unlikely(pmd_trans_huge(*____pmd))) \
__split_huge_page_pmd(__mm, ____pmd); \
} while (0)
-#define wait_split_huge_page(__anon_vma, __pmd) \
- do { \
- pmd_t *____pmd = (__pmd); \
- spin_unlock_wait(&(__anon_vma)->lock); \
- /* \
- * spin_unlock_wait() is just a loop in C and so the \
- * CPU can reorder anything around it. \
- */ \
- smp_mb(); \
- BUG_ON(pmd_trans_splitting(*____pmd) || \
- pmd_trans_huge(*____pmd)); \
- } while (0)
#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
#if HPAGE_PMD_ORDER > MAX_ORDER
@@ -118,7 +107,7 @@ static inline int split_huge_page(struct
}
#define split_huge_page_pmd(__mm, __pmd) \
do { } while (0)
-#define wait_split_huge_page(__anon_vma, __pmd) \
+#define wait_split_huge_page(__mm, __pmd) \
do { } while (0)
#define PageTransHuge(page) 0
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -162,6 +162,9 @@ int try_to_munlock(struct page *);
*/
struct anon_vma *page_lock_anon_vma(struct page *page);
void page_unlock_anon_vma(struct anon_vma *anon_vma);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+void wait_page_lock_anon_vma(struct page *page);
+#endif
int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);

/*
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -314,7 +314,7 @@ int copy_huge_pmd(struct mm_struct *dst_
spin_unlock(&src_mm->page_table_lock);
spin_unlock(&dst_mm->page_table_lock);

- wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */
+ wait_split_huge_page(src_mm, src_pmd); /* src_vma */
goto out;
}
src_page = pmd_page(pmd);
@@ -551,8 +551,7 @@ int zap_huge_pmd(struct mmu_gather *tlb,
if (likely(pmd_trans_huge(*pmd))) {
if (unlikely(pmd_trans_splitting(*pmd))) {
spin_unlock(&tlb->mm->page_table_lock);
- wait_split_huge_page(vma->anon_vma,
- pmd);
+ wait_split_huge_page(tlb->mm, pmd);
} else {
struct page *page;
pgtable_t pgtable;
@@ -879,3 +878,35 @@ void __split_huge_page_pmd(struct mm_str
put_page(page);
BUG_ON(pmd_trans_huge(*pmd));
}
+
+void wait_split_huge_page(struct mm_struct *mm, pmd_t *pmd)
+{
+ struct page *page;
+
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_trans_huge(*pmd))) {
+ spin_unlock(&mm->page_table_lock);
+ VM_BUG_ON(pmd_trans_splitting(*pmd));
+ return;
+ }
+ page = pmd_page(*pmd);
+ get_page(page);
+ spin_unlock(&mm->page_table_lock);
+
+ /*
+ * The vma->anon_vma->lock is the wrong lock if the page is shared,
+ * the anon_vma->lock pointed by page->mapping is the right one.
+ */
+ wait_page_lock_anon_vma(page);
+
+ put_page(page);
+
+ /*
+ * spin_unlock_wait() is just a loop in C and so the
+ * CPU can reorder anything around it.
+ */
+ smp_mb();
+
+ BUG_ON(pmd_trans_huge(*pmd));
+ VM_BUG_ON(pmd_trans_splitting(*pmd));
+}
diff --git a/mm/memory.c b/mm/memory.c
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -400,7 +400,7 @@ int __pte_alloc(struct mm_struct *mm, st
pmd_t *pmd, unsigned long address)
{
pgtable_t new = pte_alloc_one(mm, address);
- int wait_split_huge_page;
+ int need_wait_split_huge_page;
if (!new)
return -ENOMEM;

@@ -420,18 +420,18 @@ int __pte_alloc(struct mm_struct *mm, st
smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */

spin_lock(&mm->page_table_lock);
- wait_split_huge_page = 0;
+ need_wait_split_huge_page = 0;
if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
mm->nr_ptes++;
pmd_populate(mm, pmd, new);
new = NULL;
} else if (unlikely(pmd_trans_splitting(*pmd)))
- wait_split_huge_page = 1;
+ need_wait_split_huge_page = 1;
spin_unlock(&mm->page_table_lock);
if (new)
pte_free(mm, new);
- if (wait_split_huge_page)
- wait_split_huge_page(vma->anon_vma, pmd);
+ if (need_wait_split_huge_page)
+ wait_split_huge_page(mm, pmd);
return 0;
}

@@ -1302,7 +1302,7 @@ struct page *follow_page(struct vm_area_
if (likely(pmd_trans_huge(*pmd))) {
if (unlikely(pmd_trans_splitting(*pmd))) {
spin_unlock(&mm->page_table_lock);
- wait_split_huge_page(vma->anon_vma, pmd);
+ wait_split_huge_page(mm, pmd);
} else {
page = follow_trans_huge_pmd(mm, address,
pmd, flags);
diff --git a/mm/rmap.c b/mm/rmap.c
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -225,6 +225,30 @@ void page_unlock_anon_vma(struct anon_vm
rcu_read_unlock();
}

+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/*
+ * Getting a lock on a stable anon_vma from a page off the LRU is
+ * tricky: page_lock_anon_vma rely on RCU to guard against the races.
+ */
+void wait_page_lock_anon_vma(struct page *page)
+{
+ struct anon_vma *anon_vma;
+ unsigned long anon_mapping;
+
+ rcu_read_lock();
+ anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping);
+ if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
+ goto out;
+ if (!page_mapped(page))
+ goto out;
+
+ anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
+ spin_unlock_wait(&anon_vma->lock);
+out:
+ rcu_read_unlock();
+}
+#endif
+
/*
* At what user virtual address is page expected in @vma?
* Returns virtual address or -EFAULT if page's index/offset is not




----
Subject: adapt mincore to anon_vma chain semantics

From: Andrea Arcangeli <aarcange(a)redhat.com>

wait_split_huge_page interface changed.

Signed-off-by: Andrea Arcangeli <aarcange(a)redhat.com>
---

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -905,7 +905,7 @@ int mincore_huge_pmd(struct vm_area_stru
ret = !pmd_trans_splitting(*pmd);
spin_unlock(&vma->vm_mm->page_table_lock);
if (unlikely(!ret))
- wait_split_huge_page(vma->anon_vma, pmd);
+ wait_split_huge_page(vma->vm_mm, pmd);
else {
/*
* All logical pages in the range are present
-----
Subject: adapt mprotect to anon_vma chain semantics

From: Andrea Arcangeli <aarcange(a)redhat.com>

wait_split_huge_page interface changed.

Signed-off-by: Andrea Arcangeli <aarcange(a)redhat.com>
---

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -929,7 +929,7 @@ int change_huge_pmd(struct vm_area_struc
if (likely(pmd_trans_huge(*pmd))) {
if (unlikely(pmd_trans_splitting(*pmd))) {
spin_unlock(&mm->page_table_lock);
- wait_split_huge_page(vma->anon_vma, pmd);
+ wait_split_huge_page(mm, pmd);
} else {
pmd_t entry;

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Johannes Weiner on
On Fri, Apr 30, 2010 at 08:28:53PM +0200, Andrea Arcangeli wrote:
> Subject: adapt mprotect to anon_vma chain semantics
>
> From: Andrea Arcangeli <aarcange(a)redhat.com>
>
> wait_split_huge_page interface changed.
>
> Signed-off-by: Andrea Arcangeli <aarcange(a)redhat.com>
> ---
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -929,7 +929,7 @@ int change_huge_pmd(struct vm_area_struc
> if (likely(pmd_trans_huge(*pmd))) {
> if (unlikely(pmd_trans_splitting(*pmd))) {
> spin_unlock(&mm->page_table_lock);
> - wait_split_huge_page(vma->anon_vma, pmd);
> + wait_split_huge_page(mm, pmd);

That makes mprotect-vma-arg obsolete, I guess.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Andrea Arcangeli on
On Sat, May 01, 2010 at 03:51:10PM +0200, Johannes Weiner wrote:
> On Fri, Apr 30, 2010 at 08:28:53PM +0200, Andrea Arcangeli wrote:
> > Subject: adapt mprotect to anon_vma chain semantics
> >
> > From: Andrea Arcangeli <aarcange(a)redhat.com>
> >
> > wait_split_huge_page interface changed.
> >
> > Signed-off-by: Andrea Arcangeli <aarcange(a)redhat.com>
> > ---
> >
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > --- a/mm/huge_memory.c
> > +++ b/mm/huge_memory.c
> > @@ -929,7 +929,7 @@ int change_huge_pmd(struct vm_area_struc
> > if (likely(pmd_trans_huge(*pmd))) {
> > if (unlikely(pmd_trans_splitting(*pmd))) {
> > spin_unlock(&mm->page_table_lock);
> > - wait_split_huge_page(vma->anon_vma, pmd);
> > + wait_split_huge_page(mm, pmd);
>
> That makes mprotect-vma-arg obsolete, I guess.

Well it's needed for flush_tlb_range. Also normally we could run a
single invlpg on x86 to invalidate huge pmd tlbs, but I read some
errata for some x86, and I didn't want to take risks plus this is
common code so I can't just run a common code flush_tlb_page. In
mincore_huge_pmd probably we could pass vma->vm_mm instead of vma (as
there is not flush_tlb_range), I can change it if you prefer.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Johannes Weiner on
On Mon, May 03, 2010 at 05:33:01PM +0200, Andrea Arcangeli wrote:
> On Sat, May 01, 2010 at 03:51:10PM +0200, Johannes Weiner wrote:
> > On Fri, Apr 30, 2010 at 08:28:53PM +0200, Andrea Arcangeli wrote:
> > > Subject: adapt mprotect to anon_vma chain semantics
> > >
> > > From: Andrea Arcangeli <aarcange(a)redhat.com>
> > >
> > > wait_split_huge_page interface changed.
> > >
> > > Signed-off-by: Andrea Arcangeli <aarcange(a)redhat.com>
> > > ---
> > >
> > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > > --- a/mm/huge_memory.c
> > > +++ b/mm/huge_memory.c
> > > @@ -929,7 +929,7 @@ int change_huge_pmd(struct vm_area_struc
> > > if (likely(pmd_trans_huge(*pmd))) {
> > > if (unlikely(pmd_trans_splitting(*pmd))) {
> > > spin_unlock(&mm->page_table_lock);
> > > - wait_split_huge_page(vma->anon_vma, pmd);
> > > + wait_split_huge_page(mm, pmd);
> >
> > That makes mprotect-vma-arg obsolete, I guess.
>
> Well it's needed for flush_tlb_range.

I must have been blind, sorry for the noise.

> In mincore_huge_pmd probably we could pass vma->vm_mm instead of
> vma (as there is not flush_tlb_range), I can change it if you prefer.

Although not strictly required, it's probably nicer to keep the
function signatures in this code alike. So everything fine with
me as it stands :)

Hannes
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Andrea Arcangeli on
On Tue, May 04, 2010 at 01:41:32AM +0200, Johannes Weiner wrote:
> Although not strictly required, it's probably nicer to keep the
> function signatures in this code alike. So everything fine with
> me as it stands :)

Too late I already optimized mincore...
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/