From: Mel Gorman on
On Sun, Jul 18, 2010 at 07:18:31PM +0900, Minchan Kim wrote:
> Kukjin reported oops happen while he change min_free_kbytes
> http://www.spinics.net/lists/arm-kernel/msg92894.html
> It happen by memory map on sparsemem.
>

First off, thanks for working on this.

> The system has a memory map following as.
> section 0 section 1 section 2
> 0x20000000-0x25000000, 0x40000000-0x50000000, 0x50000000-0x58000000
> SECTION_SIZE_BITS 28(256M)
>
> It means section 0 is an incompletely filled section.
> Nontheless, current pfn_valid of sparsemem checks pfn loosely.
> It checks only mem_section's validation but ARM can free mem_map on hole
> to save memory space. So in above case, pfn on 0x25000000 can pass pfn_valid's
> validation check. It's not what we want.
>
> We can match section size to smallest valid size.(ex, above case, 16M)
> But Russell doesn't like it due to mem_section's memory overhead with different
> configuration(ex, 512K section).
>
> I tried to add valid pfn range in mem_section but everyone doesn't like it
> due to size overhead.

Also IIRC, it was vunerable to a hole being punched in the middle of the
section.

> This patch is suggested by KAMEZAWA-san.
> I just fixed compile error and change some naming.
>
> This patch registers address of mem_section to memmap itself's page struct's
> pg->private field. This means the page is used for memmap of the section.
> Otherwise, the page is used for other purpose and memmap has a hole.
>
> This patch is based on mmotm-2010-07-01-12-19.
>
> Signed-off-by: Minchan Kim <minchan.kim(a)gmail.com>
> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu(a)jp.fujitsu.com>
> Reported-by: Kukjin Kim <kgene.kim(a)samsung.com>
> ---
> arch/arm/mm/init.c | 9 ++++++++-
> include/linux/mmzone.h | 21 ++++++++++++++++++++-
> mm/Kconfig | 5 +++++
> mm/sparse.c | 41 +++++++++++++++++++++++++++++++++++++++++
> 4 files changed, 74 insertions(+), 2 deletions(-)
>
> diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
> index cfe4c5e..4586f40 100644
> --- a/arch/arm/mm/init.c
> +++ b/arch/arm/mm/init.c
> @@ -234,6 +234,11 @@ static void __init arm_bootmem_free(struct meminfo *mi)
> arch_adjust_zones(zone_size, zhole_size);
>
> free_area_init_node(0, zone_size, min, zhole_size);
> +
> + for_each_bank(i, mi) {
> + mark_memmap_hole(bank_pfn_start(&mi->bank[i]),
> + bank_pfn_end(&mi->bank[i]), true);
> + }
> }

Why do we need to mark banks both valid and invalid? Is it not enough to
just mark the holes in free_memmap() and assume it is valid otherwise?

>
> #ifndef CONFIG_SPARSEMEM
> @@ -386,8 +391,10 @@ free_memmap(unsigned long start_pfn, unsigned long end_pfn)
> * If there are free pages between these,
> * free the section of the memmap array.
> */
> - if (pg < pgend)
> + if (pg < pgend) {
> + mark_memmap_hole(pg >> PAGE_SHIFT, pgend >> PAGE_SHIFT, false);
> free_bootmem(pg, pgend - pg);
> + }
> }
>
> /*
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 9ed9c45..2ed9728 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -15,6 +15,7 @@
> #include <linux/seqlock.h>
> #include <linux/nodemask.h>
> #include <linux/pageblock-flags.h>
> +#include <linux/mm_types.h>
> #include <generated/bounds.h>
> #include <asm/atomic.h>
> #include <asm/page.h>
> @@ -1047,11 +1048,29 @@ static inline struct mem_section *__pfn_to_section(unsigned long pfn)
> return __nr_to_section(pfn_to_section_nr(pfn));
> }
>
> +void mark_memmap_hole(unsigned long start, unsigned long end, bool valid);
> +

The naming is confusing with the "valid" parameter.

What's a "valid hole"? I can see that one being a cause of head
scratching in the future :)

> +#ifdef CONFIG_SPARSEMEM_HAS_HOLE

Why not use CONFIG_ARCH_HAS_HOLES_MEMORYMODEL ?

> +static inline int page_valid(struct mem_section *ms, unsigned long pfn)
> +{
> + struct page *page = pfn_to_page(pfn);
> + struct page *__pg = virt_to_page(page);
> + return __pg->private == (unsigned long)ms;
> +}
> +#else
> +static inline int page_valid(struct mem_section *ms, unsigned long pfn)
> +{
> + return 1;
> +}
> +#endif
> +
> static inline int pfn_valid(unsigned long pfn)
> {
> + struct mem_section *ms;
> if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
> return 0;
> - return valid_section(__nr_to_section(pfn_to_section_nr(pfn)));
> + ms = __nr_to_section(pfn_to_section_nr(pfn));
> + return valid_section(ms) && page_valid(ms, pfn);
> }

So it appears here that we unconditionally check page_valid() but we know
which sections had holes in them at the time we called mark_memmap_hole(). Can
the sections with holes be tagged so that only some sections need to call
page_valid()? As it is, ARM will be taking a an performance hit just in case
the section has holes but it should only need to take a performance hit
on the corner case where a section is not fully populated.

>
> static inline int pfn_present(unsigned long pfn)
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 527136b..959ac1d 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -128,6 +128,11 @@ config SPARSEMEM_VMEMMAP
> pfn_to_page and page_to_pfn operations. This is the most
> efficient option when sufficient kernel resources are available.
>
> +config SPARSEMEM_HAS_HOLE
> + bool "allow holes in sparsemem's memmap"
> + depends on ARM && SPARSEMEM && !SPARSEMEM_VMEMMAP
> + default n
> +
> # eventually, we can have this option just 'select SPARSEMEM'
> config MEMORY_HOTPLUG
> bool "Allow for memory hot-add"
> diff --git a/mm/sparse.c b/mm/sparse.c
> index 95ac219..76d5012 100644
> --- a/mm/sparse.c
> +++ b/mm/sparse.c
> @@ -615,6 +615,47 @@ void __init sparse_init(void)
> free_bootmem(__pa(usemap_map), size);
> }
>
> +#ifdef CONFIG_SPARSEMEM_HAS_HOLE
> +/*
> + * Fill memmap's pg->private with a pointer to mem_section.
> + * pfn_valid() will check this later. (see include/linux/mmzone.h)
> + * Evenry arch should call
> + * mark_memmap_hole(start, end, true) # for all allocated mem_map
> + * and, after that,
> + * mark_memmap_hole(start, end, false) # for all holes in mem_map.
> + * please see usage in ARM.
> + */
> +void mark_memmap_hole(unsigned long start, unsigned long end, bool valid)
> +{
> + struct mem_section *ms;
> + unsigned long pos, next;
> + struct page *pg;
> + void *memmap, *mapend;
> +
> + for (pos = start; pos < end; pos = next) {
> + next = (pos + PAGES_PER_SECTION) & PAGE_SECTION_MASK;
> + ms = __pfn_to_section(pos);
> + if (!valid_section(ms))
> + continue;
> +
> + for (memmap = (void*)pfn_to_page(pos),
> + /* The last page in section */
> + mapend = pfn_to_page(next-1);
> + memmap < mapend; memmap += PAGE_SIZE) {
> + pg = virt_to_page(memmap);
> + if (valid)
> + pg->private = (unsigned long)ms;
> + else
> + pg->private = 0;
> + }
> + }
> +}
> +#else
> +void mark_memmap_hole(unsigned long start, unsigned long end, bool valid)
> +{
> +}
> +#endif
> +

The patch should also delete memmap_valid_within() and replace it with a
call to pfn_valid_within(). The reason memmap_valid_within() existed was
because sparsemem had holes punched in it but I'd rather not see use of
that function grow.

> #ifdef CONFIG_MEMORY_HOTPLUG
> #ifdef CONFIG_SPARSEMEM_VMEMMAP
> static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
> --
> 1.7.0.5
>

--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Minchan Kim on
On Tue, Jul 20, 2010 at 12:15:58PM +0200, Johannes Weiner wrote:
> Hi,
>
> On Sun, Jul 18, 2010 at 07:18:31PM +0900, Minchan Kim wrote:
> > Kukjin reported oops happen while he change min_free_kbytes
> > http://www.spinics.net/lists/arm-kernel/msg92894.html
> > It happen by memory map on sparsemem.
> >
> > The system has a memory map following as.
> > section 0 section 1 section 2
> > 0x20000000-0x25000000, 0x40000000-0x50000000, 0x50000000-0x58000000
> > SECTION_SIZE_BITS 28(256M)
> >
> > It means section 0 is an incompletely filled section.
> > Nontheless, current pfn_valid of sparsemem checks pfn loosely.
> > It checks only mem_section's validation but ARM can free mem_map on hole
> > to save memory space. So in above case, pfn on 0x25000000 can pass pfn_valid's
> > validation check. It's not what we want.
> >
> > We can match section size to smallest valid size.(ex, above case, 16M)
> > But Russell doesn't like it due to mem_section's memory overhead with different
> > configuration(ex, 512K section).
> >
> > I tried to add valid pfn range in mem_section but everyone doesn't like it
> > due to size overhead. This patch is suggested by KAMEZAWA-san.
> > I just fixed compile error and change some naming.
>
> I did not like it, because it messes up the whole concept of a
> section.

Yes. but ARM already have broken it.
So we can't ignore it.

>
> But most importantly, we already have a crutch for ARM in place,
> namely memmap_valid_within(). Looking at Kukjin's bug report,
> wouldn't it be enough to use that check in
> setup_zone_migrate_reserve()?

I did it.
But I think it's not a fundamental solution.
It would make new bad rule which whole pfn walker should call
memmap_valid_within.

I just greped "grep -nRH 'start_pfn' mm/'.
If we add it in setup_zone_migration_reserve, look at kmemleak(kmemleak_scan)
compaction(isolate_migratepages) and so on. I am not sure how many there are.
I doubt they have a same problem in setup_zone_migrate_pages.
Should we add memmap_valid_within whenever whole pfn walker does?

>
> Your approach makes every pfn_valid() more expensive, although the
> extensive checks are not not needed everywhere (check the comment
> above memmap_valid_within): vm_normal_page() for example can probably
> assume that a PTE won't point to a hole within the memory map.

I agree. But I think it's trade-off of architecture have memmap hole.
They want to use such model which don't meet sparsemem's disign
so that it's cost they have to pay.
In fact, All we mm guys don't want to use such model, but ARM has been
used such model, so we can't ignore them. So I want to care them but doesn't
affect non-hole architecure of memmap.
In terms of such point, this patch doesn't have a overhead
in non-hole architecture and it doesn't make new rule as I said above.
Also, hole architecture developers don't need to override pfn_valid to detect
hole pfn range.

>
> OTOH, if the ARM people do not care, we could probably go with your
> approach, encode it all into pfn_valid(), and also get rid of
> memmap_valid_within() completely. But I would prefer doing a bugfix
> first and such a conceptual change in a different patch, would you
> agree?

Hmm, I am not sure. memmap_valid_within problem can happen in only sparemem?
AFAIR, the problem can happen in punched hole(not either side hole) of FLATMEM?
If it is right, maybe we should expand this patch
to CONFIG_ARCH_HAS_HOLES_MEMORYMODEL not CONFIG_SPARSEMEM.

And I think this patch just checks validation of memmap not page's itself
validiation. If one page in memmap has mixed(valid or non-valid) struct page
descriptors, it can't identify it. So pfn walker need to check PageReserved
in addition to pfn_valid. But as I review above example, some cases
doesn't check it. but it's a another story we have to fix.

Thanks for careful review, Hannes. :)

--
Kind regards,
Minchan Kim
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Minchan Kim on
On Tue, Jul 20, 2010 at 04:28:09PM +0100, Mel Gorman wrote:
> On Sun, Jul 18, 2010 at 07:18:31PM +0900, Minchan Kim wrote:
> > Kukjin reported oops happen while he change min_free_kbytes
> > http://www.spinics.net/lists/arm-kernel/msg92894.html
> > It happen by memory map on sparsemem.
> >
>
> First off, thanks for working on this.
>
> > The system has a memory map following as.
> > section 0 section 1 section 2
> > 0x20000000-0x25000000, 0x40000000-0x50000000, 0x50000000-0x58000000
> > SECTION_SIZE_BITS 28(256M)
> >
> > It means section 0 is an incompletely filled section.
> > Nontheless, current pfn_valid of sparsemem checks pfn loosely.
> > It checks only mem_section's validation but ARM can free mem_map on hole
> > to save memory space. So in above case, pfn on 0x25000000 can pass pfn_valid's
> > validation check. It's not what we want.
> >
> > We can match section size to smallest valid size.(ex, above case, 16M)
> > But Russell doesn't like it due to mem_section's memory overhead with different
> > configuration(ex, 512K section).
> >
> > I tried to add valid pfn range in mem_section but everyone doesn't like it
> > due to size overhead.
>
> Also IIRC, it was vunerable to a hole being punched in the middle of the
> section.
>
> > This patch is suggested by KAMEZAWA-san.
> > I just fixed compile error and change some naming.
> >
> > This patch registers address of mem_section to memmap itself's page struct's
> > pg->private field. This means the page is used for memmap of the section.
> > Otherwise, the page is used for other purpose and memmap has a hole.
> >
> > This patch is based on mmotm-2010-07-01-12-19.
> >
> > Signed-off-by: Minchan Kim <minchan.kim(a)gmail.com>
> > Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu(a)jp.fujitsu.com>
> > Reported-by: Kukjin Kim <kgene.kim(a)samsung.com>
> > ---
> > arch/arm/mm/init.c | 9 ++++++++-
> > include/linux/mmzone.h | 21 ++++++++++++++++++++-
> > mm/Kconfig | 5 +++++
> > mm/sparse.c | 41 +++++++++++++++++++++++++++++++++++++++++
> > 4 files changed, 74 insertions(+), 2 deletions(-)
> >
> > diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
> > index cfe4c5e..4586f40 100644
> > --- a/arch/arm/mm/init.c
> > +++ b/arch/arm/mm/init.c
> > @@ -234,6 +234,11 @@ static void __init arm_bootmem_free(struct meminfo *mi)
> > arch_adjust_zones(zone_size, zhole_size);
> >
> > free_area_init_node(0, zone_size, min, zhole_size);
> > +
> > + for_each_bank(i, mi) {
> > + mark_memmap_hole(bank_pfn_start(&mi->bank[i]),
> > + bank_pfn_end(&mi->bank[i]), true);
> > + }
> > }
>
> Why do we need to mark banks both valid and invalid? Is it not enough to
> just mark the holes in free_memmap() and assume it is valid otherwise?
>
Good point.
If we can make sure pg->private is zero, we can fix it.
I will check it.

> >
> > #ifndef CONFIG_SPARSEMEM
> > @@ -386,8 +391,10 @@ free_memmap(unsigned long start_pfn, unsigned long end_pfn)
> > * If there are free pages between these,
> > * free the section of the memmap array.
> > */
> > - if (pg < pgend)
> > + if (pg < pgend) {
> > + mark_memmap_hole(pg >> PAGE_SHIFT, pgend >> PAGE_SHIFT, false);
> > free_bootmem(pg, pgend - pg);
> > + }
> > }
> >
> > /*
> > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> > index 9ed9c45..2ed9728 100644
> > --- a/include/linux/mmzone.h
> > +++ b/include/linux/mmzone.h
> > @@ -15,6 +15,7 @@
> > #include <linux/seqlock.h>
> > #include <linux/nodemask.h>
> > #include <linux/pageblock-flags.h>
> > +#include <linux/mm_types.h>
> > #include <generated/bounds.h>
> > #include <asm/atomic.h>
> > #include <asm/page.h>
> > @@ -1047,11 +1048,29 @@ static inline struct mem_section *__pfn_to_section(unsigned long pfn)
> > return __nr_to_section(pfn_to_section_nr(pfn));
> > }
> >
> > +void mark_memmap_hole(unsigned long start, unsigned long end, bool valid);
> > +
>
> The naming is confusing with the "valid" parameter.
>
> What's a "valid hole"? I can see that one being a cause of head
> scratching in the future :)

Okay. If we call it in only free_memmap, we can change naming following as.
ex) mark_invalid_memmap(start, end);
Will change.

>
> > +#ifdef CONFIG_SPARSEMEM_HAS_HOLE
>
> Why not use CONFIG_ARCH_HAS_HOLES_MEMORYMODEL ?

As I mentioned my previous mail(reply of Hannes), if the problen can happen
in FLATMEM, CONFIG_ARCH_HAS_HOLES_MEMORYMODEL is right.
Please confirm this, Mel. :)

>
> > +static inline int page_valid(struct mem_section *ms, unsigned long pfn)
> > +{
> > + struct page *page = pfn_to_page(pfn);
> > + struct page *__pg = virt_to_page(page);
> > + return __pg->private == (unsigned long)ms;
> > +}
> > +#else
> > +static inline int page_valid(struct mem_section *ms, unsigned long pfn)
> > +{
> > + return 1;
> > +}
> > +#endif
> > +
> > static inline int pfn_valid(unsigned long pfn)
> > {
> > + struct mem_section *ms;
> > if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
> > return 0;
> > - return valid_section(__nr_to_section(pfn_to_section_nr(pfn)));
> > + ms = __nr_to_section(pfn_to_section_nr(pfn));
> > + return valid_section(ms) && page_valid(ms, pfn);
> > }
>
> So it appears here that we unconditionally check page_valid() but we know
> which sections had holes in them at the time we called mark_memmap_hole(). Can
> the sections with holes be tagged so that only some sections need to call
> page_valid()? As it is, ARM will be taking a an performance hit just in case
> the section has holes but it should only need to take a performance hit
> on the corner case where a section is not fully populated.

In fact, I tried it with SECTION_MAP_LAST_BIT as you suggested.
But stucked. That's because now we can use 2 bit of section_mem_map.
And we have used 2 bit all with different meaning.
I have a dumb question. Is there any case section have SECTION_MARKED_PRESENT
but don't have SECTION_HAS_MEM_MAP except section populated time?
I mean can't we remove SECTION_MARKED_PRESENT totally?
If it is, we can the 1 bit for marking hole section.
If it isn't, I think it seems we can use lower bit of pageblock_flags.
although it's not a good.

Thanks for careful review, Mel.

>
> >
> > static inline int pfn_present(unsigned long pfn)
> > diff --git a/mm/Kconfig b/mm/Kconfig
> > index 527136b..959ac1d 100644
> > --- a/mm/Kconfig
> > +++ b/mm/Kconfig
> > @@ -128,6 +128,11 @@ config SPARSEMEM_VMEMMAP
> > pfn_to_page and page_to_pfn operations. This is the most
> > efficient option when sufficient kernel resources are available.
> >
> > +config SPARSEMEM_HAS_HOLE
> > + bool "allow holes in sparsemem's memmap"
> > + depends on ARM && SPARSEMEM && !SPARSEMEM_VMEMMAP
> > + default n
> > +
> > # eventually, we can have this option just 'select SPARSEMEM'
> > config MEMORY_HOTPLUG
> > bool "Allow for memory hot-add"
> > diff --git a/mm/sparse.c b/mm/sparse.c
> > index 95ac219..76d5012 100644
> > --- a/mm/sparse.c
> > +++ b/mm/sparse.c
> > @@ -615,6 +615,47 @@ void __init sparse_init(void)
> > free_bootmem(__pa(usemap_map), size);
> > }
> >
> > +#ifdef CONFIG_SPARSEMEM_HAS_HOLE
> > +/*
> > + * Fill memmap's pg->private with a pointer to mem_section.
> > + * pfn_valid() will check this later. (see include/linux/mmzone.h)
> > + * Evenry arch should call
> > + * mark_memmap_hole(start, end, true) # for all allocated mem_map
> > + * and, after that,
> > + * mark_memmap_hole(start, end, false) # for all holes in mem_map.
> > + * please see usage in ARM.
> > + */
> > +void mark_memmap_hole(unsigned long start, unsigned long end, bool valid)
> > +{
> > + struct mem_section *ms;
> > + unsigned long pos, next;
> > + struct page *pg;
> > + void *memmap, *mapend;
> > +
> > + for (pos = start; pos < end; pos = next) {
> > + next = (pos + PAGES_PER_SECTION) & PAGE_SECTION_MASK;
> > + ms = __pfn_to_section(pos);
> > + if (!valid_section(ms))
> > + continue;
> > +
> > + for (memmap = (void*)pfn_to_page(pos),
> > + /* The last page in section */
> > + mapend = pfn_to_page(next-1);
> > + memmap < mapend; memmap += PAGE_SIZE) {
> > + pg = virt_to_page(memmap);
> > + if (valid)
> > + pg->private = (unsigned long)ms;
> > + else
> > + pg->private = 0;
> > + }
> > + }
> > +}
> > +#else
> > +void mark_memmap_hole(unsigned long start, unsigned long end, bool valid)
> > +{
> > +}
> > +#endif
> > +
>
> The patch should also delete memmap_valid_within() and replace it with a
> call to pfn_valid_within(). The reason memmap_valid_within() existed was
> because sparsemem had holes punched in it but I'd rather not see use of
> that function grow.
>
> > #ifdef CONFIG_MEMORY_HOTPLUG
> > #ifdef CONFIG_SPARSEMEM_VMEMMAP
> > static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
> > --
> > 1.7.0.5
> >
>
> --
> Mel Gorman
> Part-time Phd Student Linux Technology Center
> University of Limerick IBM Dublin Software Lab

--
Kind regards,
Minchan Kim
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/