memcg: dirty pages instrumentation [Kernel]

Prev: [announce] gujin GPL bootloader version 2.8
Next: [Bug #14792] Misdetection of the TV output

From: Daisuke Nishimura on 3 Mar 2010 18:30

On Wed, 3 Mar 2010 23:03:19 +0100, Andrea Righi <arighi(a)develer.com> wrote:
> On Wed, Mar 03, 2010 at 05:21:32PM +0900, KAMEZAWA Hiroyuki wrote:
> > On Wed, 3 Mar 2010 15:15:49 +0900
> > KAMEZAWA Hiroyuki <kamezawa.hiroyu(a)jp.fujitsu.com> wrote:
> >
> > > Agreed.
> > > Let's try how we can write a code in clean way. (we have time ;)
> > > For now, to me, IRQ disabling while lock_page_cgroup() seems to be a little
> > > over killing. What I really want is lockless code...but it seems impossible
> > > under current implementation.
> > >
> > > I wonder the fact "the page is never unchareged under us" can give us some chances
> > > ...Hmm.
> > >
> >
> > How about this ? Basically, I don't like duplicating information...so,
> > # of new pcg_flags may be able to be reduced.
> >
> > I'm glad this can be a hint for Andrea-san.
> >
> > ==
> > ---
> > include/linux/page_cgroup.h | 44 ++++++++++++++++++++-
> > mm/memcontrol.c | 91 +++++++++++++++++++++++++++++++++++++++++++-
> > 2 files changed, 132 insertions(+), 3 deletions(-)
> >
> > Index: mmotm-2.6.33-Mar2/include/linux/page_cgroup.h
> > ===================================================================
> > --- mmotm-2.6.33-Mar2.orig/include/linux/page_cgroup.h
> > +++ mmotm-2.6.33-Mar2/include/linux/page_cgroup.h
> > @@ -39,6 +39,11 @@ enum {
> > PCG_CACHE, /* charged as cache */
> > PCG_USED, /* this object is in use. */
> > PCG_ACCT_LRU, /* page has been accounted for */
> > + PCG_MIGRATE_LOCK, /* used for mutual execution of account migration */
> > + PCG_ACCT_DIRTY,
> > + PCG_ACCT_WB,
> > + PCG_ACCT_WB_TEMP,
> > + PCG_ACCT_UNSTABLE,
> > };
> >
> > #define TESTPCGFLAG(uname, lname) \
> > @@ -73,6 +78,23 @@ CLEARPCGFLAG(AcctLRU, ACCT_LRU)
> > TESTPCGFLAG(AcctLRU, ACCT_LRU)
> > TESTCLEARPCGFLAG(AcctLRU, ACCT_LRU)
> >
> > +SETPCGFLAG(AcctDirty, ACCT_DIRTY);
> > +CLEARPCGFLAG(AcctDirty, ACCT_DIRTY);
> > +TESTPCGFLAG(AcctDirty, ACCT_DIRTY);
> > +
> > +SETPCGFLAG(AcctWB, ACCT_WB);
> > +CLEARPCGFLAG(AcctWB, ACCT_WB);
> > +TESTPCGFLAG(AcctWB, ACCT_WB);
> > +
> > +SETPCGFLAG(AcctWBTemp, ACCT_WB_TEMP);
> > +CLEARPCGFLAG(AcctWBTemp, ACCT_WB_TEMP);
> > +TESTPCGFLAG(AcctWBTemp, ACCT_WB_TEMP);
> > +
> > +SETPCGFLAG(AcctUnstableNFS, ACCT_UNSTABLE);
> > +CLEARPCGFLAG(AcctUnstableNFS, ACCT_UNSTABLE);
> > +TESTPCGFLAG(AcctUnstableNFS, ACCT_UNSTABLE);
> > +
> > +
> > static inline int page_cgroup_nid(struct page_cgroup *pc)
> > {
> > return page_to_nid(pc->page);
> > @@ -82,7 +104,9 @@ static inline enum zone_type page_cgroup
> > {
> > return page_zonenum(pc->page);
> > }
> > -
> > +/*
> > + * lock_page_cgroup() should not be held under mapping->tree_lock
> > + */
> > static inline void lock_page_cgroup(struct page_cgroup *pc)
> > {
> > bit_spin_lock(PCG_LOCK, &pc->flags);
> > @@ -93,6 +117,24 @@ static inline void unlock_page_cgroup(st
> > bit_spin_unlock(PCG_LOCK, &pc->flags);
> > }
> >
> > +/*
> > + * Lock order is
> > + * lock_page_cgroup()
> > + * lock_page_cgroup_migrate()
> > + * This lock is not be lock for charge/uncharge but for account moving.
> > + * i.e. overwrite pc->mem_cgroup. The lock owner should guarantee by itself
> > + * the page is uncharged while we hold this.
> > + */
> > +static inline void lock_page_cgroup_migrate(struct page_cgroup *pc)
> > +{
> > + bit_spin_lock(PCG_MIGRATE_LOCK, &pc->flags);
> > +}
> > +
> > +static inline void unlock_page_cgroup_migrate(struct page_cgroup *pc)
> > +{
> > + bit_spin_unlock(PCG_MIGRATE_LOCK, &pc->flags);
> > +}
> > +
> > #else /* CONFIG_CGROUP_MEM_RES_CTLR */
> > struct page_cgroup;
> >
> > Index: mmotm-2.6.33-Mar2/mm/memcontrol.c
> > ===================================================================
> > --- mmotm-2.6.33-Mar2.orig/mm/memcontrol.c
> > +++ mmotm-2.6.33-Mar2/mm/memcontrol.c
> > @@ -87,6 +87,10 @@ enum mem_cgroup_stat_index {
> > MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
> > MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
> > MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */
> > + MEM_CGROUP_STAT_DIRTY,
> > + MEM_CGROUP_STAT_WBACK,
> > + MEM_CGROUP_STAT_WBACK_TEMP,
> > + MEM_CGROUP_STAT_UNSTABLE_NFS,
> >
> > MEM_CGROUP_STAT_NSTATS,
> > };
> > @@ -1360,6 +1364,86 @@ done:
> > }
> >
> > /*
> > + * Update file cache's status for memcg. Before calling this,
> > + * mapping->tree_lock should be held and preemption is disabled.
> > + * Then, it's guarnteed that the page is not uncharged while we
> > + * access page_cgroup. We can make use of that.
> > + */
> > +void mem_cgroup_update_stat_locked(struct page *page, int idx, bool set)
> > +{
> > + struct page_cgroup *pc;
> > + struct mem_cgroup *mem;
> > +
> > + pc = lookup_page_cgroup(page);
> > + /* Not accounted ? */
> > + if (!PageCgroupUsed(pc))
> > + return;
> > + lock_page_cgroup_migrate(pc);
> > + /*
> > + * It's guarnteed that this page is never uncharged.
> > + * The only racy problem is moving account among memcgs.
> > + */
> > + switch (idx) {
> > + case MEM_CGROUP_STAT_DIRTY:
> > + if (set)
> > + SetPageCgroupAcctDirty(pc);
> > + else
> > + ClearPageCgroupAcctDirty(pc);
> > + break;
> > + case MEM_CGROUP_STAT_WBACK:
> > + if (set)
> > + SetPageCgroupAcctWB(pc);
> > + else
> > + ClearPageCgroupAcctWB(pc);
> > + break;
> > + case MEM_CGROUP_STAT_WBACK_TEMP:
> > + if (set)
> > + SetPageCgroupAcctWBTemp(pc);
> > + else
> > + ClearPageCgroupAcctWBTemp(pc);
> > + break;
> > + case MEM_CGROUP_STAT_UNSTABLE_NFS:
> > + if (set)
> > + SetPageCgroupAcctUnstableNFS(pc);
> > + else
> > + ClearPageCgroupAcctUnstableNFS(pc);
> > + break;
> > + default:
> > + BUG();
> > + break;
> > + }
> > + mem = pc->mem_cgroup;
> > + if (set)
> > + __this_cpu_inc(mem->stat->count[idx]);
> > + else
> > + __this_cpu_dec(mem->stat->count[idx]);
> > + unlock_page_cgroup_migrate(pc);
> > +}
> > +
> > +static void move_acct_information(struct mem_cgroup *from,
> > + struct mem_cgroup *to,
> > + struct page_cgroup *pc)
> > +{
> > + /* preemption is disabled, migration_lock is held. */
> > + if (PageCgroupAcctDirty(pc)) {
> > + __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_DIRTY]);
> > + __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_DIRTY]);
> > + }
> > + if (PageCgroupAcctWB(pc)) {
> > + __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_WBACK]);
> > + __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_WBACK]);
> > + }
> > + if (PageCgroupAcctWBTemp(pc)) {
> > + __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_WBACK_TEMP]);
> > + __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_WBACK_TEMP]);
> > + }
> > + if (PageCgroupAcctUnstableNFS(pc)) {
> > + __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_UNSTABLE_NFS]);
> > + __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_UNSTABLE_NFS]);
> > + }
> > +}
> > +
> > +/*
> > * size of first charge trial. "32" comes from vmscan.c's magic value.
> > * TODO: maybe necessary to use big numbers in big irons.
> > */
> > @@ -1794,15 +1878,16 @@ static void __mem_cgroup_move_account(st
> > VM_BUG_ON(!PageCgroupUsed(pc));
> > VM_BUG_ON(pc->mem_cgroup != from);
> >
> > + preempt_disable();
> > + lock_page_cgroup_migrate(pc);
> > page = pc->page;
> > if (page_mapped(page) && !PageAnon(page)) {
> > /* Update mapped_file data for mem_cgroup */
> > - preempt_disable();
> > __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
> > __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
> > - preempt_enable();
> > }
> > mem_cgroup_charge_statistics(from, pc, false);
> > + move_acct_information(from, to, pc);
>
> Kame-san, a question. According to is_target_pte_for_mc() it seems we
> don't move file pages across cgroups for now. If !PageAnon(page) we just
> return 0 and the page won't be selected for migration in
> mem_cgroup_move_charge_pte_range().
>
You're right. It's my TODO to move file pages at task migration.

> So, if I've understood well the code is correct in perspective, but
> right now it's unnecessary. File pages are not moved on task migration
> across cgroups and, at the moment, there's no way for file page
> accounted statistics to go negative.
>
> Or am I missing something?
>
__mem_cgroup_move_account() will be called not only at task migration
but also at rmdir, so I think it would be better to handle file pages anyway.

Thanks,
Daisuke Nishimura.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

From: KAMEZAWA Hiroyuki on 3 Mar 2010 22:50

On Wed, 3 Mar 2010 23:03:19 +0100
Andrea Righi <arighi(a)develer.com> wrote:

> On Wed, Mar 03, 2010 at 05:21:32PM +0900, KAMEZAWA Hiroyuki wrote:
> > On Wed, 3 Mar 2010 15:15:49 +0900
> > KAMEZAWA Hiroyuki <kamezawa.hiroyu(a)jp.fujitsu.com> wrote:

> > + preempt_disable();
> > + lock_page_cgroup_migrate(pc);
> > page = pc->page;
> > if (page_mapped(page) && !PageAnon(page)) {
> > /* Update mapped_file data for mem_cgroup */
> > - preempt_disable();
> > __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
> > __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
> > - preempt_enable();
> > }
> > mem_cgroup_charge_statistics(from, pc, false);
> > + move_acct_information(from, to, pc);
>
> Kame-san, a question. According to is_target_pte_for_mc() it seems we
> don't move file pages across cgroups for now.

yes. It's just in plan.

> If !PageAnon(page) we just return 0 and the page won't be selected for migration in
> mem_cgroup_move_charge_pte_range().
>
> So, if I've understood well the code is correct in perspective, but
> right now it's unnecessary. File pages are not moved on task migration
> across cgroups and, at the moment, there's no way for file page
> accounted statistics to go negative.
>
> Or am I missing something?
>

At rmdir(), remainging file caches in a cgroup is moved to
its parent. Then, all file caches are moved to its parent at rmdir().

This behavior is for avoiding to lose too much file caches at removing cgroup.

Thanks,
-Kame

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

From: Andrea Righi on 4 Mar 2010 11:30

On Thu, Mar 04, 2010 at 11:18:28AM -0500, Vivek Goyal wrote:
> On Thu, Mar 04, 2010 at 11:40:15AM +0100, Andrea Righi wrote:
>
> [..]
> > diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> > index 5a0f8f3..c5d14ea 100644
> > --- a/mm/page-writeback.c
> > +++ b/mm/page-writeback.c
> > @@ -137,13 +137,16 @@ static struct prop_descriptor vm_dirties;
> > */
> > static int calc_period_shift(void)
> > {
> > + struct dirty_param dirty_param;
> > unsigned long dirty_total;
> >
> > - if (vm_dirty_bytes)
> > - dirty_total = vm_dirty_bytes / PAGE_SIZE;
> > + get_dirty_param(&dirty_param);
> > +
> > + if (dirty_param.dirty_bytes)
> > + dirty_total = dirty_param.dirty_bytes / PAGE_SIZE;
> > else
> > - dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) /
> > - 100;
> > + dirty_total = (dirty_param.dirty_ratio *
> > + determine_dirtyable_memory()) / 100;
> > return 2 + ilog2(dirty_total - 1);
> > }
> >
> > @@ -408,41 +411,46 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
> > */
> > unsigned long determine_dirtyable_memory(void)
> > {
> > - unsigned long x;
> > -
> > - x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
> > + unsigned long memory;
> > + s64 memcg_memory;
> >
> > + memory = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
> > if (!vm_highmem_is_dirtyable)
> > - x -= highmem_dirtyable_memory(x);
> > -
> > - return x + 1; /* Ensure that we never return 0 */
> > + memory -= highmem_dirtyable_memory(memory);
> > + if (mem_cgroup_has_dirty_limit())
> > + return memory + 1;
>
> Should above be?
> if (!mem_cgroup_has_dirty_limit())
> return memory + 1;

Very true.

I'll post another patch with this and Kirill's fixes.

Thanks,
-Andrea

>
> Vivek
>
> > + memcg_memory = mem_cgroup_page_stat(MEMCG_NR_DIRTYABLE_PAGES);
> > + return min((unsigned long)memcg_memory, memory + 1);
> > }
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

From: Vivek Goyal on 4 Mar 2010 11:30

On Thu, Mar 04, 2010 at 11:40:15AM +0100, Andrea Righi wrote:

[..]
> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index 5a0f8f3..c5d14ea 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -137,13 +137,16 @@ static struct prop_descriptor vm_dirties;
> */
> static int calc_period_shift(void)
> {
> + struct dirty_param dirty_param;
> unsigned long dirty_total;
>
> - if (vm_dirty_bytes)
> - dirty_total = vm_dirty_bytes / PAGE_SIZE;
> + get_dirty_param(&dirty_param);
> +
> + if (dirty_param.dirty_bytes)
> + dirty_total = dirty_param.dirty_bytes / PAGE_SIZE;
> else
> - dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) /
> - 100;
> + dirty_total = (dirty_param.dirty_ratio *
> + determine_dirtyable_memory()) / 100;
> return 2 + ilog2(dirty_total - 1);
> }
>
> @@ -408,41 +411,46 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
> */
> unsigned long determine_dirtyable_memory(void)
> {
> - unsigned long x;
> -
> - x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
> + unsigned long memory;
> + s64 memcg_memory;
>
> + memory = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
> if (!vm_highmem_is_dirtyable)
> - x -= highmem_dirtyable_memory(x);
> -
> - return x + 1; /* Ensure that we never return 0 */
> + memory -= highmem_dirtyable_memory(memory);
> + if (mem_cgroup_has_dirty_limit())
> + return memory + 1;

Should above be?
if (!mem_cgroup_has_dirty_limit())
return memory + 1;

Vivek

> + memcg_memory = mem_cgroup_page_stat(MEMCG_NR_DIRTYABLE_PAGES);
> + return min((unsigned long)memcg_memory, memory + 1);
> }
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

From: Vivek Goyal on 4 Mar 2010 14:50

On Thu, Mar 04, 2010 at 11:40:15AM +0100, Andrea Righi wrote:

[..]
> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index 5a0f8f3..c5d14ea 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -137,13 +137,16 @@ static struct prop_descriptor vm_dirties;
> */
> static int calc_period_shift(void)
> {
> + struct dirty_param dirty_param;
> unsigned long dirty_total;
>
> - if (vm_dirty_bytes)
> - dirty_total = vm_dirty_bytes / PAGE_SIZE;
> + get_dirty_param(&dirty_param);
> +
> + if (dirty_param.dirty_bytes)
> + dirty_total = dirty_param.dirty_bytes / PAGE_SIZE;
> else
> - dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) /
> - 100;
> + dirty_total = (dirty_param.dirty_ratio *
> + determine_dirtyable_memory()) / 100;
> return 2 + ilog2(dirty_total - 1);
> }
>

Hmm.., I have been staring at this for some time and I think something is
wrong. I don't fully understand the way floating proportions are working
but this function seems to be calculating the period over which we need
to measuer the proportions. (vm_completion proportion and vm_dirties
proportions).

And we this period (shift), when admin updates dirty_ratio or dirty_bytes
etc. In that case we recalculate the global dirty limit and take log2 and
use that as period over which we monitor and calculate proportions.

If yes, then it should be global and not per cgroup (because all our
accouting of bdi completion is global and not per cgroup).

PeterZ, can tell us more about it. I am just raising the flag here to be
sure.

Thanks
Vivek
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

First | Prev | Next | Last
Pages: 1 2 3 4 5 6 7 8 9 10 11 12
Prev: [announce] gujin GPL bootloader version 2.8
Next: [Bug #14792] Misdetection of the TV output