memcg: dirty pages instrumentation [Kernel]

Prev: [announce] gujin GPL bootloader version 2.8
Next: [Bug #14792] Misdetection of the TV output

From: KAMEZAWA Hiroyuki on 2 Mar 2010 22:40

On Wed, 3 Mar 2010 11:12:38 +0900
Daisuke Nishimura <nishimura(a)mxp.nes.nec.co.jp> wrote:

> > diff --git a/mm/filemap.c b/mm/filemap.c
> > index fe09e51..f85acae 100644
> > --- a/mm/filemap.c
> > +++ b/mm/filemap.c
> > @@ -135,6 +135,7 @@ void __remove_from_page_cache(struct page *page)
> > * having removed the page entirely.
> > */
> > if (PageDirty(page) && mapping_cap_account_dirty(mapping)) {
> > + mem_cgroup_update_stat(page, MEM_CGROUP_STAT_FILE_DIRTY, -1);
> > dec_zone_page_state(page, NR_FILE_DIRTY);
> > dec_bdi_stat(mapping->backing_dev_info, BDI_DIRTY);
> > }
> (snip)
> > @@ -1096,6 +1113,7 @@ int __set_page_dirty_no_writeback(struct page *page)
> > void account_page_dirtied(struct page *page, struct address_space *mapping)
> > {
> > if (mapping_cap_account_dirty(mapping)) {
> > + mem_cgroup_update_stat(page, MEM_CGROUP_STAT_FILE_DIRTY, 1);
> > __inc_zone_page_state(page, NR_FILE_DIRTY);
> > __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTY);
> > task_dirty_inc(current);
> As long as I can see, those two functions(at least) calls mem_cgroup_update_state(),
> which acquires page cgroup lock, under mapping->tree_lock.
> But as I fixed before in commit e767e056, page cgroup lock must not acquired under
> mapping->tree_lock.
> hmm, we should call those mem_cgroup_update_state() outside mapping->tree_lock,
> or add local_irq_save/restore() around lock/unlock_page_cgroup() to avoid dead-lock.
>
Ah, good catch! But hmmmmmm...
This account_page_dirtted() seems to be called under IRQ-disabled.
About __remove_from_page_cache(), I think page_cgroup should have its own DIRTY flag,
then, mem_cgroup_uncharge_page() can handle it automatically.

But. there are no guarantee that following never happens.
lock_page_cgroup()
<=== interrupt.
-> mapping->tree_lock()
Even if mapping->tree_lock is held with IRQ-disabled.
Then, if we add local_irq_save(), we have to add it to all lock_page_cgroup().

Then, hm...some kind of new trick ? as..
(Follwoing patch is not tested!!)

==
---
include/linux/page_cgroup.h | 14 ++++++++++++++
mm/memcontrol.c | 27 +++++++++++++++++----------
2 files changed, 31 insertions(+), 10 deletions(-)

Index: mmotm-2.6.33-Feb11/include/linux/page_cgroup.h
===================================================================
--- mmotm-2.6.33-Feb11.orig/include/linux/page_cgroup.h
+++ mmotm-2.6.33-Feb11/include/linux/page_cgroup.h
@@ -39,6 +39,7 @@ enum {
PCG_CACHE, /* charged as cache */
PCG_USED, /* this object is in use. */
PCG_ACCT_LRU, /* page has been accounted for */
+ PCG_MIGRATE, /* page cgroup is under memcg account migration */
};

#define TESTPCGFLAG(uname, lname) \
@@ -73,6 +74,8 @@ CLEARPCGFLAG(AcctLRU, ACCT_LRU)
TESTPCGFLAG(AcctLRU, ACCT_LRU)
TESTCLEARPCGFLAG(AcctLRU, ACCT_LRU)

+TESTPCGFLAG(Migrate, MIGRATE)
+
static inline int page_cgroup_nid(struct page_cgroup *pc)
{
return page_to_nid(pc->page);
@@ -93,6 +96,17 @@ static inline void unlock_page_cgroup(st
bit_spin_unlock(PCG_LOCK, &pc->flags);
}

+static inline unsigned long page_cgroup_migration_lock(struct page_cgroup *pc)
+{
+ local_irq_save(flags);
+ bit_spin_lock(PCG_MIGRATE, &pc->flags);
+}
+static inline void
+page_cgroup_migration_lock(struct page_cgroup *pc, unsigned long flags)
+{
+ bit_spin_lock(PCG_MIGRATE, &pc->flags);
+ local_irq_restore(flags);
+}
#else /* CONFIG_CGROUP_MEM_RES_CTLR */
struct page_cgroup;

Index: mmotm-2.6.33-Feb11/mm/memcontrol.c
===================================================================
--- mmotm-2.6.33-Feb11.orig/mm/memcontrol.c
+++ mmotm-2.6.33-Feb11/mm/memcontrol.c
@@ -1321,7 +1321,7 @@ bool mem_cgroup_handle_oom(struct mem_cg
* Currently used to update mapped file statistics, but the routine can be
* generalized to update other statistics as well.
*/
-void mem_cgroup_update_file_mapped(struct page *page, int val)
+void mem_cgroup_update_file_mapped(struct page *page, int val, int locked)
{
struct mem_cgroup *mem;
struct page_cgroup *pc;
@@ -1329,22 +1329,27 @@ void mem_cgroup_update_file_mapped(struc
pc = lookup_page_cgroup(page);
if (unlikely(!pc))
return;
-
- lock_page_cgroup(pc);
+ /*
+ * if locked==1, mapping->tree_lock is held. We don't have to take
+ * care of charge/uncharge. just think about migration.
+ */
+ if (!locked)
+ lock_page_cgroup(pc);
+ else
+ page_cgroup_migration_lock(pc);
mem = pc->mem_cgroup;
- if (!mem)
+ if (!mem || !PageCgroupUsed(pc))
goto done;
-
- if (!PageCgroupUsed(pc))
- goto done;
-
/*
* Preemption is already disabled. We can use __this_cpu_xxx
*/
__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], val);

done:
- unlock_page_cgroup(pc);
+ if (!locked)
+ lock_page_cgroup(pc);
+ else
+ page_cgroup_migration_unlock(pc);
}

/*
@@ -1785,7 +1790,8 @@ static void __mem_cgroup_move_account(st
VM_BUG_ON(!PageCgroupLocked(pc));
VM_BUG_ON(!PageCgroupUsed(pc));
VM_BUG_ON(pc->mem_cgroup != from);
-
+
+ page_cgroup_migration_lock(pc);
page = pc->page;
if (page_mapped(page) && !PageAnon(page)) {
/* Update mapped_file data for mem_cgroup */
@@ -1802,6 +1808,7 @@ static void __mem_cgroup_move_account(st
/* caller should have done css_get */
pc->mem_cgroup = to;
mem_cgroup_charge_statistics(to, pc, true);
+ page_cgroup_migration_lock(pc);
/*
* We charges against "to" which may not have any tasks. Then, "to"
* can be under rmdir(). But in current implementation, caller of

Thanks,
-Kame

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

From: Daisuke Nishimura on 3 Mar 2010 01:10

On Wed, 3 Mar 2010 12:29:06 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu(a)jp.fujitsu.com> wrote:
> On Wed, 3 Mar 2010 11:12:38 +0900
> Daisuke Nishimura <nishimura(a)mxp.nes.nec.co.jp> wrote:
>
> > > diff --git a/mm/filemap.c b/mm/filemap.c
> > > index fe09e51..f85acae 100644
> > > --- a/mm/filemap.c
> > > +++ b/mm/filemap.c
> > > @@ -135,6 +135,7 @@ void __remove_from_page_cache(struct page *page)
> > > * having removed the page entirely.
> > > */
> > > if (PageDirty(page) && mapping_cap_account_dirty(mapping)) {
> > > + mem_cgroup_update_stat(page, MEM_CGROUP_STAT_FILE_DIRTY, -1);
> > > dec_zone_page_state(page, NR_FILE_DIRTY);
> > > dec_bdi_stat(mapping->backing_dev_info, BDI_DIRTY);
> > > }
> > (snip)
> > > @@ -1096,6 +1113,7 @@ int __set_page_dirty_no_writeback(struct page *page)
> > > void account_page_dirtied(struct page *page, struct address_space *mapping)
> > > {
> > > if (mapping_cap_account_dirty(mapping)) {
> > > + mem_cgroup_update_stat(page, MEM_CGROUP_STAT_FILE_DIRTY, 1);
> > > __inc_zone_page_state(page, NR_FILE_DIRTY);
> > > __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTY);
> > > task_dirty_inc(current);
> > As long as I can see, those two functions(at least) calls mem_cgroup_update_state(),
> > which acquires page cgroup lock, under mapping->tree_lock.
> > But as I fixed before in commit e767e056, page cgroup lock must not acquired under
> > mapping->tree_lock.
> > hmm, we should call those mem_cgroup_update_state() outside mapping->tree_lock,
> > or add local_irq_save/restore() around lock/unlock_page_cgroup() to avoid dead-lock.
> >
> Ah, good catch! But hmmmmmm...
> This account_page_dirtted() seems to be called under IRQ-disabled.
> About __remove_from_page_cache(), I think page_cgroup should have its own DIRTY flag,
> then, mem_cgroup_uncharge_page() can handle it automatically.
>
> But. there are no guarantee that following never happens.
> lock_page_cgroup()
> <=== interrupt.
> -> mapping->tree_lock()
> Even if mapping->tree_lock is held with IRQ-disabled.
> Then, if we add local_irq_save(), we have to add it to all lock_page_cgroup().
>
> Then, hm...some kind of new trick ? as..
> (Follwoing patch is not tested!!)
>
If we can verify that all callers of mem_cgroup_update_stat() have always either aquired
or not aquired tree_lock, this direction will work fine.
But if we can't, we have to add local_irq_save() to lock_page_cgroup() like below.

===
include/linux/page_cgroup.h | 8 ++++++--
mm/memcontrol.c | 43 +++++++++++++++++++++++++------------------
2 files changed, 31 insertions(+), 20 deletions(-)

diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index 30b0813..51da916 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -83,15 +83,19 @@ static inline enum zone_type page_cgroup_zid(struct page_cgroup *pc)
return page_zonenum(pc->page);
}

-static inline void lock_page_cgroup(struct page_cgroup *pc)
+static inline void __lock_page_cgroup(struct page_cgroup *pc)
{
bit_spin_lock(PCG_LOCK, &pc->flags);
}
+#define lock_page_cgroup(pc, flags) \
+ do { local_irq_save(flags); __lock_page_cgroup(pc); } while (0)

-static inline void unlock_page_cgroup(struct page_cgroup *pc)
+static inline void __unlock_page_cgroup(struct page_cgroup *pc)
{
bit_spin_unlock(PCG_LOCK, &pc->flags);
}
+#define unlock_page_cgroup(pc, flags) \
+ do { __unlock_page_cgroup(pc); local_irq_restore(flags); } while (0)

#else /* CONFIG_CGROUP_MEM_RES_CTLR */
struct page_cgroup;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 00ed4b1..40b9be4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1327,12 +1327,13 @@ void mem_cgroup_update_file_mapped(struct page *page, int val)
{
struct mem_cgroup *mem;
struct page_cgroup *pc;
+ unsigned long flags;

pc = lookup_page_cgroup(page);
if (unlikely(!pc))
return;

- lock_page_cgroup(pc);
+ lock_page_cgroup(pc, flags);
mem = pc->mem_cgroup;
if (!mem)
goto done;
@@ -1346,7 +1347,7 @@ void mem_cgroup_update_file_mapped(struct page *page, int val)
__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], val);

done:
- unlock_page_cgroup(pc);
+ unlock_page_cgroup(pc, flags);
}

/*
@@ -1680,11 +1681,12 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
struct page_cgroup *pc;
unsigned short id;
swp_entry_t ent;
+ unsigned long flags;

VM_BUG_ON(!PageLocked(page));

pc = lookup_page_cgroup(page);
- lock_page_cgroup(pc);
+ lock_page_cgroup(pc, flags);
if (PageCgroupUsed(pc)) {
mem = pc->mem_cgroup;
if (mem && !css_tryget(&mem->css))
@@ -1698,7 +1700,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
mem = NULL;
rcu_read_unlock();
}
- unlock_page_cgroup(pc);
+ unlock_page_cgroup(pc, flags);
return mem;
}

@@ -1711,13 +1713,15 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
struct page_cgroup *pc,
enum charge_type ctype)
{
+ unsigned long flags;
+
/* try_charge() can return NULL to *memcg, taking care of it. */
if (!mem)
return;

- lock_page_cgroup(pc);
+ lock_page_cgroup(pc, flags);
if (unlikely(PageCgroupUsed(pc))) {
- unlock_page_cgroup(pc);
+ unlock_page_cgroup(pc, flags);
mem_cgroup_cancel_charge(mem);
return;
}
@@ -1747,7 +1751,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,

mem_cgroup_charge_statistics(mem, pc, true);

- unlock_page_cgroup(pc);
+ unlock_page_cgroup(pc, flags);
/*
* "charge_statistics" updated event counter. Then, check it.
* Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
@@ -1817,12 +1821,13 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
{
int ret = -EINVAL;
- lock_page_cgroup(pc);
+ unsigned long flags;
+ lock_page_cgroup(pc, flags);
if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
__mem_cgroup_move_account(pc, from, to, uncharge);
ret = 0;
}
- unlock_page_cgroup(pc);
+ unlock_page_cgroup(pc, flags);
/*
* check events
*/
@@ -1949,17 +1954,17 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
*/
if (!(gfp_mask & __GFP_WAIT)) {
struct page_cgroup *pc;
-
+ unsigned long flags;

pc = lookup_page_cgroup(page);
if (!pc)
return 0;
- lock_page_cgroup(pc);
+ lock_page_cgroup(pc, flags);
if (PageCgroupUsed(pc)) {
- unlock_page_cgroup(pc);
+ unlock_page_cgroup(pc, flags);
return 0;
}
- unlock_page_cgroup(pc);
+ unlock_page_cgroup(pc, flags);
}

if (unlikely(!mm && !mem))
@@ -2141,6 +2146,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
struct page_cgroup *pc;
struct mem_cgroup *mem = NULL;
struct mem_cgroup_per_zone *mz;
+ unsigned long flags;

if (mem_cgroup_disabled())
return NULL;
@@ -2155,7 +2161,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
if (unlikely(!pc || !PageCgroupUsed(pc)))
return NULL;

- lock_page_cgroup(pc);
+ lock_page_cgroup(pc, flags);

mem = pc->mem_cgroup;

@@ -2194,7 +2200,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
*/

mz = page_cgroup_zoneinfo(pc);
- unlock_page_cgroup(pc);
+ unlock_page_cgroup(pc, flags);

memcg_check_events(mem, page);
/* at swapout, this memcg will be accessed to record to swap */
@@ -2204,7 +2210,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
return mem;

unlock_out:
- unlock_page_cgroup(pc);
+ unlock_page_cgroup(pc, flags);
return NULL;
}

@@ -2392,17 +2398,18 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
struct page_cgroup *pc;
struct mem_cgroup *mem = NULL;
int ret = 0;
+ unsigned long flags;

if (mem_cgroup_disabled())
return 0;

pc = lookup_page_cgroup(page);
- lock_page_cgroup(pc);
+ lock_page_cgroup(pc, flags);
if (PageCgroupUsed(pc)) {
mem = pc->mem_cgroup;
css_get(&mem->css);
}
- unlock_page_cgroup(pc);
+ unlock_page_cgroup(pc, flags);

if (mem) {
ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);

> ==
> ---
> include/linux/page_cgroup.h | 14 ++++++++++++++
> mm/memcontrol.c | 27 +++++++++++++++++----------
> 2 files changed, 31 insertions(+), 10 deletions(-)
>
> Index: mmotm-2.6.33-Feb11/include/linux/page_cgroup.h
> ===================================================================
> --- mmotm-2.6.33-Feb11.orig/include/linux/page_cgroup.h
> +++ mmotm-2.6.33-Feb11/include/linux/page_cgroup.h
> @@ -39,6 +39,7 @@ enum {
> PCG_CACHE, /* charged as cache */
> PCG_USED, /* this object is in use. */
> PCG_ACCT_LRU, /* page has been accounted for */
> + PCG_MIGRATE, /* page cgroup is under memcg account migration */
> };
>
> #define TESTPCGFLAG(uname, lname) \
> @@ -73,6 +74,8 @@ CLEARPCGFLAG(AcctLRU, ACCT_LRU)
> TESTPCGFLAG(AcctLRU, ACCT_LRU)
> TESTCLEARPCGFLAG(AcctLRU, ACCT_LRU)
>
> +TESTPCGFLAG(Migrate, MIGRATE)
> +
> static inline int page_cgroup_nid(struct page_cgroup *pc)
> {
> return page_to_nid(pc->page);
> @@ -93,6 +96,17 @@ static inline void unlock_page_cgroup(st
> bit_spin_unlock(PCG_LOCK, &pc->flags);
> }
>
> +static inline unsigned long page_cgroup_migration_lock(struct page_cgroup *pc)
> +{
> + local_irq_save(flags);
> + bit_spin_lock(PCG_MIGRATE, &pc->flags);
> +}
> +static inline void
> +page_cgroup_migration_lock(struct page_cgroup *pc, unsigned long flags)
> +{
> + bit_spin_lock(PCG_MIGRATE, &pc->flags);
> + local_irq_restore(flags);
> +}
> #else /* CONFIG_CGROUP_MEM_RES_CTLR */
> struct page_cgroup;
>
> Index: mmotm-2.6.33-Feb11/mm/memcontrol.c
> ===================================================================
> --- mmotm-2.6.33-Feb11.orig/mm/memcontrol.c
> +++ mmotm-2.6.33-Feb11/mm/memcontrol.c
> @@ -1321,7 +1321,7 @@ bool mem_cgroup_handle_oom(struct mem_cg
> * Currently used to update mapped file statistics, but the routine can be
> * generalized to update other statistics as well.
> */
> -void mem_cgroup_update_file_mapped(struct page *page, int val)
> +void mem_cgroup_update_file_mapped(struct page *page, int val, int locked)
> {
> struct mem_cgroup *mem;
> struct page_cgroup *pc;
> @@ -1329,22 +1329,27 @@ void mem_cgroup_update_file_mapped(struc
> pc = lookup_page_cgroup(page);
> if (unlikely(!pc))
> return;
> -
> - lock_page_cgroup(pc);
> + /*
> + * if locked==1, mapping->tree_lock is held. We don't have to take
> + * care of charge/uncharge. just think about migration.
> + */
> + if (!locked)
> + lock_page_cgroup(pc);
> + else
> + page_cgroup_migration_lock(pc);
> mem = pc->mem_cgroup;
> - if (!mem)
> + if (!mem || !PageCgroupUsed(pc))
> goto done;
> -
> - if (!PageCgroupUsed(pc))
> - goto done;
> -
> /*
> * Preemption is already disabled. We can use __this_cpu_xxx
> */
> __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], val);
>
> done:
> - unlock_page_cgroup(pc);
> + if (!locked)
> + lock_page_cgroup(pc);
> + else
> + page_cgroup_migration_unlock(pc);
> }
>
> /*
> @@ -1785,7 +1790,8 @@ static void __mem_cgroup_move_account(st
> VM_BUG_ON(!PageCgroupLocked(pc));
> VM_BUG_ON(!PageCgroupUsed(pc));
> VM_BUG_ON(pc->mem_cgroup != from);
> -
> +
> + page_cgroup_migration_lock(pc);
> page = pc->page;
> if (page_mapped(page) && !PageAnon(page)) {
> /* Update mapped_file data for mem_cgroup */
> @@ -1802,6 +1808,7 @@ static void __mem_cgroup_move_account(st
> /* caller should have done css_get */
> pc->mem_cgroup = to;
> mem_cgroup_charge_statistics(to, pc, true);
> + page_cgroup_migration_lock(pc);
> /*
> * We charges against "to" which may not have any tasks. Then, "to"
> * can be under rmdir(). But in current implementation, caller of
>
>
>
>
> Thanks,
> -Kame
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

From: KAMEZAWA Hiroyuki on 3 Mar 2010 01:20

On Wed, 3 Mar 2010 15:01:37 +0900
Daisuke Nishimura <nishimura(a)mxp.nes.nec.co.jp> wrote:

> On Wed, 3 Mar 2010 12:29:06 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu(a)jp.fujitsu.com> wrote:
> > On Wed, 3 Mar 2010 11:12:38 +0900
> > Daisuke Nishimura <nishimura(a)mxp.nes.nec.co.jp> wrote:
> >
> > > > diff --git a/mm/filemap.c b/mm/filemap.c
> > > > index fe09e51..f85acae 100644
> > > > --- a/mm/filemap.c
> > > > +++ b/mm/filemap.c
> > > > @@ -135,6 +135,7 @@ void __remove_from_page_cache(struct page *page)
> > > > * having removed the page entirely.
> > > > */
> > > > if (PageDirty(page) && mapping_cap_account_dirty(mapping)) {
> > > > + mem_cgroup_update_stat(page, MEM_CGROUP_STAT_FILE_DIRTY, -1);
> > > > dec_zone_page_state(page, NR_FILE_DIRTY);
> > > > dec_bdi_stat(mapping->backing_dev_info, BDI_DIRTY);
> > > > }
> > > (snip)
> > > > @@ -1096,6 +1113,7 @@ int __set_page_dirty_no_writeback(struct page *page)
> > > > void account_page_dirtied(struct page *page, struct address_space *mapping)
> > > > {
> > > > if (mapping_cap_account_dirty(mapping)) {
> > > > + mem_cgroup_update_stat(page, MEM_CGROUP_STAT_FILE_DIRTY, 1);
> > > > __inc_zone_page_state(page, NR_FILE_DIRTY);
> > > > __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTY);
> > > > task_dirty_inc(current);
> > > As long as I can see, those two functions(at least) calls mem_cgroup_update_state(),
> > > which acquires page cgroup lock, under mapping->tree_lock.
> > > But as I fixed before in commit e767e056, page cgroup lock must not acquired under
> > > mapping->tree_lock.
> > > hmm, we should call those mem_cgroup_update_state() outside mapping->tree_lock,
> > > or add local_irq_save/restore() around lock/unlock_page_cgroup() to avoid dead-lock.
> > >
> > Ah, good catch! But hmmmmmm...
> > This account_page_dirtted() seems to be called under IRQ-disabled.
> > About __remove_from_page_cache(), I think page_cgroup should have its own DIRTY flag,
> > then, mem_cgroup_uncharge_page() can handle it automatically.
> >
> > But. there are no guarantee that following never happens.
> > lock_page_cgroup()
> > <=== interrupt.
> > -> mapping->tree_lock()
> > Even if mapping->tree_lock is held with IRQ-disabled.
> > Then, if we add local_irq_save(), we have to add it to all lock_page_cgroup().
> >
> > Then, hm...some kind of new trick ? as..
> > (Follwoing patch is not tested!!)
> >
> If we can verify that all callers of mem_cgroup_update_stat() have always either aquired
> or not aquired tree_lock, this direction will work fine.
> But if we can't, we have to add local_irq_save() to lock_page_cgroup() like below.
>

Agreed.
Let's try how we can write a code in clean way. (we have time ;)
For now, to me, IRQ disabling while lock_page_cgroup() seems to be a little
over killing. What I really want is lockless code...but it seems impossible
under current implementation.

I wonder the fact "the page is never unchareged under us" can give us some chances
....Hmm.

Thanks,
-Kame

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

From: KAMEZAWA Hiroyuki on 3 Mar 2010 03:30

On Wed, 3 Mar 2010 15:15:49 +0900
KAMEZAWA Hiroyuki <kamezawa.hiroyu(a)jp.fujitsu.com> wrote:

> Agreed.
> Let's try how we can write a code in clean way. (we have time ;)
> For now, to me, IRQ disabling while lock_page_cgroup() seems to be a little
> over killing. What I really want is lockless code...but it seems impossible
> under current implementation.
>
> I wonder the fact "the page is never unchareged under us" can give us some chances
> ...Hmm.
>

How about this ? Basically, I don't like duplicating information...so,
# of new pcg_flags may be able to be reduced.

I'm glad this can be a hint for Andrea-san.

==
---
include/linux/page_cgroup.h | 44 ++++++++++++++++++++-
mm/memcontrol.c | 91 +++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 132 insertions(+), 3 deletions(-)

Index: mmotm-2.6.33-Mar2/include/linux/page_cgroup.h
===================================================================
--- mmotm-2.6.33-Mar2.orig/include/linux/page_cgroup.h
+++ mmotm-2.6.33-Mar2/include/linux/page_cgroup.h
@@ -39,6 +39,11 @@ enum {
PCG_CACHE, /* charged as cache */
PCG_USED, /* this object is in use. */
PCG_ACCT_LRU, /* page has been accounted for */
+ PCG_MIGRATE_LOCK, /* used for mutual execution of account migration */
+ PCG_ACCT_DIRTY,
+ PCG_ACCT_WB,
+ PCG_ACCT_WB_TEMP,
+ PCG_ACCT_UNSTABLE,
};

#define TESTPCGFLAG(uname, lname) \
@@ -73,6 +78,23 @@ CLEARPCGFLAG(AcctLRU, ACCT_LRU)
TESTPCGFLAG(AcctLRU, ACCT_LRU)
TESTCLEARPCGFLAG(AcctLRU, ACCT_LRU)

+SETPCGFLAG(AcctDirty, ACCT_DIRTY);
+CLEARPCGFLAG(AcctDirty, ACCT_DIRTY);
+TESTPCGFLAG(AcctDirty, ACCT_DIRTY);
+
+SETPCGFLAG(AcctWB, ACCT_WB);
+CLEARPCGFLAG(AcctWB, ACCT_WB);
+TESTPCGFLAG(AcctWB, ACCT_WB);
+
+SETPCGFLAG(AcctWBTemp, ACCT_WB_TEMP);
+CLEARPCGFLAG(AcctWBTemp, ACCT_WB_TEMP);
+TESTPCGFLAG(AcctWBTemp, ACCT_WB_TEMP);
+
+SETPCGFLAG(AcctUnstableNFS, ACCT_UNSTABLE);
+CLEARPCGFLAG(AcctUnstableNFS, ACCT_UNSTABLE);
+TESTPCGFLAG(AcctUnstableNFS, ACCT_UNSTABLE);
+
+
static inline int page_cgroup_nid(struct page_cgroup *pc)
{
return page_to_nid(pc->page);
@@ -82,7 +104,9 @@ static inline enum zone_type page_cgroup
{
return page_zonenum(pc->page);
}
-
+/*
+ * lock_page_cgroup() should not be held under mapping->tree_lock
+ */
static inline void lock_page_cgroup(struct page_cgroup *pc)
{
bit_spin_lock(PCG_LOCK, &pc->flags);
@@ -93,6 +117,24 @@ static inline void unlock_page_cgroup(st
bit_spin_unlock(PCG_LOCK, &pc->flags);
}

+/*
+ * Lock order is
+ * lock_page_cgroup()
+ * lock_page_cgroup_migrate()
+ * This lock is not be lock for charge/uncharge but for account moving.
+ * i.e. overwrite pc->mem_cgroup. The lock owner should guarantee by itself
+ * the page is uncharged while we hold this.
+ */
+static inline void lock_page_cgroup_migrate(struct page_cgroup *pc)
+{
+ bit_spin_lock(PCG_MIGRATE_LOCK, &pc->flags);
+}
+
+static inline void unlock_page_cgroup_migrate(struct page_cgroup *pc)
+{
+ bit_spin_unlock(PCG_MIGRATE_LOCK, &pc->flags);
+}
+
#else /* CONFIG_CGROUP_MEM_RES_CTLR */
struct page_cgroup;

Index: mmotm-2.6.33-Mar2/mm/memcontrol.c
===================================================================
--- mmotm-2.6.33-Mar2.orig/mm/memcontrol.c
+++ mmotm-2.6.33-Mar2/mm/memcontrol.c
@@ -87,6 +87,10 @@ enum mem_cgroup_stat_index {
MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */
+ MEM_CGROUP_STAT_DIRTY,
+ MEM_CGROUP_STAT_WBACK,
+ MEM_CGROUP_STAT_WBACK_TEMP,
+ MEM_CGROUP_STAT_UNSTABLE_NFS,

MEM_CGROUP_STAT_NSTATS,
};
@@ -1360,6 +1364,86 @@ done:
}

/*
+ * Update file cache's status for memcg. Before calling this,
+ * mapping->tree_lock should be held and preemption is disabled.
+ * Then, it's guarnteed that the page is not uncharged while we
+ * access page_cgroup. We can make use of that.
+ */
+void mem_cgroup_update_stat_locked(struct page *page, int idx, bool set)
+{
+ struct page_cgroup *pc;
+ struct mem_cgroup *mem;
+
+ pc = lookup_page_cgroup(page);
+ /* Not accounted ? */
+ if (!PageCgroupUsed(pc))
+ return;
+ lock_page_cgroup_migrate(pc);
+ /*
+ * It's guarnteed that this page is never uncharged.
+ * The only racy problem is moving account among memcgs.
+ */
+ switch (idx) {
+ case MEM_CGROUP_STAT_DIRTY:
+ if (set)
+ SetPageCgroupAcctDirty(pc);
+ else
+ ClearPageCgroupAcctDirty(pc);
+ break;
+ case MEM_CGROUP_STAT_WBACK:
+ if (set)
+ SetPageCgroupAcctWB(pc);
+ else
+ ClearPageCgroupAcctWB(pc);
+ break;
+ case MEM_CGROUP_STAT_WBACK_TEMP:
+ if (set)
+ SetPageCgroupAcctWBTemp(pc);
+ else
+ ClearPageCgroupAcctWBTemp(pc);
+ break;
+ case MEM_CGROUP_STAT_UNSTABLE_NFS:
+ if (set)
+ SetPageCgroupAcctUnstableNFS(pc);
+ else
+ ClearPageCgroupAcctUnstableNFS(pc);
+ break;
+ default:
+ BUG();
+ break;
+ }
+ mem = pc->mem_cgroup;
+ if (set)
+ __this_cpu_inc(mem->stat->count[idx]);
+ else
+ __this_cpu_dec(mem->stat->count[idx]);
+ unlock_page_cgroup_migrate(pc);
+}
+
+static void move_acct_information(struct mem_cgroup *from,
+ struct mem_cgroup *to,
+ struct page_cgroup *pc)
+{
+ /* preemption is disabled, migration_lock is held. */
+ if (PageCgroupAcctDirty(pc)) {
+ __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_DIRTY]);
+ __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_DIRTY]);
+ }
+ if (PageCgroupAcctWB(pc)) {
+ __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_WBACK]);
+ __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_WBACK]);
+ }
+ if (PageCgroupAcctWBTemp(pc)) {
+ __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_WBACK_TEMP]);
+ __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_WBACK_TEMP]);
+ }
+ if (PageCgroupAcctUnstableNFS(pc)) {
+ __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_UNSTABLE_NFS]);
+ __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_UNSTABLE_NFS]);
+ }
+}
+
+/*
* size of first charge trial. "32" comes from vmscan.c's magic value.
* TODO: maybe necessary to use big numbers in big irons.
*/
@@ -1794,15 +1878,16 @@ static void __mem_cgroup_move_account(st
VM_BUG_ON(!PageCgroupUsed(pc));
VM_BUG_ON(pc->mem_cgroup != from);

+ preempt_disable();
+ lock_page_cgroup_migrate(pc);
page = pc->page;
if (page_mapped(page) && !PageAnon(page)) {
/* Update mapped_file data for mem_cgroup */
- preempt_disable();
__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
- preempt_enable();
}
mem_cgroup_charge_statistics(from, pc, false);
+ move_acct_information(from, to, pc);
if (uncharge)
/* This is not "cancel", but cancel_charge does all we need. */
mem_cgroup_cancel_charge(from);
@@ -1810,6 +1895,8 @@ static void __mem_cgroup_move_account(st
/* caller should have done css_get */
pc->mem_cgroup = to;
mem_cgroup_charge_statistics(to, pc, true);
+ unlock_page_cgroup_migrate(pc);
+ preempt_enable();
/*
* We charges against "to" which may not have any tasks. Then, "to"
* can be under rmdir(). But in current implementation, caller of

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

From: Peter Zijlstra on 3 Mar 2010 05:10

On Tue, 2010-03-02 at 23:14 +0100, Andrea Righi wrote:
>
> I agree mem_cgroup_has_dirty_limit() is nicer. But we must do that under
> RCU, so something like:
>
> rcu_read_lock();
> if (mem_cgroup_has_dirty_limit())
> mem_cgroup_get_page_stat()
> else
> global_page_state()
> rcu_read_unlock();
>
> That is bad when mem_cgroup_has_dirty_limit() always returns false
> (e.g., when memory cgroups are disabled). So I fallback to the old
> interface.

Why is it that mem_cgroup_has_dirty_limit() needs RCU when
mem_cgroup_get_page_stat() doesn't? That is, simply make
mem_cgroup_has_dirty_limit() not require RCU in the same way
*_get_page_stat() doesn't either.

> What do you think about:
>
> mem_cgroup_lock();
> if (mem_cgroup_has_dirty_limit())
> mem_cgroup_get_page_stat()
> else
> global_page_state()
> mem_cgroup_unlock();
>
> Where mem_cgroup_read_lock/unlock() simply expand to nothing when
> memory cgroups are disabled.

I think you're engineering the wrong way around.

> >
> > That allows for a 0 dirty limit (which should work and basically makes
> > all io synchronous).
>
> IMHO it is better to reserve 0 for the special value "disabled" like the
> global settings. A synchronous IO can be also achieved using a dirty
> limit of 1.

Why?! 0 clearly states no writeback cache, IOW sync writes, a 1
byte/page writeback cache effectively reduces to the same thing, but its
not the same thing conceptually. If you want to put the size and enable
into a single variable pick -1 for disable or so.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

First | Prev | Next | Last
Pages: 1 2 3 4 5 6 7 8 9 10 11 12
Prev: [announce] gujin GPL bootloader version 2.8
Next: [Bug #14792] Misdetection of the TV output