superblock: introduce per-sb cache shrinker infrastructure [Kernel]

Prev: ath: Failed to stop TX DMA in 100 msec after killing last frame
Next: [GIT PULL] LED updates for 2.6.35

From: Nick Piggin on 26 May 2010 12:50

On Tue, May 25, 2010 at 06:53:06PM +1000, Dave Chinner wrote:
> @@ -456,21 +456,16 @@ static void prune_one_dentry(struct dentry * dentry)
> * which flags are set. This means we don't need to maintain multiple
> * similar copies of this loop.
> */
> -static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags)
> +static void __shrink_dcache_sb(struct super_block *sb, int count, int flags)
> {
> LIST_HEAD(referenced);
> LIST_HEAD(tmp);
> struct dentry *dentry;
> - int cnt = 0;
>
> BUG_ON(!sb);
> - BUG_ON((flags & DCACHE_REFERENCED) && count == NULL);
> + BUG_ON((flags & DCACHE_REFERENCED) && count == -1);
> spin_lock(&dcache_lock);
> - if (count != NULL)
> - /* called from prune_dcache() and shrink_dcache_parent() */
> - cnt = *count;
> -restart:
> - if (count == NULL)
> + if (count == -1)
> list_splice_init(&sb->s_dentry_lru, &tmp);
> else {
> while (!list_empty(&sb->s_dentry_lru)) {
> @@ -492,13 +487,13 @@ restart:
> } else {
> list_move_tail(&dentry->d_lru, &tmp);
> spin_unlock(&dentry->d_lock);
> - cnt--;
> - if (!cnt)
> + if (--count == 0)
> break;
> }
> cond_resched_lock(&dcache_lock);
> }
> }
> +prune_more:
> while (!list_empty(&tmp)) {
> dentry = list_entry(tmp.prev, struct dentry, d_lru);
> dentry_lru_del_init(dentry);
> @@ -516,88 +511,29 @@ restart:
> /* dentry->d_lock was dropped in prune_one_dentry() */
> cond_resched_lock(&dcache_lock);
> }
> - if (count == NULL && !list_empty(&sb->s_dentry_lru))
> - goto restart;
> - if (count != NULL)
> - *count = cnt;
> + if (count == -1 && !list_empty(&sb->s_dentry_lru)) {
> + list_splice_init(&sb->s_dentry_lru, &tmp);
> + goto prune_more;
> + }

Nitpick but I prefer just the restart label wher it is previously. This
is moving setup for the next iteration into the "error" case.

> +static int prune_super(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
> +{
> + struct super_block *sb;
> + int count;
> +
> + sb = container_of(shrink, struct super_block, s_shrink);
> +
> + /*
> + * Deadlock avoidance. We may hold various FS locks, and we don't want
> + * to recurse into the FS that called us in clear_inode() and friends..
> + */
> + if (!(gfp_mask & __GFP_FS))
> + return -1;
> +
> + /*
> + * if we can't get the umount lock, then there's no point having the
> + * shrinker try again because the sb is being torn down.
> + */
> + if (!down_read_trylock(&sb->s_umount))
> + return -1;

Would you just elaborate on the lock order problem somewhere? (the
comment makes it look like we *could* take the mutex if we wanted
to).

> +
> + if (!sb->s_root) {
> + up_read(&sb->s_umount);
> + return -1;
> + }
> +
> + if (nr_to_scan) {
> + /* proportion the scan between the two cacheѕ */
> + int total;
> +
> + total = sb->s_nr_dentry_unused + sb->s_nr_inodes_unused + 1;
> + count = (nr_to_scan * sb->s_nr_dentry_unused) / total;
> +
> + /* prune dcache first as icache is pinned by it */
> + prune_dcache_sb(sb, count);
> + prune_icache_sb(sb, nr_to_scan - count);
> + }
> +
> + count = ((sb->s_nr_dentry_unused + sb->s_nr_inodes_unused) / 100)
> + * sysctl_vfs_cache_pressure;

Do you think truncating in the divisions is at all a problem? It
probably doesn't matter much I suppose.

> @@ -162,6 +213,7 @@ void deactivate_locked_super(struct super_block *s)
> struct file_system_type *fs = s->s_type;
> if (atomic_dec_and_test(&s->s_active)) {
> vfs_dq_off(s, 0);
> + unregister_shrinker(&s->s_shrink);
> fs->kill_sb(s);
> put_filesystem(fs);
> put_super(s);
> @@ -335,6 +387,7 @@ retry:
> list_add_tail(&s->s_list, &super_blocks);
> list_add(&s->s_instances, &type->fs_supers);
> spin_unlock(&sb_lock);
> + register_shrinker(&s->s_shrink);
> get_filesystem(type);
> return s;
> }
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 7b90c43..5bff2dc 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -382,6 +382,7 @@ struct inodes_stat_t {
> #include <linux/capability.h>
> #include <linux/semaphore.h>
> #include <linux/fiemap.h>
> +#include <linux/mm.h>
>
> #include <asm/atomic.h>
> #include <asm/byteorder.h>
> @@ -1385,8 +1386,14 @@ struct super_block {
> * generic_show_options()
> */
> char *s_options;
> +
> + struct shrinker s_shrink; /* per-sb shrinker handle */
> };
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

From: Dave Chinner on 26 May 2010 19:20

On Thu, May 27, 2010 at 02:41:16AM +1000, Nick Piggin wrote:
> On Tue, May 25, 2010 at 06:53:06PM +1000, Dave Chinner wrote:
> > @@ -456,21 +456,16 @@ static void prune_one_dentry(struct dentry * dentry)
> > * which flags are set. This means we don't need to maintain multiple
> > * similar copies of this loop.
> > */
> > -static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags)
> > +static void __shrink_dcache_sb(struct super_block *sb, int count, int flags)
> > {
> > LIST_HEAD(referenced);
> > LIST_HEAD(tmp);
> > struct dentry *dentry;
> > - int cnt = 0;
> >
> > BUG_ON(!sb);
> > - BUG_ON((flags & DCACHE_REFERENCED) && count == NULL);
> > + BUG_ON((flags & DCACHE_REFERENCED) && count == -1);
> > spin_lock(&dcache_lock);
> > - if (count != NULL)
> > - /* called from prune_dcache() and shrink_dcache_parent() */
> > - cnt = *count;
> > -restart:
> > - if (count == NULL)
> > + if (count == -1)
> > list_splice_init(&sb->s_dentry_lru, &tmp);
> > else {
> > while (!list_empty(&sb->s_dentry_lru)) {
> > @@ -492,13 +487,13 @@ restart:
> > } else {
> > list_move_tail(&dentry->d_lru, &tmp);
> > spin_unlock(&dentry->d_lock);
> > - cnt--;
> > - if (!cnt)
> > + if (--count == 0)
> > break;
> > }
> > cond_resched_lock(&dcache_lock);
> > }
> > }
> > +prune_more:
> > while (!list_empty(&tmp)) {
> > dentry = list_entry(tmp.prev, struct dentry, d_lru);
> > dentry_lru_del_init(dentry);
> > @@ -516,88 +511,29 @@ restart:
> > /* dentry->d_lock was dropped in prune_one_dentry() */
> > cond_resched_lock(&dcache_lock);
> > }
> > - if (count == NULL && !list_empty(&sb->s_dentry_lru))
> > - goto restart;
> > - if (count != NULL)
> > - *count = cnt;
> > + if (count == -1 && !list_empty(&sb->s_dentry_lru)) {
> > + list_splice_init(&sb->s_dentry_lru, &tmp);
> > + goto prune_more;
> > + }
>
> Nitpick but I prefer just the restart label wher it is previously. This
> is moving setup for the next iteration into the "error" case.

Ok, will fix.

> > +static int prune_super(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
> > +{
> > + struct super_block *sb;
> > + int count;
> > +
> > + sb = container_of(shrink, struct super_block, s_shrink);
> > +
> > + /*
> > + * Deadlock avoidance. We may hold various FS locks, and we don't want
> > + * to recurse into the FS that called us in clear_inode() and friends..
> > + */
> > + if (!(gfp_mask & __GFP_FS))
> > + return -1;
> > +
> > + /*
> > + * if we can't get the umount lock, then there's no point having the
> > + * shrinker try again because the sb is being torn down.
> > + */
> > + if (!down_read_trylock(&sb->s_umount))
> > + return -1;
>
> Would you just elaborate on the lock order problem somewhere? (the
> comment makes it look like we *could* take the mutex if we wanted
> to).

The shrinker is unregistered in deactivate_locked_super() which is
just before ->kill_sb is called. The sb->s_umount lock is held at
this point. hence is the shrinker is operating, we will deadlock if
we try to lock it like this:

unmount: shrinker:
down_read(&shrinker_lock);
down_write(&sb->s_umount)
unregister_shrinker()
down_write(&shrinker_lock)
prune_super()
down_read(&sb->s_umount);
(deadlock)

hence if we can't get the sb->s_umount lock in prune_super(), then
the superblock must be being unmounted and the shrinker should abort
as the ->kill_sb method will clean up everything after the shrinker
is unregistered. Hence the down_read_trylock().

> > + if (!sb->s_root) {
> > + up_read(&sb->s_umount);
> > + return -1;
> > + }
> > +
> > + if (nr_to_scan) {
> > + /* proportion the scan between the two cacheѕ */
> > + int total;
> > +
> > + total = sb->s_nr_dentry_unused + sb->s_nr_inodes_unused + 1;
> > + count = (nr_to_scan * sb->s_nr_dentry_unused) / total;
> > +
> > + /* prune dcache first as icache is pinned by it */
> > + prune_dcache_sb(sb, count);
> > + prune_icache_sb(sb, nr_to_scan - count);
> > + }
> > +
> > + count = ((sb->s_nr_dentry_unused + sb->s_nr_inodes_unused) / 100)
> > + * sysctl_vfs_cache_pressure;
>
> Do you think truncating in the divisions is at all a problem? It
> probably doesn't matter much I suppose.

Same code as currently exists. IIRC, the reasoning is that if we've
got less that 100 objects to reclaim, then we're unlikely to be able
to free up any memory from the caches, anyway.

Cheers,

Dave.
--
Dave Chinner
david(a)fromorbit.com
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

From: Nick Piggin on 26 May 2010 22:20

On Thu, May 27, 2010 at 09:12:14AM +1000, Dave Chinner wrote:
> On Thu, May 27, 2010 at 02:41:16AM +1000, Nick Piggin wrote:
> > On Tue, May 25, 2010 at 06:53:06PM +1000, Dave Chinner wrote:
> > > @@ -456,21 +456,16 @@ static void prune_one_dentry(struct dentry * dentry)
> > > + /*
> > > + * if we can't get the umount lock, then there's no point having the
> > > + * shrinker try again because the sb is being torn down.
> > > + */
> > > + if (!down_read_trylock(&sb->s_umount))
> > > + return -1;
> >
> > Would you just elaborate on the lock order problem somewhere? (the
> > comment makes it look like we *could* take the mutex if we wanted
> > to).
>
> The shrinker is unregistered in deactivate_locked_super() which is
> just before ->kill_sb is called. The sb->s_umount lock is held at
> this point. hence is the shrinker is operating, we will deadlock if
> we try to lock it like this:
>
> unmount: shrinker:
> down_read(&shrinker_lock);
> down_write(&sb->s_umount)
> unregister_shrinker()
> down_write(&shrinker_lock)
> prune_super()
> down_read(&sb->s_umount);
> (deadlock)
>
> hence if we can't get the sb->s_umount lock in prune_super(), then
> the superblock must be being unmounted and the shrinker should abort
> as the ->kill_sb method will clean up everything after the shrinker
> is unregistered. Hence the down_read_trylock().

You added it to the comment in your updated patch, that was the main
thing I wanted. Thanks.

> > > + if (!sb->s_root) {
> > > + up_read(&sb->s_umount);
> > > + return -1;
> > > + }
> > > +
> > > + if (nr_to_scan) {
> > > + /* proportion the scan between the two cacheѕ */
> > > + int total;
> > > +
> > > + total = sb->s_nr_dentry_unused + sb->s_nr_inodes_unused + 1;
> > > + count = (nr_to_scan * sb->s_nr_dentry_unused) / total;
> > > +
> > > + /* prune dcache first as icache is pinned by it */
> > > + prune_dcache_sb(sb, count);
> > > + prune_icache_sb(sb, nr_to_scan - count);
> > > + }
> > > +
> > > + count = ((sb->s_nr_dentry_unused + sb->s_nr_inodes_unused) / 100)
> > > + * sysctl_vfs_cache_pressure;
> >
> > Do you think truncating in the divisions is at all a problem? It
> > probably doesn't matter much I suppose.
>
> Same code as currently exists. IIRC, the reasoning is that if we've
> got less that 100 objects to reclaim, then we're unlikely to be able
> to free up any memory from the caches, anyway.

Yeah, which is why I stop short of saying you should change it in
this patch.

But I think we should ensure things can get reclaimed eventually.
100 objects could be 100 slabs, which could be anything from
half a meg to half a dozen. Multiplied by each of the caches.
Could be significant in small systems.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

From: Dave Chinner on 27 May 2010 00:10

On Thu, May 27, 2010 at 12:19:05PM +1000, Nick Piggin wrote:
> On Thu, May 27, 2010 at 09:12:14AM +1000, Dave Chinner wrote:
> > On Thu, May 27, 2010 at 02:41:16AM +1000, Nick Piggin wrote:
> > > > + count = ((sb->s_nr_dentry_unused + sb->s_nr_inodes_unused) / 100)
> > > > + * sysctl_vfs_cache_pressure;
> > >
> > > Do you think truncating in the divisions is at all a problem? It
> > > probably doesn't matter much I suppose.
> >
> > Same code as currently exists. IIRC, the reasoning is that if we've
> > got less that 100 objects to reclaim, then we're unlikely to be able
> > to free up any memory from the caches, anyway.
>
> Yeah, which is why I stop short of saying you should change it in
> this patch.
>
> But I think we should ensure things can get reclaimed eventually.
> 100 objects could be 100 slabs, which could be anything from
> half a meg to half a dozen. Multiplied by each of the caches.
> Could be significant in small systems.

True, but usually there are busy objects in the dentry and inode
slabs, so it shouldn't be a significant issue. We can probably
address such problems if they can be demonstrated to be an issue in
a separate patch set....

Cheers,

Dave.
--
Dave Chinner
david(a)fromorbit.com
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

From: Al Viro on 27 May 2010 00:10

On Thu, May 27, 2010 at 11:53:35AM +1000, Dave Chinner wrote:
> On Thu, May 27, 2010 at 09:12:14AM +1000, Dave Chinner wrote:
> > On Thu, May 27, 2010 at 02:41:16AM +1000, Nick Piggin wrote:
> ....
> > > Nitpick but I prefer just the restart label wher it is previously. This
> > > is moving setup for the next iteration into the "error" case.
> >
> > Ok, will fix.
> ....
> > > Would you just elaborate on the lock order problem somewhere? (the
> > > comment makes it look like we *could* take the mutex if we wanted
> > > to).
> >
> > The shrinker is unregistered in deactivate_locked_super() which is
> > just before ->kill_sb is called. The sb->s_umount lock is held at
> > this point. hence is the shrinker is operating, we will deadlock if
> > we try to lock it like this:
> >
> > unmount: shrinker:
> > down_read(&shrinker_lock);
> > down_write(&sb->s_umount)
> > unregister_shrinker()
> > down_write(&shrinker_lock)
> > prune_super()
> > down_read(&sb->s_umount);
> > (deadlock)
> >
> > hence if we can't get the sb->s_umount lock in prune_super(), then
> > the superblock must be being unmounted and the shrinker should abort
> > as the ->kill_sb method will clean up everything after the shrinker
> > is unregistered. Hence the down_read_trylock().

Um... Maybe I'm dumb, but what's wrong with doing unregistration from
deactivate_locked_super(), right after the call of ->kill_sb()? At that
point ->s_umount is already dropped, so we won't deadlock at all.
Shrinker rwsem will make sure that all shrinkers-in-progress will run
to completion, so we won't get a superblock freed under prune_super().
I don't particulary mind down_try_read() in prune_super(), but why not
make life obviously safer?

Am I missing something here?
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

| Next | Last
Pages: 1 2 3
Prev: ath: Failed to stop TX DMA in 100 msec after killing last frame
Next: [GIT PULL] LED updates for 2.6.35