|
Prev: [GIT PULL] please pull infiniband.git
Next: [patch 2.6.26-rc6-mm] gpio: mcp23s08 handles multiple chips per chipselect
From: Andrea Righi on 18 Jun 2008 18:40 Carl Henrik Lunde wrote: > On Sat, Jun 07, 2008 at 12:27:29AM +0200, Andrea Righi wrote: >> This is the core io-throttle kernel infrastructure. It creates the basic >> interfaces to cgroups and implements the I/O measurement and throttling >> functions. > [...] >> +void cgroup_io_account(struct block_device *bdev, size_t bytes) > [...] >> + /* Account the I/O activity */ >> + node->req += bytes; >> + >> + /* Evaluate if we need to throttle the current process */ >> + delta = (long)jiffies - (long)node->last_request; >> + if (!delta) >> + goto out; >> + >> + t = msecs_to_jiffies(node->req / node->iorate); >> + if (!t) >> + goto out; >> + >> + sleep = t - delta; >> + if (unlikely(sleep > 0)) { >> + spin_unlock_irq(&iot->lock); >> + if (__cant_sleep()) >> + return; >> + pr_debug("io-throttle: task %p (%s) must sleep %lu jiffies\n", >> + current, current->comm, sleep); >> + schedule_timeout_killable(sleep); >> + return; >> + } >> + >> + /* Reset I/O accounting */ >> + node->req = 0; >> + node->last_request = jiffies; > [...] > > Did you consider using token bucket instead of this (leaky bucket?)? > > I've attached a patch which implements token bucket. Although not as > precise as the leaky bucket the performance is better at high bandwidth > streaming loads. Interesting! it could be great to have both available at runtime and just switch between leaky or token bucket, e.g. by echo-ing "leaky" or "token" to a file in the cgroup filesystem, ummm, block.limiting-algorithm? > > The leaky bucket stops at around 53 MB/s while token bucket works for > up to 64 MB/s. The baseline (no cgroups) is 66 MB/s. > > benchmark: > two streaming readers (fio) with block size 128k, bucket size 4 MB > 90% of the bandwidth was allocated to one process, the other gets 10% Thanks for posting the results, I'll look closely at your patch, test it as well and try merge your work. I also did some improvements in v2 in terms of scalability, in particular I've replaced the rbtree with a liked list, in order to remove the spinlocks and replace them by RCU to protect the list structure. I need to do some stress tests before, but I'll post a v3 soon. Some minor comments below for now. > diff --git a/block/blk-io-throttle.c b/block/blk-io-throttle.c > index 804df88..9ed0c7c 100644 > --- a/block/blk-io-throttle.c > +++ b/block/blk-io-throttle.c > @@ -40,7 +40,8 @@ struct iothrottle_node { > struct rb_node node; > dev_t dev; > unsigned long iorate; > - unsigned long req; > + long bucket_size; /* Max value for t */ > + long t; > unsigned long last_request; > }; > > @@ -180,18 +181,20 @@ static ssize_t iothrottle_read(struct cgroup *cont, > iothrottle_for_each(n, &iot->tree) { > struct iothrottle_node *node = > rb_entry(n, struct iothrottle_node, node); > - unsigned long delta = (long)jiffies - (long)node->last_request; > + unsigned long delta = (((long)jiffies - (long)node->last_request) * 1000) / HZ; Better to use jiffies_to_msecs() here. > > BUG_ON(!node->dev); > s += snprintf(s, nbytes - (s - buffer), > "=== device (%u,%u) ===\n" > "bandwidth-max: %lu KiB/sec\n" > - " requested: %lu bytes\n" > - " last request: %lu jiffies\n" > - " delta: %lu jiffies\n", > + "bucket size : %ld KiB\n" > + "bucket fill : %ld KiB (after last request)\n" > + "last request : %lu ms ago\n", > MAJOR(node->dev), MINOR(node->dev), > - node->iorate, node->req, > - node->last_request, delta); > + node->iorate, > + node->bucket_size / 1024, > + node->t / 1024, > + delta); > } > spin_unlock_irq(&iot->lock); > buffer[nbytes] = '\0'; > @@ -220,21 +223,33 @@ static inline dev_t devname2dev_t(const char *buf) > return ret; > } > > -static inline int iothrottle_parse_args(char *buf, size_t nbytes, > - dev_t *dev, unsigned long *val) > +static inline int iothrottle_parse_args(char *buf, size_t nbytes, dev_t *dev, > + unsigned long *iorate, > + unsigned long *bucket_size) > { > - char *p; > + char *ioratep, *bucket_sizep; > > - p = memchr(buf, ':', nbytes); > - if (!p) > + ioratep = memchr(buf, ':', nbytes); > + if (!ioratep) > return -EINVAL; > - *p++ = '\0'; > + *ioratep++ = '\0'; > + > + bucket_sizep = memchr(ioratep, ':', nbytes + ioratep - buf); > + if (!bucket_sizep) > + return -EINVAL; > + *bucket_sizep++ = '\0'; > > *dev = devname2dev_t(buf); > if (!*dev) > return -ENOTBLK; > > - return strict_strtoul(p, 10, val); > + if (strict_strtoul(ioratep, 10, iorate)) > + return -EINVAL; > + > + if (strict_strtoul(bucket_sizep, 10, bucket_size)) > + return -EINVAL; > + > + return 0; > } > > static ssize_t iothrottle_write(struct cgroup *cont, > @@ -247,7 +262,7 @@ static ssize_t iothrottle_write(struct cgroup *cont, > struct iothrottle_node *node, *tmpn = NULL; > char *buffer, *tmpp; > dev_t dev; > - unsigned long val; > + unsigned long iorate, bucket_size; > int ret; > > if (unlikely(!nbytes)) > @@ -265,7 +280,7 @@ static ssize_t iothrottle_write(struct cgroup *cont, > buffer[nbytes] = '\0'; > tmpp = strstrip(buffer); > > - ret = iothrottle_parse_args(tmpp, nbytes, &dev, &val); > + ret = iothrottle_parse_args(tmpp, nbytes, &dev, &iorate, &bucket_size); > if (ret) > goto out1; > > @@ -284,7 +299,7 @@ static ssize_t iothrottle_write(struct cgroup *cont, > iot = cgroup_to_iothrottle(cont); > > spin_lock_irq(&iot->lock); > - if (!val) { > + if (!iorate) { > /* Delete a block device limiting rule */ > iothrottle_delete_node(iot, dev); > ret = nbytes; > @@ -293,8 +308,9 @@ static ssize_t iothrottle_write(struct cgroup *cont, > node = iothrottle_search_node(iot, dev); > if (node) { > /* Update a block device limiting rule */ > - node->iorate = val; > - node->req = 0; > + node->iorate = iorate; > + node->bucket_size = bucket_size * 1024; > + node->t = 0; > node->last_request = jiffies; > ret = nbytes; > goto out3; > @@ -307,8 +323,9 @@ static ssize_t iothrottle_write(struct cgroup *cont, > node = tmpn; > tmpn = NULL; > > - node->iorate = val; > - node->req = 0; > + node->iorate = iorate; > + node->bucket_size = bucket_size * 1024; > + node->t = 0; > node->last_request = jiffies; > node->dev = dev; > ret = iothrottle_insert_node(iot, node); > @@ -355,7 +372,7 @@ void cgroup_io_account(struct block_device *bdev, size_t bytes) > { > struct iothrottle *iot; > struct iothrottle_node *node; > - unsigned long delta, t; > + unsigned long delta; > long sleep; > > if (unlikely(!bdev)) > @@ -370,36 +387,37 @@ void cgroup_io_account(struct block_device *bdev, size_t bytes) > spin_lock_irq(&iot->lock); > > node = iothrottle_search_node(iot, bdev->bd_inode->i_rdev); > - if (!node || !node->iorate) > - goto out; > - > - /* Account the I/O activity */ > - node->req += bytes; > + if (!node || !node->iorate) { > + spin_unlock_irq(&iot->lock); > + return; > + } > > - /* Evaluate if we need to throttle the current process */ > + /* Add tokens for time elapsed since last read */ > delta = (long)jiffies - (long)node->last_request; > - if (!delta) > - goto out; > + if (delta) { > + node->last_request = jiffies; > + node->t += (node->iorate * 1024 * delta) / HZ; The same here: node->t += node->iorate * 1024 * jiffies_to_msec(delta) * MSEC_PER_SEC; > > - t = msecs_to_jiffies(node->req / node->iorate); > - if (!t) > - goto out; > + if (node->t > node->bucket_size) > + node->t = node->bucket_size; > + } > > - sleep = t - delta; > - if (unlikely(sleep > 0)) { > - spin_unlock_irq(&iot->lock); > - if (__cant_sleep()) > - return; > - pr_debug("io-throttle: task %p (%s) must sleep %lu jiffies\n", > - current, current->comm, sleep); > - schedule_timeout_killable(sleep); > - return; > + /* Account the I/O activity */ > + node->t -= bytes; > + > + if (node->t < 0) { > + sleep = (-node->t) * HZ / (node->iorate * 1024); And again: sleep = msec_to_jiffies(-node->t / (node->iorate * 1024) * MSEC_PER_SEC); > + } else { > + sleep = 0; > } > > - /* Reset I/O accounting */ > - node->req = 0; > - node->last_request = jiffies; > -out: > spin_unlock_irq(&iot->lock); > + > + if (sleep && !__cant_sleep()) { > + pr_debug("io-throttle: %s[%d] must sleep %ld jiffies\n", > + current->comm, current->pid, sleep); > + > + schedule_timeout_killable(sleep); > + } > } > EXPORT_SYMBOL(cgroup_io_account); Thanks, -Andrea -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo(a)vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
From: Eric Rannaud on 22 Jun 2008 15:50 On Tue, 17 Jun 2008, Andrea Righi wrote: > > With this bandwidth controller, a cpu-intensive job which otherwise does > > not care about its IO > > performance needs to be pin-point accurate about IO bandwidth required in > > order to not suffer > > from cpu-throttling. IMHO, if a cgroup is exceeding its limit for a given > > resource, the throttling > > should be done _only_ for that resource. > > I understand your point of view. It would be nice if we could just > "disable" the i/o for a cgroup that exceeds its limit, instead of > scheduling some sleep()s, so the tasks running in this cgroup would be > able to continue their non-i/o operations as usual. > > However, how to do if the tasks continue to perform i/o ops under this > condition? we could just cache the i/o in memory and at the same time > reduce the i/o priority of those tasks' requests, but this would require > a lot of memory, more space in the page cache, and probably could lead > to potential OOM conditions. A safer approach IMHO is to force the tasks > to wait synchronously on each operation that directly or indirectly > generates i/o. The last one is the solution implemented by this > bandwidth controller. What about AIO? Is this approach going to make the task sleep as well? Would it better to return from aio_write()/_read() with EAGAIN? Thanks. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo(a)vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
From: Andrea Righi on 23 Jun 2008 06:40 Eric Rannaud wrote: > On Tue, 17 Jun 2008, Andrea Righi wrote: >>> With this bandwidth controller, a cpu-intensive job which otherwise does >>> not care about its IO >>> performance needs to be pin-point accurate about IO bandwidth required in >>> order to not suffer >>> from cpu-throttling. IMHO, if a cgroup is exceeding its limit for a given >>> resource, the throttling >>> should be done _only_ for that resource. >> I understand your point of view. It would be nice if we could just >> "disable" the i/o for a cgroup that exceeds its limit, instead of >> scheduling some sleep()s, so the tasks running in this cgroup would be >> able to continue their non-i/o operations as usual. >> >> However, how to do if the tasks continue to perform i/o ops under this >> condition? we could just cache the i/o in memory and at the same time >> reduce the i/o priority of those tasks' requests, but this would require >> a lot of memory, more space in the page cache, and probably could lead >> to potential OOM conditions. A safer approach IMHO is to force the tasks >> to wait synchronously on each operation that directly or indirectly >> generates i/o. The last one is the solution implemented by this >> bandwidth controller. > > What about AIO? Is this approach going to make the task sleep as well? > Would it better to return from aio_write()/_read() with EAGAIN? Good point. I should check, but it seems sleeps are incorrectly performed also for AIO requests. I agree the correct behaviour would be to return EAGAIN instead, as you suggested. I'll look at it if nobody comes up with a solution. Thanks, -Andrea -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo(a)vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
From: Andrew Morton on 25 Jun 2008 20:30 On Fri, 20 Jun 2008 12:05:34 +0200 Andrea Righi <righi.andrea(a)gmail.com> wrote: > This is the core io-throttle kernel infrastructure. It creates the basic > interfaces to cgroups and implements the I/O measurement and throttling > functions. > > Signed-off-by: Andrea Righi <righi.andrea(a)gmail.com> > --- > block/Makefile | 2 + > block/blk-io-throttle.c | 393 +++++++++++++++++++++++++++++++++++++++ > include/linux/blk-io-throttle.h | 12 ++ > include/linux/cgroup_subsys.h | 6 + > init/Kconfig | 10 + > 5 files changed, 423 insertions(+), 0 deletions(-) > create mode 100644 block/blk-io-throttle.c > create mode 100644 include/linux/blk-io-throttle.h > > diff --git a/block/Makefile b/block/Makefile > index 5a43c7d..8dec69b 100644 > --- a/block/Makefile > +++ b/block/Makefile > @@ -14,3 +14,5 @@ obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o > > obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o > obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o > + > +obj-$(CONFIG_CGROUP_IO_THROTTLE) += blk-io-throttle.o > diff --git a/block/blk-io-throttle.c b/block/blk-io-throttle.c > new file mode 100644 > index 0000000..4ec02bb > --- /dev/null > +++ b/block/blk-io-throttle.c > @@ -0,0 +1,393 @@ > +/* > + * blk-io-throttle.c > + * > + * This program is free software; you can redistribute it and/or > + * modify it under the terms of the GNU General Public > + * License as published by the Free Software Foundation; either > + * version 2 of the License, or (at your option) any later version. > + * > + * This program is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * General Public License for more details. > + * > + * You should have received a copy of the GNU General Public > + * License along with this program; if not, write to the > + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, > + * Boston, MA 021110-1307, USA. > + * > + * Copyright (C) 2008 Andrea Righi <righi.andrea(a)gmail.com> > + */ > + > +#include <linux/init.h> > +#include <linux/module.h> > +#include <linux/cgroup.h> > +#include <linux/slab.h> > +#include <linux/gfp.h> > +#include <linux/err.h> > +#include <linux/sched.h> > +#include <linux/fs.h> > +#include <linux/jiffies.h> > +#include <linux/hardirq.h> > +#include <linux/list.h> > +#include <linux/spinlock.h> > +#include <linux/uaccess.h> > +#include <linux/vmalloc.h> > +#include <linux/blk-io-throttle.h> > + > +#define ONE_SEC 1000000L /* # of microseconds in a second */ Remove this, use USEC_PER_SEC throughout. > +#define KBS(x) ((x) * ONE_SEC >> 10) Convert to lower-case-named C function, please. > + > +struct iothrottle_node { > + struct list_head node; > + dev_t dev; > + unsigned long iorate; > + unsigned long timestamp; > + atomic_long_t stat; > +}; Please document each field in structures. This is usually more useful and important than documenting the code whcih manipulates those fields. It is important that the units of fields such as iorate and timestamp and stamp be documented. > +struct iothrottle { > + struct cgroup_subsys_state css; > + /* protects the list below, not the single elements */ > + spinlock_t lock; > + struct list_head list; > +}; Looking elsewhere in the code it appears that some RCU-based locking is performed. That should be documented somewhere. Fully. At the definition site of the data whcih is RCU-protceted would be a good site. > +static inline struct iothrottle *cgroup_to_iothrottle(struct cgroup *cont) > +{ > + return container_of(cgroup_subsys_state(cont, iothrottle_subsys_id), > + struct iothrottle, css); > +} > + > +static inline struct iothrottle *task_to_iothrottle(struct task_struct *task) > +{ > + return container_of(task_subsys_state(task, iothrottle_subsys_id), > + struct iothrottle, css); > +} > + > +static inline struct iothrottle_node *iothrottle_search_node( > + const struct iothrottle *iot, > + dev_t dev) > +{ > + struct iothrottle_node *n; > + > + list_for_each_entry_rcu(n, &iot->list, node) > + if (n->dev == dev) > + return n; > + return NULL; > +} This will be too large for inlining. This function presumably has caller-provided locking requirements? They should be documented here. > +static inline void iothrottle_insert_node(struct iothrottle *iot, > + struct iothrottle_node *n) > +{ > + list_add_rcu(&n->node, &iot->list); > +} > + > +static inline struct iothrottle_node *iothrottle_replace_node( > + struct iothrottle *iot, > + struct iothrottle_node *old, > + struct iothrottle_node *new) > +{ > + list_replace_rcu(&old->node, &new->node); > + return old; > +} Dittoes. > +static inline struct iothrottle_node *iothrottle_delete_node( > + struct iothrottle *iot, > + dev_t dev) > +{ > + struct iothrottle_node *n; > + > + list_for_each_entry(n, &iot->list, node) > + if (n->dev == dev) { > + list_del_rcu(&n->node); > + return n; > + } > + return NULL; > +} Too large for inlining. Was list_for_each_entry_rcu() needed? Does this function have any caller-provided locking requirements? > +/* > + * Note: called from kernel/cgroup.c with cgroup_lock() held. > + */ > +static struct cgroup_subsys_state *iothrottle_create( > + struct cgroup_subsys *ss, struct cgroup *cont) static struct cgroup_subsys_state *iothrottle_create(struct cgroup_subsys *ss, struct cgroup *cont) would be more typical code layout (here and elsewhere) static struct cgroup_subsys_state * iothrottle_create(struct cgroup_subsys *ss, struct cgroup *cont) is another way. > +{ > + struct iothrottle *iot; > + > + iot = kmalloc(sizeof(*iot), GFP_KERNEL); > + if (unlikely(!iot)) > + return ERR_PTR(-ENOMEM); > + > + INIT_LIST_HEAD(&iot->list); > + spin_lock_init(&iot->lock); > + > + return &iot->css; > +} > + > +/* > + * Note: called from kernel/cgroup.c with cgroup_lock() held. > + */ > +static void iothrottle_destroy(struct cgroup_subsys *ss, struct cgroup *cont) > +{ > + struct iothrottle_node *n, *p; > + struct iothrottle *iot = cgroup_to_iothrottle(cont); > + > + /* > + * don't worry about locking here, at this point there must be not any > + * reference to the list. > + */ > + list_for_each_entry_safe(n, p, &iot->list, node) > + kfree(n); > + kfree(iot); > +} > + > +static ssize_t iothrottle_read(struct cgroup *cont, > + struct cftype *cft, > + struct file *file, > + char __user *userbuf, > + size_t nbytes, > + loff_t *ppos) > +{ > + struct iothrottle *iot; > + char *buffer; > + int s = 0; > + struct iothrottle_node *n; > + ssize_t ret; > + > + buffer = kmalloc(nbytes + 1, GFP_KERNEL); > + if (!buffer) > + return -ENOMEM; > + > + cgroup_lock(); > + if (cgroup_is_removed(cont)) { > + ret = -ENODEV; > + goto out; > + } > + > + iot = cgroup_to_iothrottle(cont); > + rcu_read_lock(); > + list_for_each_entry_rcu(n, &iot->list, node) { > + unsigned long delta, rate; > + > + BUG_ON(!n->dev); > + delta = jiffies_to_usecs((long)jiffies - (long)n->timestamp); > + rate = delta ? KBS(atomic_long_read(&n->stat) / delta) : 0; > + s += scnprintf(buffer + s, nbytes - s, > + "=== device (%u,%u) ===\n" > + " bandwidth limit: %lu KiB/sec\n" > + "current i/o usage: %lu KiB/sec\n", > + MAJOR(n->dev), MINOR(n->dev), > + n->iorate, rate); > + } > + rcu_read_unlock(); > + ret = simple_read_from_buffer(userbuf, nbytes, ppos, buffer, s); > +out: > + cgroup_unlock(); > + kfree(buffer); > + return ret; > +} This is a kernel->userspace interface. It is part of the kernel ABI. We will need to support in a back-compatible fashion for ever. Hence it is important. The entire proposed kernel<->userspace interface should be completely described in the changelog or the documentation so that we can understand and review what you are proposing. > +static inline dev_t devname2dev_t(const char *buf) > +{ > + struct block_device *bdev; > + dev_t ret; > + > + bdev = lookup_bdev(buf); > + if (IS_ERR(bdev)) > + return 0; > + > + BUG_ON(!bdev->bd_inode); > + ret = bdev->bd_inode->i_rdev; > + bdput(bdev); > + > + return ret; > +} Too large to inline. I get tired of telling people this. Please just remove all the inlining from all the patches. Then go back and selectively inline any functions which really do need to be inlined (overall reduction in generated .text is a good heuristic). How can this function not be racy? We're returning a dev_t which refers to a device upon which we have no reference. A better design might be to rework the who9le thing to operate on a `struct block_device *' upon whcih this code holds a reference, rather than using bare dev_t. I _guess_ it's OK doing an in-kernel filesystem lookup here. But did you consider just passing the dev_t in from userspace? It's just a stat(). Does all this code treat /dev/sda1 as a separate device from /dev/sda2? If so, that would be broken. > +static inline int iothrottle_parse_args(char *buf, size_t nbytes, > + dev_t *dev, unsigned long *val) > +{ > + char *p; > + > + p = memchr(buf, ':', nbytes); > + if (!p) > + return -EINVAL; > + *p++ = '\0'; > + > + /* i/o bandiwth is expressed in KiB/s */ typo. This comment is incorrect, isn't it? Or at least misleading. The bandwidth can be expressed in an exotically broad number of different ways. > + *val = ALIGN(memparse(p, &p), 1024) >> 10; > + if (*p) > + return -EINVAL; > + > + *dev = devname2dev_t(buf); > + if (!*dev) > + return -ENOTBLK; > + > + return 0; > +} uninline... I think the whole memparse() thing is over the top: +- BANDWIDTH is the maximum I/O bandwidth on DEVICE allowed by CGROUP (we can + use a suffix k, K, m, M, g or G to indicate bandwidth values in KB/s, MB/s + or GB/s), For starters, we don't _display_ the bacndwidth back to the user in the units with which it was written, so what's the point? Secondly, we hope and expect that humans won't be diorectly echoing raw data into kernel pseudo files. We shouild expect and plan for (or even write) front-end management applications. And such applications won't need these ornate designed-for-human interfaces. IOW: I'd suggest this interface be changed to accept a plain old 64-bit bytes-per-second and leave it at that. > +static ssize_t iothrottle_write(struct cgroup *cont, > + struct cftype *cft, > + struct file *file, > + const char __user *userbuf, > + size_t nbytes, loff_t *ppos) > +{ > + struct iothrottle *iot; > + struct iothrottle_node *n, *tmpn = NULL; > + char *buffer, *tmpp; Please avoid variables called tmp or things derived from it. Surely we can think of some more communucative identifier? > + dev_t dev; > + unsigned long val; > + int ret; > + > + if (!nbytes) > + return -EINVAL; > + > + /* Upper limit on largest io-throttle rule string user might write. */ > + if (nbytes > 1024) > + return -E2BIG; > + > + buffer = kmalloc(nbytes + 1, GFP_KERNEL); > + if (!buffer) > + return -ENOMEM; > + > + if (copy_from_user(buffer, userbuf, nbytes)) { > + ret = -EFAULT; > + goto out1; > + } > + > + buffer[nbytes] = '\0'; strncpy_from_user()? (I'm not sure that strncpy_from_user() does the null-termination as desired). > + tmpp = strstrip(buffer); > + > + ret = iothrottle_parse_args(tmpp, nbytes, &dev, &val); > + if (ret) > + goto out1; > + > + if (val) { > + tmpn = kmalloc(sizeof(*tmpn), GFP_KERNEL); > + if (!tmpn) { > + ret = -ENOMEM; > + goto out1; > + } > + atomic_long_set(&tmpn->stat, 0); > + tmpn->timestamp = jiffies; > + tmpn->iorate = val; > + tmpn->dev = dev; > + } > + > + cgroup_lock(); > + if (cgroup_is_removed(cont)) { > + ret = -ENODEV; > + goto out2; > + } > + > + iot = cgroup_to_iothrottle(cont); > + spin_lock(&iot->lock); > + if (!val) { > + /* Delete a block device limiting rule */ > + n = iothrottle_delete_node(iot, dev); > + goto out3; > + } > + n = iothrottle_search_node(iot, dev); > + if (n) { > + /* Update a block device limiting rule */ > + iothrottle_replace_node(iot, n, tmpn); > + goto out3; > + } > + /* Add a new block device limiting rule */ > + iothrottle_insert_node(iot, tmpn); > +out3: > + ret = nbytes; > + spin_unlock(&iot->lock); > + if (n) { > + synchronize_rcu(); > + kfree(n); > + } > +out2: > + cgroup_unlock(); > +out1: > + kfree(buffer); > + return ret; > +} > + > +static struct cftype files[] = { > + { > + .name = "bandwidth", > + .read = iothrottle_read, > + .write = iothrottle_write, > + }, > +}; > + > +static int iothrottle_populate(struct cgroup_subsys *ss, struct cgroup *cont) > +{ > + return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); > +} > + > +struct cgroup_subsys iothrottle_subsys = { > + .name = "blockio", > + .create = iothrottle_create, > + .destroy = iothrottle_destroy, > + .populate = iothrottle_populate, > + .subsys_id = iothrottle_subsys_id, > +}; > + > +static inline int __cant_sleep(void) > +{ > + return in_atomic() || in_interrupt() || irqs_disabled(); > +} err, no. I don't know what this is doing or why it was added, but whatever it is it's a hack and it all needs to go away. Please describe what problem this is trying to solve and let's take a look at it. That should have been covered in code comments anyway. But because it wasn't, I am presently unable to help. > +void cgroup_io_throttle(struct block_device *bdev, size_t bytes) > +{ > + struct iothrottle *iot; > + struct iothrottle_node *n; > + unsigned long delta, t; > + long sleep; > + > + if (unlikely(!bdev || !bytes)) > + return; > + > + iot = task_to_iothrottle(current); > + if (unlikely(!iot)) > + return; > + > + BUG_ON(!bdev->bd_inode); > + > + rcu_read_lock(); > + n = iothrottle_search_node(iot, bdev->bd_inode->i_rdev); > + if (!n || !n->iorate) > + goto out; > + > + /* Account the i/o activity */ > + atomic_long_add(bytes, &n->stat); > + > + /* Evaluate if we need to throttle the current process */ > + delta = (long)jiffies - (long)n->timestamp; > + if (!delta) > + goto out; > + > + t = usecs_to_jiffies(KBS(atomic_long_read(&n->stat) / n->iorate)); Are you sure that n->iorate cannot be set to zero between the above test and this division? Taking a copy into a local variable would fix that small race. > + if (!t) > + goto out; > + > + sleep = t - delta; > + if (unlikely(sleep > 0)) { > + rcu_read_unlock(); > + if (__cant_sleep()) > + return; > + pr_debug("io-throttle: task %p (%s) must sleep %lu jiffies\n", > + current, current->comm, delta); > + schedule_timeout_killable(sleep); > + return; > + } > + /* Reset i/o statistics */ > + atomic_long_set(&n->stat, 0); > + /* > + * NOTE: be sure i/o statistics have been resetted before updating the > + * timestamp, otherwise a very small time delta may possibly be read by > + * another CPU w.r.t. accounted i/o statistics, generating unnecessary > + * long sleeps. > + */ > + smp_wmb(); > + n->timestamp = jiffies; > +out: > + rcu_read_unlock(); > +} > +EXPORT_SYMBOL(cgroup_io_throttle); I'm confused. This code is using jiffies but the string "HZ" doesn't appears anywhere in the diff. Where are we converting from the kernel-internal HZ rate into suitable-for-exposing-to-userspace units? HZ can vary from 100 to 1000 (approx). What are the implications of this for the accuracy of this code? I have no comments on the overall design. I'm not sure that I understand it yet. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo(a)vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
From: Andrea Righi on 26 Jun 2008 18:40
Andrew, thanks for your time and the detailed revision. I'll try to fix everything you reported and better document the code according to your suggestions. I'll re-submit a new patchset version ASAP. A few comments below. Andrew Morton wrote: [snip] >> +static ssize_t iothrottle_read(struct cgroup *cont, >> + struct cftype *cft, >> + struct file *file, >> + char __user *userbuf, >> + size_t nbytes, >> + loff_t *ppos) >> +{ >> + struct iothrottle *iot; >> + char *buffer; >> + int s = 0; >> + struct iothrottle_node *n; >> + ssize_t ret; >> + >> + buffer = kmalloc(nbytes + 1, GFP_KERNEL); >> + if (!buffer) >> + return -ENOMEM; >> + >> + cgroup_lock(); >> + if (cgroup_is_removed(cont)) { >> + ret = -ENODEV; >> + goto out; >> + } >> + >> + iot = cgroup_to_iothrottle(cont); >> + rcu_read_lock(); >> + list_for_each_entry_rcu(n, &iot->list, node) { >> + unsigned long delta, rate; >> + >> + BUG_ON(!n->dev); >> + delta = jiffies_to_usecs((long)jiffies - (long)n->timestamp); >> + rate = delta ? KBS(atomic_long_read(&n->stat) / delta) : 0; >> + s += scnprintf(buffer + s, nbytes - s, >> + "=== device (%u,%u) ===\n" >> + " bandwidth limit: %lu KiB/sec\n" >> + "current i/o usage: %lu KiB/sec\n", >> + MAJOR(n->dev), MINOR(n->dev), >> + n->iorate, rate); >> + } >> + rcu_read_unlock(); >> + ret = simple_read_from_buffer(userbuf, nbytes, ppos, buffer, s); >> +out: >> + cgroup_unlock(); >> + kfree(buffer); >> + return ret; >> +} > > This is a kernel->userspace interface. It is part of the kernel ABI. > We will need to support in a back-compatible fashion for ever. Hence > it is important. The entire proposed kernel<->userspace interface > should be completely described in the changelog or the documentation so > that we can understand and review what you are proposing. and BTW I was actually wondering if the output could be changed with something less human readable and better parseable, I means just print only raw numbers. And describe the semantic in the documentation. >> +static inline dev_t devname2dev_t(const char *buf) >> +{ >> + struct block_device *bdev; >> + dev_t ret; >> + >> + bdev = lookup_bdev(buf); >> + if (IS_ERR(bdev)) >> + return 0; >> + >> + BUG_ON(!bdev->bd_inode); >> + ret = bdev->bd_inode->i_rdev; >> + bdput(bdev); >> + >> + return ret; >> +} > > Too large to inline. I get tired of telling people this. Please just > remove all the inlining from all the patches. Then go back and > selectively inline any functions which really do need to be inlined > (overall reduction in generated .text is a good heuristic). > > How can this function not be racy? We're returning a dev_t which > refers to a device upon which we have no reference. A better design > might be to rework the who9le thing to operate on a `struct > block_device *' upon whcih this code holds a reference, rather than > using bare dev_t. However, holding a reference wouldn't allow to unplug the device, i.e. a USB disk. As reported in Documentation/controllers/io-throttle.txt: WARNING: per-block device limiting rules always refer to the dev_t device number. If a block device is unplugged (i.e. a USB device) the limiting rules defined for that device persist and they are still valid if a new device is plugged in the system and it uses the same major and minor numbers. This would be a feature in my case, but I don't know if it would be a bug in general. > I _guess_ it's OK doing an in-kernel filesystem lookup here. But did > you consider just passing the dev_t in from userspace? It's just a > stat(). Yes, and that seems to be more reasonable, since we display major,minor numbers in the output. > Does all this code treat /dev/sda1 as a separate device from /dev/sda2? > If so, that would be broken. Yes, all the partitions are treated as separate devices with (potentially) different limiting rules, but I don't understand why it would be broken... dev_t has both minor and major numbers, so it would be possible to select single partitions as well. >> +static inline int iothrottle_parse_args(char *buf, size_t nbytes, >> + dev_t *dev, unsigned long *val) >> +{ >> + char *p; >> + >> + p = memchr(buf, ':', nbytes); >> + if (!p) >> + return -EINVAL; >> + *p++ = '\0'; >> + >> + /* i/o bandiwth is expressed in KiB/s */ > > typo. > > This comment is incorrect, isn't it? Or at least misleading. The > bandwidth can be expressed in an exotically broad number of different > ways. Yes. > >> + *val = ALIGN(memparse(p, &p), 1024) >> 10; >> + if (*p) >> + return -EINVAL; >> + >> + *dev = devname2dev_t(buf); >> + if (!*dev) >> + return -ENOTBLK; >> + >> + return 0; >> +} > > uninline... > > I think the whole memparse() thing is over the top: > > +- BANDWIDTH is the maximum I/O bandwidth on DEVICE allowed by CGROUP (we can > + use a suffix k, K, m, M, g or G to indicate bandwidth values in KB/s, MB/s > + or GB/s), > > For starters, we don't _display_ the bacndwidth back to the user in the > units with which it was written, so what's the point? > > Secondly, we hope and expect that humans won't be diorectly echoing raw > data into kernel pseudo files. We shouild expect and plan for (or even > write) front-end management applications. And such applications won't > need these ornate designed-for-human interfaces. > > IOW: I'd suggest this interface be changed to accept a plain old 64-bit > bytes-per-second and leave it at that. I agree. >> +void cgroup_io_throttle(struct block_device *bdev, size_t bytes) >> +{ >> + struct iothrottle *iot; >> + struct iothrottle_node *n; >> + unsigned long delta, t; >> + long sleep; >> + >> + if (unlikely(!bdev || !bytes)) >> + return; >> + >> + iot = task_to_iothrottle(current); >> + if (unlikely(!iot)) >> + return; >> + >> + BUG_ON(!bdev->bd_inode); >> + >> + rcu_read_lock(); >> + n = iothrottle_search_node(iot, bdev->bd_inode->i_rdev); >> + if (!n || !n->iorate) >> + goto out; >> + >> + /* Account the i/o activity */ >> + atomic_long_add(bytes, &n->stat); >> + >> + /* Evaluate if we need to throttle the current process */ >> + delta = (long)jiffies - (long)n->timestamp; >> + if (!delta) >> + goto out; >> + >> + t = usecs_to_jiffies(KBS(atomic_long_read(&n->stat) / n->iorate)); > > Are you sure that n->iorate cannot be set to zero between the above > test and this division? Taking a copy into a local variable would fix > that small race. n->iorate can only change via userspace->kernel interface, that just replaces the node in the list using the rcu way. AFAIK this shouldn't be racy, but better to do it using the local variable to avoid future bugs. >> + if (!t) >> + goto out; >> + >> + sleep = t - delta; >> + if (unlikely(sleep > 0)) { >> + rcu_read_unlock(); >> + if (__cant_sleep()) >> + return; >> + pr_debug("io-throttle: task %p (%s) must sleep %lu jiffies\n", >> + current, current->comm, delta); >> + schedule_timeout_killable(sleep); >> + return; >> + } >> + /* Reset i/o statistics */ >> + atomic_long_set(&n->stat, 0); >> + /* >> + * NOTE: be sure i/o statistics have been resetted before updating the >> + * timestamp, otherwise a very small time delta may possibly be read by >> + * another CPU w.r.t. accounted i/o statistics, generating unnecessary >> + * long sleeps. >> + */ >> + smp_wmb(); >> + n->timestamp = jiffies; >> +out: >> + rcu_read_unlock(); >> +} >> +EXPORT_SYMBOL(cgroup_io_throttle); > > I'm confused. This code is using jiffies but the string "HZ" doesn't > appears anywhere in the diff. Where are we converting from the > kernel-internal HZ rate into suitable-for-exposing-to-userspace units? > > HZ can vary from 100 to 1000 (approx). What are the implications of > this for the accuracy of this code? The code uses jiffies_to_usecs() and usecs_to_jiffies(), that should be ok, isn't it? -Andrea -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo(a)vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/ |