From: Michael S. Tsirkin on
On Fri, Sep 25, 2009 at 10:01:58AM -0700, Ira W. Snyder wrote:
> > + case VHOST_SET_VRING_KICK:
> > + r = copy_from_user(&f, argp, sizeof f);
> > + if (r < 0)
> > + break;
> > + eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd);
> > + if (IS_ERR(eventfp))
> > + return PTR_ERR(eventfp);
> > + if (eventfp != vq->kick) {
> > + pollstop = filep = vq->kick;
> > + pollstart = vq->kick = eventfp;
> > + } else
> > + filep = eventfp;
> > + break;
> > + case VHOST_SET_VRING_CALL:
> > + r = copy_from_user(&f, argp, sizeof f);
> > + if (r < 0)
> > + break;
> > + eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd);
> > + if (IS_ERR(eventfp))
> > + return PTR_ERR(eventfp);
> > + if (eventfp != vq->call) {
> > + filep = vq->call;
> > + ctx = vq->call_ctx;
> > + vq->call = eventfp;
> > + vq->call_ctx = eventfp ?
> > + eventfd_ctx_fileget(eventfp) : NULL;
> > + } else
> > + filep = eventfp;
> > + break;
> > + case VHOST_SET_VRING_ERR:
> > + r = copy_from_user(&f, argp, sizeof f);
> > + if (r < 0)
> > + break;
> > + eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd);
> > + if (IS_ERR(eventfp))
> > + return PTR_ERR(eventfp);
> > + if (eventfp != vq->error) {
> > + filep = vq->error;
> > + vq->error = eventfp;
> > + ctx = vq->error_ctx;
> > + vq->error_ctx = eventfp ?
> > + eventfd_ctx_fileget(eventfp) : NULL;
> > + } else
> > + filep = eventfp;
> > + break;
>
> I'm not sure how these eventfd's save a trip to userspace.
>
> AFAICT, eventfd's cannot be used to signal another part of the kernel,
> they can only be used to wake up userspace.

Yes, they can. See irqfd code in virt/kvm/eventfd.c.

> In my system, when an IRQ for kick() comes in, I have an eventfd which
> gets signalled to notify userspace. When I want to send a call(), I have
> to use a special ioctl(), just like lguest does.
>
> Doesn't this mean that for call(), vhost is just going to signal an
> eventfd to wake up userspace, which is then going to call ioctl(), and
> then we're back in kernelspace. Seems like a wasted userspace
> round-trip.
>
> Or am I mis-reading this code?

Yes. Kernel can poll eventfd and deliver an interrupt directly
without involving userspace.

> PS - you can see my current code at:
> http://www.mmarray.org/~iws/virtio-phys/
>
> Thanks,
> Ira
>
> > + default:
> > + r = -ENOIOCTLCMD;
> > + }
> > +
> > + if (pollstop && vq->handle_kick)
> > + vhost_poll_stop(&vq->poll);
> > +
> > + if (ctx)
> > + eventfd_ctx_put(ctx);
> > + if (filep)
> > + fput(filep);
> > +
> > + if (pollstart && vq->handle_kick)
> > + vhost_poll_start(&vq->poll, vq->kick);
> > +
> > + mutex_unlock(&vq->mutex);
> > +
> > + if (pollstop && vq->handle_kick)
> > + vhost_poll_flush(&vq->poll);
> > + return 0;
> > +}
> > +
> > +long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, unsigned long arg)
> > +{
> > + void __user *argp = (void __user *)arg;
> > + long r;
> > +
> > + mutex_lock(&d->mutex);
> > + /* If you are not the owner, you can become one */
> > + if (ioctl == VHOST_SET_OWNER) {
> > + r = vhost_dev_set_owner(d);
> > + goto done;
> > + }
> > +
> > + /* You must be the owner to do anything else */
> > + r = vhost_dev_check_owner(d);
> > + if (r)
> > + goto done;
> > +
> > + switch (ioctl) {
> > + case VHOST_SET_MEM_TABLE:
> > + r = vhost_set_memory(d, argp);
> > + break;
> > + default:
> > + r = vhost_set_vring(d, ioctl, argp);
> > + break;
> > + }
> > +done:
> > + mutex_unlock(&d->mutex);
> > + return r;
> > +}
> > +
> > +static const struct vhost_memory_region *find_region(struct vhost_memory *mem,
> > + __u64 addr, __u32 len)
> > +{
> > + struct vhost_memory_region *reg;
> > + int i;
> > + /* linear search is not brilliant, but we really have on the order of 6
> > + * regions in practice */
> > + for (i = 0; i < mem->nregions; ++i) {
> > + reg = mem->regions + i;
> > + if (reg->guest_phys_addr <= addr &&
> > + reg->guest_phys_addr + reg->memory_size - 1 >= addr)
> > + return reg;
> > + }
> > + return NULL;
> > +}
> > +
> > +int translate_desc(struct vhost_dev *dev, u64 addr, u32 len,
> > + struct iovec iov[], int iov_size)
> > +{
> > + const struct vhost_memory_region *reg;
> > + struct vhost_memory *mem;
> > + struct iovec *_iov;
> > + u64 s = 0;
> > + int ret = 0;
> > +
> > + rcu_read_lock();
> > +
> > + mem = rcu_dereference(dev->memory);
> > + while ((u64)len > s) {
> > + u64 size;
> > + if (ret >= iov_size) {
> > + ret = -ENOBUFS;
> > + break;
> > + }
> > + reg = find_region(mem, addr, len);
> > + if (!reg) {
> > + ret = -EFAULT;
> > + break;
> > + }
> > + _iov = iov + ret;
> > + size = reg->memory_size - addr + reg->guest_phys_addr;
> > + _iov->iov_len = min((u64)len, size);
> > + _iov->iov_base = (void *)
> > + (reg->userspace_addr + addr - reg->guest_phys_addr);
> > + s += size;
> > + addr += size;
> > + ++ret;
> > + }
> > +
> > + rcu_read_unlock();
> > + return ret;
> > +}
> > +
> > +/* Each buffer in the virtqueues is actually a chain of descriptors. This
> > + * function returns the next descriptor in the chain, or vq->vring.num if we're
> > + * at the end. */
> > +static unsigned next_desc(struct vhost_virtqueue *vq, struct vring_desc *desc)
> > +{
> > + unsigned int next;
> > +
> > + /* If this descriptor says it doesn't chain, we're done. */
> > + if (!(desc->flags & VRING_DESC_F_NEXT))
> > + return vq->num;
> > +
> > + /* Check they're not leading us off end of descriptors. */
> > + next = desc->next;
> > + /* Make sure compiler knows to grab that: we don't want it changing! */
> > + /* We will use the result as an index in an array, so most
> > + * architectures only need a compiler barrier here. */
> > + read_barrier_depends();
> > +
> > + if (next >= vq->num) {
> > + vq_err(vq, "Desc next is %u > %u", next, vq->num);
> > + return vq->num;
> > + }
> > +
> > + return next;
> > +}
> > +
> > +/* This looks in the virtqueue and for the first available buffer, and converts
> > + * it to an iovec for convenient access. Since descriptors consist of some
> > + * number of output then some number of input descriptors, it's actually two
> > + * iovecs, but we pack them into one and note how many of each there were.
> > + *
> > + * This function returns the descriptor number found, or vq->num (which
> > + * is never a valid descriptor number) if none was found. */
> > +unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
> > + struct iovec iov[],
> > + unsigned int *out_num, unsigned int *in_num)
> > +{
> > + struct vring_desc desc;
> > + unsigned int i, head;
> > + u16 last_avail_idx;
> > + int ret;
> > +
> > + /* Check it isn't doing very strange things with descriptor numbers. */
> > + last_avail_idx = vq->last_avail_idx;
> > + if (get_user(vq->avail_idx, &vq->avail->idx)) {
> > + vq_err(vq, "Failed to access avail idx at %p\n",
> > + &vq->avail->idx);
> > + return vq->num;
> > + }
> > +
> > + if ((u16)(vq->avail_idx - last_avail_idx) > vq->num) {
> > + vq_err(vq, "Guest moved used index from %u to %u",
> > + last_avail_idx, vq->avail_idx);
> > + return vq->num;
> > + }
> > +
> > + /* If there's nothing new since last we looked, return invalid. */
> > + if (vq->avail_idx == last_avail_idx)
> > + return vq->num;
> > +
> > + /* Grab the next descriptor number they're advertising, and increment
> > + * the index we've seen. */
> > + if (get_user(head, &vq->avail->ring[last_avail_idx % vq->num])) {
> > + vq_err(vq, "Failed to read head: idx %d address %p\n",
> > + last_avail_idx,
> > + &vq->avail->ring[last_avail_idx % vq->num]);
> > + return vq->num;
> > + }
> > +
> > + /* If their number is silly, that's an error. */
> > + if (head >= vq->num) {
> > + vq_err(vq, "Guest says index %u > %u is available",
> > + head, vq->num);
> > + return vq->num;
> > + }
> > +
> > + vq->last_avail_idx++;
> > +
> > + /* When we start there are none of either input nor output. */
> > + *out_num = *in_num = 0;
> > +
> > + i = head;
> > + do {
> > + unsigned iov_count = *in_num + *out_num;
> > + if (copy_from_user(&desc, vq->desc + i, sizeof desc)) {
> > + vq_err(vq, "Failed to get descriptor: idx %d addr %p\n",
> > + i, vq->desc + i);
> > + return vq->num;
> > + }
> > + ret = translate_desc(dev, desc.addr, desc.len, iov + iov_count,
> > + VHOST_NET_MAX_SG - iov_count);
> > + if (ret < 0) {
> > + vq_err(vq, "Translation failure %d descriptor idx %d\n",
> > + ret, i);
> > + return vq->num;
> > + }
> > + /* If this is an input descriptor, increment that count. */
> > + if (desc.flags & VRING_DESC_F_WRITE)
> > + *in_num += ret;
> > + else {
> > + /* If it's an output descriptor, they're all supposed
> > + * to come before any input descriptors. */
> > + if (*in_num) {
> > + vq_err(vq, "Descriptor has out after in: "
> > + "idx %d\n", i);
> > + return vq->num;
> > + }
> > + *out_num += ret;
> > + }
> > + } while ((i = next_desc(vq, &desc)) != vq->num);
> > + return head;
> > +}
> > +
> > +/* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
> > +void vhost_discard_vq_desc(struct vhost_virtqueue *vq)
> > +{
> > + vq->last_avail_idx--;
> > +}
> > +
> > +/* After we've used one of their buffers, we tell them about it. We'll then
> > + * want to send them an interrupt, using vq->call. */
> > +int vhost_add_used(struct vhost_virtqueue *vq,
> > + unsigned int head, int len)
> > +{
> > + struct vring_used_elem *used;
> > +
> > + /* The virtqueue contains a ring of used buffers. Get a pointer to the
> > + * next entry in that used ring. */
> > + used = &vq->used->ring[vq->last_used_idx % vq->num];
> > + if (put_user(head, &used->id)) {
> > + vq_err(vq, "Failed to write used id");
> > + return -EFAULT;
> > + }
> > + if (put_user(len, &used->len)) {
> > + vq_err(vq, "Failed to write used len");
> > + return -EFAULT;
> > + }
> > + /* Make sure buffer is written before we update index. */
> > + wmb();
> > + if (put_user(vq->last_used_idx + 1, &vq->used->idx)) {
> > + vq_err(vq, "Failed to increment used idx");
> > + return -EFAULT;
> > + }
> > + vq->last_used_idx++;
> > + return 0;
> > +}
> > +
> > +/* This actually sends the interrupt for this virtqueue */
> > +void vhost_trigger_irq(struct vhost_dev *dev, struct vhost_virtqueue *vq)
> > +{
> > + __u16 flags = 0;
> > + if (get_user(flags, &vq->avail->flags)) {
> > + vq_err(vq, "Failed to get flags");
> > + return;
> > + }
> > +
> > + /* If they don't want an interrupt, don't send one, unless empty. */
> > + if ((flags & VRING_AVAIL_F_NO_INTERRUPT) &&
> > + (!vhost_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) ||
> > + vq->avail_idx != vq->last_avail_idx))
> > + return;
> > +
> > + /* Send the Guest an interrupt tell them we used something up. */
> > + if (vq->call_ctx)
> > + eventfd_signal(vq->call_ctx, 1);
> > +}
> > +
> > +/* And here's the combo meal deal. Supersize me! */
> > +void vhost_add_used_and_trigger(struct vhost_dev *dev,
> > + struct vhost_virtqueue *vq,
> > + unsigned int head, int len)
> > +{
> > + vhost_add_used(vq, head, len);
> > + vhost_trigger_irq(dev, vq);
> > +}
> > +
> > +/* OK, now we need to know about added descriptors. */
> > +bool vhost_notify(struct vhost_virtqueue *vq)
> > +{
> > + int r;
> > + if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY))
> > + return false;
> > + vq->used_flags &= ~VRING_USED_F_NO_NOTIFY;
> > + r = put_user(vq->used_flags, &vq->used->flags);
> > + if (r)
> > + vq_err(vq, "Failed to disable notification: %d\n", r);
> > + /* They could have slipped one in as we were doing that: make
> > + * sure it's written, tell caller it needs to check again. */
> > + mb();
> > + return true;
> > +}
> > +
> > +/* We don't need to be notified again. */
> > +void vhost_no_notify(struct vhost_virtqueue *vq)
> > +{
> > + int r;
> > + if (vq->used_flags & VRING_USED_F_NO_NOTIFY)
> > + return;
> > + vq->used_flags |= VRING_USED_F_NO_NOTIFY;
> > + r = put_user(vq->used_flags, &vq->used->flags);
> > + if (r)
> > + vq_err(vq, "Failed to enable notification: %d\n", r);
> > +}
> > +
> > +int vhost_init(void)
> > +{
> > + vhost_workqueue = create_workqueue("vhost");
> > + if (!vhost_workqueue)
> > + return -ENOMEM;
> > + return 0;
> > +}
> > +
> > +void vhost_cleanup(void)
> > +{
> > + destroy_workqueue(vhost_workqueue);
> > +}
> > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> > new file mode 100644
> > index 0000000..8e13d06
> > --- /dev/null
> > +++ b/drivers/vhost/vhost.h
> > @@ -0,0 +1,122 @@
> > +#ifndef _VHOST_H
> > +#define _VHOST_H
> > +
> > +#include <linux/eventfd.h>
> > +#include <linux/vhost.h>
> > +#include <linux/mm.h>
> > +#include <linux/mutex.h>
> > +#include <linux/workqueue.h>
> > +#include <linux/poll.h>
> > +#include <linux/file.h>
> > +#include <linux/skbuff.h>
> > +#include <linux/uio.h>
> > +#include <linux/virtio_config.h>
> > +
> > +struct vhost_device;
> > +
> > +enum {
> > + VHOST_NET_MAX_SG = MAX_SKB_FRAGS + 2,
> > +};
> > +
> > +/* Poll a file (eventfd or socket) */
> > +/* Note: there's nothing vhost specific about this structure. */
> > +struct vhost_poll {
> > + poll_table table;
> > + wait_queue_head_t *wqh;
> > + wait_queue_t wait;
> > + /* struct which will handle all actual work. */
> > + struct work_struct work;
> > + unsigned long mask;
> > +};
> > +
> > +void vhost_poll_init(struct vhost_poll *poll, work_func_t func,
> > + unsigned long mask);
> > +void vhost_poll_start(struct vhost_poll *poll, struct file *file);
> > +void vhost_poll_stop(struct vhost_poll *poll);
> > +void vhost_poll_flush(struct vhost_poll *poll);
> > +
> > +/* The virtqueue structure describes a queue attached to a device. */
> > +struct vhost_virtqueue {
> > + struct vhost_dev *dev;
> > +
> > + /* The actual ring of buffers. */
> > + struct mutex mutex;
> > + unsigned int num;
> > + struct vring_desc __user *desc;
> > + struct vring_avail __user *avail;
> > + struct vring_used __user *used;
> > + struct file *kick;
> > + struct file *call;
> > + struct file *error;
> > + struct eventfd_ctx *call_ctx;
> > + struct eventfd_ctx *error_ctx;
> > +
> > + struct vhost_poll poll;
> > +
> > + /* The routine to call when the Guest pings us, or timeout. */
> > + work_func_t handle_kick;
> > +
> > + /* Last available index we saw. */
> > + u16 last_avail_idx;
> > +
> > + /* Caches available index value from user. */
> > + u16 avail_idx;
> > +
> > + /* Last index we used. */
> > + u16 last_used_idx;
> > +
> > + /* Used flags */
> > + u16 used_flags;
> > +
> > + struct iovec iov[VHOST_NET_MAX_SG];
> > + struct iovec hdr[VHOST_NET_MAX_SG];
> > +};
> > +
> > +struct vhost_dev {
> > + /* Readers use RCU to access memory table pointer.
> > + * Writers use mutex below.*/
> > + struct vhost_memory *memory;
> > + struct mm_struct *mm;
> > + struct vhost_virtqueue *vqs;
> > + int nvqs;
> > + struct mutex mutex;
> > + unsigned acked_features;
> > +};
> > +
> > +long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs);
> > +long vhost_dev_check_owner(struct vhost_dev *);
> > +long vhost_dev_reset_owner(struct vhost_dev *);
> > +void vhost_dev_cleanup(struct vhost_dev *);
> > +long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, unsigned long arg);
> > +
> > +unsigned vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue *,
> > + struct iovec iov[],
> > + unsigned int *out_num, unsigned int *in_num);
> > +void vhost_discard_vq_desc(struct vhost_virtqueue *);
> > +
> > +int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
> > +void vhost_trigger_irq(struct vhost_dev *, struct vhost_virtqueue *);
> > +void vhost_add_used_and_trigger(struct vhost_dev *, struct vhost_virtqueue *,
> > + unsigned int head, int len);
> > +void vhost_no_notify(struct vhost_virtqueue *);
> > +bool vhost_notify(struct vhost_virtqueue *);
> > +
> > +int vhost_init(void);
> > +void vhost_cleanup(void);
> > +
> > +#define vq_err(vq, fmt, ...) do { \
> > + pr_debug(pr_fmt(fmt), ##__VA_ARGS__); \
> > + if ((vq)->error_ctx) \
> > + eventfd_signal((vq)->error_ctx, 1);\
> > + } while (0)
> > +
> > +enum {
> > + VHOST_FEATURES = 1 << VIRTIO_F_NOTIFY_ON_EMPTY,
> > +};
> > +
> > +static inline int vhost_has_feature(struct vhost_dev *dev, int bit)
> > +{
> > + return dev->acked_features & (1 << bit);
> > +}
> > +
> > +#endif
> > diff --git a/include/linux/Kbuild b/include/linux/Kbuild
> > index dec2f18..975df9a 100644
> > --- a/include/linux/Kbuild
> > +++ b/include/linux/Kbuild
> > @@ -360,6 +360,7 @@ unifdef-y += uio.h
> > unifdef-y += unistd.h
> > unifdef-y += usbdevice_fs.h
> > unifdef-y += utsname.h
> > +unifdef-y += vhost.h
> > unifdef-y += videodev2.h
> > unifdef-y += videodev.h
> > unifdef-y += virtio_config.h
> > diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h
> > index 0521177..781a8bb 100644
> > --- a/include/linux/miscdevice.h
> > +++ b/include/linux/miscdevice.h
> > @@ -30,6 +30,7 @@
> > #define HPET_MINOR 228
> > #define FUSE_MINOR 229
> > #define KVM_MINOR 232
> > +#define VHOST_NET_MINOR 233
> > #define MISC_DYNAMIC_MINOR 255
> >
> > struct device;
> > diff --git a/include/linux/vhost.h b/include/linux/vhost.h
> > new file mode 100644
> > index 0000000..3f441a9
> > --- /dev/null
> > +++ b/include/linux/vhost.h
> > @@ -0,0 +1,101 @@
> > +#ifndef _LINUX_VHOST_H
> > +#define _LINUX_VHOST_H
> > +/* Userspace interface for in-kernel virtio accelerators. */
> > +
> > +/* vhost is used to reduce the number of system calls involved in virtio.
> > + *
> > + * Existing virtio net code is used in the guest without modification.
> > + *
> > + * This header includes interface used by userspace hypervisor for
> > + * device configuration.
> > + */
> > +
> > +#include <linux/types.h>
> > +#include <linux/compiler.h>
> > +#include <linux/ioctl.h>
> > +#include <linux/virtio_config.h>
> > +#include <linux/virtio_ring.h>
> > +
> > +struct vhost_vring_state {
> > + unsigned int index;
> > + unsigned int num;
> > +};
> > +
> > +struct vhost_vring_file {
> > + unsigned int index;
> > + int fd;
> > +};
> > +
> > +struct vhost_vring_addr {
> > + unsigned int index;
> > + unsigned int padding;
> > + __u64 user_addr;
> > +};
> > +
> > +struct vhost_memory_region {
> > + __u64 guest_phys_addr;
> > + __u64 memory_size; /* bytes */
> > + __u64 userspace_addr;
> > + __u64 padding; /* read/write protection? */
> > +};
> > +
> > +struct vhost_memory {
> > + __u32 nregions;
> > + __u32 padding;
> > + struct vhost_memory_region regions[0];
> > +};
> > +
> > +/* ioctls */
> > +
> > +#define VHOST_VIRTIO 0xAF
> > +
> > +/* Features bitmask for forward compatibility. Transport bits are used for
> > + * vhost specific features. */
> > +#define VHOST_GET_FEATURES _IOR(VHOST_VIRTIO, 0x00, __u64)
> > +#define VHOST_ACK_FEATURES _IOW(VHOST_VIRTIO, 0x00, __u64)
> > +
> > +/* Set current process as the (exclusive) owner of this file descriptor. This
> > + * must be called before any other vhost command. Further calls to
> > + * VHOST_OWNER_SET fail until VHOST_OWNER_RESET is called. */
> > +#define VHOST_SET_OWNER _IO(VHOST_VIRTIO, 0x01)
> > +/* Give up ownership, and reset the device to default values.
> > + * Allows subsequent call to VHOST_OWNER_SET to succeed. */
> > +#define VHOST_RESET_OWNER _IO(VHOST_VIRTIO, 0x02)
> > +
> > +/* Set up/modify memory layout */
> > +#define VHOST_SET_MEM_TABLE _IOW(VHOST_VIRTIO, 0x03, struct vhost_memory)
> > +
> > +/* Ring setup. These parameters can not be modified while ring is running
> > + * (bound to a device). */
> > +/* Set number of descriptors in ring */
> > +#define VHOST_SET_VRING_NUM _IOW(VHOST_VIRTIO, 0x10, struct vhost_vring_state)
> > +/* Start of array of descriptors (virtually contiguous) */
> > +#define VHOST_SET_VRING_DESC _IOW(VHOST_VIRTIO, 0x11, struct vhost_vring_addr)
> > +/* Used structure address */
> > +#define VHOST_SET_VRING_USED _IOW(VHOST_VIRTIO, 0x12, struct vhost_vring_addr)
> > +/* Available structure address */
> > +#define VHOST_SET_VRING_AVAIL _IOW(VHOST_VIRTIO, 0x13, struct vhost_vring_addr)
> > +/* Base value where queue looks for available descriptors */
> > +#define VHOST_SET_VRING_BASE _IOW(VHOST_VIRTIO, 0x14, struct vhost_vring_state)
> > +/* Get accessor: reads index, writes value in num */
> > +#define VHOST_GET_VRING_BASE _IOWR(VHOST_VIRTIO, 0x14, struct vhost_vring_state)
> > +
> > +/* The following ioctls use eventfd file descriptors to signal and poll
> > + * for events. */
> > +
> > +/* Set eventfd to poll for added buffers */
> > +#define VHOST_SET_VRING_KICK _IOW(VHOST_VIRTIO, 0x20, struct vhost_vring_file)
> > +/* Set eventfd to signal when buffers have beed used */
> > +#define VHOST_SET_VRING_CALL _IOW(VHOST_VIRTIO, 0x21, struct vhost_vring_file)
> > +/* Set eventfd to signal an error */
> > +#define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file)
> > +
> > +/* VHOST_NET specific defines */
> > +
> > +/* Attach virtio net device to a raw socket. The socket must be already
> > + * bound to an ethernet device, this device will be used for transmit.
> > + * Pass -1 to unbind from the socket and the transmit device.
> > + * This can be used to stop the device (e.g. for migration). */
> > +#define VHOST_NET_SET_SOCKET _IOW(VHOST_VIRTIO, 0x30, int)
> > +
> > +#endif
> > --
> > 1.6.2.5
> > --
> > To unsubscribe from this list: send the line "unsubscribe netdev" in
> > the body of a message to majordomo(a)vger.kernel.org
> > More majordomo info at http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Avi Kivity on
On 09/26/2009 12:32 AM, Gregory Haskins wrote:
>>>
>>> I realize in retrospect that my choice of words above implies vbus _is_
>>> complete, but this is not what I was saying. What I was trying to
>>> convey is that vbus is _more_ complete. Yes, in either case some kind
>>> of glue needs to be written. The difference is that vbus implements
>>> more of the glue generally, and leaves less required to be customized
>>> for each iteration.
>>>
>>>
>>
>> No argument there. Since you care about non-virt scenarios and virtio
>> doesn't, naturally vbus is a better fit for them as the code stands.
>>
> Thanks for finally starting to acknowledge there's a benefit, at least.
>

I think I've mentioned vbus' finer grained layers as helpful here,
though I doubt the value of this. Hypervisors are added rarely, while
devices and drivers are added (and modified) much more often. I don't
buy the anything-to-anything promise.

> To be more precise, IMO virtio is designed to be a performance oriented
> ring-based driver interface that supports all types of hypervisors (e.g.
> shmem based kvm, and non-shmem based Xen). vbus is designed to be a
> high-performance generic shared-memory interconnect (for rings or
> otherwise) framework for environments where linux is the underpinning
> "host" (physical or virtual). They are distinctly different, but
> complementary (the former addresses the part of the front-end, and
> latter addresses the back-end, and a different part of the front-end).
>

They're not truly complementary since they're incompatible. A 2.6.27
guest, or Windows guest with the existing virtio drivers, won't work
over vbus. Further, non-shmem virtio can't work over vbus. Since
virtio is guest-oriented and host-agnostic, it can't ignore
non-shared-memory hosts (even though it's unlikely virtio will be
adopted there).

> In addition, the kvm-connector used in AlacrityVM's design strives to
> add value and improve performance via other mechanisms, such as dynamic
> allocation, interrupt coalescing (thus reducing exit-ratio, which is a
> serious issue in KVM)

Do you have measurements of inter-interrupt coalescing rates (excluding
intra-interrupt coalescing).

> and priortizable/nestable signals.
>

That doesn't belong in a bus.

> Today there is a large performance disparity between what a KVM guest
> sees and what a native linux application sees on that same host. Just
> take a look at some of my graphs between "virtio", and "native", for
> example:
>
> http://developer.novell.com/wiki/images/b/b7/31-rc4_throughput.png
>

That's a red herring. The problem is not with virtio as an ABI, but
with its implementation in userspace. vhost-net should offer equivalent
performance to vbus.

> A dominant vbus design principle is to try to achieve the same IO
> performance for all "linux applications" whether they be literally
> userspace applications, or things like KVM vcpus or Ira's physical
> boards. It also aims to solve problems not previously expressible with
> current technologies (even virtio), like nested real-time.
>
> And even though you repeatedly insist otherwise, the neat thing here is
> that the two technologies mesh (at least under certain circumstances,
> like when virtio is deployed on a shared-memory friendly linux backend
> like KVM). I hope that my stack diagram below depicts that clearly.
>

Right, when you ignore the points where they don't fit, it's a perfect mesh.

>> But that's not a strong argument for vbus; instead of adding vbus you
>> could make virtio more friendly to non-virt
>>
> Actually, it _is_ a strong argument then because adding vbus is what
> helps makes virtio friendly to non-virt, at least for when performance
> matters.
>

As vhost-net shows, you can do that without vbus and without breaking
compatibility.



>> Right. virtio assumes that it's in a virt scenario and that the guest
>> architecture already has enumeration and hotplug mechanisms which it
>> would prefer to use. That happens to be the case for kvm/x86.
>>
> No, virtio doesn't assume that. It's stack provides the "virtio-bus"
> abstraction and what it does assume is that it will be wired up to
> something underneath. Kvm/x86 conveniently has pci, so the virtio-pci
> adapter was created to reuse much of that facility. For other things
> like lguest and s360, something new had to be created underneath to make
> up for the lack of pci-like support.
>

Right, I was wrong there. But it does allow you to have a 1:1 mapping
between native devices and virtio devices.


>>> So to answer your question, the difference is that the part that has to
>>> be customized in vbus should be a fraction of what needs to be
>>> customized with vhost because it defines more of the stack.
>>>
>> But if you want to use the native mechanisms, vbus doesn't have any
>> added value.
>>
> First of all, thats incorrect. If you want to use the "native"
> mechanisms (via the way the vbus-connector is implemented, for instance)
> you at least still have the benefit that the backend design is more
> broadly re-useable in more environments (like non-virt, for instance),
> because vbus does a proper job of defining the requisite
> layers/abstractions compared to vhost. So it adds value even in that
> situation.
>

Maybe. If vhost-net isn't sufficient I'm sure there will be patches sent.

> Second of all, with PV there is no such thing as "native". It's
> software so it can be whatever we want. Sure, you could argue that the
> guest may have built-in support for something like PCI protocol.
> However, PCI protocol itself isn't suitable for high-performance PV out
> of the can. So you will therefore invariably require new software
> layers on top anyway, even if part of the support is already included.
>

Of course there is such a thing as native, a pci-ready guest has tons of
support built into it that doesn't need to be retrofitted. Since
practically everyone (including Xen) does their paravirt drivers atop
pci, the claim that pci isn't suitable for high performance is incorrect.


> And lastly, why would you _need_ to use the so called "native"
> mechanism? The short answer is, "you don't". Any given system (guest
> or bare-metal) already have a wide-range of buses (try running "tree
> /sys/bus" in Linux). More importantly, the concept of adding new buses
> is widely supported in both the Windows and Linux driver model (and
> probably any other guest-type that matters). Therefore, despite claims
> to the contrary, its not hard or even unusual to add a new bus to the mix.
>

The short answer is "compatibility".


> In summary, vbus is simply one more bus of many, purpose built to
> support high-end IO in a virt-like model, giving controlled access to
> the linux-host underneath it. You can write a high-performance layer
> below the OS bus-model (vbus), or above it (virtio-pci) but either way
> you are modifying the stack to add these capabilities, so we might as
> well try to get this right.
>
> With all due respect, you are making a big deal out of a minor issue.
>

It's not minor to me.

>>> And, as
>>> eluded to in my diagram, both virtio-net and vhost (with some
>>> modifications to fit into the vbus framework) are potentially
>>> complementary, not competitors.
>>>
>>>
>> Only theoretically. The existing installed base would have to be thrown
>> away
>>
> "Thrown away" is pure hyperbole. The installed base, worse case, needs
> to load a new driver for a missing device.

Yes, we all know how fun this is. Especially if the device changed is
your boot disk. You may not care about the pain caused to users, but I
do, so I will continue to insist on compatibility.

>> or we'd need to support both.
>>
>>
>>
> No matter what model we talk about, there's always going to be a "both"
> since the userspace virtio models are probably not going to go away (nor
> should they).
>

virtio allows you to have userspace-only, kernel-only, or
start-with-userspace-and-move-to-kernel-later, all transparent to the
guest. In many cases we'll stick with userspace-only.

>> All this is after kvm has decoded that vbus is addresses. It can't work
>> without someone outside vbus deciding that.
>>
> How the connector message is delivered is really not relevant. Some
> architectures will simply deliver the message point-to-point (like the
> original hypercall design for KVM, or something like Ira's rig), and
> some will need additional demuxing (like pci-bridge/pio based KVM).
> It's an implementation detail of the connector.
>
> However, the real point here is that something needs to establish a
> scoped namespace mechanism, add items to that namespace, and advertise
> the presence of the items to the guest. vbus has this facility built in
> to its stack. vhost doesn't, so it must come from elsewhere.
>

So we have: vbus needs a connector, vhost needs a connector. vbus
doesn't need userspace to program the addresses (but does need userspace
to instantiate the devices and to program the bus address decode), vhost
needs userspace to instantiate the devices and program the addresses.

>>> In fact, it's actually a simpler design to unify things this way because
>>> you avoid splitting the device model up. Consider how painful the vhost
>>> implementation would be if it didn't already have the userspace
>>> virtio-net to fall-back on. This is effectively what we face for new
>>> devices going forward if that model is to persist.
>>>
>>>
>>
>> It doesn't have just virtio-net, it has userspace-based hostplug
>>
> vbus has hotplug too: mkdir and rmdir
>

Does that work from nonprivileged processes? Does it work on Windows?

> As an added bonus, its device-model is modular. A developer can write a
> new device model, compile it, insmod it to the host kernel, hotplug it
> to the running guest with mkdir/ln, and the come back out again
> (hotunplug with rmdir, rmmod, etc). They may do this all without taking
> the guest down, and while eating QEMU based IO solutions for breakfast
> performance wise.
>
> Afaict, qemu can't do either of those things.
>

We've seen that herring before, and it's redder than ever.



>> Refactor instead of duplicating.
>>
> There is no duplicating. vbus has no equivalent today as virtio doesn't
> define these layers.
>

So define them if they're missing.


>>>
>>>
>>>> Use libraries (virtio-shmem.ko, libvhost.so).
>>>>
>>>>
>>> What do you suppose vbus is? vbus-proxy.ko = virtio-shmem.ko, and you
>>> dont need libvhost.so per se since you can just use standard kernel
>>> interfaces (like configfs/sysfs). I could create an .so going forward
>>> for the new ioctl-based interface, I suppose.
>>>
>>>
>> Refactor instead of rewriting.
>>
> There is no rewriting. vbus has no equivalent today as virtio doesn't
> define these layers.
>
> By your own admission, you said if you wanted that capability, use a
> library. What I think you are not understanding is vbus _is_ that
> library. So what is the problem, exactly?
>

It's not compatible. If you were truly worried about code duplication
in virtio, you'd refactor it to remove the duplication, without
affecting existing guests.

>>>> For kvm/x86 pci definitely remains king.
>>>>
>>>>
>>> For full virtualization, sure. I agree. However, we are talking about
>>> PV here. For PV, PCI is not a requirement and is a technical dead-end
>>> IMO.
>>>
>>> KVM seems to be the only virt solution that thinks otherwise (*), but I
>>> believe that is primarily a condition of its maturity. I aim to help
>>> advance things here.
>>>
>>> (*) citation: xen has xenbus, lguest has lguest-bus, vmware has some
>>> vmi-esq thing (I forget what its called) to name a few. Love 'em or
>>> hate 'em, most other hypervisors do something along these lines. I'd
>>> like to try to create one for KVM, but to unify them all (at least for
>>> the Linux-based host designs).
>>>
>>>
>> VMware are throwing VMI away (won't be supported in their new product,
>> and they've sent a patch to rip it off from Linux);
>>
> vmware only cares about x86 iiuc, so probably not a good example.
>

Well, you brought it up. Between you and me, I only care about x86 too.

>> Xen has to tunnel
>> xenbus in pci for full virtualization (which is where Windows is, and
>> where Linux will be too once people realize it's faster). lguest is
>> meant as an example hypervisor, not an attempt to take over the world.
>>
> So pick any other hypervisor, and the situation is often similar.
>

The situation is often pci.

>
>> An right now you can have a guest using pci to access a mix of
>> userspace-emulated devices, userspace-emulated-but-kernel-accelerated
>> virtio devices, and real host devices. All on one dead-end bus. Try
>> that with vbus.
>>
> vbus is not interested in userspace devices. The charter is to provide
> facilities for utilizing the host linux kernel's IO capabilities in the
> most efficient, yet safe, manner possible. Those devices that fit
> outside that charter can ride on legacy mechanisms if that suits them best.
>

vbus isn't, but I am. I would prefer not to have to expose
implementation decisions (kernel vs userspace) to the guest (vbus vs pci).

>>> That won't cut it. For one, creating an eventfd is only part of the
>>> equation. I.e. you need to have originate/terminate somewhere
>>> interesting (and in-kernel, otherwise use tuntap).
>>>
>>>
>> vbus needs the same thing so it cancels out.
>>
> No, it does not. vbus just needs a relatively simple single message
> pipe between the guest and host (think "hypercall tunnel", if you will).
>

That's ioeventfd. So far so similar.

> Per queue/device addressing is handled by the same conceptual namespace
> as the one that would trigger eventfds in the model you mention. And
> that namespace is built in to the vbus stack, and objects are registered
> automatically as they are created.
>
> Contrast that to vhost, which requires some other kernel interface to
> exist, and to be managed manually for each object that is created. Your
> libvhostconfig would need to somehow know how to perform this
> registration operation, and there would have to be something in the
> kernel to receive it, presumably on a per platform basis. Solving this
> problem generally would probably end up looking eerily like vbus,
> because thats what vbus does.
>

vbus devices aren't magically instantiated. Userspace needs to
instantiate them too. Sure, there's less work on the host side since
you're using vbus instead of the native interface, but more work on the
guest side since you're using vbus instead of the native interface.



>> Well, let's see. Can vbus today:
>>
>> - let userspace know which features are available (so it can decide if
>> live migration is possible)
>>
> yes, its in sysfs.
>
>
>> - let userspace limit which features are exposed to the guest (so it can
>> make live migration possible among hosts of different capabilities)
>>
> yes, its in sysfs.
>

Per-device? non-privileged-user capable?

>> - let userspace know which features were negotiated (so it can transfer
>> them to the other host during live migration)
>>
> no, but we can easily add ->save()/->restore() to the model going
> forward, and the negotiated features are just a subcomponent if its
> serialized stream.
>
>
>> - let userspace tell the kernel which features were negotiated (when
>> live migration completes, to avoid requiring the guest to re-negotiate)
>>
> that would be the function of the ->restore() deserializer.
>
>
>> - do all that from an unprivileged process
>>
> yes, in the upcoming alacrityvm v0.3 with the ioctl based control plane.
>

Ah, so you have two control planes.

> Bottom line: vbus isn't done, especially w.r.t. live-migration..but that
> is not an valid argument against the idea if you believe in
> release-early/release-often. kvm wasn't (isn't) done either when it was
> proposed/merged.
>
>

kvm didn't have an existing counterpart in Linux when it was
proposed/merged.

--
Do not meddle in the internals of kernels, for they are subtle and quick to panic.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Gregory Haskins on
Avi Kivity wrote:
> On 09/26/2009 12:32 AM, Gregory Haskins wrote:
>>>>
>>>> I realize in retrospect that my choice of words above implies vbus _is_
>>>> complete, but this is not what I was saying. What I was trying to
>>>> convey is that vbus is _more_ complete. Yes, in either case some kind
>>>> of glue needs to be written. The difference is that vbus implements
>>>> more of the glue generally, and leaves less required to be customized
>>>> for each iteration.
>>>>
>>>>
>>>
>>> No argument there. Since you care about non-virt scenarios and virtio
>>> doesn't, naturally vbus is a better fit for them as the code stands.
>>>
>> Thanks for finally starting to acknowledge there's a benefit, at least.
>>
>
> I think I've mentioned vbus' finer grained layers as helpful here,
> though I doubt the value of this. Hypervisors are added rarely, while
> devices and drivers are added (and modified) much more often. I don't
> buy the anything-to-anything promise.

The ease in which a new hypervisor should be able to integrate into the
stack is only one of vbus's many benefits.

>
>> To be more precise, IMO virtio is designed to be a performance oriented
>> ring-based driver interface that supports all types of hypervisors (e.g.
>> shmem based kvm, and non-shmem based Xen). vbus is designed to be a
>> high-performance generic shared-memory interconnect (for rings or
>> otherwise) framework for environments where linux is the underpinning
>> "host" (physical or virtual). They are distinctly different, but
>> complementary (the former addresses the part of the front-end, and
>> latter addresses the back-end, and a different part of the front-end).
>>
>
> They're not truly complementary since they're incompatible.

No, that is incorrect. Not to be rude, but for clarity:

Complementary \Com`ple*men"ta*ry\, a.
Serving to fill out or to complete; as, complementary
numbers.
[1913 Webster]

Citation: www.dict.org

IOW: Something being complementary has nothing to do with guest/host
binary compatibility. virtio-pci and virtio-vbus are both equally
complementary to virtio since they fill in the bottom layer of the
virtio stack.

So yes, vbus is truly complementary to virtio afaict.

> A 2.6.27 guest, or Windows guest with the existing virtio drivers, won't work
> over vbus.

Binary compatibility with existing virtio drivers, while nice to have,
is not a specific requirement nor goal. We will simply load an updated
KMP/MSI into those guests and they will work again. As previously
discussed, this is how more or less any system works today. It's like
we are removing an old adapter card and adding a new one to "uprev the
silicon".

> Further, non-shmem virtio can't work over vbus.

Actually I misspoke earlier when I said virtio works over non-shmem.
Thinking about it some more, both virtio and vbus fundamentally require
shared-memory, since sharing their metadata concurrently on both sides
is their raison d'être.

The difference is that virtio utilizes a pre-translation/mapping (via
->add_buf) from the guest side. OTOH, vbus uses a post translation
scheme (via memctx) from the host-side. If anything, vbus is actually
more flexible because it doesn't assume the entire guest address space
is directly mappable.

In summary, your statement is incorrect (though it is my fault for
putting that idea in your head).

> Since
> virtio is guest-oriented and host-agnostic, it can't ignore
> non-shared-memory hosts (even though it's unlikely virtio will be
> adopted there)

Well, to be fair no one said it has to ignore them. Either virtio-vbus
transport is present and available to the virtio stack, or it isn't. If
its present, it may or may not publish objects for consumption.
Providing a virtio-vbus transport in no way limits or degrades the
existing capabilities of the virtio stack. It only enhances them.

I digress. The whole point is moot since I realized that the non-shmem
distinction isn't accurate anyway. They both require shared-memory for
the metadata, and IIUC virtio requires the entire address space to be
mappable whereas vbus only assumes the metadata is.

>
>> In addition, the kvm-connector used in AlacrityVM's design strives to
>> add value and improve performance via other mechanisms, such as dynamic
>> allocation, interrupt coalescing (thus reducing exit-ratio, which is a
>> serious issue in KVM)
>
> Do you have measurements of inter-interrupt coalescing rates (excluding
> intra-interrupt coalescing).

I actually do not have a rig setup to explicitly test inter-interrupt
rates at the moment. Once things stabilize for me, I will try to
re-gather some numbers here. Last time I looked, however, there were
some decent savings for inter as well.

Inter rates are interesting because they are what tends to ramp up with
IO load more than intra since guest interrupt mitigation techniques like
NAPI often quell intra-rates naturally. This is especially true for
data-center, cloud, hpc-grid, etc, kind of workloads (vs vanilla
desktops, etc) that tend to have multiple IO ports (multi-homed nics,
disk-io, etc). Those various ports tend to be workload-related to one
another (e.g. 3-tier web stack may use multi-homed network and disk-io
at the same time, trigged by one IO event).

An interesting thing here is that you don't even need a fancy
multi-homed setup to see the effects of my exit-ratio reduction work:
even single port configurations suffer from the phenomenon since many
devices have multiple signal-flows (e.g. network adapters tend to have
at least 3 flows: rx-ready, tx-complete, and control-events (link-state,
etc). Whats worse, is that the flows often are indirectly related (for
instance, many host adapters will free tx skbs during rx operations, so
you tend to get bursts of tx-completes at the same time as rx-ready. If
the flows map 1:1 with IDT, they will suffer the same problem.

In any case, here is an example run of a simple single-homed guest over
standard GigE. Whats interesting here is that .qnotify to .notify
ratio, as this is the interrupt-to-signal ratio. In this case, its
170047/151918, which comes out to about 11% savings in interrupt injections:

vbus-guest:/home/ghaskins # netperf -H dev
TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to
dev.laurelwood.net (192.168.1.10) port 0 AF_INET
Recv Send Send
Socket Socket Message Elapsed
Size Size Size Time Throughput
bytes bytes bytes secs. 10^6bits/sec

1048576 16384 16384 10.01 940.77
vbus-guest:/home/ghaskins # cat /sys/kernel/debug/pci-to-vbus-bridge
.events : 170048
.qnotify : 151918
.qinject : 0
.notify : 170047
.inject : 18238
.bridgecalls : 18
.buscalls : 12
vbus-guest:/home/ghaskins # cat /proc/interrupts
CPU0
0: 87 IO-APIC-edge timer
1: 6 IO-APIC-edge i8042
4: 733 IO-APIC-edge serial
6: 2 IO-APIC-edge floppy
7: 0 IO-APIC-edge parport0
8: 0 IO-APIC-edge rtc0
9: 0 IO-APIC-fasteoi acpi
10: 0 IO-APIC-fasteoi virtio1
12: 90 IO-APIC-edge i8042
14: 3041 IO-APIC-edge ata_piix
15: 1008 IO-APIC-edge ata_piix
24: 151933 PCI-MSI-edge vbus
25: 0 PCI-MSI-edge virtio0-config
26: 190 PCI-MSI-edge virtio0-input
27: 28 PCI-MSI-edge virtio0-output
NMI: 0 Non-maskable interrupts
LOC: 9854 Local timer interrupts
SPU: 0 Spurious interrupts
CNT: 0 Performance counter interrupts
PND: 0 Performance pending work
RES: 0 Rescheduling interrupts
CAL: 0 Function call interrupts
TLB: 0 TLB shootdowns
TRM: 0 Thermal event interrupts
THR: 0 Threshold APIC interrupts
MCE: 0 Machine check exceptions
MCP: 1 Machine check polls
ERR: 0
MIS: 0

Its important to note here that we are actually looking at the interrupt
rate, not the exit rate (which is usually a multiple of the interrupt
rate, since you have to factor in as many as three exits per interrupt
(IPI, window, EOI). Therefore we saved about 18k interrupts in this 10
second burst, but we may have actually saved up to 54k exits in the
process. This is only over a 10 second window at GigE rates, so YMMV.
These numbers get even more dramatic on higher end hardware, but I
haven't had a chance to generate new numbers yet.

Looking at some external stats paints an even bleaker picture: "exits"
as reported by kvm_stat for virtio-pci based virtio-net tip the scales
at 65k/s vs 36k/s for vbus based venet. And virtio is consuming ~30% of
my quad-core's cpu, vs 19% for venet during the test. Its hard to know
which innovation or innovations may be responsible for the entire
reduction, but certainly the interrupt-to-signal ratio mentioned above
is probably helping.

The even worse news for 1:1 models is that the ratio of
exits-per-interrupt climbs with load (exactly when it hurts the most)
since that is when the probability that the vcpu will need all three
exits is the highest.

>
>> and priortizable/nestable signals.
>>
>
> That doesn't belong in a bus.

Everyone is of course entitled to an opinion, but the industry as a
whole would disagree with you. Signal path routing (1:1, aggregated,
etc) is at the discretion of the bus designer. Most buses actually do
_not_ support 1:1 with IDT (think USB, SCSI, IDE, etc).

PCI is somewhat of an outlier in that regard afaict. Its actually a
nice feature of PCI when its used within its design spec (HW). For
SW/PV, 1:1 suffers from, among other issues, that "triple-exit scaling"
issue in the signal path I mentioned above. This is one of the many
reasons I think PCI is not the best choice for PV.

>
>> Today there is a large performance disparity between what a KVM guest
>> sees and what a native linux application sees on that same host. Just
>> take a look at some of my graphs between "virtio", and "native", for
>> example:
>>
>> http://developer.novell.com/wiki/images/b/b7/31-rc4_throughput.png
>>
>
> That's a red herring. The problem is not with virtio as an ABI, but
> with its implementation in userspace. vhost-net should offer equivalent
> performance to vbus.

That's pure speculation. I would advise you to reserve such statements
until after a proper bakeoff can be completed. This is not to mention
that vhost-net does nothing to address our other goals, like scheduler
coordination and non-802.x fabrics.

>
>> A dominant vbus design principle is to try to achieve the same IO
>> performance for all "linux applications" whether they be literally
>> userspace applications, or things like KVM vcpus or Ira's physical
>> boards. It also aims to solve problems not previously expressible with
>> current technologies (even virtio), like nested real-time.
>>
>> And even though you repeatedly insist otherwise, the neat thing here is
>> that the two technologies mesh (at least under certain circumstances,
>> like when virtio is deployed on a shared-memory friendly linux backend
>> like KVM). I hope that my stack diagram below depicts that clearly.
>>
>
> Right, when you ignore the points where they don't fit, it's a perfect
> mesh.

Where doesn't it fit?

>
>>> But that's not a strong argument for vbus; instead of adding vbus you
>>> could make virtio more friendly to non-virt
>>>
>> Actually, it _is_ a strong argument then because adding vbus is what
>> helps makes virtio friendly to non-virt, at least for when performance
>> matters.
>>
>
> As vhost-net shows, you can do that without vbus

Citation please. Afaict, the one use case that we looked at for vhost
outside of KVM failed to adapt properly, so I do not see how this is true.

> and without breaking compatibility.

Compatibility with what? vhost hasn't even been officially deployed in
KVM environments afaict, nevermind non-virt. Therefore, how could it
possibly have compatibility constraints with something non-virt already?
Citation please.

>
>
>
>>> Right. virtio assumes that it's in a virt scenario and that the guest
>>> architecture already has enumeration and hotplug mechanisms which it
>>> would prefer to use. That happens to be the case for kvm/x86.
>>>
>> No, virtio doesn't assume that. It's stack provides the "virtio-bus"
>> abstraction and what it does assume is that it will be wired up to
>> something underneath. Kvm/x86 conveniently has pci, so the virtio-pci
>> adapter was created to reuse much of that facility. For other things
>> like lguest and s360, something new had to be created underneath to make
>> up for the lack of pci-like support.
>>
>
> Right, I was wrong there. But it does allow you to have a 1:1 mapping
> between native devices and virtio devices.

vbus allows you to have 1:1 if that is what you want, but we strive to
do better.

>
>
>>>> So to answer your question, the difference is that the part that has to
>>>> be customized in vbus should be a fraction of what needs to be
>>>> customized with vhost because it defines more of the stack.
>>>>
>>> But if you want to use the native mechanisms, vbus doesn't have any
>>> added value.
>>>
>> First of all, thats incorrect. If you want to use the "native"
>> mechanisms (via the way the vbus-connector is implemented, for instance)
>> you at least still have the benefit that the backend design is more
>> broadly re-useable in more environments (like non-virt, for instance),
>> because vbus does a proper job of defining the requisite
>> layers/abstractions compared to vhost. So it adds value even in that
>> situation.
>>
>
> Maybe. If vhost-net isn't sufficient I'm sure there will be patches sent.

It isn't, and I've already done that.

>
>> Second of all, with PV there is no such thing as "native". It's
>> software so it can be whatever we want. Sure, you could argue that the
>> guest may have built-in support for something like PCI protocol.

[1]

>> However, PCI protocol itself isn't suitable for high-performance PV out
>> of the can. So you will therefore invariably require new software
>> layers on top anyway, even if part of the support is already included.
>>
>
> Of course there is such a thing as native, a pci-ready guest has tons of
> support built into it

I specifically mentioned that already ([1]).

You are also overstating its role, since the basic OS is what implements
the native support for bus-objects, hotswap, etc, _not_ PCI. PCI just
rides underneath and feeds trivial events up, as do other bus-types
(usb, scsi, vbus, etc). And once those events are fed, you still need a
PV layer to actually handle the bus interface in a high-performance
manner so its not like you really have a "native" stack in either case.

> that doesn't need to be retrofitted.

No, that is incorrect. You have to heavily modify the pci model with
layers on top to get any kind of performance out of it. Otherwise, we
would just use realtek emulation, which is technically the native PCI
you are apparently so enamored with.

Not to mention there are things you just plain can't do in PCI today,
like dynamically assign signal-paths, priority, and coalescing, etc.

> Since
> practically everyone (including Xen) does their paravirt drivers atop
> pci, the claim that pci isn't suitable for high performance is incorrect.

Actually IIUC, I think Xen bridges to their own bus as well (and only
where they have to), just like vbus. They don't use PCI natively. PCI
is perfectly suited as a bridge transport for PV, as I think the Xen and
vbus examples have demonstrated. Its the 1:1 device-model where PCI has
the most problems.

>
>
>> And lastly, why would you _need_ to use the so called "native"
>> mechanism? The short answer is, "you don't". Any given system (guest
>> or bare-metal) already have a wide-range of buses (try running "tree
>> /sys/bus" in Linux). More importantly, the concept of adding new buses
>> is widely supported in both the Windows and Linux driver model (and
>> probably any other guest-type that matters). Therefore, despite claims
>> to the contrary, its not hard or even unusual to add a new bus to the
>> mix.
>>
>
> The short answer is "compatibility".

There was a point in time where the same could be said for virtio-pci
based drivers vs realtek and e1000, so that argument is demonstrably
silly. No one tried to make virtio work in a binary compatible way with
realtek emulation, yet we all survived the requirement for loading a
virtio driver to my knowledge.

The bottom line is: Binary device compatibility is not required in any
other system (as long as you follow sensible versioning/id rules), so
why is KVM considered special?

The fact is, it isn't special (at least not in this regard). What _is_
required is "support" and we fully intend to support these proposed
components. I assure you that at least the users that care about
maximum performance will not generally mind loading a driver. Most of
them would have to anyway if they want to get beyond realtek emulation.

>
>
>> In summary, vbus is simply one more bus of many, purpose built to
>> support high-end IO in a virt-like model, giving controlled access to
>> the linux-host underneath it. You can write a high-performance layer
>> below the OS bus-model (vbus), or above it (virtio-pci) but either way
>> you are modifying the stack to add these capabilities, so we might as
>> well try to get this right.
>>
>> With all due respect, you are making a big deal out of a minor issue.
>>
>
> It's not minor to me.

I am certainly in no position to tell you how to feel, but this
declaration would seem from my perspective to be more of a means to an
end than a legitimate concern. Otherwise we would never have had virtio
support in the first place, since it was not "compatible" with previous
releases.

>
>>>> And, as
>>>> eluded to in my diagram, both virtio-net and vhost (with some
>>>> modifications to fit into the vbus framework) are potentially
>>>> complementary, not competitors.
>>>>
>>>>
>>> Only theoretically. The existing installed base would have to be thrown
>>> away
>>>
>> "Thrown away" is pure hyperbole. The installed base, worse case, needs
>> to load a new driver for a missing device.
>
> Yes, we all know how fun this is.

Making systems perform 5x faster _is_ fun, yes. I love what I do for a
living.

> Especially if the device changed is your boot disk.

If and when that becomes a priority concern, that would be a function
transparently supported in the BIOS shipped with the hypervisor, and
would thus be invisible to the user.

> You may not care about the pain caused to users, but I do, so I will
> continue to insist on compatibility.

No, you are incorrect on two counts.

1) Of course I care about pain to users or I wouldn't be funded. Right
now the pain from my perspective is caused to users in the
high-performance community who want to deploy KVM based solutions. They
are unable to do so due to its performance disparity compared to
bare-metal, outside of pass-through hardware which is not widely
available in a lot of existing deployments. I aim to fix that disparity
while reusing the existing hardware investment by writing smarter
software, and I assure you that these users won't mind loading a driver
in the guest to take advantage of it.

For the users that don't care about maximum performance, there is no
change (and thus zero pain) required. They can use realtek or virtio if
they really want to. Neither is going away to my knowledge, and lets
face it: 2.6Gb/s out of virtio to userspace isn't *that* bad. But "good
enough" isn't good enough, and I won't rest till we get to native
performance. Additionally, I want to support previously unavailable
modes of operations (e.g. real-time) and advanced fabrics (e.g. IB).

2) True pain to users is not caused by lack of binary compatibility.
Its caused by lack of support. And its a good thing or we would all be
emulating 8086 architecture forever...

..oh wait, I guess we kind of do that already ;). But at least we can
slip in something more advanced once in a while (APIC vs PIC, USB vs
uart, iso9660 vs floppy, for instance) and update the guest stack
instead of insisting it must look like ISA forever for compatibility's sake.

>
>>> or we'd need to support both.
>>>
>>>
>>>
>> No matter what model we talk about, there's always going to be a "both"
>> since the userspace virtio models are probably not going to go away (nor
>> should they).
>>
>
> virtio allows you to have userspace-only, kernel-only, or
> start-with-userspace-and-move-to-kernel-later, all transparent to the
> guest. In many cases we'll stick with userspace-only.

The user will not care where the model lives, per se. Only that it is
supported, and it works well.

Likewise, I know from experience that the developer will not like
writing the same code twice, so the "runs in both" model is not
necessarily a great design trait either.

>
>>> All this is after kvm has decoded that vbus is addresses. It can't work
>>> without someone outside vbus deciding that.
>>>
>> How the connector message is delivered is really not relevant. Some
>> architectures will simply deliver the message point-to-point (like the
>> original hypercall design for KVM, or something like Ira's rig), and
>> some will need additional demuxing (like pci-bridge/pio based KVM).
>> It's an implementation detail of the connector.
>>
>> However, the real point here is that something needs to establish a
>> scoped namespace mechanism, add items to that namespace, and advertise
>> the presence of the items to the guest. vbus has this facility built in
>> to its stack. vhost doesn't, so it must come from elsewhere.
>>
>
> So we have: vbus needs a connector, vhost needs a connector. vbus
> doesn't need userspace to program the addresses (but does need userspace
> to instantiate the devices and to program the bus address decode)

First of all, bus-decode is substantially easier than per-device decode
(you have to track all those per-device/per-signal fds somewhere,
integrate with hotswap, etc), and its only done once per guest at
startup and left alone. So its already not apples to apples.

Second, while its true that the general kvm-connector bus-decode needs
to be programmed, that is a function of adapting to the environment
that _you_ created for me. The original kvm-connector was discovered
via cpuid and hypercalls, and didn't need userspace at all to set it up.
Therefore it would be entirely unfair of you to turn around and somehow
try to use that trait of the design against me since you yourself
imposed it.

As an additional data point, our other connectors have no such
bus-decode programming requirement. Therefore, this is clearly
just a property of the KVM environment, not a function of the overall
vbus design.

> vhost needs userspace to instantiate the devices and program the addresses.
>

Right. And among other shortcomings it also requires a KVM-esque memory
model (which is not always going to work as we recently discussed), and
a redundant device-model to back it up in userspace, which is a
development and maintenance burden, and an external bus-model (filled by
pio-bus in KVM today).

>>>> In fact, it's actually a simpler design to unify things this way
>>>> because
>>>> you avoid splitting the device model up. Consider how painful the vhost
>>>> implementation would be if it didn't already have the userspace
>>>> virtio-net to fall-back on. This is effectively what we face for new
>>>> devices going forward if that model is to persist.
>>>>
>>>>
>>>
>>> It doesn't have just virtio-net, it has userspace-based hostplug
>>>
>> vbus has hotplug too: mkdir and rmdir
>>
>
> Does that work from nonprivileged processes?

It will with the ioctl based control interface that I'll merge shortly.

> Does it work on Windows?

This question doesn't make sense. Hotswap control occurs on the host,
which is always Linux.

If you were asking about whether a windows guest will support hotswap:
the answer is "yes". Our windows driver presents a unique PDO/FDO pair
for each logical device instance that is pushed out (just like the built
in usb, pci, scsi bus drivers that windows supports natively).

>
>> As an added bonus, its device-model is modular. A developer can write a
>> new device model, compile it, insmod it to the host kernel, hotplug it
>> to the running guest with mkdir/ln, and the come back out again
>> (hotunplug with rmdir, rmmod, etc). They may do this all without taking
>> the guest down, and while eating QEMU based IO solutions for breakfast
>> performance wise.
>>
>> Afaict, qemu can't do either of those things.
>>
>
> We've seen that herring before,

Citation?

> and it's redder than ever.

This is more hyperbole. I doubt that there would be many that would
argue that a modular architecture (that we get for free with LKM
support) is not desirable, even if its never used dynamically with a
running guest. OTOH, I actually use this dynamic feature all the time
as I test my components, so its at least useful to me.

>
>
>
>>> Refactor instead of duplicating.
>>>
>> There is no duplicating. vbus has no equivalent today as virtio doesn't
>> define these layers.
>>
>
> So define them if they're missing.

I just did.

>
>
>>>>
>>>>
>>>>> Use libraries (virtio-shmem.ko, libvhost.so).
>>>>>
>>>>>
>>>> What do you suppose vbus is? vbus-proxy.ko = virtio-shmem.ko, and you
>>>> dont need libvhost.so per se since you can just use standard kernel
>>>> interfaces (like configfs/sysfs). I could create an .so going forward
>>>> for the new ioctl-based interface, I suppose.
>>>>
>>>>
>>> Refactor instead of rewriting.
>>>
>> There is no rewriting. vbus has no equivalent today as virtio doesn't
>> define these layers.
>>
>> By your own admission, you said if you wanted that capability, use a
>> library. What I think you are not understanding is vbus _is_ that
>> library. So what is the problem, exactly?
>>
>
> It's not compatible.

No, that is incorrect. What you are apparently not understanding is
that not only is vbus that library, but its extensible. So even if
compatibility is your goal (it doesn't need to be IMO) it can be
accommodated by how you interface to the library.

> If you were truly worried about code duplication
> in virtio, you'd refactor it to remove the duplication,

My primary objective is creating an extensible, high-performance,
shared-memory interconnect for systems that utilize a Linux host as
their IO-hub. It just so happens that virtio can sit nicely on top of
such a model because shmem-rings are a subclass of shmem. As a result
of its design, vbus also helps to reduce code duplication in the stack
for new environments due to its extensible nature.

However, vbus also has goals beyond what virtio is providing today that
are of more concern, and part of that is designing a connector/bus that
eliminates the shortcomings in the current pci-based design.

> without affecting existing guests.

Already covered above.

>
>>>>> For kvm/x86 pci definitely remains king.
>>>>>
>>>>>
>>>> For full virtualization, sure. I agree. However, we are talking about
>>>> PV here. For PV, PCI is not a requirement and is a technical dead-end
>>>> IMO.
>>>>
>>>> KVM seems to be the only virt solution that thinks otherwise (*), but I
>>>> believe that is primarily a condition of its maturity. I aim to help
>>>> advance things here.
>>>>
>>>> (*) citation: xen has xenbus, lguest has lguest-bus, vmware has some
>>>> vmi-esq thing (I forget what its called) to name a few. Love 'em or
>>>> hate 'em, most other hypervisors do something along these lines. I'd
>>>> like to try to create one for KVM, but to unify them all (at least for
>>>> the Linux-based host designs).
>>>>
>>>>
>>> VMware are throwing VMI away (won't be supported in their new product,
>>> and they've sent a patch to rip it off from Linux);
>>>
>> vmware only cares about x86 iiuc, so probably not a good example.
>>
>
> Well, you brought it up. Between you and me, I only care about x86 too.

Fair enough.

>
>>> Xen has to tunnel
>>> xenbus in pci for full virtualization (which is where Windows is, and
>>> where Linux will be too once people realize it's faster). lguest is
>>> meant as an example hypervisor, not an attempt to take over the world.
>>>
>> So pick any other hypervisor, and the situation is often similar.
>>
>
> The situation is often pci.

Even if that were true, which is debatable, do not confuse "convenient"
with "optimal". If you don't care about maximum performance and
advanced features like QOS, sure go ahead and use PCI. Why not.

>
>>
>>> An right now you can have a guest using pci to access a mix of
>>> userspace-emulated devices, userspace-emulated-but-kernel-accelerated
>>> virtio devices, and real host devices. All on one dead-end bus. Try
>>> that with vbus.
>>>
>> vbus is not interested in userspace devices. The charter is to provide
>> facilities for utilizing the host linux kernel's IO capabilities in the
>> most efficient, yet safe, manner possible. Those devices that fit
>> outside that charter can ride on legacy mechanisms if that suits them
>> best.
>>
>
> vbus isn't, but I am. I would prefer not to have to expose
> implementation decisions (kernel vs userspace) to the guest (vbus vs pci).
>
>>>> That won't cut it. For one, creating an eventfd is only part of the
>>>> equation. I.e. you need to have originate/terminate somewhere
>>>> interesting (and in-kernel, otherwise use tuntap).
>>>>
>>>>
>>> vbus needs the same thing so it cancels out.
>>>
>> No, it does not. vbus just needs a relatively simple single message
>> pipe between the guest and host (think "hypercall tunnel", if you will).
>>
>
> That's ioeventfd. So far so similar.

No, that is incorrect. For one, vhost uses them on a per-signal path
basis, whereas vbus only has one channel for the entire guest->host.

Second, I do not use ioeventfd anymore because it has too many problems
with the surrounding technology. However, that is a topic for a
different thread.


>
>> Per queue/device addressing is handled by the same conceptual namespace
>> as the one that would trigger eventfds in the model you mention. And
>> that namespace is built in to the vbus stack, and objects are registered
>> automatically as they are created.
>>
>> Contrast that to vhost, which requires some other kernel interface to
>> exist, and to be managed manually for each object that is created. Your
>> libvhostconfig would need to somehow know how to perform this
>> registration operation, and there would have to be something in the
>> kernel to receive it, presumably on a per platform basis. Solving this
>> problem generally would probably end up looking eerily like vbus,
>> because thats what vbus does.
>>
>
> vbus devices aren't magically instantiated. Userspace needs to
> instantiate them too. Sure, there's less work on the host side since
> you're using vbus instead of the native interface, but more work on the
> guest side since you're using vbus instead of the native interface.


No, that is incorrect. The amount of "work" that a guest does is
actually the same in both cases, since the guest OS peforms the hotswap
handling natively for all bus types (at least for Linux and Windows).
You still need to have a PV layer to interface with those objects in
both cases, as well, so there is no such thing as "native interface" for
PV. Its only a matter of where it occurs in the stack.

>
>
>
>>> Well, let's see. Can vbus today:
>>>
>>> - let userspace know which features are available (so it can decide if
>>> live migration is possible)
>>>
>> yes, its in sysfs.
>>
>>
>>> - let userspace limit which features are exposed to the guest (so it can
>>> make live migration possible among hosts of different capabilities)
>>>
>> yes, its in sysfs.
>>
>
> Per-device?

Yes, see /sys/vbus/devices/$dev/ to get per-instance attributes

> non-privileged-user capable?

The short answer is "not yet (I think)". I need to write a patch to
properly set the mode attribute in sysfs, but I think this will be trivial.

>
>>> - let userspace know which features were negotiated (so it can transfer
>>> them to the other host during live migration)
>>>
>> no, but we can easily add ->save()/->restore() to the model going
>> forward, and the negotiated features are just a subcomponent if its
>> serialized stream.
>>
>>
>>> - let userspace tell the kernel which features were negotiated (when
>>> live migration completes, to avoid requiring the guest to re-negotiate)
>>>
>> that would be the function of the ->restore() deserializer.
>>
>>
>>> - do all that from an unprivileged process
>>>
>> yes, in the upcoming alacrityvm v0.3 with the ioctl based control plane.
>>
>
> Ah, so you have two control planes.

So what? If anything, it goes to show how extensible the framework is
that a new plane could be added in 119 lines of code:

~/git/linux-2.6> stg show vbus-add-admin-ioctls.patch | diffstat
Makefile | 3 -
config-ioctl.c | 117
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 119 insertions(+), 1 deletion(-)

if and when having two control planes exceeds its utility, I will submit
a simple patch that removes the useless one.

>
>> Bottom line: vbus isn't done, especially w.r.t. live-migration..but that
>> is not an valid argument against the idea if you believe in
>> release-early/release-often. kvm wasn't (isn't) done either when it was
>> proposed/merged.
>>
>>
>
> kvm didn't have an existing counterpart in Linux when it was
> proposed/merged.
>

And likewise, neither does vbus.

Kind Regards,
-Greg









From: Avi Kivity on
On 09/30/2009 10:04 PM, Gregory Haskins wrote:


>> A 2.6.27 guest, or Windows guest with the existing virtio drivers, won't work
>> over vbus.
>>
> Binary compatibility with existing virtio drivers, while nice to have,
> is not a specific requirement nor goal. We will simply load an updated
> KMP/MSI into those guests and they will work again. As previously
> discussed, this is how more or less any system works today. It's like
> we are removing an old adapter card and adding a new one to "uprev the
> silicon".
>

Virtualization is about not doing that. Sometimes it's necessary (when
you have made unfixable design mistakes), but just to replace a bus,
with no advantages to the guest that has to be changed (other
hypervisors or hypervisorless deployment scenarios aren't).

>> Further, non-shmem virtio can't work over vbus.
>>
> Actually I misspoke earlier when I said virtio works over non-shmem.
> Thinking about it some more, both virtio and vbus fundamentally require
> shared-memory, since sharing their metadata concurrently on both sides
> is their raison d'être.
>
> The difference is that virtio utilizes a pre-translation/mapping (via
> ->add_buf) from the guest side. OTOH, vbus uses a post translation
> scheme (via memctx) from the host-side. If anything, vbus is actually
> more flexible because it doesn't assume the entire guest address space
> is directly mappable.
>
> In summary, your statement is incorrect (though it is my fault for
> putting that idea in your head).
>

Well, Xen requires pre-translation (since the guest has to give the host
(which is just another guest) permissions to access the data). So
neither is a superset of the other, they're just different.

It doesn't really matter since Xen is unlikely to adopt virtio.

> An interesting thing here is that you don't even need a fancy
> multi-homed setup to see the effects of my exit-ratio reduction work:
> even single port configurations suffer from the phenomenon since many
> devices have multiple signal-flows (e.g. network adapters tend to have
> at least 3 flows: rx-ready, tx-complete, and control-events (link-state,
> etc). Whats worse, is that the flows often are indirectly related (for
> instance, many host adapters will free tx skbs during rx operations, so
> you tend to get bursts of tx-completes at the same time as rx-ready. If
> the flows map 1:1 with IDT, they will suffer the same problem.
>

You can simply use the same vector for both rx and tx and poll both at
every interrupt.

> In any case, here is an example run of a simple single-homed guest over
> standard GigE. Whats interesting here is that .qnotify to .notify
> ratio, as this is the interrupt-to-signal ratio. In this case, its
> 170047/151918, which comes out to about 11% savings in interrupt injections:
>
> vbus-guest:/home/ghaskins # netperf -H dev
> TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to
> dev.laurelwood.net (192.168.1.10) port 0 AF_INET
> Recv Send Send
> Socket Socket Message Elapsed
> Size Size Size Time Throughput
> bytes bytes bytes secs. 10^6bits/sec
>
> 1048576 16384 16384 10.01 940.77
> vbus-guest:/home/ghaskins # cat /sys/kernel/debug/pci-to-vbus-bridge
> .events : 170048
> .qnotify : 151918
> .qinject : 0
> .notify : 170047
> .inject : 18238
> .bridgecalls : 18
> .buscalls : 12
> vbus-guest:/home/ghaskins # cat /proc/interrupts
> CPU0
> 0: 87 IO-APIC-edge timer
> 1: 6 IO-APIC-edge i8042
> 4: 733 IO-APIC-edge serial
> 6: 2 IO-APIC-edge floppy
> 7: 0 IO-APIC-edge parport0
> 8: 0 IO-APIC-edge rtc0
> 9: 0 IO-APIC-fasteoi acpi
> 10: 0 IO-APIC-fasteoi virtio1
> 12: 90 IO-APIC-edge i8042
> 14: 3041 IO-APIC-edge ata_piix
> 15: 1008 IO-APIC-edge ata_piix
> 24: 151933 PCI-MSI-edge vbus
> 25: 0 PCI-MSI-edge virtio0-config
> 26: 190 PCI-MSI-edge virtio0-input
> 27: 28 PCI-MSI-edge virtio0-output
> NMI: 0 Non-maskable interrupts
> LOC: 9854 Local timer interrupts
> SPU: 0 Spurious interrupts
> CNT: 0 Performance counter interrupts
> PND: 0 Performance pending work
> RES: 0 Rescheduling interrupts
> CAL: 0 Function call interrupts
> TLB: 0 TLB shootdowns
> TRM: 0 Thermal event interrupts
> THR: 0 Threshold APIC interrupts
> MCE: 0 Machine check exceptions
> MCP: 1 Machine check polls
> ERR: 0
> MIS: 0
>
> Its important to note here that we are actually looking at the interrupt
> rate, not the exit rate (which is usually a multiple of the interrupt
> rate, since you have to factor in as many as three exits per interrupt
> (IPI, window, EOI). Therefore we saved about 18k interrupts in this 10
> second burst, but we may have actually saved up to 54k exits in the
> process. This is only over a 10 second window at GigE rates, so YMMV.
> These numbers get even more dramatic on higher end hardware, but I
> haven't had a chance to generate new numbers yet.
>

(irq window exits should only be required on a small percentage of
interrupt injections, since the guest will try to disable interrupts for
short periods only)

> Looking at some external stats paints an even bleaker picture: "exits"
> as reported by kvm_stat for virtio-pci based virtio-net tip the scales
> at 65k/s vs 36k/s for vbus based venet. And virtio is consuming ~30% of
> my quad-core's cpu, vs 19% for venet during the test. Its hard to know
> which innovation or innovations may be responsible for the entire
> reduction, but certainly the interrupt-to-signal ratio mentioned above
> is probably helping.
>

Can you please stop comparing userspace-based virtio hosts to
kernel-based venet hosts? We know the userspace implementation sucks.

> The even worse news for 1:1 models is that the ratio of
> exits-per-interrupt climbs with load (exactly when it hurts the most)
> since that is when the probability that the vcpu will need all three
> exits is the highest.
>

Requiring all three exits means the guest is spending most of its time
with interrupts disabled; that's unlikely.

Thanks for the numbers. Are those 11% attributable to rx/tx
piggybacking from the same interface?

Also, 170K interupts -> 17K interrupts/sec -> 55kbit/interrupt ->
6.8kB/interrupt. Ignoring interrupt merging and assuming equal rx/tx
distribution, that's about 13kB/interrupt. Seems rather low for a
saturated link.

>>
>>> and priortizable/nestable signals.
>>>
>>>
>> That doesn't belong in a bus.
>>
> Everyone is of course entitled to an opinion, but the industry as a
> whole would disagree with you. Signal path routing (1:1, aggregated,
> etc) is at the discretion of the bus designer. Most buses actually do
> _not_ support 1:1 with IDT (think USB, SCSI, IDE, etc).
>

With standard PCI, they do not. But all modern host adapters support
MSI and they will happily give you one interrupt per queue.

> PCI is somewhat of an outlier in that regard afaict. Its actually a
> nice feature of PCI when its used within its design spec (HW). For
> SW/PV, 1:1 suffers from, among other issues, that "triple-exit scaling"
> issue in the signal path I mentioned above. This is one of the many
> reasons I think PCI is not the best choice for PV.
>

Look at the vmxnet3 submission (recently posted on virtualization@).
It's a perfectly ordinary PCI NIC driver, apart from having so many 'V's
in the code. 16 rx queues, 8 tx queues, 25 MSIs, BARs for the
registers. So while the industry as a whole might disagree with me, it
seems VMware does not.


>>> http://developer.novell.com/wiki/images/b/b7/31-rc4_throughput.png
>>>
>>>
>> That's a red herring. The problem is not with virtio as an ABI, but
>> with its implementation in userspace. vhost-net should offer equivalent
>> performance to vbus.
>>
> That's pure speculation. I would advise you to reserve such statements
> until after a proper bakeoff can be completed.

Let's do that then. Please reserve the corresponding comparisons from
your side as well.

> This is not to mention
> that vhost-net does nothing to address our other goals, like scheduler
> coordination and non-802.x fabrics.
>

What are scheduler coordination and non-802.x fabrics?

>> Right, when you ignore the points where they don't fit, it's a perfect
>> mesh.
>>
> Where doesn't it fit?
>

(avoiding infinite loop)

>>>> But that's not a strong argument for vbus; instead of adding vbus you
>>>> could make virtio more friendly to non-virt
>>>>
>>>>
>>> Actually, it _is_ a strong argument then because adding vbus is what
>>> helps makes virtio friendly to non-virt, at least for when performance
>>> matters.
>>>
>>>
>> As vhost-net shows, you can do that without vbus
>>
> Citation please. Afaict, the one use case that we looked at for vhost
> outside of KVM failed to adapt properly, so I do not see how this is true.
>

I think Ira said he can make vhost work?

>> and without breaking compatibility.
>>
> Compatibility with what? vhost hasn't even been officially deployed in
> KVM environments afaict, nevermind non-virt. Therefore, how could it
> possibly have compatibility constraints with something non-virt already?
> Citation please.
>

virtio-net over pci is deployed. Replacing the backend with vhost-net
will require no guest modifications. Replacing the frontend with venet
or virt-net/vbus-pci will require guest modifications.

Obviously virtio-net isn't deployed in non-virt. But if we adopt vbus,
we have to migrate guests.



>> Of course there is such a thing as native, a pci-ready guest has tons of
>> support built into it
>>
> I specifically mentioned that already ([1]).
>
> You are also overstating its role, since the basic OS is what implements
> the native support for bus-objects, hotswap, etc, _not_ PCI. PCI just
> rides underneath and feeds trivial events up, as do other bus-types
> (usb, scsi, vbus, etc).

But we have to implement vbus for each guest we want to support. That
includes Windows and older Linux which has a different internal API, so
we have to port the code multiple times, to get existing functionality.

> And once those events are fed, you still need a
> PV layer to actually handle the bus interface in a high-performance
> manner so its not like you really have a "native" stack in either case.
>

virtio-net doesn't use any pv layer.

>> that doesn't need to be retrofitted.
>>
> No, that is incorrect. You have to heavily modify the pci model with
> layers on top to get any kind of performance out of it. Otherwise, we
> would just use realtek emulation, which is technically the native PCI
> you are apparently so enamored with.
>

virtio-net doesn't modify the PCI model. And if you look at vmxnet3,
they mention that it conforms to somthing called UPT, which allows
hardware vendors to implement parts of their NIC model. So vmxnet3 is
apparently suitable to both hardware and software implementations.

> Not to mention there are things you just plain can't do in PCI today,
> like dynamically assign signal-paths,

You can have dynamic MSI/queue routing with virtio, and each MSI can be
routed to a vcpu at will.

> priority, and coalescing, etc.
>

Do you mean interrupt priority? Well, apic allows interrupt priorities
and Windows uses them; Linux doesn't. I don't see a reason to provide
more than native hardware.

>> Since
>> practically everyone (including Xen) does their paravirt drivers atop
>> pci, the claim that pci isn't suitable for high performance is incorrect.
>>
> Actually IIUC, I think Xen bridges to their own bus as well (and only
> where they have to), just like vbus. They don't use PCI natively. PCI
> is perfectly suited as a bridge transport for PV, as I think the Xen and
> vbus examples have demonstrated. Its the 1:1 device-model where PCI has
> the most problems.
>

N:1 breaks down on large guests since one vcpu will have to process all
events. You could do N:M, with commands to change routings, but where's
your userspace interface? you can't tell from /proc/interrupts which
vbus interupts are active, and irqbalance can't steer them towards less
busy cpus since they're invisible to the interrupt controller.


>>> And lastly, why would you _need_ to use the so called "native"
>>> mechanism? The short answer is, "you don't". Any given system (guest
>>> or bare-metal) already have a wide-range of buses (try running "tree
>>> /sys/bus" in Linux). More importantly, the concept of adding new buses
>>> is widely supported in both the Windows and Linux driver model (and
>>> probably any other guest-type that matters). Therefore, despite claims
>>> to the contrary, its not hard or even unusual to add a new bus to the
>>> mix.
>>>
>>>
>> The short answer is "compatibility".
>>
> There was a point in time where the same could be said for virtio-pci
> based drivers vs realtek and e1000, so that argument is demonstrably
> silly. No one tried to make virtio work in a binary compatible way with
> realtek emulation, yet we all survived the requirement for loading a
> virtio driver to my knowledge.
>

The larger your installed base, the more difficult it is. Of course
it's doable, but I prefer not doing it and instead improving things in a
binary backwards compatible manner. If there is no choice we will bow
to the inevitable and make our users upgrade. But at this point there
is a choice, and I prefer to stick with vhost-net until it is proven
that it won't work.

> The bottom line is: Binary device compatibility is not required in any
> other system (as long as you follow sensible versioning/id rules), so
> why is KVM considered special?
>

One of the benefits of virtualization is that the guest model is
stable. You can live-migrate guests and upgrade the hardware
underneath. You can have a single guest image that you clone to
provision new guests. If you switch to a new model, you give up those
benefits, or you support both models indefinitely.

Note even hardware nowadays is binary compatible. One e1000 driver
supports a ton of different cards, and I think (not sure) newer cards
will work with older drivers, just without all their features.

> The fact is, it isn't special (at least not in this regard). What _is_
> required is "support" and we fully intend to support these proposed
> components. I assure you that at least the users that care about
> maximum performance will not generally mind loading a driver. Most of
> them would have to anyway if they want to get beyond realtek emulation.
>

For a new install, sure. I'm talking about existing deployments (and
those that will exist by the time vbus is ready for roll out).

> I am certainly in no position to tell you how to feel, but this
> declaration would seem from my perspective to be more of a means to an
> end than a legitimate concern. Otherwise we would never have had virtio
> support in the first place, since it was not "compatible" with previous
> releases.
>

virtio was certainly not pain free, needing Windows drivers, updates to
management tools (you can't enable it by default, so you have to offer
it as a choice), mkinitrd, etc. I'd rather not have to go through that
again.

>> Especially if the device changed is your boot disk.
>>
> If and when that becomes a priority concern, that would be a function
> transparently supported in the BIOS shipped with the hypervisor, and
> would thus be invisible to the user.
>

No, you have to update the driver in your initrd (for Linux) or properly
install the new driver (for Windows). It's especially difficult for
Windows.

>> You may not care about the pain caused to users, but I do, so I will
>> continue to insist on compatibility.
>>
> For the users that don't care about maximum performance, there is no
> change (and thus zero pain) required. They can use realtek or virtio if
> they really want to. Neither is going away to my knowledge, and lets
> face it: 2.6Gb/s out of virtio to userspace isn't *that* bad. But "good
> enough" isn't good enough, and I won't rest till we get to native
> performance.

I don't want to support both virtio and vbus in parallel. There's
enough work already. If we adopt vbus, we'll have to deprecate and
eventually kill off virtio.

> 2) True pain to users is not caused by lack of binary compatibility.
> Its caused by lack of support. And its a good thing or we would all be
> emulating 8086 architecture forever...
>
> ..oh wait, I guess we kind of do that already ;). But at least we can
> slip in something more advanced once in a while (APIC vs PIC, USB vs
> uart, iso9660 vs floppy, for instance) and update the guest stack
> instead of insisting it must look like ISA forever for compatibility's sake.
>

PCI is continuously updated, with MSI, MSI-X, and IOMMU support being
some recent updates. I'd like to ride on top of that instead of having
to clone it for every guest I support.

>> So we have: vbus needs a connector, vhost needs a connector. vbus
>> doesn't need userspace to program the addresses (but does need userspace
>> to instantiate the devices and to program the bus address decode)
>>
> First of all, bus-decode is substantially easier than per-device decode
> (you have to track all those per-device/per-signal fds somewhere,
> integrate with hotswap, etc), and its only done once per guest at
> startup and left alone. So its already not apples to apples.
>

Right, it means you can hand off those eventfds to other qemus or other
pure userspace servers. It's more flexible.

> Second, while its true that the general kvm-connector bus-decode needs
> to be programmed, that is a function of adapting to the environment
> that _you_ created for me. The original kvm-connector was discovered
> via cpuid and hypercalls, and didn't need userspace at all to set it up.
> Therefore it would be entirely unfair of you to turn around and somehow
> try to use that trait of the design against me since you yourself
> imposed it.
>

No kvm feature will ever be exposed to a guest without userspace
intervention. It's a basic requirement. If it causes complexity (and
it does) we have to live with it.

>> Does it work on Windows?
>>
> This question doesn't make sense. Hotswap control occurs on the host,
> which is always Linux.
>
> If you were asking about whether a windows guest will support hotswap:
> the answer is "yes". Our windows driver presents a unique PDO/FDO pair
> for each logical device instance that is pushed out (just like the built
> in usb, pci, scsi bus drivers that windows supports natively).
>

Ah, you have a Windows venet driver?


>>> As an added bonus, its device-model is modular. A developer can write a
>>> new device model, compile it, insmod it to the host kernel, hotplug it
>>> to the running guest with mkdir/ln, and the come back out again
>>> (hotunplug with rmdir, rmmod, etc). They may do this all without taking
>>> the guest down, and while eating QEMU based IO solutions for breakfast
>>> performance wise.
>>>
>>> Afaict, qemu can't do either of those things.
>>>
>>>
>> We've seen that herring before,
>>
> Citation?
>

It's the compare venet-in-kernel to virtio-in-userspace thing again.
Let's defer that until mst complete vhost-net mergable buffers, it which
time we can compare vhost-net to venet and see how much vbus contributes
to performance and how much of it comes from being in-kernel.

>>>> Refactor instead of duplicating.
>>>>
>>>>
>>> There is no duplicating. vbus has no equivalent today as virtio doesn't
>>> define these layers.
>>>
>>>
>> So define them if they're missing.
>>
> I just did.
>

Since this is getting confusing to me, I'll start from scratch looking
at the vbus layers, top to bottom:

Guest side:
1. venet guest kernel driver - AFAICT, duplicates the virtio-net guest
driver functionality
2. vbus guest driver (config and hotplug) - duplicates pci, or if you
need non-pci support, virtio config and its pci bindings; needs
reimplementation for all supported guests
3. vbus guest driver (interrupt coalescing, priority) - if needed,
should be implemented as an irqchip (and be totally orthogonal to the
driver); needs reimplementation for all supported guests
4. vbus guest driver (shm/ioq) - finder grained layering than virtio
(which only supports the combination, due to the need for Xen support);
can be retrofitted to virtio at some cost

Host side:
1. venet host kernel driver - is duplicated by vhost-net; doesn't
support live migration, unprivileged users, or slirp
2. vbus host driver (config and hotplug) - duplicates pci support in
userspace (which will need to be kept in any case); already has two
userspace interfaces
3. vbus host driver (interrupt coalescing, priority) - if we think we
need it (and I don't), should be part of kvm core, not a bus
4. vbus host driver (shm) - partially duplicated by vhost memory slots
5. vbus host driver (ioq) - duplicates userspace virtio, duplicated by vhost

>>> There is no rewriting. vbus has no equivalent today as virtio doesn't
>>> define these layers.
>>>
>>> By your own admission, you said if you wanted that capability, use a
>>> library. What I think you are not understanding is vbus _is_ that
>>> library. So what is the problem, exactly?
>>>
>>>
>> It's not compatible.
>>
> No, that is incorrect. What you are apparently not understanding is
> that not only is vbus that library, but its extensible. So even if
> compatibility is your goal (it doesn't need to be IMO) it can be
> accommodated by how you interface to the library.
>

To me, compatible means I can live migrate an image to a new system
without the user knowing about the change. You'll be able to do that
with vhost-net.

>>>>
>>>>
>>> No, it does not. vbus just needs a relatively simple single message
>>> pipe between the guest and host (think "hypercall tunnel", if you will).
>>>
>>>
>> That's ioeventfd. So far so similar.
>>
> No, that is incorrect. For one, vhost uses them on a per-signal path
> basis, whereas vbus only has one channel for the entire guest->host.
>

You'll probably need to change that as you start running smp guests.

> Second, I do not use ioeventfd anymore because it has too many problems
> with the surrounding technology. However, that is a topic for a
> different thread.
>

Please post your issues. I see ioeventfd/irqfd as critical kvm interfaces.

>> vbus devices aren't magically instantiated. Userspace needs to
>> instantiate them too. Sure, there's less work on the host side since
>> you're using vbus instead of the native interface, but more work on the
>> guest side since you're using vbus instead of the native interface.
>>
>
> No, that is incorrect. The amount of "work" that a guest does is
> actually the same in both cases, since the guest OS peforms the hotswap
> handling natively for all bus types (at least for Linux and Windows).
> You still need to have a PV layer to interface with those objects in
> both cases, as well, so there is no such thing as "native interface" for
> PV. Its only a matter of where it occurs in the stack.
>

I'm missing something. Where's the pv layer for virtio-net?

Linux drivers have an abstraction layer to deal with non-pci. But the
Windows drivers are ordinary pci drivers with nothing that looks
pv-ish. You could implement virtio-net hardware if you wanted to.

>> non-privileged-user capable?
>>
> The short answer is "not yet (I think)". I need to write a patch to
> properly set the mode attribute in sysfs, but I think this will be trivial.
>
>

(and selinux label)

>> Ah, so you have two control planes.
>>
> So what? If anything, it goes to show how extensible the framework is
> that a new plane could be added in 119 lines of code:
>
> ~/git/linux-2.6> stg show vbus-add-admin-ioctls.patch | diffstat
> Makefile | 3 -
> config-ioctl.c | 117
> +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 119 insertions(+), 1 deletion(-)
>
> if and when having two control planes exceeds its utility, I will submit
> a simple patch that removes the useless one.
>

It always begins with a 119-line patch and then grows, that's life.

>> kvm didn't have an existing counterpart in Linux when it was
>> proposed/merged.
>>
> And likewise, neither does vbus.
>
>

For virt uses, I don't see the need. For non-virt, I have no opinion.


--
Do not meddle in the internals of kernels, for they are subtle and quick to panic.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Michael S. Tsirkin on
On Thu, Oct 01, 2009 at 10:34:17AM +0200, Avi Kivity wrote:
>> Second, I do not use ioeventfd anymore because it has too many problems
>> with the surrounding technology. However, that is a topic for a
>> different thread.
>>
>
> Please post your issues. I see ioeventfd/irqfd as critical kvm interfaces.

I second that. AFAIK ioeventfd/irqfd got exposed to userspace in 2.6.32-rc1,
if there are issues we better nail them before 2.6.32 is out.
And yes, please start a different thread.

--
MST
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/