From: Michael S. Tsirkin on
> virtio: put last_used and last_avail index into ring itself.
>
> Generally, the other end of the virtio ring doesn't need to see where
> you're up to in consuming the ring. However, to completely understand
> what's going on from the outside, this information must be exposed.
> For example, if you want to save and restore a virtio_ring, but you're
> not the consumer because the kernel is using it directly.
>
> Fortunately, we have room to expand: the ring is always a whole number
> of pages and there's hundreds of bytes of padding after the avail ring
> and the used ring, whatever the number of descriptors (which must be a
> power of 2).
>
> We add a feature bit so the guest can tell the host that it's writing
> out the current value there, if it wants to use that.
>
> Signed-off-by: Rusty Russell <rusty(a)rustcorp.com.au>

I've been looking at this patch some more (more on why
later), and I wonder: would it be better to add some
alignment to the last used index address, so that
if we later add more stuff at the tail, it all
fits in a single cache line?

We use a new feature bit anyway, so layout change should not be
a problem.

Since I raised the question of caches: for used ring,
the ring is not aligned to 64 bit, so on CPUs with 64 bit
or larger cache lines, used entries will often cross
cache line boundaries. Am I right and might it
have been better to align ring entries to cache line boundaries?

What do you think?

> ---
> drivers/virtio/virtio_ring.c | 23 +++++++++++++++--------
> include/linux/virtio_ring.h | 12 +++++++++++-
> 2 files changed, 26 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -71,9 +71,6 @@ struct vring_virtqueue
> /* Number we've added since last sync. */
> unsigned int num_added;
>
> - /* Last used index we've seen. */
> - u16 last_used_idx;
> -
> /* How to notify other side. FIXME: commonalize hcalls! */
> void (*notify)(struct virtqueue *vq);
>
> @@ -278,12 +275,13 @@ static void detach_buf(struct vring_virt
>
> static inline bool more_used(const struct vring_virtqueue *vq)
> {
> - return vq->last_used_idx != vq->vring.used->idx;
> + return vring_last_used(&vq->vring) != vq->vring.used->idx;
> }
>
> static void *vring_get_buf(struct virtqueue *_vq, unsigned int *len)
> {
> struct vring_virtqueue *vq = to_vvq(_vq);
> + struct vring_used_elem *u;
> void *ret;
> unsigned int i;
>
> @@ -300,8 +298,11 @@ static void *vring_get_buf(struct virtqu
> return NULL;
> }
>
> - i = vq->vring.used->ring[vq->last_used_idx%vq->vring.num].id;
> - *len = vq->vring.used->ring[vq->last_used_idx%vq->vring.num].len;
> + u = &vq->vring.used->ring[vring_last_used(&vq->vring) % vq->vring.num];
> + i = u->id;
> + *len = u->len;
> + /* Make sure we don't reload i after doing checks. */
> + rmb();
>
> if (unlikely(i >= vq->vring.num)) {
> BAD_RING(vq, "id %u out of range\n", i);
> @@ -315,7 +316,8 @@ static void *vring_get_buf(struct virtqu
> /* detach_buf clears data, so grab it now. */
> ret = vq->data[i];
> detach_buf(vq, i);
> - vq->last_used_idx++;
> + vring_last_used(&vq->vring)++;
> +
> END_USE(vq);
> return ret;
> }
> @@ -402,7 +404,6 @@ struct virtqueue *vring_new_virtqueue(un
> vq->vq.name = name;
> vq->notify = notify;
> vq->broken = false;
> - vq->last_used_idx = 0;
> vq->num_added = 0;
> list_add_tail(&vq->vq.list, &vdev->vqs);
> #ifdef DEBUG
> @@ -413,6 +414,10 @@ struct virtqueue *vring_new_virtqueue(un
>
> vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC);
>
> + /* We publish indices whether they offer it or not: if not, it's junk
> + * space anyway. But calling this acknowledges the feature. */
> + virtio_has_feature(vdev, VIRTIO_RING_F_PUBLISH_INDICES);
> +
> /* No callback? Tell other side not to bother us. */
> if (!callback)
> vq->vring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT;
> @@ -443,6 +448,8 @@ void vring_transport_features(struct vir
> switch (i) {
> case VIRTIO_RING_F_INDIRECT_DESC:
> break;
> + case VIRTIO_RING_F_PUBLISH_INDICES:
> + break;
> default:
> /* We don't understand this bit. */
> clear_bit(i, vdev->features);
> diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h
> --- a/include/linux/virtio_ring.h
> +++ b/include/linux/virtio_ring.h
> @@ -29,6 +29,9 @@
> /* We support indirect buffer descriptors */
> #define VIRTIO_RING_F_INDIRECT_DESC 28
>
> +/* We publish our last-seen used index at the end of the avail ring. */
> +#define VIRTIO_RING_F_PUBLISH_INDICES 29
> +
> /* Virtio ring descriptors: 16 bytes. These can chain together via "next". */
> struct vring_desc
> {
> @@ -87,6 +90,7 @@ struct vring {
> * __u16 avail_flags;
> * __u16 avail_idx;
> * __u16 available[num];
> + * __u16 last_used_idx;
> *
> * // Padding to the next align boundary.
> * char pad[];
> @@ -95,6 +99,7 @@ struct vring {
> * __u16 used_flags;
> * __u16 used_idx;
> * struct vring_used_elem used[num];
> + * __u16 last_avail_idx;
> * };
> */
> static inline void vring_init(struct vring *vr, unsigned int num, void *p,
> @@ -111,9 +116,14 @@ static inline unsigned vring_size(unsign
> {
> return ((sizeof(struct vring_desc) * num + sizeof(__u16) * (2 + num)
> + align - 1) & ~(align - 1))
> - + sizeof(__u16) * 2 + sizeof(struct vring_used_elem) * num;
> + + sizeof(__u16) * 2 + sizeof(struct vring_used_elem) * num + 2;
> }
>
> +/* We publish the last-seen used index at the end of the available ring, and
> + * vice-versa. These are at the end for backwards compatibility. */
> +#define vring_last_used(vr) ((vr)->avail->ring[(vr)->num])
> +#define vring_last_avail(vr) (*(__u16 *)&(vr)->used->ring[(vr)->num])
> +
> #ifdef __KERNEL__
> #include <linux/irqreturn.h>
> struct virtio_device;
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Rusty Russell on
On Wed, 5 May 2010 03:52:36 am Michael S. Tsirkin wrote:
> > virtio: put last_used and last_avail index into ring itself.
> >
> > Generally, the other end of the virtio ring doesn't need to see where
> > you're up to in consuming the ring. However, to completely understand
> > what's going on from the outside, this information must be exposed.
> > For example, if you want to save and restore a virtio_ring, but you're
> > not the consumer because the kernel is using it directly.
> >
> > Fortunately, we have room to expand: the ring is always a whole number
> > of pages and there's hundreds of bytes of padding after the avail ring
> > and the used ring, whatever the number of descriptors (which must be a
> > power of 2).
> >
> > We add a feature bit so the guest can tell the host that it's writing
> > out the current value there, if it wants to use that.
> >
> > Signed-off-by: Rusty Russell <rusty(a)rustcorp.com.au>
>
> I've been looking at this patch some more (more on why
> later), and I wonder: would it be better to add some
> alignment to the last used index address, so that
> if we later add more stuff at the tail, it all
> fits in a single cache line?

In theory, but not in practice. We don't have many rings, so the
difference between 1 and 2 cache lines is not very much.

> We use a new feature bit anyway, so layout change should not be
> a problem.
>
> Since I raised the question of caches: for used ring,
> the ring is not aligned to 64 bit, so on CPUs with 64 bit
> or larger cache lines, used entries will often cross
> cache line boundaries. Am I right and might it
> have been better to align ring entries to cache line boundaries?
>
> What do you think?

I think everyone is settled on 128 byte cache lines for the forseeable
future, so it's not really an issue.

Cheers,
Rusty.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Michael S. Tsirkin on
On Thu, May 06, 2010 at 10:22:12AM +0930, Rusty Russell wrote:
> On Wed, 5 May 2010 03:52:36 am Michael S. Tsirkin wrote:
> > > virtio: put last_used and last_avail index into ring itself.
> > >
> > > Generally, the other end of the virtio ring doesn't need to see where
> > > you're up to in consuming the ring. However, to completely understand
> > > what's going on from the outside, this information must be exposed.
> > > For example, if you want to save and restore a virtio_ring, but you're
> > > not the consumer because the kernel is using it directly.
> > >
> > > Fortunately, we have room to expand: the ring is always a whole number
> > > of pages and there's hundreds of bytes of padding after the avail ring
> > > and the used ring, whatever the number of descriptors (which must be a
> > > power of 2).
> > >
> > > We add a feature bit so the guest can tell the host that it's writing
> > > out the current value there, if it wants to use that.
> > >
> > > Signed-off-by: Rusty Russell <rusty(a)rustcorp.com.au>
> >
> > I've been looking at this patch some more (more on why
> > later), and I wonder: would it be better to add some
> > alignment to the last used index address, so that
> > if we later add more stuff at the tail, it all
> > fits in a single cache line?
>
> In theory, but not in practice. We don't have many rings, so the
> difference between 1 and 2 cache lines is not very much.

Fair enough.

> > We use a new feature bit anyway, so layout change should not be
> > a problem.
> >
> > Since I raised the question of caches: for used ring,
> > the ring is not aligned to 64 bit, so on CPUs with 64 bit
> > or larger cache lines, used entries will often cross
> > cache line boundaries. Am I right and might it
> > have been better to align ring entries to cache line boundaries?
> >
> > What do you think?
>
> I think everyone is settled on 128 byte cache lines for the forseeable
> future, so it's not really an issue.
>
> Cheers,
> Rusty.

You mean with 64 bit descriptors we will be bouncing a cache line
between host and guest, anyway?

--
MST
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Rusty Russell on
On Thu, 6 May 2010 03:57:55 pm Michael S. Tsirkin wrote:
> On Thu, May 06, 2010 at 10:22:12AM +0930, Rusty Russell wrote:
> > On Wed, 5 May 2010 03:52:36 am Michael S. Tsirkin wrote:
> > > What do you think?
> >
> > I think everyone is settled on 128 byte cache lines for the forseeable
> > future, so it's not really an issue.
>
> You mean with 64 bit descriptors we will be bouncing a cache line
> between host and guest, anyway?

I'm confused by this entire thread.

Descriptors are 16 bytes. They are at the start, so presumably aligned to
cache boundaries.

Available ring follows that at 2 bytes per entry, so it's also packed nicely
into cachelines.

Then there's padding to page boundary. That puts us on a cacheline again
for the used ring; also 2 bytes per entry.

I don't see how any change in layout could be more cache friendly?
Rusty.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Michael S. Tsirkin on
On Fri, May 07, 2010 at 12:35:39PM +0930, Rusty Russell wrote:
> On Thu, 6 May 2010 03:57:55 pm Michael S. Tsirkin wrote:
> > On Thu, May 06, 2010 at 10:22:12AM +0930, Rusty Russell wrote:
> > > On Wed, 5 May 2010 03:52:36 am Michael S. Tsirkin wrote:
> > > > What do you think?
> > >
> > > I think everyone is settled on 128 byte cache lines for the forseeable
> > > future, so it's not really an issue.
> >
> > You mean with 64 bit descriptors we will be bouncing a cache line
> > between host and guest, anyway?
>
> I'm confused by this entire thread.
>
> Descriptors are 16 bytes. They are at the start, so presumably aligned to
> cache boundaries.
>
> Available ring follows that at 2 bytes per entry, so it's also packed nicely
> into cachelines.
>
> Then there's padding to page boundary. That puts us on a cacheline again
> for the used ring; also 2 bytes per entry.
>

Hmm, is used ring really 2 bytes per entry?


/* u32 is used here for ids for padding reasons. */
struct vring_used_elem {
/* Index of start of used descriptor chain. */
__u32 id;
/* Total length of the descriptor chain which was used (written to) */
__u32 len;
};

struct vring_used {
__u16 flags;
__u16 idx;
struct vring_used_elem ring[];
};

> I don't see how any change in layout could be more cache friendly?
> Rusty.

I thought that used ring has 8 bytes per entry, and that struct
vring_used is aligned at page boundary, this
would mean that ring element is at offset 4 bytes from page boundary.
Thus with cacheline size 128 bytes, each 4th element crosses
a cacheline boundary. If we had a 4 byte padding after idx, each
used element would always be completely within a single cacheline.

What am I missing?
--
MST
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/