From: Michael S. Tsirkin on
On Tue, Nov 24, 2009 at 08:54:23AM +1030, Rusty Russell wrote:
> On Tue, 24 Nov 2009 02:37:01 am Shirley Ma wrote:
> > > > + skb = (struct sk_buff *)buf;
> > > This cast is unnecessary, but a comment would be nice:
> >
> > Without this cast there is a compile warning.
>
> Hi Shirley,
>
> Looks like buf is a void *, so no cast should be necessary. But I could
> be reading the patch wrong.
>
> > > However, I question whether making it 16 byte is the right thing: the
> > > ethernet header is 14 bytes long, so don't we want 8 bytes of padding?
> >
> > Because in QEMU it requires 10 bytes header in a separately, so one page
> > is used to share between virtio_net_hdr header which is 10 bytes head
> > and rest of data. So I put 6 bytes offset here between two buffers. I
> > didn't look at the reason why a seperate buf is used for virtio_net_hdr
> > in QEMU.
>
> It's a qemu bug. It insists the header be an element in the scatterlist by
> itself. Unfortunately we have to accommodate it.

We do? Let's just fix this?
All we have to do is replace memcpy with proper iovec walk, correct?
Something like the followng (untested) patch? It's probably not too
late to put this in the next qemu release...

Signed-off-by: Michael S. Tsirkin <mst(a)redhat.com>


diff --git a/hw/virtio-net.c b/hw/virtio-net.c
index 2f147e5..06c5148 100644
--- a/hw/virtio-net.c
+++ b/hw/virtio-net.c
@@ -434,26 +434,59 @@ static int iov_fill(struct iovec *iov, int iovcnt, const void *buf, int count)
return offset;
}

+static int iov_skip(struct iovec *iov, int iovcnt, int count)
+{
+ int offset, i;
+
+ offset = i = 0;
+ while (offset < count && i < iovcnt) {
+ int len = MIN(iov[i].iov_len, count - offset);
+ iov[i].iov_base += len;
+ iov[i].iov_len -= len;
+ offset += len;
+ i++;
+ }
+
+ return offset;
+}
+
+static int iov_copy(struct iovec *to, struct iovec *from, int iovcnt, int count)
+{
+ int offset, i;
+
+ offset = i = 0;
+ while (offset < count && i < iovcnt) {
+ int len = MIN(from[i].iov_len, count - offset);
+ to[i].iov_base = from[i].iov_base;
+ to[i].iov_len = from[i].iov_len;
+ offset += len;
+ i++;
+ }
+
+ return i;
+}
+
static int receive_header(VirtIONet *n, struct iovec *iov, int iovcnt,
const void *buf, size_t size, size_t hdr_len)
{
- struct virtio_net_hdr *hdr = (struct virtio_net_hdr *)iov[0].iov_base;
+ struct virtio_net_hdr hdr = {};
int offset = 0;

- hdr->flags = 0;
- hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE;
+ hdr.flags = 0;
+ hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;

if (n->has_vnet_hdr) {
- memcpy(hdr, buf, sizeof(*hdr));
- offset = sizeof(*hdr);
- work_around_broken_dhclient(hdr, buf + offset, size - offset);
+ memcpy(&hdr, buf, sizeof hdr);
+ offset = sizeof hdr;
+ work_around_broken_dhclient(&hdr, buf + offset, size - offset);
}

+ iov_fill(iov, iovcnt, &hdr, sizeof hdr);
+
/* We only ever receive a struct virtio_net_hdr from the tapfd,
* but we may be passing along a larger header to the guest.
*/
- iov[0].iov_base += hdr_len;
- iov[0].iov_len -= hdr_len;
+ iov_skip(iov, iovcnt, hdr_len);

return offset;
}
@@ -514,7 +547,8 @@ static int receive_filter(VirtIONet *n, const uint8_t *buf, int size)
static ssize_t virtio_net_receive(VLANClientState *vc, const uint8_t *buf, size_t size)
{
VirtIONet *n = vc->opaque;
- struct virtio_net_hdr_mrg_rxbuf *mhdr = NULL;
+ struct iovec mhdr[VIRTQUEUE_MAX_SIZE];
+ int mhdrcnt = 0;
size_t hdr_len, offset, i;

if (!virtio_net_can_receive(n->vc))
@@ -552,16 +586,13 @@ static ssize_t virtio_net_receive(VLANClientState *vc, const uint8_t *buf, size_
exit(1);
}

- if (!n->mergeable_rx_bufs && elem.in_sg[0].iov_len != hdr_len) {
- fprintf(stderr, "virtio-net header not in first element\n");
- exit(1);
- }
-
memcpy(&sg, &elem.in_sg[0], sizeof(sg[0]) * elem.in_num);

if (i == 0) {
- if (n->mergeable_rx_bufs)
- mhdr = (struct virtio_net_hdr_mrg_rxbuf *)sg[0].iov_base;
+ if (n->mergeable_rx_bufs) {
+ mhdrcnt = iov_copy(mhdr, sg, elem.in_num,
+ sizeof(struct virtio_net_hdr_mrg_rxbuf));
+ }

offset += receive_header(n, sg, elem.in_num,
buf + offset, size - offset, hdr_len);
@@ -579,8 +610,12 @@ static ssize_t virtio_net_receive(VLANClientState *vc, const uint8_t *buf, size_
offset += len;
}

- if (mhdr)
- mhdr->num_buffers = i;
+ if (mhdrcnt) {
+ uint16_t num = i;
+ iov_skip(mhdr, mhdrcnt,
+ offsetof(struct virtio_net_hdr_mrg_rxbuf, num_buffers));
+ iov_fill(mhdr, mhdrcnt, &num, sizeof num);
+ }

virtqueue_flush(n->rx_vq, i);
virtio_notify(&n->vdev, n->rx_vq);
@@ -627,20 +662,19 @@ static void virtio_net_flush_tx(VirtIONet *n, VirtQueue *vq)
sizeof(struct virtio_net_hdr_mrg_rxbuf) :
sizeof(struct virtio_net_hdr);

- if (out_num < 1 || out_sg->iov_len != hdr_len) {
- fprintf(stderr, "virtio-net header not in first element\n");
+ if (out_num < 1) {
+ fprintf(stderr, "virtio-net: no output element\n");
exit(1);
}

/* ignore the header if GSO is not supported */
if (!n->has_vnet_hdr) {
- out_num--;
- out_sg++;
+ iov_skip(out_sg, out_num, hdr_len);
len += hdr_len;
} else if (n->mergeable_rx_bufs) {
/* tapfd expects a struct virtio_net_hdr */
hdr_len -= sizeof(struct virtio_net_hdr);
- out_sg->iov_len -= hdr_len;
+ iov_skip(out_sg, out_num, hdr_len);
len += hdr_len;
}

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Michael S. Tsirkin on
On Tue, Nov 24, 2009 at 08:36:32AM -0600, Anthony Liguori wrote:
> Michael S. Tsirkin wrote:
>> On Tue, Nov 24, 2009 at 08:54:23AM +1030, Rusty Russell wrote:
>>
>>> On Tue, 24 Nov 2009 02:37:01 am Shirley Ma wrote:
>>>
>>>>>> + skb = (struct sk_buff *)buf;
>>>>>>
>>>>> This cast is unnecessary, but a comment would be nice:
>>>>>
>>>> Without this cast there is a compile warning.
>>> Hi Shirley,
>>>
>>> Looks like buf is a void *, so no cast should be necessary. But I could
>>> be reading the patch wrong.
>>>
>>>
>>>>> However, I question whether making it 16 byte is the right thing: the
>>>>> ethernet header is 14 bytes long, so don't we want 8 bytes of padding?
>>>>>
>>>> Because in QEMU it requires 10 bytes header in a separately, so one page
>>>> is used to share between virtio_net_hdr header which is 10 bytes head
>>>> and rest of data. So I put 6 bytes offset here between two buffers. I
>>>> didn't look at the reason why a seperate buf is used for virtio_net_hdr
>>>> in QEMU.
>>>>
>>> It's a qemu bug. It insists the header be an element in the scatterlist by
>>> itself. Unfortunately we have to accommodate it.
>>>
>>
>> We do? Let's just fix this?
>>
>
> So does lguest.

It does? All I see it doing is writev/readv,
and this passes things to tap which handles
this correctly.


> It's been that way since the beginning. Fixing this
> would result in breaking older guests.

If you look at my patch, it handles old guests just fine :).

> We really need to introduce a feature bit if we want to change this.

I am not sure I agree: we can't add feature bits
for all bugs, can we?

--
MST
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Michael S. Tsirkin on
On Tue, Nov 24, 2009 at 08:36:32AM -0600, Anthony Liguori wrote:
> Michael S. Tsirkin wrote:
>> On Tue, Nov 24, 2009 at 08:54:23AM +1030, Rusty Russell wrote:
>>
>>> On Tue, 24 Nov 2009 02:37:01 am Shirley Ma wrote:
>>>
>>>>>> + skb = (struct sk_buff *)buf;
>>>>>>
>>>>> This cast is unnecessary, but a comment would be nice:
>>>>>
>>>> Without this cast there is a compile warning.
>>> Hi Shirley,
>>>
>>> Looks like buf is a void *, so no cast should be necessary. But I could
>>> be reading the patch wrong.
>>>
>>>
>>>>> However, I question whether making it 16 byte is the right thing: the
>>>>> ethernet header is 14 bytes long, so don't we want 8 bytes of padding?
>>>>>
>>>> Because in QEMU it requires 10 bytes header in a separately, so one page
>>>> is used to share between virtio_net_hdr header which is 10 bytes head
>>>> and rest of data. So I put 6 bytes offset here between two buffers. I
>>>> didn't look at the reason why a seperate buf is used for virtio_net_hdr
>>>> in QEMU.
>>>>
>>> It's a qemu bug. It insists the header be an element in the scatterlist by
>>> itself. Unfortunately we have to accommodate it.
>>>
>>
>> We do? Let's just fix this?
>>
>
> So does lguest. It's been that way since the beginning. Fixing this
> would result in breaking older guests.

The patch you are replying to fixes this in a way that does not break older guests.

> We really need to introduce a feature bit if we want to change this.

--
MST
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Michael S. Tsirkin on
On Wed, Nov 25, 2009 at 10:42:06AM +1030, Rusty Russell wrote:
> On Tue, 24 Nov 2009 10:07:54 pm Michael S. Tsirkin wrote:
> > On Tue, Nov 24, 2009 at 08:54:23AM +1030, Rusty Russell wrote:
> > > On Tue, 24 Nov 2009 02:37:01 am Shirley Ma wrote:
> > > > > > + skb = (struct sk_buff *)buf;
> > > > > This cast is unnecessary, but a comment would be nice:
> > > >
> > > > Without this cast there is a compile warning.
> > >
> > > Hi Shirley,
> > >
> > > Looks like buf is a void *, so no cast should be necessary. But I could
> > > be reading the patch wrong.
> > >
> > > > > However, I question whether making it 16 byte is the right thing: the
> > > > > ethernet header is 14 bytes long, so don't we want 8 bytes of padding?
> > > >
> > > > Because in QEMU it requires 10 bytes header in a separately, so one page
> > > > is used to share between virtio_net_hdr header which is 10 bytes head
> > > > and rest of data. So I put 6 bytes offset here between two buffers. I
> > > > didn't look at the reason why a seperate buf is used for virtio_net_hdr
> > > > in QEMU.
> > >
> > > It's a qemu bug. It insists the header be an element in the scatterlist by
> > > itself. Unfortunately we have to accommodate it.
> >
> > We do? Let's just fix this?
> > All we have to do is replace memcpy with proper iovec walk, correct?
> > Something like the followng (untested) patch? It's probably not too
> > late to put this in the next qemu release...
>
> You might want to implement a more generic helper which does:
>
> /* Return pointer into iovec if we can, otherwise copy into buf */
> void *pull_iovec(struct iovec *iov, int iovcnt, void *buf, size_t len)
> {
> unsigned int i;
> void *p;
>
> if (likely(iov_cnt && iov[0].iov_len >= len)) {
> /* Nice contiguous chunk. */
> void *p = iov[0].iov_base;
> iov[i].iov_base += len;
> iov[i].iov_len -= len;
> return p;
> }
>
> p = buf;
> for (i = 0; i < iov_cnt; i++) {
> size_t this_len = min(len, iov[i].iov_len);
> memcpy(p, iov[i].iov_base, this_len);
> len -= this_len;
> iov[i].iov_base += len;
> iov[i].iov_len -= len;
> if (len == 0)
> return buf;
> }
> /* BTW, we screwed your iovec. */
> return NULL;
> }
>
> Then use it in all the virtio drivers...

Hmm, is it really worth it to save a header copy if it's linear? We are
going to access it anyway, and it fits into one cacheline nicely. On
the other hand we have more code making life harder for compiler and
processor.

> Thanks!
> Rusty.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Michael S. Tsirkin on
On Wed, Nov 25, 2009 at 08:50:21PM +1030, Rusty Russell wrote:
> On Wed, 25 Nov 2009 07:45:30 pm Michael S. Tsirkin wrote:
> > Hmm, is it really worth it to save a header copy if it's linear? We are
> > going to access it anyway, and it fits into one cacheline nicely. On
> > the other hand we have more code making life harder for compiler and
> > processor.
>
> Not sure: I think there would be many places where it would be useful.
>
> We do a similar thing in the kernel to inspect non-linear packets, and
> it's served us well.

You mean this gives measureable speedup? Okay ...

> Cheers,
> Rusty.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/