Discussion:
[dpdk-dev] [PATCH] optimize vhost enqueue
(too old to reply)
Zhihong Wang
2016-08-16 03:50:02 UTC
Permalink
This patch optimizes the vhost enqueue function: rte_vhost_enqueue_burst.

Currently there're 2 callbacks for vhost enqueue:
* virtio_dev_merge_rx for mrg_rxbuf turned on cases.
* virtio_dev_rx for mrg_rxbuf turned off cases.

The virtio_dev_merge_rx doesn't provide optimal performance, also it is
reported having compatibility issue working with Windows VMs.

Besides, having 2 separated functions increases maintenance efforts.

This patch uses a single function logic to replace the current 2 for
better maintainability, and provides better performance by optimizing
caching behavior especially for mrg_rxbuf turned on cases.

It also fixes the issue working with Windows VMs.

Signed-off-by: Zhihong Wang <***@intel.com>
---
lib/librte_vhost/vhost-net.h | 6 +-
lib/librte_vhost/vhost_rxtx.c | 582 ++++++++++++++----------------------------
lib/librte_vhost/virtio-net.c | 15 +-
3 files changed, 208 insertions(+), 395 deletions(-)

diff --git a/lib/librte_vhost/vhost-net.h b/lib/librte_vhost/vhost-net.h
index 38593a2..a15182c 100644
--- a/lib/librte_vhost/vhost-net.h
+++ b/lib/librte_vhost/vhost-net.h
@@ -71,7 +71,7 @@ struct vhost_virtqueue {
uint32_t size;

/* Last index used on the available ring */
- volatile uint16_t last_used_idx;
+ uint16_t last_used_idx;
#define VIRTIO_INVALID_EVENTFD (-1)
#define VIRTIO_UNINITIALIZED_EVENTFD (-2)

@@ -85,6 +85,10 @@ struct vhost_virtqueue {

/* Physical address of used ring, for logging */
uint64_t log_guest_addr;
+
+ /* Shadow used ring for performance */
+ struct vring_used_elem *shadow_used_ring;
+ uint32_t shadow_used_idx;
} __rte_cache_aligned;

/* Old kernels have no such macro defined */
diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 08a73fd..1263168 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -91,7 +91,7 @@ is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t qp_nb)
return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
}

-static void
+static inline void __attribute__((always_inline))
virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
{
if (m_buf->ol_flags & PKT_TX_L4_MASK) {
@@ -125,427 +125,227 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
}
}

-static inline void
-copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr,
- struct virtio_net_hdr_mrg_rxbuf hdr)
-{
- if (dev->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf))
- *(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr;
- else
- *(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr;
-}
-
-static inline int __attribute__((always_inline))
-copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
- struct rte_mbuf *m, uint16_t desc_idx)
+uint16_t
+rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
+ struct rte_mbuf **pkts, uint16_t count)
{
- uint32_t desc_avail, desc_offset;
- uint32_t mbuf_avail, mbuf_offset;
- uint32_t cpy_len;
+ struct virtio_net_hdr_mrg_rxbuf *virtio_hdr;
+ struct vhost_virtqueue *vq;
struct vring_desc *desc;
- uint64_t desc_addr;
- struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
-
- desc = &vq->desc[desc_idx];
- desc_addr = gpa_to_vva(dev, desc->addr);
- /*
- * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
- * performance issue with some versions of gcc (4.8.4 and 5.3.0) which
- * otherwise stores offset on the stack instead of in a register.
- */
- if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr)
- return -1;
-
- rte_prefetch0((void *)(uintptr_t)desc_addr);
-
- virtio_enqueue_offload(m, &virtio_hdr.hdr);
- copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
- vhost_log_write(dev, desc->addr, dev->vhost_hlen);
- PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
-
- desc_offset = dev->vhost_hlen;
- desc_avail = desc->len - dev->vhost_hlen;
-
- mbuf_avail = rte_pktmbuf_data_len(m);
- mbuf_offset = 0;
- while (mbuf_avail != 0 || m->next != NULL) {
- /* done with current mbuf, fetch next */
- if (mbuf_avail == 0) {
- m = m->next;
-
- mbuf_offset = 0;
- mbuf_avail = rte_pktmbuf_data_len(m);
- }
-
- /* done with current desc buf, fetch next */
- if (desc_avail == 0) {
- if ((desc->flags & VRING_DESC_F_NEXT) == 0) {
- /* Room in vring buffer is not enough */
- return -1;
- }
- if (unlikely(desc->next >= vq->size))
- return -1;
-
- desc = &vq->desc[desc->next];
- desc_addr = gpa_to_vva(dev, desc->addr);
- if (unlikely(!desc_addr))
- return -1;
-
- desc_offset = 0;
- desc_avail = desc->len;
- }
-
- cpy_len = RTE_MIN(desc_avail, mbuf_avail);
- rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
- rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
- cpy_len);
- vhost_log_write(dev, desc->addr + desc_offset, cpy_len);
- PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
- cpy_len, 0);
-
- mbuf_avail -= cpy_len;
- mbuf_offset += cpy_len;
- desc_avail -= cpy_len;
- desc_offset += cpy_len;
- }
-
- return 0;
-}
+ struct virtio_net *dev;
+ struct rte_mbuf *mbuf;
+ uint64_t desc_host_write_addr = 0;
+ uint32_t desc_chain_head = 0;
+ uint32_t desc_chain_len = 0;
+ uint32_t desc_current = 0;
+ uint32_t desc_write_offset = 0;
+ uint32_t used_idx_static = 0;
+ uint32_t pkt_idx = 0;
+ uint32_t pkt_left = 0;
+ uint32_t pkt_sent = 0;
+ uint32_t mbuf_len = 0;
+ uint32_t mbuf_len_left = 0;
+ uint32_t copy_len = 0;
+ uint32_t copy_virtio_hdr = 0;
+ uint32_t is_mrg_rxbuf = 0;
+ uint32_t is_virtio_1 = 0;
+
+ if (unlikely(count == 0))
+ return 0;

-/**
- * This function adds buffers to the virtio devices RX virtqueue. Buffers can
- * be received from the physical port or from another virtio device. A packet
- * count is returned to indicate the number of packets that are succesfully
- * added to the RX queue. This function works when the mbuf is scattered, but
- * it doesn't support the mergeable feature.
- */
-static inline uint32_t __attribute__((always_inline))
-virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
- struct rte_mbuf **pkts, uint32_t count)
-{
- struct vhost_virtqueue *vq;
- uint16_t avail_idx, free_entries, start_idx;
- uint16_t desc_indexes[MAX_PKT_BURST];
- uint16_t used_idx;
- uint32_t i;
+ count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);

- LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
- if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
- RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
- dev->vid, __func__, queue_id);
+ dev = get_device(vid);
+ if (unlikely(!dev))
return 0;
- }

- vq = dev->virtqueue[queue_id];
- if (unlikely(vq->enabled == 0))
+ if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb)))
return 0;

- avail_idx = *((volatile uint16_t *)&vq->avail->idx);
- start_idx = vq->last_used_idx;
- free_entries = avail_idx - start_idx;
- count = RTE_MIN(count, free_entries);
- count = RTE_MIN(count, (uint32_t)MAX_PKT_BURST);
- if (count == 0)
+ vq = dev->virtqueue[queue_id];
+ if (unlikely(!vq->enabled))
return 0;

- LOG_DEBUG(VHOST_DATA, "(%d) start_idx %d | end_idx %d\n",
- dev->vid, start_idx, start_idx + count);
+ if (dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF))
+ is_mrg_rxbuf = 1;
+
+ if (dev->features & (1ULL << VIRTIO_F_VERSION_1))
+ is_virtio_1 = 1;
+
+ pkt_idx = 0;
+ pkt_left = count;
+ used_idx_static = vq->last_used_idx & (vq->size - 1);
+ vq->shadow_used_idx = 0;
+
+ while (pkt_left > 0) {
+ if (unlikely(vq->avail->idx == vq->last_used_idx))
+ goto done;
+
+ if (pkt_left > 1 && vq->avail->idx != vq->last_used_idx + 1)
+ rte_prefetch0(&vq->desc[
+ vq->avail->ring[
+ (vq->last_used_idx + 1) &
+ (vq->size - 1)]]);
+
+ mbuf = pkts[pkt_idx];
+ mbuf_len = rte_pktmbuf_data_len(mbuf);
+ mbuf_len_left = mbuf_len;
+ pkt_idx++;
+ pkt_left--;
+
+ desc_chain_head = vq->avail->ring[(vq->last_used_idx) &
+ (vq->size - 1)];
+ desc_current = desc_chain_head;
+ desc = &vq->desc[desc_current];
+ desc_host_write_addr = gpa_to_vva(dev, desc->addr);
+ if (unlikely(!desc_host_write_addr))
+ goto done;
+
+ virtio_hdr = (struct virtio_net_hdr_mrg_rxbuf *)
+ (uintptr_t)desc_host_write_addr;
+ copy_virtio_hdr = 1;
+
+ vhost_log_write(dev, desc->addr, dev->vhost_hlen);
+ desc_write_offset = dev->vhost_hlen;
+ desc_chain_len = desc_write_offset;
+ desc_host_write_addr += desc_write_offset;
+
+ while (1) {
+ if (!mbuf_len_left) {
+ if (mbuf->next) {
+ mbuf = mbuf->next;
+ mbuf_len = rte_pktmbuf_data_len(mbuf);
+ mbuf_len_left = mbuf_len;
+ } else
+ break;
+ }

- /* Retrieve all of the desc indexes first to avoid caching issues. */
- rte_prefetch0(&vq->avail->ring[start_idx & (vq->size - 1)]);
- for (i = 0; i < count; i++) {
- used_idx = (start_idx + i) & (vq->size - 1);
- desc_indexes[i] = vq->avail->ring[used_idx];
- vq->used->ring[used_idx].id = desc_indexes[i];
- vq->used->ring[used_idx].len = pkts[i]->pkt_len +
- dev->vhost_hlen;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used, ring[used_idx]),
- sizeof(vq->used->ring[used_idx]));
- }
+ if (desc->len <= desc_write_offset) {
+ if (desc->flags & VRING_DESC_F_NEXT) {
+ desc_write_offset = 0;
+ desc_current = desc->next;
+ desc = &vq->desc[desc_current];
+ desc_host_write_addr =
+ gpa_to_vva(dev, desc->addr);
+ if (unlikely(!desc_host_write_addr))
+ goto rollback;
+ } else if (is_mrg_rxbuf) {
+ vq->shadow_used_ring[
+ vq->shadow_used_idx].id =
+ desc_chain_head;
+ vq->shadow_used_ring[
+ vq->shadow_used_idx].len =
+ desc_chain_len;
+ vq->shadow_used_idx++;
+ vq->last_used_idx++;
+ virtio_hdr->num_buffers++;
+ if (unlikely(vq->avail->idx ==
+ vq->last_used_idx))
+ goto rollback;
+
+ desc_chain_head = vq->avail->ring[
+ (vq->last_used_idx) &
+ (vq->size - 1)];
+ desc_current = desc_chain_head;
+ desc = &vq->desc[desc_current];
+ desc_host_write_addr =
+ gpa_to_vva(dev, desc->addr);
+ if (unlikely(!desc_host_write_addr))
+ goto rollback;
+
+ desc_chain_len = 0;
+ desc_write_offset = 0;
+ } else
+ goto rollback;
+ }

- rte_prefetch0(&vq->desc[desc_indexes[0]]);
- for (i = 0; i < count; i++) {
- uint16_t desc_idx = desc_indexes[i];
- int err;
+ copy_len = RTE_MIN(desc->len - desc_write_offset,
+ mbuf_len_left);
+ if (copy_virtio_hdr) {
+ copy_virtio_hdr = 0;
+ memset((void *)(uintptr_t)&(virtio_hdr->hdr),
+ 0, dev->vhost_hlen);
+ virtio_enqueue_offload(mbuf,
+ &(virtio_hdr->hdr));
+ if (is_mrg_rxbuf || is_virtio_1)
+ virtio_hdr->num_buffers = 1;
+ }

- err = copy_mbuf_to_desc(dev, vq, pkts[i], desc_idx);
- if (unlikely(err)) {
- used_idx = (start_idx + i) & (vq->size - 1);
- vq->used->ring[used_idx].len = dev->vhost_hlen;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used, ring[used_idx]),
- sizeof(vq->used->ring[used_idx]));
+ rte_memcpy((void *)(uintptr_t)desc_host_write_addr,
+ rte_pktmbuf_mtod_offset(mbuf, void *,
+ mbuf_len - mbuf_len_left),
+ copy_len);
+ vhost_log_write(dev, desc->addr + desc_write_offset,
+ copy_len);
+ mbuf_len_left -= copy_len;
+ desc_write_offset += copy_len;
+ desc_host_write_addr += copy_len;
+ desc_chain_len += copy_len;
}

- if (i + 1 < count)
- rte_prefetch0(&vq->desc[desc_indexes[i+1]]);
+ vq->shadow_used_ring[vq->shadow_used_idx].id = desc_chain_head;
+ vq->shadow_used_ring[vq->shadow_used_idx].len = desc_chain_len;
+ vq->shadow_used_idx++;
+ vq->last_used_idx++;
+ pkt_sent++;
}

- rte_smp_wmb();
-
- *(volatile uint16_t *)&vq->used->idx += count;
- vq->last_used_idx += count;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used, idx),
- sizeof(vq->used->idx));
-
- /* flush used->idx update before we read avail->flags. */
- rte_mb();
-
- /* Kick the guest if necessary. */
- if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
- && (vq->callfd >= 0))
- eventfd_write(vq->callfd, (eventfd_t)1);
- return count;
-}
-
-static inline int
-fill_vec_buf(struct vhost_virtqueue *vq, uint32_t avail_idx,
- uint32_t *allocated, uint32_t *vec_idx,
- struct buf_vector *buf_vec)
-{
- uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
- uint32_t vec_id = *vec_idx;
- uint32_t len = *allocated;
-
- while (1) {
- if (unlikely(vec_id >= BUF_VECTOR_MAX || idx >= vq->size))
- return -1;
-
- len += vq->desc[idx].len;
- buf_vec[vec_id].buf_addr = vq->desc[idx].addr;
- buf_vec[vec_id].buf_len = vq->desc[idx].len;
- buf_vec[vec_id].desc_idx = idx;
- vec_id++;
-
- if ((vq->desc[idx].flags & VRING_DESC_F_NEXT) == 0)
- break;
-
- idx = vq->desc[idx].next;
- }
-
- *allocated = len;
- *vec_idx = vec_id;
-
- return 0;
-}
-
-/*
- * Returns -1 on fail, 0 on success
- */
-static inline int
-reserve_avail_buf_mergeable(struct vhost_virtqueue *vq, uint32_t size,
- uint16_t *end, struct buf_vector *buf_vec)
-{
- uint16_t cur_idx;
- uint16_t avail_idx;
- uint32_t allocated = 0;
- uint32_t vec_idx = 0;
- uint16_t tries = 0;
-
- cur_idx = vq->last_used_idx;
-
- while (1) {
- avail_idx = *((volatile uint16_t *)&vq->avail->idx);
- if (unlikely(cur_idx == avail_idx))
- return -1;
-
- if (unlikely(fill_vec_buf(vq, cur_idx, &allocated,
- &vec_idx, buf_vec) < 0))
- return -1;
-
- cur_idx++;
- tries++;
-
- if (allocated >= size)
- break;
-
- /*
- * if we tried all available ring items, and still
- * can't get enough buf, it means something abnormal
- * happened.
- */
- if (unlikely(tries >= vq->size))
- return -1;
- }
-
- *end = cur_idx;
- return 0;
-}
-
-static inline uint32_t __attribute__((always_inline))
-copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
- uint16_t end_idx, struct rte_mbuf *m,
- struct buf_vector *buf_vec)
-{
- struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
- uint32_t vec_idx = 0;
- uint16_t start_idx = vq->last_used_idx;
- uint16_t cur_idx = start_idx;
- uint64_t desc_addr;
- uint32_t mbuf_offset, mbuf_avail;
- uint32_t desc_offset, desc_avail;
- uint32_t cpy_len;
- uint16_t desc_idx, used_idx;
-
- if (unlikely(m == NULL))
- return 0;
-
- LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n",
- dev->vid, cur_idx, end_idx);
-
- desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
- if (buf_vec[vec_idx].buf_len < dev->vhost_hlen || !desc_addr)
- return 0;
-
- rte_prefetch0((void *)(uintptr_t)desc_addr);
-
- virtio_hdr.num_buffers = end_idx - start_idx;
- LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n",
- dev->vid, virtio_hdr.num_buffers);
-
- virtio_enqueue_offload(m, &virtio_hdr.hdr);
- copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
- vhost_log_write(dev, buf_vec[vec_idx].buf_addr, dev->vhost_hlen);
- PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
-
- desc_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
- desc_offset = dev->vhost_hlen;
-
- mbuf_avail = rte_pktmbuf_data_len(m);
- mbuf_offset = 0;
- while (mbuf_avail != 0 || m->next != NULL) {
- /* done with current desc buf, get the next one */
- if (desc_avail == 0) {
- desc_idx = buf_vec[vec_idx].desc_idx;
-
- if (!(vq->desc[desc_idx].flags & VRING_DESC_F_NEXT)) {
- /* Update used ring with desc information */
- used_idx = cur_idx++ & (vq->size - 1);
- vq->used->ring[used_idx].id = desc_idx;
- vq->used->ring[used_idx].len = desc_offset;
- vhost_log_used_vring(dev, vq,
+done:
+ if (likely(vq->shadow_used_idx > 0)) {
+ if (used_idx_static + vq->shadow_used_idx < vq->size) {
+ rte_memcpy(&vq->used->ring[used_idx_static],
+ &vq->shadow_used_ring[0],
+ vq->shadow_used_idx *
+ sizeof(struct vring_used_elem));
+ vhost_log_used_vring(dev, vq,
offsetof(struct vring_used,
- ring[used_idx]),
- sizeof(vq->used->ring[used_idx]));
- }
-
- vec_idx++;
- desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
- if (unlikely(!desc_addr))
- return 0;
-
- /* Prefetch buffer address. */
- rte_prefetch0((void *)(uintptr_t)desc_addr);
- desc_offset = 0;
- desc_avail = buf_vec[vec_idx].buf_len;
- }
-
- /* done with current mbuf, get the next one */
- if (mbuf_avail == 0) {
- m = m->next;
+ ring[used_idx_static]),
+ vq->shadow_used_idx *
+ sizeof(struct vring_used_elem));
+ } else {
+ uint32_t part_1 = vq->size - used_idx_static;
+ uint32_t part_2 = vq->shadow_used_idx - part_1;

- mbuf_offset = 0;
- mbuf_avail = rte_pktmbuf_data_len(m);
+ rte_memcpy(&vq->used->ring[used_idx_static],
+ &vq->shadow_used_ring[0],
+ part_1 *
+ sizeof(struct vring_used_elem));
+ vhost_log_used_vring(dev, vq,
+ offsetof(struct vring_used,
+ ring[used_idx_static]),
+ part_1 *
+ sizeof(struct vring_used_elem));
+ rte_memcpy(&vq->used->ring[0],
+ &vq->shadow_used_ring[part_1],
+ part_2 *
+ sizeof(struct vring_used_elem));
+ vhost_log_used_vring(dev, vq,
+ offsetof(struct vring_used,
+ ring[0]),
+ part_2 *
+ sizeof(struct vring_used_elem));
}
-
- cpy_len = RTE_MIN(desc_avail, mbuf_avail);
- rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
- rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
- cpy_len);
- vhost_log_write(dev, buf_vec[vec_idx].buf_addr + desc_offset,
- cpy_len);
- PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
- cpy_len, 0);
-
- mbuf_avail -= cpy_len;
- mbuf_offset += cpy_len;
- desc_avail -= cpy_len;
- desc_offset += cpy_len;
}

- used_idx = cur_idx & (vq->size - 1);
- vq->used->ring[used_idx].id = buf_vec[vec_idx].desc_idx;
- vq->used->ring[used_idx].len = desc_offset;
+ rte_smp_wmb();
+ vq->used->idx = vq->last_used_idx;
vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used, ring[used_idx]),
- sizeof(vq->used->ring[used_idx]));
-
- return end_idx - start_idx;
-}
-
-static inline uint32_t __attribute__((always_inline))
-virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
- struct rte_mbuf **pkts, uint32_t count)
-{
- struct vhost_virtqueue *vq;
- uint32_t pkt_idx = 0, nr_used = 0;
- uint16_t end;
- struct buf_vector buf_vec[BUF_VECTOR_MAX];
-
- LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
- if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
- RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
- dev->vid, __func__, queue_id);
- return 0;
- }
-
- vq = dev->virtqueue[queue_id];
- if (unlikely(vq->enabled == 0))
- return 0;
-
- count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
- if (count == 0)
- return 0;
-
- for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
- uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
-
- if (unlikely(reserve_avail_buf_mergeable(vq, pkt_len,
- &end, buf_vec) < 0)) {
- LOG_DEBUG(VHOST_DATA,
- "(%d) failed to get enough desc from vring\n",
- dev->vid);
- break;
- }
-
- nr_used = copy_mbuf_to_desc_mergeable(dev, vq, end,
- pkts[pkt_idx], buf_vec);
- rte_smp_wmb();
-
- *(volatile uint16_t *)&vq->used->idx += nr_used;
- vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
+ offsetof(struct vring_used, idx),
sizeof(vq->used->idx));
- vq->last_used_idx += nr_used;
- }
-
- if (likely(pkt_idx)) {
- /* flush used->idx update before we read avail->flags. */
- rte_mb();
-
- /* Kick the guest if necessary. */
+ rte_mb();
+ if (likely(pkt_sent)) {
if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
&& (vq->callfd >= 0))
eventfd_write(vq->callfd, (eventfd_t)1);
}

- return pkt_idx;
-}
-
-uint16_t
-rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
- struct rte_mbuf **pkts, uint16_t count)
-{
- struct virtio_net *dev = get_device(vid);
+ return pkt_sent;

- if (!dev)
- return 0;
+rollback:
+ if (is_mrg_rxbuf || is_virtio_1)
+ vq->last_used_idx -= virtio_hdr->num_buffers - 1;

- if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF))
- return virtio_dev_merge_rx(dev, queue_id, pkts, count);
- else
- return virtio_dev_rx(dev, queue_id, pkts, count);
+ goto done;
}

static void
diff --git a/lib/librte_vhost/virtio-net.c b/lib/librte_vhost/virtio-net.c
index 1785695..87d09fa 100644
--- a/lib/librte_vhost/virtio-net.c
+++ b/lib/librte_vhost/virtio-net.c
@@ -152,10 +152,14 @@ cleanup_device(struct virtio_net *dev, int destroy)
static void
free_device(struct virtio_net *dev)
{
+ struct vhost_virtqueue *vq;
uint32_t i;

- for (i = 0; i < dev->virt_qp_nb; i++)
- rte_free(dev->virtqueue[i * VIRTIO_QNUM]);
+ for (i = 0; i < dev->virt_qp_nb; i++) {
+ vq = dev->virtqueue[i * VIRTIO_QNUM];
+ rte_free(vq->shadow_used_ring);
+ rte_free(vq);
+ }

rte_free(dev);
}
@@ -418,13 +422,18 @@ int
vhost_set_vring_num(int vid, struct vhost_vring_state *state)
{
struct virtio_net *dev;
+ struct vhost_virtqueue *vq;

dev = get_device(vid);
if (dev == NULL)
return -1;

/* State->index refers to the queue index. The txq is 1, rxq is 0. */
- dev->virtqueue[state->index]->size = state->num;
+ vq = dev->virtqueue[state->index];
+ vq->size = state->num;
+ vq->shadow_used_ring = rte_malloc("",
+ vq->size * sizeof(struct vring_used_elem),
+ RTE_CACHE_LINE_SIZE);

return 0;
}
--
2.7.4
Maxime Coquelin
2016-08-16 13:59:52 UTC
Permalink
Hi Zhihong,
Post by Zhihong Wang
This patch optimizes the vhost enqueue function: rte_vhost_enqueue_burst.
* virtio_dev_merge_rx for mrg_rxbuf turned on cases.
* virtio_dev_rx for mrg_rxbuf turned off cases.
The virtio_dev_merge_rx doesn't provide optimal performance, also it is
reported having compatibility issue working with Windows VMs.
Could you tell us more please about this compatibility issue?
Post by Zhihong Wang
Besides, having 2 separated functions increases maintenance efforts.
This patch uses a single function logic to replace the current 2 for
better maintainability, and provides better performance by optimizing
caching behavior especially for mrg_rxbuf turned on cases.
Do you have some benchmark comparison before and after your change?

Also, for maintainability, I would suggest the that the enqueue
function be split. Because vhost_enqueue_burst becomes very long (220
LoC), and max level of indentation is too high (6).

It makes the code hard to understand, and prone to miss bugs during
review and maintenance.
Post by Zhihong Wang
It also fixes the issue working with Windows VMs.
Ideally, the fix should be sent separately, before the rework.
Indeed, we might want to have the fix in the stable branch, without
picking the optimization.
Post by Zhihong Wang
---
lib/librte_vhost/vhost-net.h | 6 +-
lib/librte_vhost/vhost_rxtx.c | 582 ++++++++++++++----------------------------
lib/librte_vhost/virtio-net.c | 15 +-
3 files changed, 208 insertions(+), 395 deletions(-)
582 lines changed is a huge patch.
If possible, it would be better splitting it in incremental changes,
making the review process easier.

Also, for v2, please prefix the commit title with "vhost:".

Thanks for your contribution, I'm looking forward for the v2.
- Maxime
Wang, Zhihong
2016-08-17 01:45:26 UTC
Permalink
-----Original Message-----
Sent: Tuesday, August 16, 2016 10:00 PM
Subject: Re: [dpdk-dev] [PATCH] optimize vhost enqueue
Hi Zhihong,
Post by Zhihong Wang
This patch optimizes the vhost enqueue function: rte_vhost_enqueue_burst.
* virtio_dev_merge_rx for mrg_rxbuf turned on cases.
* virtio_dev_rx for mrg_rxbuf turned off cases.
The virtio_dev_merge_rx doesn't provide optimal performance, also it is
reported having compatibility issue working with Windows VMs.
Could you tell us more please about this compatibility issue?
For example, when you have testpmd in the host and Window VM as the guest,
with mrg_rxbuf turned on, the guest will hang once there's packets enqueued
by virtio_dev_merge_rx.

Let me know if you see the same issue.
Post by Zhihong Wang
Besides, having 2 separated functions increases maintenance efforts.
This patch uses a single function logic to replace the current 2 for
better maintainability, and provides better performance by optimizing
caching behavior especially for mrg_rxbuf turned on cases.
Do you have some benchmark comparison before and after your change?
Also, for maintainability, I would suggest the that the enqueue
function be split. Because vhost_enqueue_burst becomes very long (220
LoC), and max level of indentation is too high (6).
It makes the code hard to understand, and prone to miss bugs during
review and maintenance.
This is something I've thought about while writing the code, the reason I
keep it as one function body is that:

1. This function is very performance sensitive, and we need full control of
code ordering (You can compare with the current performance with the
mrg_rxbuf feature turned on to see the difference).

2. I somehow find that a single function logic makes it easier to understand,
surely I can add comments to make it easiler to read for .

Please let me know if you still insist, we can discuss more on it.
Post by Zhihong Wang
It also fixes the issue working with Windows VMs.
Ideally, the fix should be sent separately, before the rework.
Indeed, we might want to have the fix in the stable branch, without
picking the optimization.
Post by Zhihong Wang
---
lib/librte_vhost/vhost-net.h | 6 +-
lib/librte_vhost/vhost_rxtx.c | 582 ++++++++++++++----------------------------
lib/librte_vhost/virtio-net.c | 15 +-
3 files changed, 208 insertions(+), 395 deletions(-)
582 lines changed is a huge patch.
If possible, it would be better splitting it in incremental changes,
making the review process easier.
It looks like a huge patch, but it simply deletes the current implementation
and add the new code. I think perhaps split it into 2, 1st one to replace
just the rte_vhost_enqueue_burst, 2nd one to delete all the obsolete functions.
It should make the patch clear, how do you think? :)
Also, for v2, please prefix the commit title with "vhost:".
Thanks for the hint! Will do.
Thanks for your contribution, I'm looking forward for the v2.
- Maxime
Yuanhan Liu
2016-08-17 02:38:25 UTC
Permalink
Post by Wang, Zhihong
-----Original Message-----
Sent: Tuesday, August 16, 2016 10:00 PM
Subject: Re: [dpdk-dev] [PATCH] optimize vhost enqueue
Hi Zhihong,
Post by Zhihong Wang
This patch optimizes the vhost enqueue function: rte_vhost_enqueue_burst.
* virtio_dev_merge_rx for mrg_rxbuf turned on cases.
* virtio_dev_rx for mrg_rxbuf turned off cases.
The virtio_dev_merge_rx doesn't provide optimal performance, also it is
reported having compatibility issue working with Windows VMs.
Could you tell us more please about this compatibility issue?
For example, when you have testpmd in the host and Window VM as the guest,
with mrg_rxbuf turned on, the guest will hang once there's packets enqueued
by virtio_dev_merge_rx.
You should put it into commit log.
Post by Wang, Zhihong
Let me know if you see the same issue.
Post by Zhihong Wang
Besides, having 2 separated functions increases maintenance efforts.
This patch uses a single function logic to replace the current 2 for
better maintainability, and provides better performance by optimizing
caching behavior especially for mrg_rxbuf turned on cases.
Here, here sounds two parts to me:

- one to unite mergeable and non-mergeable Rx

- another one to optimize the mergeable path

That means you should do it in two patches, with that we can have clear
understanding what changes the performance boost. It also helps review.
Post by Wang, Zhihong
Do you have some benchmark comparison before and after your change?
Also, for maintainability, I would suggest the that the enqueue
function be split. Because vhost_enqueue_burst becomes very long (220
LoC), and max level of indentation is too high (6).
It makes the code hard to understand, and prone to miss bugs during
review and maintenance.
Agreed.
Post by Wang, Zhihong
This is something I've thought about while writing the code, the reason I
1. This function is very performance sensitive, and we need full control of
code ordering (You can compare with the current performance with the
mrg_rxbuf feature turned on to see the difference).
Will inline functions help?
Post by Wang, Zhihong
2. I somehow find that a single function logic makes it easier to understand,
surely I can add comments to make it easiler to read for .
Please let me know if you still insist, we can discuss more on it.
I am personally not a fan of huge function; I would try hard to avoid
too many levels of indentation as well.
Post by Wang, Zhihong
Post by Zhihong Wang
It also fixes the issue working with Windows VMs.
Ideally, the fix should be sent separately, before the rework.
Indeed, we might want to have the fix in the stable branch, without
picking the optimization.
Agreed.
Post by Wang, Zhihong
Post by Zhihong Wang
---
lib/librte_vhost/vhost-net.h | 6 +-
lib/librte_vhost/vhost_rxtx.c | 582 ++++++++++++++----------------------------
lib/librte_vhost/virtio-net.c | 15 +-
3 files changed, 208 insertions(+), 395 deletions(-)
582 lines changed is a huge patch.
If possible, it would be better splitting it in incremental changes,
making the review process easier.
It looks like a huge patch, but it simply deletes the current implementation
and add the new code. I think perhaps split it into 2, 1st one to replace
just the rte_vhost_enqueue_burst, 2nd one to delete all the obsolete functions.
It should make the patch clear, how do you think? :)
Nope, it's not working in that way. It should be:

- one patch to fix the hang issue for windows guest

Please cc it to ***@dpdk.org as well so that we could pick it for
v16.07 stable release.

- one patch to unite the two different Rx code path

- another patch to optimize mergeable code path

--yliu
Wang, Zhihong
2016-08-17 06:41:57 UTC
Permalink
-----Original Message-----
Sent: Wednesday, August 17, 2016 10:38 AM
Subject: Re: [dpdk-dev] [PATCH] optimize vhost enqueue
Post by Wang, Zhihong
-----Original Message-----
Sent: Tuesday, August 16, 2016 10:00 PM
Subject: Re: [dpdk-dev] [PATCH] optimize vhost enqueue
Hi Zhihong,
rte_vhost_enqueue_burst.
Post by Wang, Zhihong
Post by Zhihong Wang
* virtio_dev_merge_rx for mrg_rxbuf turned on cases.
* virtio_dev_rx for mrg_rxbuf turned off cases.
The virtio_dev_merge_rx doesn't provide optimal performance, also it is
reported having compatibility issue working with Windows VMs.
Could you tell us more please about this compatibility issue?
For example, when you have testpmd in the host and Window VM as the
guest,
Post by Wang, Zhihong
with mrg_rxbuf turned on, the guest will hang once there's packets enqueued
by virtio_dev_merge_rx.
You should put it into commit log.
Okay.
Post by Wang, Zhihong
Let me know if you see the same issue.
Post by Zhihong Wang
Besides, having 2 separated functions increases maintenance efforts.
This patch uses a single function logic to replace the current 2 for
better maintainability, and provides better performance by optimizing
caching behavior especially for mrg_rxbuf turned on cases.
- one to unite mergeable and non-mergeable Rx
- another one to optimize the mergeable path
That means you should do it in two patches, with that we can have clear
understanding what changes the performance boost. It also helps review.
Please see explanation below.
Post by Wang, Zhihong
Do you have some benchmark comparison before and after your change?
Also, for maintainability, I would suggest the that the enqueue
function be split. Because vhost_enqueue_burst becomes very long (220
LoC), and max level of indentation is too high (6).
It makes the code hard to understand, and prone to miss bugs during
review and maintenance.
Agreed.
Post by Wang, Zhihong
This is something I've thought about while writing the code, the reason I
1. This function is very performance sensitive, and we need full control of
code ordering (You can compare with the current performance with the
mrg_rxbuf feature turned on to see the difference).
Will inline functions help?
Optimization in this patch actually reorganizes the code from its logic,
so it's not suitable for making separated functions.

I'll explain this in v2.
Post by Wang, Zhihong
2. I somehow find that a single function logic makes it easier to understand,
surely I can add comments to make it easiler to read for .
Please let me know if you still insist, we can discuss more on it.
I am personally not a fan of huge function; I would try hard to avoid
too many levels of indentation as well.
Post by Wang, Zhihong
Post by Zhihong Wang
It also fixes the issue working with Windows VMs.
Ideally, the fix should be sent separately, before the rework.
Indeed, we might want to have the fix in the stable branch, without
picking the optimization.
Agreed.
The fact is that I don't have much time to debug with the current code
since it's messy and I don't have Windows virtio code and the debugging
environment.

This patch doesn't try to fix this issue, it rewrites the logic totally,
and somehow fixes this issue.

Do you think integrating this whole patch into the stable branch will work?
Personally I think it makes more sense.
Post by Wang, Zhihong
Post by Zhihong Wang
---
lib/librte_vhost/vhost-net.h | 6 +-
lib/librte_vhost/vhost_rxtx.c | 582
++++++++++++++----------------------------
Post by Wang, Zhihong
Post by Zhihong Wang
lib/librte_vhost/virtio-net.c | 15 +-
3 files changed, 208 insertions(+), 395 deletions(-)
582 lines changed is a huge patch.
If possible, it would be better splitting it in incremental changes,
making the review process easier.
It looks like a huge patch, but it simply deletes the current implementation
and add the new code. I think perhaps split it into 2, 1st one to replace
just the rte_vhost_enqueue_burst, 2nd one to delete all the obsolete
functions.
Post by Wang, Zhihong
It should make the patch clear, how do you think? :)
- one patch to fix the hang issue for windows guest
v16.07 stable release.
- one patch to unite the two different Rx code path
- another patch to optimize mergeable code path
I can separate optimization from the basic code in v2, however as I explained
this patch is built from scratch and doesn't take anything from the existing
code, so there's no way to transform from the existing code incrementally into
the new code.
--yliu
Maxime Coquelin
2016-08-17 09:17:46 UTC
Permalink
Post by Wang, Zhihong
-----Original Message-----
Sent: Wednesday, August 17, 2016 10:38 AM
Subject: Re: [dpdk-dev] [PATCH] optimize vhost enqueue
Post by Wang, Zhihong
-----Original Message-----
Sent: Tuesday, August 16, 2016 10:00 PM
Subject: Re: [dpdk-dev] [PATCH] optimize vhost enqueue
Hi Zhihong,
rte_vhost_enqueue_burst.
Post by Wang, Zhihong
Post by Zhihong Wang
* virtio_dev_merge_rx for mrg_rxbuf turned on cases.
* virtio_dev_rx for mrg_rxbuf turned off cases.
The virtio_dev_merge_rx doesn't provide optimal performance, also it is
reported having compatibility issue working with Windows VMs.
Could you tell us more please about this compatibility issue?
For example, when you have testpmd in the host and Window VM as the
guest,
Post by Wang, Zhihong
with mrg_rxbuf turned on, the guest will hang once there's packets enqueued
by virtio_dev_merge_rx.
You should put it into commit log.
Okay.
Post by Wang, Zhihong
Let me know if you see the same issue.
Post by Zhihong Wang
Besides, having 2 separated functions increases maintenance efforts.
This patch uses a single function logic to replace the current 2 for
better maintainability, and provides better performance by optimizing
caching behavior especially for mrg_rxbuf turned on cases.
- one to unite mergeable and non-mergeable Rx
- another one to optimize the mergeable path
That means you should do it in two patches, with that we can have clear
understanding what changes the performance boost. It also helps review.
Please see explanation below.
Post by Wang, Zhihong
Do you have some benchmark comparison before and after your change?
Also, for maintainability, I would suggest the that the enqueue
function be split. Because vhost_enqueue_burst becomes very long (220
LoC), and max level of indentation is too high (6).
It makes the code hard to understand, and prone to miss bugs during
review and maintenance.
Agreed.
Post by Wang, Zhihong
This is something I've thought about while writing the code, the reason I
1. This function is very performance sensitive, and we need full control of
code ordering (You can compare with the current performance with the
mrg_rxbuf feature turned on to see the difference).
Will inline functions help?
Optimization in this patch actually reorganizes the code from its logic,
so it's not suitable for making separated functions.
I'll explain this in v2.
I agree with Yuanhan.
Inline functions should not break the optimizations.
IMHO, this is mandatory for the patch to be accepted.
Post by Wang, Zhihong
Post by Wang, Zhihong
2. I somehow find that a single function logic makes it easier to understand,
surely I can add comments to make it easiler to read for .
Please let me know if you still insist, we can discuss more on it.
I am personally not a fan of huge function; I would try hard to avoid
too many levels of indentation as well.
Post by Wang, Zhihong
Post by Zhihong Wang
It also fixes the issue working with Windows VMs.
Ideally, the fix should be sent separately, before the rework.
Indeed, we might want to have the fix in the stable branch, without
picking the optimization.
Agreed.
The fact is that I don't have much time to debug with the current code
since it's messy and I don't have Windows virtio code and the debugging
environment.
It seems you are not the only one facing the issue:
https://github.com/YanVugenfirer/kvm-guest-drivers-windows/issues/70

So a dedicated fix is really important.
Post by Wang, Zhihong
This patch doesn't try to fix this issue, it rewrites the logic totally,
and somehow fixes this issue.
Do you think integrating this whole patch into the stable branch will work?
Personally I think it makes more sense.
No.
We don't even know why/how it fixes the Windows issue, which would be
the first thing to understand before integrating a fix in stable branch.

And the stable branch is not meant for integrating such big reworks,
it is only meant to fix bugs.

The risk of regressions have to be avoided as much as possible.
Post by Wang, Zhihong
Post by Wang, Zhihong
Post by Zhihong Wang
---
lib/librte_vhost/vhost-net.h | 6 +-
lib/librte_vhost/vhost_rxtx.c | 582
++++++++++++++----------------------------
Post by Wang, Zhihong
Post by Zhihong Wang
lib/librte_vhost/virtio-net.c | 15 +-
3 files changed, 208 insertions(+), 395 deletions(-)
582 lines changed is a huge patch.
If possible, it would be better splitting it in incremental changes,
making the review process easier.
It looks like a huge patch, but it simply deletes the current implementation
and add the new code. I think perhaps split it into 2, 1st one to replace
just the rte_vhost_enqueue_burst, 2nd one to delete all the obsolete
functions.
Post by Wang, Zhihong
It should make the patch clear, how do you think? :)
- one patch to fix the hang issue for windows guest
v16.07 stable release.
- one patch to unite the two different Rx code path
- another patch to optimize mergeable code path
I can separate optimization from the basic code in v2, however as I explained
this patch is built from scratch and doesn't take anything from the existing
code, so there's no way to transform from the existing code incrementally into
the new code.
--yliu
Yuanhan Liu
2016-08-17 09:51:11 UTC
Permalink
Post by Maxime Coquelin
Post by Wang, Zhihong
Post by Yuanhan Liu
Post by Wang, Zhihong
This is something I've thought about while writing the code, the reason I
1. This function is very performance sensitive, and we need full control of
code ordering (You can compare with the current performance with the
mrg_rxbuf feature turned on to see the difference).
Will inline functions help?
Optimization in this patch actually reorganizes the code from its logic,
so it's not suitable for making separated functions.
I'll explain this in v2.
I agree with Yuanhan.
Inline functions should not break the optimizations.
IMHO, this is mandatory for the patch to be accepted.
Yes.
Post by Maxime Coquelin
https://github.com/YanVugenfirer/kvm-guest-drivers-windows/issues/70
So a dedicated fix is really important.
Yes.
Post by Maxime Coquelin
Post by Wang, Zhihong
This patch doesn't try to fix this issue, it rewrites the logic totally,
and somehow fixes this issue.
Do you think integrating this whole patch into the stable branch will work?
Personally I think it makes more sense.
No.
We don't even know why/how it fixes the Windows issue, which would be
the first thing to understand before integrating a fix in stable branch.
Yes.
Post by Maxime Coquelin
And the stable branch is not meant for integrating such big reworks,
it is only meant to fix bugs.
Yes.
Post by Maxime Coquelin
The risk of regressions have to be avoided as much as possible.
Yes.

--yliu
Wang, Zhihong
2016-08-18 13:44:21 UTC
Permalink
Thanks Maxime and Yuanhan for your review and suggestions!
Please help review the v2 of this patch.
-----Original Message-----
Sent: Wednesday, August 17, 2016 5:51 PM
Subject: Re: [dpdk-dev] [PATCH] optimize vhost enqueue
Post by Maxime Coquelin
Post by Wang, Zhihong
Post by Yuanhan Liu
Post by Wang, Zhihong
This is something I've thought about while writing the code, the reason I
1. This function is very performance sensitive, and we need full control of
code ordering (You can compare with the current performance with
the
Post by Maxime Coquelin
Post by Wang, Zhihong
Post by Yuanhan Liu
Post by Wang, Zhihong
mrg_rxbuf feature turned on to see the difference).
Will inline functions help?
Optimization in this patch actually reorganizes the code from its logic,
so it's not suitable for making separated functions.
I'll explain this in v2.
I agree with Yuanhan.
Inline functions should not break the optimizations.
IMHO, this is mandatory for the patch to be accepted.
Yes.
Post by Maxime Coquelin
https://github.com/YanVugenfirer/kvm-guest-drivers-windows/issues/70
So a dedicated fix is really important.
Yes.
Post by Maxime Coquelin
Post by Wang, Zhihong
This patch doesn't try to fix this issue, it rewrites the logic totally,
and somehow fixes this issue.
Do you think integrating this whole patch into the stable branch will work?
Personally I think it makes more sense.
No.
We don't even know why/how it fixes the Windows issue, which would be
the first thing to understand before integrating a fix in stable branch.
Yes.
Post by Maxime Coquelin
And the stable branch is not meant for integrating such big reworks,
it is only meant to fix bugs.
Yes.
Post by Maxime Coquelin
The risk of regressions have to be avoided as much as possible.
Yes.
--yliu
Wang, Zhihong
2016-08-17 10:07:00 UTC
Permalink
-----Original Message-----
Sent: Wednesday, August 17, 2016 5:18 PM
Subject: Re: [dpdk-dev] [PATCH] optimize vhost enqueue
Post by Wang, Zhihong
-----Original Message-----
Sent: Wednesday, August 17, 2016 10:38 AM
Subject: Re: [dpdk-dev] [PATCH] optimize vhost enqueue
Post by Wang, Zhihong
-----Original Message-----
Sent: Tuesday, August 16, 2016 10:00 PM
Subject: Re: [dpdk-dev] [PATCH] optimize vhost enqueue
Hi Zhihong,
rte_vhost_enqueue_burst.
Post by Wang, Zhihong
Post by Zhihong Wang
* virtio_dev_merge_rx for mrg_rxbuf turned on cases.
* virtio_dev_rx for mrg_rxbuf turned off cases.
The virtio_dev_merge_rx doesn't provide optimal performance, also it is
reported having compatibility issue working with Windows VMs.
Could you tell us more please about this compatibility issue?
For example, when you have testpmd in the host and Window VM as the
guest,
Post by Wang, Zhihong
with mrg_rxbuf turned on, the guest will hang once there's packets
enqueued
Post by Wang, Zhihong
Post by Wang, Zhihong
by virtio_dev_merge_rx.
You should put it into commit log.
Okay.
Post by Wang, Zhihong
Let me know if you see the same issue.
Post by Zhihong Wang
Besides, having 2 separated functions increases maintenance efforts.
This patch uses a single function logic to replace the current 2 for
better maintainability, and provides better performance by optimizing
caching behavior especially for mrg_rxbuf turned on cases.
- one to unite mergeable and non-mergeable Rx
- another one to optimize the mergeable path
That means you should do it in two patches, with that we can have clear
understanding what changes the performance boost. It also helps review.
Please see explanation below.
Post by Wang, Zhihong
Do you have some benchmark comparison before and after your change?
Also, for maintainability, I would suggest the that the enqueue
function be split. Because vhost_enqueue_burst becomes very long (220
LoC), and max level of indentation is too high (6).
It makes the code hard to understand, and prone to miss bugs during
review and maintenance.
Agreed.
Post by Wang, Zhihong
This is something I've thought about while writing the code, the reason I
1. This function is very performance sensitive, and we need full control of
code ordering (You can compare with the current performance with the
mrg_rxbuf feature turned on to see the difference).
Will inline functions help?
Optimization in this patch actually reorganizes the code from its logic,
so it's not suitable for making separated functions.
I'll explain this in v2.
I agree with Yuanhan.
Inline functions should not break the optimizations.
IMHO, this is mandatory for the patch to be accepted.
Excellent!
Post by Wang, Zhihong
Post by Wang, Zhihong
2. I somehow find that a single function logic makes it easier to understand,
surely I can add comments to make it easiler to read for .
Please let me know if you still insist, we can discuss more on it.
I am personally not a fan of huge function; I would try hard to avoid
too many levels of indentation as well.
Post by Wang, Zhihong
Post by Zhihong Wang
It also fixes the issue working with Windows VMs.
Ideally, the fix should be sent separately, before the rework.
Indeed, we might want to have the fix in the stable branch, without
picking the optimization.
Agreed.
The fact is that I don't have much time to debug with the current code
since it's messy and I don't have Windows virtio code and the debugging
environment.
https://github.com/YanVugenfirer/kvm-guest-drivers-windows/issues/70
So a dedicated fix is really important.
Yeah that's me raising this issue there.

But I think it's another standalone task to identify the root cause and
find the fix for the existing code.
Post by Wang, Zhihong
This patch doesn't try to fix this issue, it rewrites the logic totally,
and somehow fixes this issue.
Do you think integrating this whole patch into the stable branch will work?
Personally I think it makes more sense.
No.
We don't even know why/how it fixes the Windows issue, which would be
the first thing to understand before integrating a fix in stable branch.
And the stable branch is not meant for integrating such big reworks,
it is only meant to fix bugs.
The risk of regressions have to be avoided as much as possible.
Post by Wang, Zhihong
Post by Wang, Zhihong
Post by Zhihong Wang
---
lib/librte_vhost/vhost-net.h | 6 +-
lib/librte_vhost/vhost_rxtx.c | 582
++++++++++++++----------------------------
Post by Wang, Zhihong
Post by Zhihong Wang
lib/librte_vhost/virtio-net.c | 15 +-
3 files changed, 208 insertions(+), 395 deletions(-)
582 lines changed is a huge patch.
If possible, it would be better splitting it in incremental changes,
making the review process easier.
It looks like a huge patch, but it simply deletes the current implementation
and add the new code. I think perhaps split it into 2, 1st one to replace
just the rte_vhost_enqueue_burst, 2nd one to delete all the obsolete
functions.
Post by Wang, Zhihong
It should make the patch clear, how do you think? :)
- one patch to fix the hang issue for windows guest
v16.07 stable release.
- one patch to unite the two different Rx code path
- another patch to optimize mergeable code path
I can separate optimization from the basic code in v2, however as I explained
this patch is built from scratch and doesn't take anything from the existing
code, so there's no way to transform from the existing code incrementally into
the new code.
--yliu
Zhihong Wang
2016-08-18 06:33:05 UTC
Permalink
This patch set optimizes the vhost enqueue function.

It implements the vhost logic from scratch into a single function designed
for high performance and good maintainability, and improves CPU efficiency
significantly by optimizing cache access, which means:

* For fast frontends (eg. DPDK virtio pmd), higher performance (maximum
throughput) can be achieved.

* For slow frontends (eg. kernel virtio-net), better scalability can be
achieved, each vhost core can support more connections since it takes
less cycles to handle each single frontend.

The main optimization techniques are:

1. Reorder code to reduce CPU pipeline stall cycles.

2. Batch update the used ring for better efficiency.

3. Prefetch descriptor to hide cache latency.

4. Remove useless volatile attribute to allow compiler optimization.

In the existing code there're 2 callbacks for vhost enqueue:

* virtio_dev_merge_rx for mrg_rxbuf turned on cases.

* virtio_dev_rx for mrg_rxbuf turned off cases.

The performance of the existing code is not optimal, especially when the
mrg_rxbuf feature turned on. Also, having 2 separated functions increases
maintenance efforts.

---
Changes in v2:

1. Split the big function into several small ones

2. Use multiple patches to explain each optimization

3. Add comments

Zhihong Wang (6):
vhost: rewrite enqueue
vhost: remove obsolete
vhost: remove useless volatile
vhost: add desc prefetch
vhost: batch update used ring
vhost: optimize cache access

lib/librte_vhost/vhost-net.h | 6 +-
lib/librte_vhost/vhost_rxtx.c | 582 +++++++++++++++---------------------------
lib/librte_vhost/virtio-net.c | 15 +-
3 files changed, 228 insertions(+), 375 deletions(-)
--
2.7.4
Zhihong Wang
2016-08-18 06:33:06 UTC
Permalink
This patch implements the vhost logic from scratch into a single function
designed for high performance and better maintainability.

Signed-off-by: Zhihong Wang <***@intel.com>
---
lib/librte_vhost/vhost_rxtx.c | 212 ++++++++++++++++++++++++++++++++++++++++--
1 file changed, 205 insertions(+), 7 deletions(-)

diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 08a73fd..8e6d782 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -91,7 +91,7 @@ is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t qp_nb)
return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
}

-static void
+static inline void __attribute__((always_inline))
virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
{
if (m_buf->ol_flags & PKT_TX_L4_MASK) {
@@ -533,19 +533,217 @@ virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
return pkt_idx;
}

+static inline uint32_t __attribute__((always_inline))
+loop_check(struct vhost_virtqueue *vq, uint16_t avail_idx, uint32_t pkt_left)
+{
+ if (pkt_left == 0 || avail_idx == vq->last_used_idx)
+ return 1;
+
+ return 0;
+}
+
+static inline uint32_t __attribute__((always_inline))
+enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ uint16_t avail_idx, struct rte_mbuf *mbuf,
+ uint32_t is_mrg_rxbuf)
+{
+ struct virtio_net_hdr_mrg_rxbuf *virtio_hdr;
+ struct vring_desc *desc;
+ uint64_t desc_host_write_addr = 0;
+ uint32_t desc_chain_head = 0;
+ uint32_t desc_chain_len = 0;
+ uint32_t desc_current = 0;
+ uint32_t desc_write_offset = 0;
+ uint32_t mbuf_len = 0;
+ uint32_t mbuf_len_left = 0;
+ uint32_t copy_len = 0;
+ uint32_t extra_buffers = 0;
+ uint32_t used_idx_round = 0;
+
+ /* start with the first mbuf of the packet */
+ mbuf_len = rte_pktmbuf_data_len(mbuf);
+ mbuf_len_left = mbuf_len;
+
+ /* get the current desc */
+ desc_current = vq->avail->ring[(vq->last_used_idx) & (vq->size - 1)];
+ desc_chain_head = desc_current;
+ desc = &vq->desc[desc_current];
+ desc_host_write_addr = gpa_to_vva(dev, desc->addr);
+ if (unlikely(!desc_host_write_addr))
+ goto error;
+
+ /* handle virtio header */
+ virtio_hdr = (struct virtio_net_hdr_mrg_rxbuf *)
+ (uintptr_t)desc_host_write_addr;
+ memset((void *)(uintptr_t)&(virtio_hdr->hdr),
+ 0, dev->vhost_hlen);
+ virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
+ vhost_log_write(dev, desc->addr, dev->vhost_hlen);
+ desc_write_offset = dev->vhost_hlen;
+ desc_chain_len = desc_write_offset;
+ desc_host_write_addr += desc_write_offset;
+ if (is_mrg_rxbuf)
+ virtio_hdr->num_buffers = 1;
+
+ /* start copy from mbuf to desc */
+ while (1) {
+ /* get the next mbuf if the current done */
+ if (!mbuf_len_left) {
+ if (mbuf->next) {
+ mbuf = mbuf->next;
+ mbuf_len = rte_pktmbuf_data_len(mbuf);
+ mbuf_len_left = mbuf_len;
+ } else
+ break;
+ }
+
+ /* get the next desc if the current done */
+ if (desc->len <= desc_write_offset) {
+ if (desc->flags & VRING_DESC_F_NEXT) {
+ /* go on with the current desc chain */
+ desc_write_offset = 0;
+ desc_current = desc->next;
+ desc = &vq->desc[desc_current];
+ desc_host_write_addr =
+ gpa_to_vva(dev, desc->addr);
+ if (unlikely(!desc_host_write_addr))
+ goto rollback;
+ } else if (is_mrg_rxbuf) {
+ /* start with the next desc chain */
+ used_idx_round = vq->last_used_idx
+ & (vq->size - 1);
+ vq->used->ring[used_idx_round].id =
+ desc_chain_head;
+ vq->used->ring[used_idx_round].len =
+ desc_chain_len;
+ vhost_log_used_vring(dev, vq,
+ offsetof(struct vring_used,
+ ring[used_idx_round]),
+ sizeof(vq->used->ring[
+ used_idx_round]));
+ vq->last_used_idx++;
+ extra_buffers++;
+ virtio_hdr->num_buffers++;
+ if (avail_idx == vq->last_used_idx)
+ goto rollback;
+
+ desc_current =
+ vq->avail->ring[(vq->last_used_idx) &
+ (vq->size - 1)];
+ desc_chain_head = desc_current;
+ desc = &vq->desc[desc_current];
+ desc_host_write_addr =
+ gpa_to_vva(dev, desc->addr);
+ if (unlikely(!desc_host_write_addr))
+ goto rollback;
+
+ desc_chain_len = 0;
+ desc_write_offset = 0;
+ } else
+ goto rollback;
+ }
+
+ /* copy mbuf data */
+ copy_len = RTE_MIN(desc->len - desc_write_offset,
+ mbuf_len_left);
+ rte_memcpy((void *)(uintptr_t)desc_host_write_addr,
+ rte_pktmbuf_mtod_offset(mbuf, void *,
+ mbuf_len - mbuf_len_left),
+ copy_len);
+ vhost_log_write(dev, desc->addr + desc_write_offset,
+ copy_len);
+ mbuf_len_left -= copy_len;
+ desc_write_offset += copy_len;
+ desc_host_write_addr += copy_len;
+ desc_chain_len += copy_len;
+ }
+
+ used_idx_round = vq->last_used_idx & (vq->size - 1);
+ vq->used->ring[used_idx_round].id = desc_chain_head;
+ vq->used->ring[used_idx_round].len = desc_chain_len;
+ vhost_log_used_vring(dev, vq,
+ offsetof(struct vring_used, ring[used_idx_round]),
+ sizeof(vq->used->ring[used_idx_round]));
+ vq->last_used_idx++;
+
+ return 0;
+
+rollback:
+ /* rollback on any error if last_used_idx update on-the-fly */
+ if (is_mrg_rxbuf)
+ vq->last_used_idx -= extra_buffers;
+
+error:
+ return 1;
+}
+
+static inline void __attribute__((always_inline))
+notify_guest(struct virtio_net *dev, struct vhost_virtqueue *vq)
+{
+ rte_smp_wmb();
+ vq->used->idx = vq->last_used_idx;
+ vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
+ sizeof(vq->used->idx));
+ rte_mb();
+ if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
+ && (vq->callfd >= 0))
+ eventfd_write(vq->callfd, (eventfd_t)1);
+}
+
uint16_t
rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
struct rte_mbuf **pkts, uint16_t count)
{
- struct virtio_net *dev = get_device(vid);
+ struct vhost_virtqueue *vq;
+ struct virtio_net *dev;
+ uint32_t pkt_idx = 0;
+ uint32_t pkt_left = 0;
+ uint32_t pkt_sent = 0;
+ uint32_t is_mrg_rxbuf = 0;
+ uint16_t avail_idx = 0;
+
+ /* precheck */
+ if (unlikely(count == 0))
+ return 0;

- if (!dev)
+ count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
+
+ dev = get_device(vid);
+ if (unlikely(!dev))
return 0;

- if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF))
- return virtio_dev_merge_rx(dev, queue_id, pkts, count);
- else
- return virtio_dev_rx(dev, queue_id, pkts, count);
+ if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb)))
+ return 0;
+
+ vq = dev->virtqueue[queue_id];
+ if (unlikely(!vq->enabled))
+ return 0;
+
+ if (dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF))
+ is_mrg_rxbuf = 1;
+
+ /* start enqueuing packets 1 by 1 */
+ pkt_idx = 0;
+ pkt_left = count;
+ avail_idx = *((volatile uint16_t *)&vq->avail->idx);
+ while (1) {
+ if (loop_check(vq, avail_idx, pkt_left))
+ break;
+
+ if (enqueue_packet(dev, vq, avail_idx, pkts[pkt_idx],
+ is_mrg_rxbuf))
+ break;
+
+ pkt_idx++;
+ pkt_sent++;
+ pkt_left--;
+ }
+
+ /* update used idx and kick the guest if necessary */
+ if (pkt_sent)
+ notify_guest(dev, vq);
+
+ return pkt_sent;
}

static void
--
2.7.4
Yuanhan Liu
2016-08-19 02:39:24 UTC
Permalink
Post by Zhihong Wang
This patch implements the vhost logic from scratch into a single function
designed for high performance and better maintainability.
---
lib/librte_vhost/vhost_rxtx.c | 212 ++++++++++++++++++++++++++++++++++++++++--
1 file changed, 205 insertions(+), 7 deletions(-)
diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 08a73fd..8e6d782 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -91,7 +91,7 @@ is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t qp_nb)
return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
}
-static void
+static inline void __attribute__((always_inline))
virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
{
if (m_buf->ol_flags & PKT_TX_L4_MASK) {
@@ -533,19 +533,217 @@ virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
return pkt_idx;
}
+static inline uint32_t __attribute__((always_inline))
+loop_check(struct vhost_virtqueue *vq, uint16_t avail_idx, uint32_t pkt_left)
+{
+ if (pkt_left == 0 || avail_idx == vq->last_used_idx)
+ return 1;
+
+ return 0;
+}
Hmmm, I don't see any benifit from making such simple check into a
function.
Post by Zhihong Wang
+static inline uint32_t __attribute__((always_inline))
+enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ uint16_t avail_idx, struct rte_mbuf *mbuf,
+ uint32_t is_mrg_rxbuf)
+{
+ struct virtio_net_hdr_mrg_rxbuf *virtio_hdr;
+ struct vring_desc *desc;
+ uint64_t desc_host_write_addr = 0;
+ uint32_t desc_chain_head = 0;
+ uint32_t desc_chain_len = 0;
+ uint32_t desc_current = 0;
+ uint32_t desc_write_offset = 0;
+ uint32_t mbuf_len = 0;
+ uint32_t mbuf_len_left = 0;
+ uint32_t copy_len = 0;
The dequeue function uses var like desc_addr, desc_avail, desc_offset,
mbuf_avail, ..., I see no reason to use something different here. This
breaks the code consistency. Besides that, var name like desc_host_write_addr
looks redundant; desc_addr is much cleaner.

--yliu
Wang, Zhihong
2016-08-19 07:07:42 UTC
Permalink
-----Original Message-----
Sent: Friday, August 19, 2016 10:39 AM
Subject: Re: [PATCH v2 1/6] vhost: rewrite enqueue
Post by Zhihong Wang
This patch implements the vhost logic from scratch into a single function
designed for high performance and better maintainability.
---
lib/librte_vhost/vhost_rxtx.c | 212
++++++++++++++++++++++++++++++++++++++++--
Post by Zhihong Wang
1 file changed, 205 insertions(+), 7 deletions(-)
diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 08a73fd..8e6d782 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -91,7 +91,7 @@ is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t
qp_nb)
Post by Zhihong Wang
return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
}
-static void
+static inline void __attribute__((always_inline))
virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr
*net_hdr)
Post by Zhihong Wang
{
if (m_buf->ol_flags & PKT_TX_L4_MASK) {
@@ -533,19 +533,217 @@ virtio_dev_merge_rx(struct virtio_net *dev,
uint16_t queue_id,
Post by Zhihong Wang
return pkt_idx;
}
+static inline uint32_t __attribute__((always_inline))
+loop_check(struct vhost_virtqueue *vq, uint16_t avail_idx, uint32_t pkt_left)
+{
+ if (pkt_left == 0 || avail_idx == vq->last_used_idx)
+ return 1;
+
+ return 0;
+}
Hmmm, I don't see any benifit from making such simple check into a
function.
It's for prefetch code later to be merged into this function.
Post by Zhihong Wang
+static inline uint32_t __attribute__((always_inline))
+enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ uint16_t avail_idx, struct rte_mbuf *mbuf,
+ uint32_t is_mrg_rxbuf)
+{
+ struct virtio_net_hdr_mrg_rxbuf *virtio_hdr;
+ struct vring_desc *desc;
+ uint64_t desc_host_write_addr = 0;
+ uint32_t desc_chain_head = 0;
+ uint32_t desc_chain_len = 0;
+ uint32_t desc_current = 0;
+ uint32_t desc_write_offset = 0;
+ uint32_t mbuf_len = 0;
+ uint32_t mbuf_len_left = 0;
+ uint32_t copy_len = 0;
The dequeue function uses var like desc_addr, desc_avail, desc_offset,
mbuf_avail, ..., I see no reason to use something different here. This
breaks the code consistency. Besides that, var name like desc_host_write_addr
looks redundant; desc_addr is much cleaner.
Okay.
--yliu
Zhihong Wang
2016-08-18 06:33:07 UTC
Permalink
This patch removes obsolete functions.

Signed-off-by: Zhihong Wang <***@intel.com>
---
lib/librte_vhost/vhost_rxtx.c | 408 ------------------------------------------
1 file changed, 408 deletions(-)

diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 8e6d782..939957d 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -125,414 +125,6 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
}
}

-static inline void
-copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr,
- struct virtio_net_hdr_mrg_rxbuf hdr)
-{
- if (dev->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf))
- *(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr;
- else
- *(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr;
-}
-
-static inline int __attribute__((always_inline))
-copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
- struct rte_mbuf *m, uint16_t desc_idx)
-{
- uint32_t desc_avail, desc_offset;
- uint32_t mbuf_avail, mbuf_offset;
- uint32_t cpy_len;
- struct vring_desc *desc;
- uint64_t desc_addr;
- struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
-
- desc = &vq->desc[desc_idx];
- desc_addr = gpa_to_vva(dev, desc->addr);
- /*
- * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
- * performance issue with some versions of gcc (4.8.4 and 5.3.0) which
- * otherwise stores offset on the stack instead of in a register.
- */
- if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr)
- return -1;
-
- rte_prefetch0((void *)(uintptr_t)desc_addr);
-
- virtio_enqueue_offload(m, &virtio_hdr.hdr);
- copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
- vhost_log_write(dev, desc->addr, dev->vhost_hlen);
- PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
-
- desc_offset = dev->vhost_hlen;
- desc_avail = desc->len - dev->vhost_hlen;
-
- mbuf_avail = rte_pktmbuf_data_len(m);
- mbuf_offset = 0;
- while (mbuf_avail != 0 || m->next != NULL) {
- /* done with current mbuf, fetch next */
- if (mbuf_avail == 0) {
- m = m->next;
-
- mbuf_offset = 0;
- mbuf_avail = rte_pktmbuf_data_len(m);
- }
-
- /* done with current desc buf, fetch next */
- if (desc_avail == 0) {
- if ((desc->flags & VRING_DESC_F_NEXT) == 0) {
- /* Room in vring buffer is not enough */
- return -1;
- }
- if (unlikely(desc->next >= vq->size))
- return -1;
-
- desc = &vq->desc[desc->next];
- desc_addr = gpa_to_vva(dev, desc->addr);
- if (unlikely(!desc_addr))
- return -1;
-
- desc_offset = 0;
- desc_avail = desc->len;
- }
-
- cpy_len = RTE_MIN(desc_avail, mbuf_avail);
- rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
- rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
- cpy_len);
- vhost_log_write(dev, desc->addr + desc_offset, cpy_len);
- PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
- cpy_len, 0);
-
- mbuf_avail -= cpy_len;
- mbuf_offset += cpy_len;
- desc_avail -= cpy_len;
- desc_offset += cpy_len;
- }
-
- return 0;
-}
-
-/**
- * This function adds buffers to the virtio devices RX virtqueue. Buffers can
- * be received from the physical port or from another virtio device. A packet
- * count is returned to indicate the number of packets that are succesfully
- * added to the RX queue. This function works when the mbuf is scattered, but
- * it doesn't support the mergeable feature.
- */
-static inline uint32_t __attribute__((always_inline))
-virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
- struct rte_mbuf **pkts, uint32_t count)
-{
- struct vhost_virtqueue *vq;
- uint16_t avail_idx, free_entries, start_idx;
- uint16_t desc_indexes[MAX_PKT_BURST];
- uint16_t used_idx;
- uint32_t i;
-
- LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
- if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
- RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
- dev->vid, __func__, queue_id);
- return 0;
- }
-
- vq = dev->virtqueue[queue_id];
- if (unlikely(vq->enabled == 0))
- return 0;
-
- avail_idx = *((volatile uint16_t *)&vq->avail->idx);
- start_idx = vq->last_used_idx;
- free_entries = avail_idx - start_idx;
- count = RTE_MIN(count, free_entries);
- count = RTE_MIN(count, (uint32_t)MAX_PKT_BURST);
- if (count == 0)
- return 0;
-
- LOG_DEBUG(VHOST_DATA, "(%d) start_idx %d | end_idx %d\n",
- dev->vid, start_idx, start_idx + count);
-
- /* Retrieve all of the desc indexes first to avoid caching issues. */
- rte_prefetch0(&vq->avail->ring[start_idx & (vq->size - 1)]);
- for (i = 0; i < count; i++) {
- used_idx = (start_idx + i) & (vq->size - 1);
- desc_indexes[i] = vq->avail->ring[used_idx];
- vq->used->ring[used_idx].id = desc_indexes[i];
- vq->used->ring[used_idx].len = pkts[i]->pkt_len +
- dev->vhost_hlen;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used, ring[used_idx]),
- sizeof(vq->used->ring[used_idx]));
- }
-
- rte_prefetch0(&vq->desc[desc_indexes[0]]);
- for (i = 0; i < count; i++) {
- uint16_t desc_idx = desc_indexes[i];
- int err;
-
- err = copy_mbuf_to_desc(dev, vq, pkts[i], desc_idx);
- if (unlikely(err)) {
- used_idx = (start_idx + i) & (vq->size - 1);
- vq->used->ring[used_idx].len = dev->vhost_hlen;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used, ring[used_idx]),
- sizeof(vq->used->ring[used_idx]));
- }
-
- if (i + 1 < count)
- rte_prefetch0(&vq->desc[desc_indexes[i+1]]);
- }
-
- rte_smp_wmb();
-
- *(volatile uint16_t *)&vq->used->idx += count;
- vq->last_used_idx += count;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used, idx),
- sizeof(vq->used->idx));
-
- /* flush used->idx update before we read avail->flags. */
- rte_mb();
-
- /* Kick the guest if necessary. */
- if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
- && (vq->callfd >= 0))
- eventfd_write(vq->callfd, (eventfd_t)1);
- return count;
-}
-
-static inline int
-fill_vec_buf(struct vhost_virtqueue *vq, uint32_t avail_idx,
- uint32_t *allocated, uint32_t *vec_idx,
- struct buf_vector *buf_vec)
-{
- uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
- uint32_t vec_id = *vec_idx;
- uint32_t len = *allocated;
-
- while (1) {
- if (unlikely(vec_id >= BUF_VECTOR_MAX || idx >= vq->size))
- return -1;
-
- len += vq->desc[idx].len;
- buf_vec[vec_id].buf_addr = vq->desc[idx].addr;
- buf_vec[vec_id].buf_len = vq->desc[idx].len;
- buf_vec[vec_id].desc_idx = idx;
- vec_id++;
-
- if ((vq->desc[idx].flags & VRING_DESC_F_NEXT) == 0)
- break;
-
- idx = vq->desc[idx].next;
- }
-
- *allocated = len;
- *vec_idx = vec_id;
-
- return 0;
-}
-
-/*
- * Returns -1 on fail, 0 on success
- */
-static inline int
-reserve_avail_buf_mergeable(struct vhost_virtqueue *vq, uint32_t size,
- uint16_t *end, struct buf_vector *buf_vec)
-{
- uint16_t cur_idx;
- uint16_t avail_idx;
- uint32_t allocated = 0;
- uint32_t vec_idx = 0;
- uint16_t tries = 0;
-
- cur_idx = vq->last_used_idx;
-
- while (1) {
- avail_idx = *((volatile uint16_t *)&vq->avail->idx);
- if (unlikely(cur_idx == avail_idx))
- return -1;
-
- if (unlikely(fill_vec_buf(vq, cur_idx, &allocated,
- &vec_idx, buf_vec) < 0))
- return -1;
-
- cur_idx++;
- tries++;
-
- if (allocated >= size)
- break;
-
- /*
- * if we tried all available ring items, and still
- * can't get enough buf, it means something abnormal
- * happened.
- */
- if (unlikely(tries >= vq->size))
- return -1;
- }
-
- *end = cur_idx;
- return 0;
-}
-
-static inline uint32_t __attribute__((always_inline))
-copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
- uint16_t end_idx, struct rte_mbuf *m,
- struct buf_vector *buf_vec)
-{
- struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
- uint32_t vec_idx = 0;
- uint16_t start_idx = vq->last_used_idx;
- uint16_t cur_idx = start_idx;
- uint64_t desc_addr;
- uint32_t mbuf_offset, mbuf_avail;
- uint32_t desc_offset, desc_avail;
- uint32_t cpy_len;
- uint16_t desc_idx, used_idx;
-
- if (unlikely(m == NULL))
- return 0;
-
- LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n",
- dev->vid, cur_idx, end_idx);
-
- desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
- if (buf_vec[vec_idx].buf_len < dev->vhost_hlen || !desc_addr)
- return 0;
-
- rte_prefetch0((void *)(uintptr_t)desc_addr);
-
- virtio_hdr.num_buffers = end_idx - start_idx;
- LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n",
- dev->vid, virtio_hdr.num_buffers);
-
- virtio_enqueue_offload(m, &virtio_hdr.hdr);
- copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
- vhost_log_write(dev, buf_vec[vec_idx].buf_addr, dev->vhost_hlen);
- PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
-
- desc_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
- desc_offset = dev->vhost_hlen;
-
- mbuf_avail = rte_pktmbuf_data_len(m);
- mbuf_offset = 0;
- while (mbuf_avail != 0 || m->next != NULL) {
- /* done with current desc buf, get the next one */
- if (desc_avail == 0) {
- desc_idx = buf_vec[vec_idx].desc_idx;
-
- if (!(vq->desc[desc_idx].flags & VRING_DESC_F_NEXT)) {
- /* Update used ring with desc information */
- used_idx = cur_idx++ & (vq->size - 1);
- vq->used->ring[used_idx].id = desc_idx;
- vq->used->ring[used_idx].len = desc_offset;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used,
- ring[used_idx]),
- sizeof(vq->used->ring[used_idx]));
- }
-
- vec_idx++;
- desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
- if (unlikely(!desc_addr))
- return 0;
-
- /* Prefetch buffer address. */
- rte_prefetch0((void *)(uintptr_t)desc_addr);
- desc_offset = 0;
- desc_avail = buf_vec[vec_idx].buf_len;
- }
-
- /* done with current mbuf, get the next one */
- if (mbuf_avail == 0) {
- m = m->next;
-
- mbuf_offset = 0;
- mbuf_avail = rte_pktmbuf_data_len(m);
- }
-
- cpy_len = RTE_MIN(desc_avail, mbuf_avail);
- rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
- rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
- cpy_len);
- vhost_log_write(dev, buf_vec[vec_idx].buf_addr + desc_offset,
- cpy_len);
- PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
- cpy_len, 0);
-
- mbuf_avail -= cpy_len;
- mbuf_offset += cpy_len;
- desc_avail -= cpy_len;
- desc_offset += cpy_len;
- }
-
- used_idx = cur_idx & (vq->size - 1);
- vq->used->ring[used_idx].id = buf_vec[vec_idx].desc_idx;
- vq->used->ring[used_idx].len = desc_offset;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used, ring[used_idx]),
- sizeof(vq->used->ring[used_idx]));
-
- return end_idx - start_idx;
-}
-
-static inline uint32_t __attribute__((always_inline))
-virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
- struct rte_mbuf **pkts, uint32_t count)
-{
- struct vhost_virtqueue *vq;
- uint32_t pkt_idx = 0, nr_used = 0;
- uint16_t end;
- struct buf_vector buf_vec[BUF_VECTOR_MAX];
-
- LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
- if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
- RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
- dev->vid, __func__, queue_id);
- return 0;
- }
-
- vq = dev->virtqueue[queue_id];
- if (unlikely(vq->enabled == 0))
- return 0;
-
- count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
- if (count == 0)
- return 0;
-
- for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
- uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
-
- if (unlikely(reserve_avail_buf_mergeable(vq, pkt_len,
- &end, buf_vec) < 0)) {
- LOG_DEBUG(VHOST_DATA,
- "(%d) failed to get enough desc from vring\n",
- dev->vid);
- break;
- }
-
- nr_used = copy_mbuf_to_desc_mergeable(dev, vq, end,
- pkts[pkt_idx], buf_vec);
- rte_smp_wmb();
-
- *(volatile uint16_t *)&vq->used->idx += nr_used;
- vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
- sizeof(vq->used->idx));
- vq->last_used_idx += nr_used;
- }
-
- if (likely(pkt_idx)) {
- /* flush used->idx update before we read avail->flags. */
- rte_mb();
-
- /* Kick the guest if necessary. */
- if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
- && (vq->callfd >= 0))
- eventfd_write(vq->callfd, (eventfd_t)1);
- }
-
- return pkt_idx;
-}
-
static inline uint32_t __attribute__((always_inline))
loop_check(struct vhost_virtqueue *vq, uint16_t avail_idx, uint32_t pkt_left)
{
--
2.7.4
Yuanhan Liu
2016-08-19 02:32:48 UTC
Permalink
Post by Zhihong Wang
This patch removes obsolete functions.
Splitting patches doesn't work in this way: this should be in the first
patch. Otherwise, build breaks in the first patch, as some functions are
defined but not used.

--yliu
Wang, Zhihong
2016-08-19 07:08:44 UTC
Permalink
-----Original Message-----
Sent: Friday, August 19, 2016 10:33 AM
Subject: Re: [PATCH v2 2/6] vhost: remove obsolete
Post by Zhihong Wang
This patch removes obsolete functions.
Splitting patches doesn't work in this way: this should be in the first
patch. Otherwise, build breaks in the first patch, as some functions are
defined but not used.
Thanks. I'll send out v3 soon, also to fix a small glitch
while running in old platform like snb and ivb.
--yliu
Zhihong Wang
2016-08-18 06:33:08 UTC
Permalink
This patch removes useless volatile attribute to allow compiler
optimization.

Signed-off-by: Zhihong Wang <***@intel.com>
---
lib/librte_vhost/vhost-net.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/librte_vhost/vhost-net.h b/lib/librte_vhost/vhost-net.h
index 38593a2..51fdf3d 100644
--- a/lib/librte_vhost/vhost-net.h
+++ b/lib/librte_vhost/vhost-net.h
@@ -71,7 +71,7 @@ struct vhost_virtqueue {
uint32_t size;

/* Last index used on the available ring */
- volatile uint16_t last_used_idx;
+ uint16_t last_used_idx;
#define VIRTIO_INVALID_EVENTFD (-1)
#define VIRTIO_UNINITIALIZED_EVENTFD (-2)
--
2.7.4
Zhihong Wang
2016-08-18 06:33:09 UTC
Permalink
This patch adds descriptor prefetch to hide cache access latency.

Signed-off-by: Zhihong Wang <***@intel.com>
---
lib/librte_vhost/vhost_rxtx.c | 5 +++++
1 file changed, 5 insertions(+)

diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 939957d..7db83d0 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -131,6 +131,11 @@ loop_check(struct vhost_virtqueue *vq, uint16_t avail_idx, uint32_t pkt_left)
if (pkt_left == 0 || avail_idx == vq->last_used_idx)
return 1;

+ /* prefetch the next desc */
+ if (pkt_left > 1 && avail_idx != vq->last_used_idx + 1)
+ rte_prefetch0(&vq->desc[vq->avail->ring[
+ (vq->last_used_idx + 1) & (vq->size - 1)]]);
+
return 0;
}
--
2.7.4
Zhihong Wang
2016-08-18 06:33:10 UTC
Permalink
This patch enables batch update of the used ring for better efficiency.

Signed-off-by: Zhihong Wang <***@intel.com>
---
lib/librte_vhost/vhost-net.h | 4 +++
lib/librte_vhost/vhost_rxtx.c | 68 +++++++++++++++++++++++++++++++++----------
lib/librte_vhost/virtio-net.c | 15 ++++++++--
3 files changed, 68 insertions(+), 19 deletions(-)

diff --git a/lib/librte_vhost/vhost-net.h b/lib/librte_vhost/vhost-net.h
index 51fdf3d..a15182c 100644
--- a/lib/librte_vhost/vhost-net.h
+++ b/lib/librte_vhost/vhost-net.h
@@ -85,6 +85,10 @@ struct vhost_virtqueue {

/* Physical address of used ring, for logging */
uint64_t log_guest_addr;
+
+ /* Shadow used ring for performance */
+ struct vring_used_elem *shadow_used_ring;
+ uint32_t shadow_used_idx;
} __rte_cache_aligned;

/* Old kernels have no such macro defined */
diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 7db83d0..60d63d3 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -155,7 +155,6 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
uint32_t mbuf_len_left = 0;
uint32_t copy_len = 0;
uint32_t extra_buffers = 0;
- uint32_t used_idx_round = 0;

/* start with the first mbuf of the packet */
mbuf_len = rte_pktmbuf_data_len(mbuf);
@@ -207,17 +206,11 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
goto rollback;
} else if (is_mrg_rxbuf) {
/* start with the next desc chain */
- used_idx_round = vq->last_used_idx
- & (vq->size - 1);
- vq->used->ring[used_idx_round].id =
+ vq->shadow_used_ring[vq->shadow_used_idx].id =
desc_chain_head;
- vq->used->ring[used_idx_round].len =
+ vq->shadow_used_ring[vq->shadow_used_idx].len =
desc_chain_len;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used,
- ring[used_idx_round]),
- sizeof(vq->used->ring[
- used_idx_round]));
+ vq->shadow_used_idx++;
vq->last_used_idx++;
extra_buffers++;
virtio_hdr->num_buffers++;
@@ -255,12 +248,9 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
desc_chain_len += copy_len;
}

- used_idx_round = vq->last_used_idx & (vq->size - 1);
- vq->used->ring[used_idx_round].id = desc_chain_head;
- vq->used->ring[used_idx_round].len = desc_chain_len;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used, ring[used_idx_round]),
- sizeof(vq->used->ring[used_idx_round]));
+ vq->shadow_used_ring[vq->shadow_used_idx].id = desc_chain_head;
+ vq->shadow_used_ring[vq->shadow_used_idx].len = desc_chain_len;
+ vq->shadow_used_idx++;
vq->last_used_idx++;

return 0;
@@ -275,6 +265,45 @@ error:
}

static inline void __attribute__((always_inline))
+update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ uint32_t used_idx_start)
+{
+ if (used_idx_start + vq->shadow_used_idx < vq->size) {
+ rte_memcpy(&vq->used->ring[used_idx_start],
+ &vq->shadow_used_ring[0],
+ vq->shadow_used_idx *
+ sizeof(struct vring_used_elem));
+ vhost_log_used_vring(dev, vq,
+ offsetof(struct vring_used,
+ ring[used_idx_start]),
+ vq->shadow_used_idx *
+ sizeof(struct vring_used_elem));
+ } else {
+ uint32_t part_1 = vq->size - used_idx_start;
+ uint32_t part_2 = vq->shadow_used_idx - part_1;
+
+ rte_memcpy(&vq->used->ring[used_idx_start],
+ &vq->shadow_used_ring[0],
+ part_1 *
+ sizeof(struct vring_used_elem));
+ vhost_log_used_vring(dev, vq,
+ offsetof(struct vring_used,
+ ring[used_idx_start]),
+ part_1 *
+ sizeof(struct vring_used_elem));
+ rte_memcpy(&vq->used->ring[0],
+ &vq->shadow_used_ring[part_1],
+ part_2 *
+ sizeof(struct vring_used_elem));
+ vhost_log_used_vring(dev, vq,
+ offsetof(struct vring_used,
+ ring[0]),
+ part_2 *
+ sizeof(struct vring_used_elem));
+ }
+}
+
+static inline void __attribute__((always_inline))
notify_guest(struct virtio_net *dev, struct vhost_virtqueue *vq)
{
rte_smp_wmb();
@@ -293,6 +322,7 @@ rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
{
struct vhost_virtqueue *vq;
struct virtio_net *dev;
+ uint32_t used_idx_start = 0;
uint32_t pkt_idx = 0;
uint32_t pkt_left = 0;
uint32_t pkt_sent = 0;
@@ -322,6 +352,8 @@ rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
/* start enqueuing packets 1 by 1 */
pkt_idx = 0;
pkt_left = count;
+ vq->shadow_used_idx = 0;
+ used_idx_start = vq->last_used_idx & (vq->size - 1);
avail_idx = *((volatile uint16_t *)&vq->avail->idx);
while (1) {
if (loop_check(vq, avail_idx, pkt_left))
@@ -336,6 +368,10 @@ rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
pkt_left--;
}

+ /* batch update used ring for better performance */
+ if (likely(vq->shadow_used_idx > 0))
+ update_used_ring(dev, vq, used_idx_start);
+
/* update used idx and kick the guest if necessary */
if (pkt_sent)
notify_guest(dev, vq);
diff --git a/lib/librte_vhost/virtio-net.c b/lib/librte_vhost/virtio-net.c
index 1785695..87d09fa 100644
--- a/lib/librte_vhost/virtio-net.c
+++ b/lib/librte_vhost/virtio-net.c
@@ -152,10 +152,14 @@ cleanup_device(struct virtio_net *dev, int destroy)
static void
free_device(struct virtio_net *dev)
{
+ struct vhost_virtqueue *vq;
uint32_t i;

- for (i = 0; i < dev->virt_qp_nb; i++)
- rte_free(dev->virtqueue[i * VIRTIO_QNUM]);
+ for (i = 0; i < dev->virt_qp_nb; i++) {
+ vq = dev->virtqueue[i * VIRTIO_QNUM];
+ rte_free(vq->shadow_used_ring);
+ rte_free(vq);
+ }

rte_free(dev);
}
@@ -418,13 +422,18 @@ int
vhost_set_vring_num(int vid, struct vhost_vring_state *state)
{
struct virtio_net *dev;
+ struct vhost_virtqueue *vq;

dev = get_device(vid);
if (dev == NULL)
return -1;

/* State->index refers to the queue index. The txq is 1, rxq is 0. */
- dev->virtqueue[state->index]->size = state->num;
+ vq = dev->virtqueue[state->index];
+ vq->size = state->num;
+ vq->shadow_used_ring = rte_malloc("",
+ vq->size * sizeof(struct vring_used_elem),
+ RTE_CACHE_LINE_SIZE);

return 0;
}
--
2.7.4
Zhihong Wang
2016-08-18 06:33:11 UTC
Permalink
This patch reorders the code to delay virtio header write to optimize cache
access efficiency for cases where the mrg_rxbuf feature is turned on. It
reduces CPU pipeline stall cycles significantly.


Signed-off-by: Zhihong Wang <***@intel.com>
---
lib/librte_vhost/vhost_rxtx.c | 23 ++++++++++++++++-------
1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 60d63d3..15f7f9c 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -154,6 +154,7 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
uint32_t mbuf_len = 0;
uint32_t mbuf_len_left = 0;
uint32_t copy_len = 0;
+ uint32_t copy_virtio_hdr = 0;
uint32_t extra_buffers = 0;

/* start with the first mbuf of the packet */
@@ -168,18 +169,17 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
if (unlikely(!desc_host_write_addr))
goto error;

- /* handle virtio header */
+ /*
+ * handle virtio header, the actual write operation
+ * is delayed for cache optimization.
+ */
virtio_hdr = (struct virtio_net_hdr_mrg_rxbuf *)
(uintptr_t)desc_host_write_addr;
- memset((void *)(uintptr_t)&(virtio_hdr->hdr),
- 0, dev->vhost_hlen);
- virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
+ copy_virtio_hdr = 1;
vhost_log_write(dev, desc->addr, dev->vhost_hlen);
desc_write_offset = dev->vhost_hlen;
desc_chain_len = desc_write_offset;
desc_host_write_addr += desc_write_offset;
- if (is_mrg_rxbuf)
- virtio_hdr->num_buffers = 1;

/* start copy from mbuf to desc */
while (1) {
@@ -233,9 +233,18 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
goto rollback;
}

- /* copy mbuf data */
+ /* copy virtio header and mbuf data */
copy_len = RTE_MIN(desc->len - desc_write_offset,
mbuf_len_left);
+ if (copy_virtio_hdr) {
+ copy_virtio_hdr = 0;
+ memset((void *)(uintptr_t)&(virtio_hdr->hdr),
+ 0, dev->vhost_hlen);
+ virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
+ if (is_mrg_rxbuf)
+ virtio_hdr->num_buffers = extra_buffers + 1;
+ }
+
rte_memcpy((void *)(uintptr_t)desc_host_write_addr,
rte_pktmbuf_mtod_offset(mbuf, void *,
mbuf_len - mbuf_len_left),
--
2.7.4
Zhihong Wang
2016-08-19 05:43:45 UTC
Permalink
This patch set optimizes the vhost enqueue function.

It implements the vhost logic from scratch into a single function designed
for high performance and good maintainability, and improves CPU efficiency
significantly by optimizing cache access, which means:

* For fast frontends (eg. DPDK virtio pmd), higher performance (maximum
throughput) can be achieved.

* For slow frontends (eg. kernel virtio-net), better scalability can be
achieved, each vhost core can support more connections since it takes
less cycles to handle each single frontend.

The main optimization techniques are:

1. Reorder code to reduce CPU pipeline stall cycles.

2. Batch update the used ring for better efficiency.

3. Prefetch descriptor to hide cache latency.

4. Remove useless volatile attribute to allow compiler optimization.

Code reordering and batch used ring update bring most of the performance
improvements.

In the existing code there're 2 callbacks for vhost enqueue:

* virtio_dev_merge_rx for mrg_rxbuf turned on cases.

* virtio_dev_rx for mrg_rxbuf turned off cases.

The performance of the existing code is not optimal, especially when the
mrg_rxbuf feature turned on. Also, having 2 separated functions increases
maintenance efforts.

---
Changes in v3:

1. Remove unnecessary memset which causes frontend stall on SNB & IVB.

2. Rename variables to follow naming convention.

3. Rewrite enqueue and delete the obsolete in the same patch.

---
Changes in v2:

1. Split the big function into several small ones.

2. Use multiple patches to explain each optimization.

3. Add comments.

Zhihong Wang (5):
vhost: rewrite enqueue
vhost: remove useless volatile
vhost: add desc prefetch
vhost: batch update used ring
vhost: optimize cache access

lib/librte_vhost/vhost-net.h | 6 +-
lib/librte_vhost/vhost_rxtx.c | 573 +++++++++++++++---------------------------
lib/librte_vhost/virtio-net.c | 15 +-
3 files changed, 220 insertions(+), 374 deletions(-)
--
2.7.4
Zhihong Wang
2016-08-19 05:43:47 UTC
Permalink
This patch removes useless volatile attribute to allow compiler
optimization.

Signed-off-by: Zhihong Wang <***@intel.com>
---
lib/librte_vhost/vhost-net.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/librte_vhost/vhost-net.h b/lib/librte_vhost/vhost-net.h
index 38593a2..51fdf3d 100644
--- a/lib/librte_vhost/vhost-net.h
+++ b/lib/librte_vhost/vhost-net.h
@@ -71,7 +71,7 @@ struct vhost_virtqueue {
uint32_t size;

/* Last index used on the available ring */
- volatile uint16_t last_used_idx;
+ uint16_t last_used_idx;
#define VIRTIO_INVALID_EVENTFD (-1)
#define VIRTIO_UNINITIALIZED_EVENTFD (-2)
--
2.7.4
Zhihong Wang
2016-08-19 05:43:46 UTC
Permalink
This patch implements the vhost logic from scratch into a single function
designed for high performance and better maintainability.

---
Changes in v3:

1. Rewrite enqueue and delete the obsolete in the same patch.

Signed-off-by: Zhihong Wang <***@intel.com>
---
lib/librte_vhost/vhost_rxtx.c | 537 +++++++++++++-----------------------------
1 file changed, 160 insertions(+), 377 deletions(-)

diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 08a73fd..b09a9c3 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -91,7 +91,7 @@ is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t qp_nb)
return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
}

-static void
+static inline void __attribute__((always_inline))
virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
{
if (m_buf->ol_flags & PKT_TX_L4_MASK) {
@@ -125,427 +125,210 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
}
}

-static inline void
-copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr,
- struct virtio_net_hdr_mrg_rxbuf hdr)
+static inline uint32_t __attribute__((always_inline))
+loop_check(struct vhost_virtqueue *vq, uint16_t avail_idx, uint32_t pkt_left)
{
- if (dev->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf))
- *(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr;
- else
- *(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr;
+ if (pkt_left == 0 || avail_idx == vq->last_used_idx)
+ return 1;
+
+ return 0;
}

-static inline int __attribute__((always_inline))
-copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
- struct rte_mbuf *m, uint16_t desc_idx)
+static inline uint32_t __attribute__((always_inline))
+enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ uint16_t avail_idx, struct rte_mbuf *mbuf,
+ uint32_t is_mrg_rxbuf)
{
- uint32_t desc_avail, desc_offset;
- uint32_t mbuf_avail, mbuf_offset;
- uint32_t cpy_len;
+ struct virtio_net_hdr_mrg_rxbuf *virtio_hdr;
struct vring_desc *desc;
- uint64_t desc_addr;
- struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
-
- desc = &vq->desc[desc_idx];
+ uint64_t desc_addr = 0;
+ uint32_t desc_chain_head = 0;
+ uint32_t desc_chain_len = 0;
+ uint32_t desc_current = 0;
+ uint32_t desc_offset = 0;
+ uint32_t mbuf_len = 0;
+ uint32_t mbuf_avail = 0;
+ uint32_t copy_len = 0;
+ uint32_t extra_buffers = 0;
+ uint32_t used_idx_round = 0;
+
+ /* start with the first mbuf of the packet */
+ mbuf_len = rte_pktmbuf_data_len(mbuf);
+ mbuf_avail = mbuf_len;
+
+ /* get the current desc */
+ desc_current = vq->avail->ring[(vq->last_used_idx) & (vq->size - 1)];
+ desc_chain_head = desc_current;
+ desc = &vq->desc[desc_current];
desc_addr = gpa_to_vva(dev, desc->addr);
- /*
- * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
- * performance issue with some versions of gcc (4.8.4 and 5.3.0) which
- * otherwise stores offset on the stack instead of in a register.
- */
- if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr)
- return -1;
-
- rte_prefetch0((void *)(uintptr_t)desc_addr);
+ if (unlikely(!desc_addr))
+ goto error;

- virtio_enqueue_offload(m, &virtio_hdr.hdr);
- copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
+ /* handle virtio header */
+ virtio_hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr;
+ virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
vhost_log_write(dev, desc->addr, dev->vhost_hlen);
- PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
-
desc_offset = dev->vhost_hlen;
- desc_avail = desc->len - dev->vhost_hlen;
+ desc_chain_len = desc_offset;
+ desc_addr += desc_offset;
+ if (is_mrg_rxbuf)
+ virtio_hdr->num_buffers = 1;

- mbuf_avail = rte_pktmbuf_data_len(m);
- mbuf_offset = 0;
- while (mbuf_avail != 0 || m->next != NULL) {
- /* done with current mbuf, fetch next */
- if (mbuf_avail == 0) {
- m = m->next;
-
- mbuf_offset = 0;
- mbuf_avail = rte_pktmbuf_data_len(m);
+ /* start copy from mbuf to desc */
+ while (1) {
+ /* get the next mbuf if the current done */
+ if (!mbuf_avail) {
+ if (mbuf->next) {
+ mbuf = mbuf->next;
+ mbuf_len = rte_pktmbuf_data_len(mbuf);
+ mbuf_avail = mbuf_len;
+ } else
+ break;
}

- /* done with current desc buf, fetch next */
- if (desc_avail == 0) {
- if ((desc->flags & VRING_DESC_F_NEXT) == 0) {
- /* Room in vring buffer is not enough */
- return -1;
- }
- if (unlikely(desc->next >= vq->size))
- return -1;
-
- desc = &vq->desc[desc->next];
- desc_addr = gpa_to_vva(dev, desc->addr);
- if (unlikely(!desc_addr))
- return -1;
-
- desc_offset = 0;
- desc_avail = desc->len;
+ /* get the next desc if the current done */
+ if (desc->len <= desc_offset) {
+ if (desc->flags & VRING_DESC_F_NEXT) {
+ /* go on with the current desc chain */
+ desc_offset = 0;
+ desc_current = desc->next;
+ desc = &vq->desc[desc_current];
+ desc_addr = gpa_to_vva(dev, desc->addr);
+ if (unlikely(!desc_addr))
+ goto rollback;
+ } else if (is_mrg_rxbuf) {
+ /* start with the next desc chain */
+ used_idx_round = vq->last_used_idx
+ & (vq->size - 1);
+ vq->used->ring[used_idx_round].id =
+ desc_chain_head;
+ vq->used->ring[used_idx_round].len =
+ desc_chain_len;
+ vhost_log_used_vring(dev, vq,
+ offsetof(struct vring_used,
+ ring[used_idx_round]),
+ sizeof(vq->used->ring[
+ used_idx_round]));
+ vq->last_used_idx++;
+ extra_buffers++;
+ virtio_hdr->num_buffers++;
+ if (avail_idx == vq->last_used_idx)
+ goto rollback;
+
+ desc_current =
+ vq->avail->ring[(vq->last_used_idx) &
+ (vq->size - 1)];
+ desc_chain_head = desc_current;
+ desc = &vq->desc[desc_current];
+ desc_addr = gpa_to_vva(dev, desc->addr);
+ if (unlikely(!desc_addr))
+ goto rollback;
+
+ desc_chain_len = 0;
+ desc_offset = 0;
+ } else
+ goto rollback;
}

- cpy_len = RTE_MIN(desc_avail, mbuf_avail);
- rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
- rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
- cpy_len);
- vhost_log_write(dev, desc->addr + desc_offset, cpy_len);
- PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
- cpy_len, 0);
-
- mbuf_avail -= cpy_len;
- mbuf_offset += cpy_len;
- desc_avail -= cpy_len;
- desc_offset += cpy_len;
- }
-
- return 0;
-}
-
-/**
- * This function adds buffers to the virtio devices RX virtqueue. Buffers can
- * be received from the physical port or from another virtio device. A packet
- * count is returned to indicate the number of packets that are succesfully
- * added to the RX queue. This function works when the mbuf is scattered, but
- * it doesn't support the mergeable feature.
- */
-static inline uint32_t __attribute__((always_inline))
-virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
- struct rte_mbuf **pkts, uint32_t count)
-{
- struct vhost_virtqueue *vq;
- uint16_t avail_idx, free_entries, start_idx;
- uint16_t desc_indexes[MAX_PKT_BURST];
- uint16_t used_idx;
- uint32_t i;
-
- LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
- if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
- RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
- dev->vid, __func__, queue_id);
- return 0;
+ /* copy mbuf data */
+ copy_len = RTE_MIN(desc->len - desc_offset, mbuf_avail);
+ rte_memcpy((void *)(uintptr_t)desc_addr,
+ rte_pktmbuf_mtod_offset(mbuf, void *,
+ mbuf_len - mbuf_avail),
+ copy_len);
+ vhost_log_write(dev, desc->addr + desc_offset, copy_len);
+ mbuf_avail -= copy_len;
+ desc_offset += copy_len;
+ desc_addr += copy_len;
+ desc_chain_len += copy_len;
}

- vq = dev->virtqueue[queue_id];
- if (unlikely(vq->enabled == 0))
- return 0;
-
- avail_idx = *((volatile uint16_t *)&vq->avail->idx);
- start_idx = vq->last_used_idx;
- free_entries = avail_idx - start_idx;
- count = RTE_MIN(count, free_entries);
- count = RTE_MIN(count, (uint32_t)MAX_PKT_BURST);
- if (count == 0)
- return 0;
-
- LOG_DEBUG(VHOST_DATA, "(%d) start_idx %d | end_idx %d\n",
- dev->vid, start_idx, start_idx + count);
-
- /* Retrieve all of the desc indexes first to avoid caching issues. */
- rte_prefetch0(&vq->avail->ring[start_idx & (vq->size - 1)]);
- for (i = 0; i < count; i++) {
- used_idx = (start_idx + i) & (vq->size - 1);
- desc_indexes[i] = vq->avail->ring[used_idx];
- vq->used->ring[used_idx].id = desc_indexes[i];
- vq->used->ring[used_idx].len = pkts[i]->pkt_len +
- dev->vhost_hlen;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used, ring[used_idx]),
- sizeof(vq->used->ring[used_idx]));
- }
+ used_idx_round = vq->last_used_idx & (vq->size - 1);
+ vq->used->ring[used_idx_round].id = desc_chain_head;
+ vq->used->ring[used_idx_round].len = desc_chain_len;
+ vhost_log_used_vring(dev, vq,
+ offsetof(struct vring_used, ring[used_idx_round]),
+ sizeof(vq->used->ring[used_idx_round]));
+ vq->last_used_idx++;

- rte_prefetch0(&vq->desc[desc_indexes[0]]);
- for (i = 0; i < count; i++) {
- uint16_t desc_idx = desc_indexes[i];
- int err;
+ return 0;

- err = copy_mbuf_to_desc(dev, vq, pkts[i], desc_idx);
- if (unlikely(err)) {
- used_idx = (start_idx + i) & (vq->size - 1);
- vq->used->ring[used_idx].len = dev->vhost_hlen;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used, ring[used_idx]),
- sizeof(vq->used->ring[used_idx]));
- }
+rollback:
+ /* rollback on any error if last_used_idx update on-the-fly */
+ if (is_mrg_rxbuf)
+ vq->last_used_idx -= extra_buffers;

- if (i + 1 < count)
- rte_prefetch0(&vq->desc[desc_indexes[i+1]]);
- }
+error:
+ return 1;
+}

+static inline void __attribute__((always_inline))
+notify_guest(struct virtio_net *dev, struct vhost_virtqueue *vq)
+{
rte_smp_wmb();
-
- *(volatile uint16_t *)&vq->used->idx += count;
- vq->last_used_idx += count;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used, idx),
- sizeof(vq->used->idx));
-
- /* flush used->idx update before we read avail->flags. */
+ vq->used->idx = vq->last_used_idx;
+ vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
+ sizeof(vq->used->idx));
rte_mb();
-
- /* Kick the guest if necessary. */
if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
&& (vq->callfd >= 0))
eventfd_write(vq->callfd, (eventfd_t)1);
- return count;
-}
-
-static inline int
-fill_vec_buf(struct vhost_virtqueue *vq, uint32_t avail_idx,
- uint32_t *allocated, uint32_t *vec_idx,
- struct buf_vector *buf_vec)
-{
- uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
- uint32_t vec_id = *vec_idx;
- uint32_t len = *allocated;
-
- while (1) {
- if (unlikely(vec_id >= BUF_VECTOR_MAX || idx >= vq->size))
- return -1;
-
- len += vq->desc[idx].len;
- buf_vec[vec_id].buf_addr = vq->desc[idx].addr;
- buf_vec[vec_id].buf_len = vq->desc[idx].len;
- buf_vec[vec_id].desc_idx = idx;
- vec_id++;
-
- if ((vq->desc[idx].flags & VRING_DESC_F_NEXT) == 0)
- break;
-
- idx = vq->desc[idx].next;
- }
-
- *allocated = len;
- *vec_idx = vec_id;
-
- return 0;
-}
-
-/*
- * Returns -1 on fail, 0 on success
- */
-static inline int
-reserve_avail_buf_mergeable(struct vhost_virtqueue *vq, uint32_t size,
- uint16_t *end, struct buf_vector *buf_vec)
-{
- uint16_t cur_idx;
- uint16_t avail_idx;
- uint32_t allocated = 0;
- uint32_t vec_idx = 0;
- uint16_t tries = 0;
-
- cur_idx = vq->last_used_idx;
-
- while (1) {
- avail_idx = *((volatile uint16_t *)&vq->avail->idx);
- if (unlikely(cur_idx == avail_idx))
- return -1;
-
- if (unlikely(fill_vec_buf(vq, cur_idx, &allocated,
- &vec_idx, buf_vec) < 0))
- return -1;
-
- cur_idx++;
- tries++;
-
- if (allocated >= size)
- break;
-
- /*
- * if we tried all available ring items, and still
- * can't get enough buf, it means something abnormal
- * happened.
- */
- if (unlikely(tries >= vq->size))
- return -1;
- }
-
- *end = cur_idx;
- return 0;
}

-static inline uint32_t __attribute__((always_inline))
-copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
- uint16_t end_idx, struct rte_mbuf *m,
- struct buf_vector *buf_vec)
+uint16_t
+rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
+ struct rte_mbuf **pkts, uint16_t count)
{
- struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
- uint32_t vec_idx = 0;
- uint16_t start_idx = vq->last_used_idx;
- uint16_t cur_idx = start_idx;
- uint64_t desc_addr;
- uint32_t mbuf_offset, mbuf_avail;
- uint32_t desc_offset, desc_avail;
- uint32_t cpy_len;
- uint16_t desc_idx, used_idx;
-
- if (unlikely(m == NULL))
+ struct vhost_virtqueue *vq;
+ struct virtio_net *dev;
+ uint32_t pkt_idx = 0;
+ uint32_t pkt_left = 0;
+ uint32_t pkt_sent = 0;
+ uint32_t is_mrg_rxbuf = 0;
+ uint16_t avail_idx = 0;
+
+ /* precheck */
+ if (unlikely(count == 0))
return 0;

- LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n",
- dev->vid, cur_idx, end_idx);
+ count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);

- desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
- if (buf_vec[vec_idx].buf_len < dev->vhost_hlen || !desc_addr)
+ dev = get_device(vid);
+ if (unlikely(!dev))
return 0;

- rte_prefetch0((void *)(uintptr_t)desc_addr);
-
- virtio_hdr.num_buffers = end_idx - start_idx;
- LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n",
- dev->vid, virtio_hdr.num_buffers);
-
- virtio_enqueue_offload(m, &virtio_hdr.hdr);
- copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
- vhost_log_write(dev, buf_vec[vec_idx].buf_addr, dev->vhost_hlen);
- PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
-
- desc_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
- desc_offset = dev->vhost_hlen;
-
- mbuf_avail = rte_pktmbuf_data_len(m);
- mbuf_offset = 0;
- while (mbuf_avail != 0 || m->next != NULL) {
- /* done with current desc buf, get the next one */
- if (desc_avail == 0) {
- desc_idx = buf_vec[vec_idx].desc_idx;
-
- if (!(vq->desc[desc_idx].flags & VRING_DESC_F_NEXT)) {
- /* Update used ring with desc information */
- used_idx = cur_idx++ & (vq->size - 1);
- vq->used->ring[used_idx].id = desc_idx;
- vq->used->ring[used_idx].len = desc_offset;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used,
- ring[used_idx]),
- sizeof(vq->used->ring[used_idx]));
- }
-
- vec_idx++;
- desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
- if (unlikely(!desc_addr))
- return 0;
-
- /* Prefetch buffer address. */
- rte_prefetch0((void *)(uintptr_t)desc_addr);
- desc_offset = 0;
- desc_avail = buf_vec[vec_idx].buf_len;
- }
-
- /* done with current mbuf, get the next one */
- if (mbuf_avail == 0) {
- m = m->next;
-
- mbuf_offset = 0;
- mbuf_avail = rte_pktmbuf_data_len(m);
- }
-
- cpy_len = RTE_MIN(desc_avail, mbuf_avail);
- rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
- rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
- cpy_len);
- vhost_log_write(dev, buf_vec[vec_idx].buf_addr + desc_offset,
- cpy_len);
- PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
- cpy_len, 0);
-
- mbuf_avail -= cpy_len;
- mbuf_offset += cpy_len;
- desc_avail -= cpy_len;
- desc_offset += cpy_len;
- }
-
- used_idx = cur_idx & (vq->size - 1);
- vq->used->ring[used_idx].id = buf_vec[vec_idx].desc_idx;
- vq->used->ring[used_idx].len = desc_offset;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used, ring[used_idx]),
- sizeof(vq->used->ring[used_idx]));
-
- return end_idx - start_idx;
-}
-
-static inline uint32_t __attribute__((always_inline))
-virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
- struct rte_mbuf **pkts, uint32_t count)
-{
- struct vhost_virtqueue *vq;
- uint32_t pkt_idx = 0, nr_used = 0;
- uint16_t end;
- struct buf_vector buf_vec[BUF_VECTOR_MAX];
-
- LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
- if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
- RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
- dev->vid, __func__, queue_id);
+ if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb)))
return 0;
- }

vq = dev->virtqueue[queue_id];
- if (unlikely(vq->enabled == 0))
+ if (unlikely(!vq->enabled))
return 0;

- count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
- if (count == 0)
- return 0;
+ if (dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF))
+ is_mrg_rxbuf = 1;

- for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
- uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
-
- if (unlikely(reserve_avail_buf_mergeable(vq, pkt_len,
- &end, buf_vec) < 0)) {
- LOG_DEBUG(VHOST_DATA,
- "(%d) failed to get enough desc from vring\n",
- dev->vid);
+ /* start enqueuing packets 1 by 1 */
+ pkt_idx = 0;
+ pkt_left = count;
+ avail_idx = *((volatile uint16_t *)&vq->avail->idx);
+ while (1) {
+ if (loop_check(vq, avail_idx, pkt_left))
break;
- }
-
- nr_used = copy_mbuf_to_desc_mergeable(dev, vq, end,
- pkts[pkt_idx], buf_vec);
- rte_smp_wmb();

- *(volatile uint16_t *)&vq->used->idx += nr_used;
- vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
- sizeof(vq->used->idx));
- vq->last_used_idx += nr_used;
- }
-
- if (likely(pkt_idx)) {
- /* flush used->idx update before we read avail->flags. */
- rte_mb();
+ if (enqueue_packet(dev, vq, avail_idx, pkts[pkt_idx],
+ is_mrg_rxbuf))
+ break;

- /* Kick the guest if necessary. */
- if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
- && (vq->callfd >= 0))
- eventfd_write(vq->callfd, (eventfd_t)1);
+ pkt_idx++;
+ pkt_sent++;
+ pkt_left--;
}

- return pkt_idx;
-}
-
-uint16_t
-rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
- struct rte_mbuf **pkts, uint16_t count)
-{
- struct virtio_net *dev = get_device(vid);
-
- if (!dev)
- return 0;
+ /* update used idx and kick the guest if necessary */
+ if (pkt_sent)
+ notify_guest(dev, vq);

- if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF))
- return virtio_dev_merge_rx(dev, queue_id, pkts, count);
- else
- return virtio_dev_rx(dev, queue_id, pkts, count);
+ return pkt_sent;
}

static void
--
2.7.4
Maxime Coquelin
2016-08-22 09:35:47 UTC
Permalink
Post by Zhihong Wang
This patch implements the vhost logic from scratch into a single function
designed for high performance and better maintainability.
---
1. Rewrite enqueue and delete the obsolete in the same patch.
---
lib/librte_vhost/vhost_rxtx.c | 537 +++++++++++++-----------------------------
1 file changed, 160 insertions(+), 377 deletions(-)
diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 08a73fd..b09a9c3 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -91,7 +91,7 @@ is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t qp_nb)
return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
}
-static void
+static inline void __attribute__((always_inline))
virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
{
if (m_buf->ol_flags & PKT_TX_L4_MASK) {
@@ -125,427 +125,210 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
}
}
-static inline void
-copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr,
- struct virtio_net_hdr_mrg_rxbuf hdr)
+static inline uint32_t __attribute__((always_inline))
+loop_check(struct vhost_virtqueue *vq, uint16_t avail_idx, uint32_t pkt_left)
Creating a function just for doing this doesn't make much sense.
And the function name doesn't help.
I think you should just remove this function.
Post by Zhihong Wang
{
- if (dev->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf))
- *(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr;
- else
- *(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr;
+ if (pkt_left == 0 || avail_idx == vq->last_used_idx)
+ return 1;
+
+ return 0;
}
-static inline int __attribute__((always_inline))
-copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
- struct rte_mbuf *m, uint16_t desc_idx)
+static inline uint32_t __attribute__((always_inline))
+enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ uint16_t avail_idx, struct rte_mbuf *mbuf,
+ uint32_t is_mrg_rxbuf)
{
- uint32_t desc_avail, desc_offset;
- uint32_t mbuf_avail, mbuf_offset;
- uint32_t cpy_len;
+ struct virtio_net_hdr_mrg_rxbuf *virtio_hdr;
struct vring_desc *desc;
- uint64_t desc_addr;
- struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
-
- desc = &vq->desc[desc_idx];
+ uint64_t desc_addr = 0;
+ uint32_t desc_chain_head = 0;
+ uint32_t desc_chain_len = 0;
+ uint32_t desc_current = 0;
+ uint32_t desc_offset = 0;
+ uint32_t mbuf_len = 0;
+ uint32_t mbuf_avail = 0;
+ uint32_t copy_len = 0;
+ uint32_t extra_buffers = 0;
+ uint32_t used_idx_round = 0;
Most of these variables don't need to be initialized.
Post by Zhihong Wang
+
+ /* start with the first mbuf of the packet */
+ mbuf_len = rte_pktmbuf_data_len(mbuf);
+ mbuf_avail = mbuf_len;
+
+ /* get the current desc */
+ desc_current = vq->avail->ring[(vq->last_used_idx) & (vq->size - 1)];
+ desc_chain_head = desc_current;
+ desc = &vq->desc[desc_current];
desc_addr = gpa_to_vva(dev, desc->addr);
- /*
- * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
- * performance issue with some versions of gcc (4.8.4 and 5.3.0) which
- * otherwise stores offset on the stack instead of in a register.
- */
- if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr)
- return -1;
-
- rte_prefetch0((void *)(uintptr_t)desc_addr);
+ if (unlikely(!desc_addr))
+ goto error;
- virtio_enqueue_offload(m, &virtio_hdr.hdr);
- copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
+ /* handle virtio header */
+ virtio_hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr;
+ virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
Parenthesis around virtio_hdr->hdr shouldn't be needed.
Post by Zhihong Wang
vhost_log_write(dev, desc->addr, dev->vhost_hlen);
- PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
Looks like you remove the PRINT_PACKET calls.
Does it impact performance?
In any case, it should be mentionned in the commit message.
Post by Zhihong Wang
-
desc_offset = dev->vhost_hlen;
- desc_avail = desc->len - dev->vhost_hlen;
+ desc_chain_len = desc_offset;
+ desc_addr += desc_offset;
+ if (is_mrg_rxbuf)
+ virtio_hdr->num_buffers = 1;
- mbuf_avail = rte_pktmbuf_data_len(m);
- mbuf_offset = 0;
- while (mbuf_avail != 0 || m->next != NULL) {
- /* done with current mbuf, fetch next */
- if (mbuf_avail == 0) {
- m = m->next;
-
- mbuf_offset = 0;
- mbuf_avail = rte_pktmbuf_data_len(m);
+ /* start copy from mbuf to desc */
+ while (1) {
Please avoid while(1) when you can check for a real condition:
while (mbuf_avail || mbuf->next) ?

Compiler should optimize this properly, no?
Post by Zhihong Wang
+ /* get the next mbuf if the current done */
+ if (!mbuf_avail) {
+ if (mbuf->next) {
+ mbuf = mbuf->next;
+ mbuf_len = rte_pktmbuf_data_len(mbuf);
+ mbuf_avail = mbuf_len;
+ } else
+ break;
}
- /* done with current desc buf, fetch next */
- if (desc_avail == 0) {
- if ((desc->flags & VRING_DESC_F_NEXT) == 0) {
- /* Room in vring buffer is not enough */
- return -1;
- }
- if (unlikely(desc->next >= vq->size))
- return -1;
-
- desc = &vq->desc[desc->next];
- desc_addr = gpa_to_vva(dev, desc->addr);
- if (unlikely(!desc_addr))
- return -1;
-
- desc_offset = 0;
- desc_avail = desc->len;
+ /* get the next desc if the current done */
+ if (desc->len <= desc_offset) {
+ if (desc->flags & VRING_DESC_F_NEXT) {
+ /* go on with the current desc chain */
+ desc_offset = 0;
+ desc_current = desc->next;
+ desc = &vq->desc[desc_current];
+ desc_addr = gpa_to_vva(dev, desc->addr);
+ if (unlikely(!desc_addr))
+ goto rollback;
you could goto directly to error, and decrement last_used_idx directly
under "error"'s goto since extra_buffers will be zero otherwise.

Also, except desc_current affectation, all the above code is common
with mergeable case, so you should avoid duplication.
Post by Zhihong Wang
+ } else if (is_mrg_rxbuf) {
+ /* start with the next desc chain */
+ used_idx_round = vq->last_used_idx
+ & (vq->size - 1);
+ vq->used->ring[used_idx_round].id =
+ desc_chain_head;
+ vq->used->ring[used_idx_round].len =
+ desc_chain_len;
+ vhost_log_used_vring(dev, vq,
+ offsetof(struct vring_used,
+ ring[used_idx_round]),
+ sizeof(vq->used->ring[
+ used_idx_round]));
+ vq->last_used_idx++;
+ extra_buffers++;
+ virtio_hdr->num_buffers++;
+ if (avail_idx == vq->last_used_idx)
+ goto rollback;
+
+ desc_current =
+ vq->avail->ring[(vq->last_used_idx) &
+ (vq->size - 1)];
+ desc_chain_head = desc_current;
+ desc = &vq->desc[desc_current];
+ desc_addr = gpa_to_vva(dev, desc->addr);
+ if (unlikely(!desc_addr))
+ goto rollback;
+
+ desc_chain_len = 0;
+ desc_offset = 0;
+ } else
+ goto rollback;
}
- cpy_len = RTE_MIN(desc_avail, mbuf_avail);
- rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
- rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
- cpy_len);
- vhost_log_write(dev, desc->addr + desc_offset, cpy_len);
- PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
- cpy_len, 0);
-
- mbuf_avail -= cpy_len;
- mbuf_offset += cpy_len;
- desc_avail -= cpy_len;
- desc_offset += cpy_len;
- }
-
- return 0;
-}
-
-/**
- * This function adds buffers to the virtio devices RX virtqueue. Buffers can
- * be received from the physical port or from another virtio device. A packet
- * count is returned to indicate the number of packets that are succesfully
- * added to the RX queue. This function works when the mbuf is scattered, but
- * it doesn't support the mergeable feature.
- */
-static inline uint32_t __attribute__((always_inline))
-virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
- struct rte_mbuf **pkts, uint32_t count)
-{
- struct vhost_virtqueue *vq;
- uint16_t avail_idx, free_entries, start_idx;
- uint16_t desc_indexes[MAX_PKT_BURST];
- uint16_t used_idx;
- uint32_t i;
-
- LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
- if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
- RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
- dev->vid, __func__, queue_id);
- return 0;
+ /* copy mbuf data */
+ copy_len = RTE_MIN(desc->len - desc_offset, mbuf_avail);
+ rte_memcpy((void *)(uintptr_t)desc_addr,
+ rte_pktmbuf_mtod_offset(mbuf, void *,
+ mbuf_len - mbuf_avail),
+ copy_len);
+ vhost_log_write(dev, desc->addr + desc_offset, copy_len);
+ mbuf_avail -= copy_len;
+ desc_offset += copy_len;
+ desc_addr += copy_len;
+ desc_chain_len += copy_len;
}
- vq = dev->virtqueue[queue_id];
- if (unlikely(vq->enabled == 0))
- return 0;
-
- avail_idx = *((volatile uint16_t *)&vq->avail->idx);
- start_idx = vq->last_used_idx;
- free_entries = avail_idx - start_idx;
- count = RTE_MIN(count, free_entries);
- count = RTE_MIN(count, (uint32_t)MAX_PKT_BURST);
- if (count == 0)
- return 0;
-
- LOG_DEBUG(VHOST_DATA, "(%d) start_idx %d | end_idx %d\n",
- dev->vid, start_idx, start_idx + count);
-
- /* Retrieve all of the desc indexes first to avoid caching issues. */
- rte_prefetch0(&vq->avail->ring[start_idx & (vq->size - 1)]);
- for (i = 0; i < count; i++) {
- used_idx = (start_idx + i) & (vq->size - 1);
- desc_indexes[i] = vq->avail->ring[used_idx];
- vq->used->ring[used_idx].id = desc_indexes[i];
- vq->used->ring[used_idx].len = pkts[i]->pkt_len +
- dev->vhost_hlen;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used, ring[used_idx]),
- sizeof(vq->used->ring[used_idx]));
- }
+ used_idx_round = vq->last_used_idx & (vq->size - 1);
+ vq->used->ring[used_idx_round].id = desc_chain_head;
+ vq->used->ring[used_idx_round].len = desc_chain_len;
+ vhost_log_used_vring(dev, vq,
+ offsetof(struct vring_used, ring[used_idx_round]),
+ sizeof(vq->used->ring[used_idx_round]));
+ vq->last_used_idx++;
All this code is duplicatedd from the rx_mergeable base.
I think a dedicated inline function would really make sense here.
Post by Zhihong Wang
- rte_prefetch0(&vq->desc[desc_indexes[0]]);
- for (i = 0; i < count; i++) {
- uint16_t desc_idx = desc_indexes[i];
- int err;
+ return 0;
- err = copy_mbuf_to_desc(dev, vq, pkts[i], desc_idx);
- if (unlikely(err)) {
- used_idx = (start_idx + i) & (vq->size - 1);
- vq->used->ring[used_idx].len = dev->vhost_hlen;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used, ring[used_idx]),
- sizeof(vq->used->ring[used_idx]));
- }
+ /* rollback on any error if last_used_idx update on-the-fly */
+ if (is_mrg_rxbuf)
If (!is_mrg_rxbuf), extra_buffers will be zero, so just remove the test,
and place belw line directly under error: as explained above.
Post by Zhihong Wang
+ vq->last_used_idx -= extra_buffers;
- if (i + 1 < count)
- rte_prefetch0(&vq->desc[desc_indexes[i+1]]);
- }
+ return 1;
+}
+static inline void __attribute__((always_inline))
+notify_guest(struct virtio_net *dev, struct vhost_virtqueue *vq)
+{
rte_smp_wmb();
-
- *(volatile uint16_t *)&vq->used->idx += count;
- vq->last_used_idx += count;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used, idx),
- sizeof(vq->used->idx));
-
- /* flush used->idx update before we read avail->flags. */
+ vq->used->idx = vq->last_used_idx;
+ vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
+ sizeof(vq->used->idx));
rte_mb();
-
- /* Kick the guest if necessary. */
if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
&& (vq->callfd >= 0))
eventfd_write(vq->callfd, (eventfd_t)1);
- return count;
-}
-
-static inline int
-fill_vec_buf(struct vhost_virtqueue *vq, uint32_t avail_idx,
- uint32_t *allocated, uint32_t *vec_idx,
- struct buf_vector *buf_vec)
-{
- uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
- uint32_t vec_id = *vec_idx;
- uint32_t len = *allocated;
-
- while (1) {
- if (unlikely(vec_id >= BUF_VECTOR_MAX || idx >= vq->size))
- return -1;
-
- len += vq->desc[idx].len;
- buf_vec[vec_id].buf_addr = vq->desc[idx].addr;
- buf_vec[vec_id].buf_len = vq->desc[idx].len;
- buf_vec[vec_id].desc_idx = idx;
- vec_id++;
-
- if ((vq->desc[idx].flags & VRING_DESC_F_NEXT) == 0)
- break;
-
- idx = vq->desc[idx].next;
- }
-
- *allocated = len;
- *vec_idx = vec_id;
-
- return 0;
-}
-
-/*
- * Returns -1 on fail, 0 on success
- */
-static inline int
-reserve_avail_buf_mergeable(struct vhost_virtqueue *vq, uint32_t size,
- uint16_t *end, struct buf_vector *buf_vec)
-{
- uint16_t cur_idx;
- uint16_t avail_idx;
- uint32_t allocated = 0;
- uint32_t vec_idx = 0;
- uint16_t tries = 0;
-
- cur_idx = vq->last_used_idx;
-
- while (1) {
- avail_idx = *((volatile uint16_t *)&vq->avail->idx);
- if (unlikely(cur_idx == avail_idx))
- return -1;
-
- if (unlikely(fill_vec_buf(vq, cur_idx, &allocated,
- &vec_idx, buf_vec) < 0))
- return -1;
-
- cur_idx++;
- tries++;
-
- if (allocated >= size)
- break;
-
- /*
- * if we tried all available ring items, and still
- * can't get enough buf, it means something abnormal
- * happened.
- */
- if (unlikely(tries >= vq->size))
- return -1;
- }
-
- *end = cur_idx;
- return 0;
}
-static inline uint32_t __attribute__((always_inline))
-copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
- uint16_t end_idx, struct rte_mbuf *m,
- struct buf_vector *buf_vec)
+uint16_t
+rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
+ struct rte_mbuf **pkts, uint16_t count)
{
- struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
- uint32_t vec_idx = 0;
- uint16_t start_idx = vq->last_used_idx;
- uint16_t cur_idx = start_idx;
- uint64_t desc_addr;
- uint32_t mbuf_offset, mbuf_avail;
- uint32_t desc_offset, desc_avail;
- uint32_t cpy_len;
- uint16_t desc_idx, used_idx;
-
- if (unlikely(m == NULL))
+ struct vhost_virtqueue *vq;
+ struct virtio_net *dev;
+ uint32_t pkt_idx = 0;
+ uint32_t pkt_left = 0;
+ uint32_t pkt_sent = 0;
+ uint32_t is_mrg_rxbuf = 0;
+ uint16_t avail_idx = 0;
+
+ /* precheck */
Comment not very informative here.
Post by Zhihong Wang
+ if (unlikely(count == 0))
return 0;
- LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n",
- dev->vid, cur_idx, end_idx);
+ count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
- desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
- if (buf_vec[vec_idx].buf_len < dev->vhost_hlen || !desc_addr)
+ dev = get_device(vid);
+ if (unlikely(!dev))
return 0;
- rte_prefetch0((void *)(uintptr_t)desc_addr);
-
- virtio_hdr.num_buffers = end_idx - start_idx;
- LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n",
- dev->vid, virtio_hdr.num_buffers);
-
- virtio_enqueue_offload(m, &virtio_hdr.hdr);
- copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
- vhost_log_write(dev, buf_vec[vec_idx].buf_addr, dev->vhost_hlen);
- PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
-
- desc_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
- desc_offset = dev->vhost_hlen;
-
- mbuf_avail = rte_pktmbuf_data_len(m);
- mbuf_offset = 0;
- while (mbuf_avail != 0 || m->next != NULL) {
- /* done with current desc buf, get the next one */
- if (desc_avail == 0) {
- desc_idx = buf_vec[vec_idx].desc_idx;
-
- if (!(vq->desc[desc_idx].flags & VRING_DESC_F_NEXT)) {
- /* Update used ring with desc information */
- used_idx = cur_idx++ & (vq->size - 1);
- vq->used->ring[used_idx].id = desc_idx;
- vq->used->ring[used_idx].len = desc_offset;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used,
- ring[used_idx]),
- sizeof(vq->used->ring[used_idx]));
- }
-
- vec_idx++;
- desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
- if (unlikely(!desc_addr))
- return 0;
-
- /* Prefetch buffer address. */
- rte_prefetch0((void *)(uintptr_t)desc_addr);
- desc_offset = 0;
- desc_avail = buf_vec[vec_idx].buf_len;
- }
-
- /* done with current mbuf, get the next one */
- if (mbuf_avail == 0) {
- m = m->next;
-
- mbuf_offset = 0;
- mbuf_avail = rte_pktmbuf_data_len(m);
- }
-
- cpy_len = RTE_MIN(desc_avail, mbuf_avail);
- rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
- rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
- cpy_len);
- vhost_log_write(dev, buf_vec[vec_idx].buf_addr + desc_offset,
- cpy_len);
- PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
- cpy_len, 0);
-
- mbuf_avail -= cpy_len;
- mbuf_offset += cpy_len;
- desc_avail -= cpy_len;
- desc_offset += cpy_len;
- }
-
- used_idx = cur_idx & (vq->size - 1);
- vq->used->ring[used_idx].id = buf_vec[vec_idx].desc_idx;
- vq->used->ring[used_idx].len = desc_offset;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used, ring[used_idx]),
- sizeof(vq->used->ring[used_idx]));
-
- return end_idx - start_idx;
-}
-
-static inline uint32_t __attribute__((always_inline))
-virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
- struct rte_mbuf **pkts, uint32_t count)
-{
- struct vhost_virtqueue *vq;
- uint32_t pkt_idx = 0, nr_used = 0;
- uint16_t end;
- struct buf_vector buf_vec[BUF_VECTOR_MAX];
-
- LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
- if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
- RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
- dev->vid, __func__, queue_id);
+ if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb)))
return 0;
- }
vq = dev->virtqueue[queue_id];
- if (unlikely(vq->enabled == 0))
+ if (unlikely(!vq->enabled))
return 0;
- count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
- if (count == 0)
- return 0;
+ if (dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF))
+ is_mrg_rxbuf = 1;
- for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
- uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
-
- if (unlikely(reserve_avail_buf_mergeable(vq, pkt_len,
- &end, buf_vec) < 0)) {
- LOG_DEBUG(VHOST_DATA,
- "(%d) failed to get enough desc from vring\n",
- dev->vid);
+ /* start enqueuing packets 1 by 1 */
+ pkt_idx = 0;
+ pkt_left = count;
+ avail_idx = *((volatile uint16_t *)&vq->avail->idx);
+ while (1) {
+ if (loop_check(vq, avail_idx, pkt_left))
What about:
while (pkt_left && avail_idx != vq->last_used_idx) {
Post by Zhihong Wang
break;
- }
-
- nr_used = copy_mbuf_to_desc_mergeable(dev, vq, end,
- pkts[pkt_idx], buf_vec);
- rte_smp_wmb();
- *(volatile uint16_t *)&vq->used->idx += nr_used;
- vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
- sizeof(vq->used->idx));
- vq->last_used_idx += nr_used;
- }
-
- if (likely(pkt_idx)) {
- /* flush used->idx update before we read avail->flags. */
- rte_mb();
+ if (enqueue_packet(dev, vq, avail_idx, pkts[pkt_idx],
+ is_mrg_rxbuf))
+ break;
- /* Kick the guest if necessary. */
- if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
- && (vq->callfd >= 0))
- eventfd_write(vq->callfd, (eventfd_t)1);
+ pkt_idx++;
+ pkt_sent++;
+ pkt_left--;
}
- return pkt_idx;
-}
-
-uint16_t
-rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
- struct rte_mbuf **pkts, uint16_t count)
-{
- struct virtio_net *dev = get_device(vid);
-
- if (!dev)
- return 0;
+ /* update used idx and kick the guest if necessary */
+ if (pkt_sent)
+ notify_guest(dev, vq);
- if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF))
- return virtio_dev_merge_rx(dev, queue_id, pkts, count);
- else
- return virtio_dev_rx(dev, queue_id, pkts, count);
+ return pkt_sent;
}
static void
Wang, Zhihong
2016-08-23 02:27:15 UTC
Permalink
Hi Maxime,

Thanks very much for the detailed review.
-----Original Message-----
Sent: Monday, August 22, 2016 5:36 PM
Subject: Re: [PATCH v3 1/5] vhost: rewrite enqueue
Post by Zhihong Wang
This patch implements the vhost logic from scratch into a single function
designed for high performance and better maintainability.
---
1. Rewrite enqueue and delete the obsolete in the same patch.
---
lib/librte_vhost/vhost_rxtx.c | 537 +++++++++++++-----------------------------
1 file changed, 160 insertions(+), 377 deletions(-)
diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 08a73fd..b09a9c3 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -91,7 +91,7 @@ is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t
qp_nb)
Post by Zhihong Wang
return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
}
-static void
+static inline void __attribute__((always_inline))
virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr
*net_hdr)
Post by Zhihong Wang
{
if (m_buf->ol_flags & PKT_TX_L4_MASK) {
@@ -125,427 +125,210 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf,
struct virtio_net_hdr *net_hdr)
Post by Zhihong Wang
}
}
-static inline void
-copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr,
- struct virtio_net_hdr_mrg_rxbuf hdr)
+static inline uint32_t __attribute__((always_inline))
+loop_check(struct vhost_virtqueue *vq, uint16_t avail_idx, uint32_t pkt_left)
Creating a function just for doing this doesn't make much sense.
And the function name doesn't help.
I think you should just remove this function.
Okay.
Post by Zhihong Wang
{
- if (dev->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf))
- *(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr;
- else
- *(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr;
+ if (pkt_left == 0 || avail_idx == vq->last_used_idx)
+ return 1;
+
+ return 0;
}
-static inline int __attribute__((always_inline))
-copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
- struct rte_mbuf *m, uint16_t desc_idx)
+static inline uint32_t __attribute__((always_inline))
+enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ uint16_t avail_idx, struct rte_mbuf *mbuf,
+ uint32_t is_mrg_rxbuf)
{
- uint32_t desc_avail, desc_offset;
- uint32_t mbuf_avail, mbuf_offset;
- uint32_t cpy_len;
+ struct virtio_net_hdr_mrg_rxbuf *virtio_hdr;
struct vring_desc *desc;
- uint64_t desc_addr;
- struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
-
- desc = &vq->desc[desc_idx];
+ uint64_t desc_addr = 0;
+ uint32_t desc_chain_head = 0;
+ uint32_t desc_chain_len = 0;
+ uint32_t desc_current = 0;
+ uint32_t desc_offset = 0;
+ uint32_t mbuf_len = 0;
+ uint32_t mbuf_avail = 0;
+ uint32_t copy_len = 0;
+ uint32_t extra_buffers = 0;
+ uint32_t used_idx_round = 0;
Most of these variables don't need to be initialized.
Okay.
Post by Zhihong Wang
+
+ /* start with the first mbuf of the packet */
+ mbuf_len = rte_pktmbuf_data_len(mbuf);
+ mbuf_avail = mbuf_len;
+
+ /* get the current desc */
+ desc_current = vq->avail->ring[(vq->last_used_idx) & (vq->size - 1)];
+ desc_chain_head = desc_current;
+ desc = &vq->desc[desc_current];
desc_addr = gpa_to_vva(dev, desc->addr);
- /*
- * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
- * performance issue with some versions of gcc (4.8.4 and 5.3.0) which
- * otherwise stores offset on the stack instead of in a register.
- */
- if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr)
- return -1;
-
- rte_prefetch0((void *)(uintptr_t)desc_addr);
+ if (unlikely(!desc_addr))
+ goto error;
- virtio_enqueue_offload(m, &virtio_hdr.hdr);
- copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
+ /* handle virtio header */
+ virtio_hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr;
+ virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
Parenthesis around virtio_hdr->hdr shouldn't be needed.
Post by Zhihong Wang
vhost_log_write(dev, desc->addr, dev->vhost_hlen);
- PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
Looks like you remove the PRINT_PACKET calls.
Does it impact performance?
In any case, it should be mentionned in the commit message.
Will add this.
Post by Zhihong Wang
-
desc_offset = dev->vhost_hlen;
- desc_avail = desc->len - dev->vhost_hlen;
+ desc_chain_len = desc_offset;
+ desc_addr += desc_offset;
+ if (is_mrg_rxbuf)
+ virtio_hdr->num_buffers = 1;
- mbuf_avail = rte_pktmbuf_data_len(m);
- mbuf_offset = 0;
- while (mbuf_avail != 0 || m->next != NULL) {
- /* done with current mbuf, fetch next */
- if (mbuf_avail == 0) {
- m = m->next;
-
- mbuf_offset = 0;
- mbuf_avail = rte_pktmbuf_data_len(m);
+ /* start copy from mbuf to desc */
+ while (1) {
while (mbuf_avail || mbuf->next) ?
Will rewrite this logic.
Compiler should optimize this properly, no?
Post by Zhihong Wang
+ /* get the next mbuf if the current done */
+ if (!mbuf_avail) {
+ if (mbuf->next) {
+ mbuf = mbuf->next;
+ mbuf_len = rte_pktmbuf_data_len(mbuf);
+ mbuf_avail = mbuf_len;
+ } else
+ break;
}
- /* done with current desc buf, fetch next */
- if (desc_avail == 0) {
- if ((desc->flags & VRING_DESC_F_NEXT) == 0) {
- /* Room in vring buffer is not enough */
- return -1;
- }
- if (unlikely(desc->next >= vq->size))
- return -1;
-
- desc = &vq->desc[desc->next];
- desc_addr = gpa_to_vva(dev, desc->addr);
- if (unlikely(!desc_addr))
- return -1;
-
- desc_offset = 0;
- desc_avail = desc->len;
+ /* get the next desc if the current done */
+ if (desc->len <= desc_offset) {
+ if (desc->flags & VRING_DESC_F_NEXT) {
+ /* go on with the current desc chain */
+ desc_offset = 0;
+ desc_current = desc->next;
+ desc = &vq->desc[desc_current];
+ desc_addr = gpa_to_vva(dev, desc->addr);
+ if (unlikely(!desc_addr))
+ goto rollback;
you could goto directly to error, and decrement last_used_idx directly
under "error"'s goto since extra_buffers will be zero otherwise.
Good call.
Also, except desc_current affectation, all the above code is common
with mergeable case, so you should avoid duplication.
Post by Zhihong Wang
+ } else if (is_mrg_rxbuf) {
+ /* start with the next desc chain */
+ used_idx_round = vq->last_used_idx
+ & (vq->size - 1);
+ vq->used->ring[used_idx_round].id =
+ desc_chain_head;
+ vq->used->ring[used_idx_round].len =
+ desc_chain_len;
+ vhost_log_used_vring(dev, vq,
+ offsetof(struct vring_used,
+ ring[used_idx_round]),
+ sizeof(vq->used->ring[
+ used_idx_round]));
+ vq->last_used_idx++;
+ extra_buffers++;
+ virtio_hdr->num_buffers++;
+ if (avail_idx == vq->last_used_idx)
+ goto rollback;
+
+ desc_current =
+ vq->avail->ring[(vq->last_used_idx) &
+ (vq->size - 1)];
+ desc_chain_head = desc_current;
+ desc = &vq->desc[desc_current];
+ desc_addr = gpa_to_vva(dev, desc->addr);
+ if (unlikely(!desc_addr))
+ goto rollback;
+
+ desc_chain_len = 0;
+ desc_offset = 0;
+ } else
+ goto rollback;
}
- cpy_len = RTE_MIN(desc_avail, mbuf_avail);
- rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
- rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
- cpy_len);
- vhost_log_write(dev, desc->addr + desc_offset, cpy_len);
- PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
- cpy_len, 0);
-
- mbuf_avail -= cpy_len;
- mbuf_offset += cpy_len;
- desc_avail -= cpy_len;
- desc_offset += cpy_len;
- }
-
- return 0;
-}
-
-/**
- * This function adds buffers to the virtio devices RX virtqueue. Buffers can
- * be received from the physical port or from another virtio device. A packet
- * count is returned to indicate the number of packets that are succesfully
- * added to the RX queue. This function works when the mbuf is scattered, but
- * it doesn't support the mergeable feature.
- */
-static inline uint32_t __attribute__((always_inline))
-virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
- struct rte_mbuf **pkts, uint32_t count)
-{
- struct vhost_virtqueue *vq;
- uint16_t avail_idx, free_entries, start_idx;
- uint16_t desc_indexes[MAX_PKT_BURST];
- uint16_t used_idx;
- uint32_t i;
-
- LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
- if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
- RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
- dev->vid, __func__, queue_id);
- return 0;
+ /* copy mbuf data */
+ copy_len = RTE_MIN(desc->len - desc_offset, mbuf_avail);
+ rte_memcpy((void *)(uintptr_t)desc_addr,
+ rte_pktmbuf_mtod_offset(mbuf, void *,
+ mbuf_len - mbuf_avail),
+ copy_len);
+ vhost_log_write(dev, desc->addr + desc_offset, copy_len);
+ mbuf_avail -= copy_len;
+ desc_offset += copy_len;
+ desc_addr += copy_len;
+ desc_chain_len += copy_len;
}
- vq = dev->virtqueue[queue_id];
- if (unlikely(vq->enabled == 0))
- return 0;
-
- avail_idx = *((volatile uint16_t *)&vq->avail->idx);
- start_idx = vq->last_used_idx;
- free_entries = avail_idx - start_idx;
- count = RTE_MIN(count, free_entries);
- count = RTE_MIN(count, (uint32_t)MAX_PKT_BURST);
- if (count == 0)
- return 0;
-
- LOG_DEBUG(VHOST_DATA, "(%d) start_idx %d | end_idx %d\n",
- dev->vid, start_idx, start_idx + count);
-
- /* Retrieve all of the desc indexes first to avoid caching issues. */
- rte_prefetch0(&vq->avail->ring[start_idx & (vq->size - 1)]);
- for (i = 0; i < count; i++) {
- used_idx = (start_idx + i) & (vq->size - 1);
- desc_indexes[i] = vq->avail->ring[used_idx];
- vq->used->ring[used_idx].id = desc_indexes[i];
- vq->used->ring[used_idx].len = pkts[i]->pkt_len +
- dev->vhost_hlen;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used, ring[used_idx]),
- sizeof(vq->used->ring[used_idx]));
- }
+ used_idx_round = vq->last_used_idx & (vq->size - 1);
+ vq->used->ring[used_idx_round].id = desc_chain_head;
+ vq->used->ring[used_idx_round].len = desc_chain_len;
+ vhost_log_used_vring(dev, vq,
+ offsetof(struct vring_used, ring[used_idx_round]),
+ sizeof(vq->used->ring[used_idx_round]));
+ vq->last_used_idx++;
All this code is duplicatedd from the rx_mergeable base.
I think a dedicated inline function would really make sense here.
Good catch. Will make a function for this.
Post by Zhihong Wang
- rte_prefetch0(&vq->desc[desc_indexes[0]]);
- for (i = 0; i < count; i++) {
- uint16_t desc_idx = desc_indexes[i];
- int err;
+ return 0;
- err = copy_mbuf_to_desc(dev, vq, pkts[i], desc_idx);
- if (unlikely(err)) {
- used_idx = (start_idx + i) & (vq->size - 1);
- vq->used->ring[used_idx].len = dev->vhost_hlen;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used, ring[used_idx]),
- sizeof(vq->used->ring[used_idx]));
- }
+ /* rollback on any error if last_used_idx update on-the-fly */
+ if (is_mrg_rxbuf)
If (!is_mrg_rxbuf), extra_buffers will be zero, so just remove the test,
and place belw line directly under error: as explained above.
Sure. Thanks.
Post by Zhihong Wang
+ vq->last_used_idx -= extra_buffers;
- if (i + 1 < count)
- rte_prefetch0(&vq->desc[desc_indexes[i+1]]);
- }
+ return 1;
+}
+static inline void __attribute__((always_inline))
+notify_guest(struct virtio_net *dev, struct vhost_virtqueue *vq)
+{
rte_smp_wmb();
-
- *(volatile uint16_t *)&vq->used->idx += count;
- vq->last_used_idx += count;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used, idx),
- sizeof(vq->used->idx));
-
- /* flush used->idx update before we read avail->flags. */
+ vq->used->idx = vq->last_used_idx;
+ vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
+ sizeof(vq->used->idx));
rte_mb();
-
- /* Kick the guest if necessary. */
if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
&& (vq->callfd >= 0))
eventfd_write(vq->callfd, (eventfd_t)1);
- return count;
-}
-
-static inline int
-fill_vec_buf(struct vhost_virtqueue *vq, uint32_t avail_idx,
- uint32_t *allocated, uint32_t *vec_idx,
- struct buf_vector *buf_vec)
-{
- uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
- uint32_t vec_id = *vec_idx;
- uint32_t len = *allocated;
-
- while (1) {
- if (unlikely(vec_id >= BUF_VECTOR_MAX || idx >= vq->size))
- return -1;
-
- len += vq->desc[idx].len;
- buf_vec[vec_id].buf_addr = vq->desc[idx].addr;
- buf_vec[vec_id].buf_len = vq->desc[idx].len;
- buf_vec[vec_id].desc_idx = idx;
- vec_id++;
-
- if ((vq->desc[idx].flags & VRING_DESC_F_NEXT) == 0)
- break;
-
- idx = vq->desc[idx].next;
- }
-
- *allocated = len;
- *vec_idx = vec_id;
-
- return 0;
-}
-
-/*
- * Returns -1 on fail, 0 on success
- */
-static inline int
-reserve_avail_buf_mergeable(struct vhost_virtqueue *vq, uint32_t size,
- uint16_t *end, struct buf_vector *buf_vec)
-{
- uint16_t cur_idx;
- uint16_t avail_idx;
- uint32_t allocated = 0;
- uint32_t vec_idx = 0;
- uint16_t tries = 0;
-
- cur_idx = vq->last_used_idx;
-
- while (1) {
- avail_idx = *((volatile uint16_t *)&vq->avail->idx);
- if (unlikely(cur_idx == avail_idx))
- return -1;
-
- if (unlikely(fill_vec_buf(vq, cur_idx, &allocated,
- &vec_idx, buf_vec) < 0))
- return -1;
-
- cur_idx++;
- tries++;
-
- if (allocated >= size)
- break;
-
- /*
- * if we tried all available ring items, and still
- * can't get enough buf, it means something abnormal
- * happened.
- */
- if (unlikely(tries >= vq->size))
- return -1;
- }
-
- *end = cur_idx;
- return 0;
}
-static inline uint32_t __attribute__((always_inline))
-copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct
vhost_virtqueue *vq,
Post by Zhihong Wang
- uint16_t end_idx, struct rte_mbuf *m,
- struct buf_vector *buf_vec)
+uint16_t
+rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
+ struct rte_mbuf **pkts, uint16_t count)
{
- struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
- uint32_t vec_idx = 0;
- uint16_t start_idx = vq->last_used_idx;
- uint16_t cur_idx = start_idx;
- uint64_t desc_addr;
- uint32_t mbuf_offset, mbuf_avail;
- uint32_t desc_offset, desc_avail;
- uint32_t cpy_len;
- uint16_t desc_idx, used_idx;
-
- if (unlikely(m == NULL))
+ struct vhost_virtqueue *vq;
+ struct virtio_net *dev;
+ uint32_t pkt_idx = 0;
+ uint32_t pkt_left = 0;
+ uint32_t pkt_sent = 0;
+ uint32_t is_mrg_rxbuf = 0;
+ uint16_t avail_idx = 0;
+
+ /* precheck */
Comment not very informative here.
Okay.
Post by Zhihong Wang
+ if (unlikely(count == 0))
return 0;
- LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n",
- dev->vid, cur_idx, end_idx);
+ count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
- desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
- if (buf_vec[vec_idx].buf_len < dev->vhost_hlen || !desc_addr)
+ dev = get_device(vid);
+ if (unlikely(!dev))
return 0;
- rte_prefetch0((void *)(uintptr_t)desc_addr);
-
- virtio_hdr.num_buffers = end_idx - start_idx;
- LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n",
- dev->vid, virtio_hdr.num_buffers);
-
- virtio_enqueue_offload(m, &virtio_hdr.hdr);
- copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
- vhost_log_write(dev, buf_vec[vec_idx].buf_addr, dev->vhost_hlen);
- PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
-
- desc_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
- desc_offset = dev->vhost_hlen;
-
- mbuf_avail = rte_pktmbuf_data_len(m);
- mbuf_offset = 0;
- while (mbuf_avail != 0 || m->next != NULL) {
- /* done with current desc buf, get the next one */
- if (desc_avail == 0) {
- desc_idx = buf_vec[vec_idx].desc_idx;
-
- if (!(vq->desc[desc_idx].flags & VRING_DESC_F_NEXT)) {
- /* Update used ring with desc information */
- used_idx = cur_idx++ & (vq->size - 1);
- vq->used->ring[used_idx].id = desc_idx;
- vq->used->ring[used_idx].len = desc_offset;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used,
- ring[used_idx]),
- sizeof(vq->used->ring[used_idx]));
- }
-
- vec_idx++;
- desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
- if (unlikely(!desc_addr))
- return 0;
-
- /* Prefetch buffer address. */
- rte_prefetch0((void *)(uintptr_t)desc_addr);
- desc_offset = 0;
- desc_avail = buf_vec[vec_idx].buf_len;
- }
-
- /* done with current mbuf, get the next one */
- if (mbuf_avail == 0) {
- m = m->next;
-
- mbuf_offset = 0;
- mbuf_avail = rte_pktmbuf_data_len(m);
- }
-
- cpy_len = RTE_MIN(desc_avail, mbuf_avail);
- rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
- rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
- cpy_len);
- vhost_log_write(dev, buf_vec[vec_idx].buf_addr + desc_offset,
- cpy_len);
- PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
- cpy_len, 0);
-
- mbuf_avail -= cpy_len;
- mbuf_offset += cpy_len;
- desc_avail -= cpy_len;
- desc_offset += cpy_len;
- }
-
- used_idx = cur_idx & (vq->size - 1);
- vq->used->ring[used_idx].id = buf_vec[vec_idx].desc_idx;
- vq->used->ring[used_idx].len = desc_offset;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used, ring[used_idx]),
- sizeof(vq->used->ring[used_idx]));
-
- return end_idx - start_idx;
-}
-
-static inline uint32_t __attribute__((always_inline))
-virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
- struct rte_mbuf **pkts, uint32_t count)
-{
- struct vhost_virtqueue *vq;
- uint32_t pkt_idx = 0, nr_used = 0;
- uint16_t end;
- struct buf_vector buf_vec[BUF_VECTOR_MAX];
-
- LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
- if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
- RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
- dev->vid, __func__, queue_id);
+ if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb)))
return 0;
- }
vq = dev->virtqueue[queue_id];
- if (unlikely(vq->enabled == 0))
+ if (unlikely(!vq->enabled))
return 0;
- count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
- if (count == 0)
- return 0;
+ if (dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF))
+ is_mrg_rxbuf = 1;
- for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
- uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
-
- if (unlikely(reserve_avail_buf_mergeable(vq, pkt_len,
- &end, buf_vec) < 0)) {
- LOG_DEBUG(VHOST_DATA,
- "(%d) failed to get enough desc from vring\n",
- dev->vid);
+ /* start enqueuing packets 1 by 1 */
+ pkt_idx = 0;
+ pkt_left = count;
+ avail_idx = *((volatile uint16_t *)&vq->avail->idx);
+ while (1) {
+ if (loop_check(vq, avail_idx, pkt_left))
while (pkt_left && avail_idx != vq->last_used_idx) {
Will rewrite it.
Post by Zhihong Wang
break;
- }
-
- nr_used = copy_mbuf_to_desc_mergeable(dev, vq, end,
- pkts[pkt_idx], buf_vec);
- rte_smp_wmb();
- *(volatile uint16_t *)&vq->used->idx += nr_used;
- vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
- sizeof(vq->used->idx));
- vq->last_used_idx += nr_used;
- }
-
- if (likely(pkt_idx)) {
- /* flush used->idx update before we read avail->flags. */
- rte_mb();
+ if (enqueue_packet(dev, vq, avail_idx, pkts[pkt_idx],
+ is_mrg_rxbuf))
+ break;
- /* Kick the guest if necessary. */
- if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
- && (vq->callfd >= 0))
- eventfd_write(vq->callfd, (eventfd_t)1);
+ pkt_idx++;
+ pkt_sent++;
+ pkt_left--;
}
- return pkt_idx;
-}
-
-uint16_t
-rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
- struct rte_mbuf **pkts, uint16_t count)
-{
- struct virtio_net *dev = get_device(vid);
-
- if (!dev)
- return 0;
+ /* update used idx and kick the guest if necessary */
+ if (pkt_sent)
+ notify_guest(dev, vq);
- if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF))
- return virtio_dev_merge_rx(dev, queue_id, pkts, count);
- else
- return virtio_dev_rx(dev, queue_id, pkts, count);
+ return pkt_sent;
}
static void
Yuanhan Liu
2016-08-25 04:00:45 UTC
Permalink
Post by Maxime Coquelin
Post by Zhihong Wang
- virtio_enqueue_offload(m, &virtio_hdr.hdr);
- copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
+ /* handle virtio header */
+ virtio_hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr;
+ virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
Parenthesis around virtio_hdr->hdr shouldn't be needed.
Post by Zhihong Wang
vhost_log_write(dev, desc->addr, dev->vhost_hlen);
- PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
Looks like you remove the PRINT_PACKET calls.
Does it impact performance?
Yes, it does. But it's only enabled for debug mode. Besides that,
it's just a NOOP.
Post by Maxime Coquelin
In any case, it should be mentionned in the commit message.
Agreed. But for this case, we should not remove it: it breaks the
debug-ability.

--yliu
Zhihong Wang
2016-08-19 05:43:49 UTC
Permalink
This patch enables batch update of the used ring for better efficiency.

Signed-off-by: Zhihong Wang <***@intel.com>
---
lib/librte_vhost/vhost-net.h | 4 +++
lib/librte_vhost/vhost_rxtx.c | 68 +++++++++++++++++++++++++++++++++----------
lib/librte_vhost/virtio-net.c | 15 ++++++++--
3 files changed, 68 insertions(+), 19 deletions(-)

diff --git a/lib/librte_vhost/vhost-net.h b/lib/librte_vhost/vhost-net.h
index 51fdf3d..a15182c 100644
--- a/lib/librte_vhost/vhost-net.h
+++ b/lib/librte_vhost/vhost-net.h
@@ -85,6 +85,10 @@ struct vhost_virtqueue {

/* Physical address of used ring, for logging */
uint64_t log_guest_addr;
+
+ /* Shadow used ring for performance */
+ struct vring_used_elem *shadow_used_ring;
+ uint32_t shadow_used_idx;
} __rte_cache_aligned;

/* Old kernels have no such macro defined */
diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 7523b2d..c4abaf1 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -155,7 +155,6 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
uint32_t mbuf_avail = 0;
uint32_t copy_len = 0;
uint32_t extra_buffers = 0;
- uint32_t used_idx_round = 0;

/* start with the first mbuf of the packet */
mbuf_len = rte_pktmbuf_data_len(mbuf);
@@ -203,17 +202,11 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
goto rollback;
} else if (is_mrg_rxbuf) {
/* start with the next desc chain */
- used_idx_round = vq->last_used_idx
- & (vq->size - 1);
- vq->used->ring[used_idx_round].id =
+ vq->shadow_used_ring[vq->shadow_used_idx].id =
desc_chain_head;
- vq->used->ring[used_idx_round].len =
+ vq->shadow_used_ring[vq->shadow_used_idx].len =
desc_chain_len;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used,
- ring[used_idx_round]),
- sizeof(vq->used->ring[
- used_idx_round]));
+ vq->shadow_used_idx++;
vq->last_used_idx++;
extra_buffers++;
virtio_hdr->num_buffers++;
@@ -248,12 +241,9 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
desc_chain_len += copy_len;
}

- used_idx_round = vq->last_used_idx & (vq->size - 1);
- vq->used->ring[used_idx_round].id = desc_chain_head;
- vq->used->ring[used_idx_round].len = desc_chain_len;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used, ring[used_idx_round]),
- sizeof(vq->used->ring[used_idx_round]));
+ vq->shadow_used_ring[vq->shadow_used_idx].id = desc_chain_head;
+ vq->shadow_used_ring[vq->shadow_used_idx].len = desc_chain_len;
+ vq->shadow_used_idx++;
vq->last_used_idx++;

return 0;
@@ -268,6 +258,45 @@ error:
}

static inline void __attribute__((always_inline))
+update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ uint32_t used_idx_start)
+{
+ if (used_idx_start + vq->shadow_used_idx < vq->size) {
+ rte_memcpy(&vq->used->ring[used_idx_start],
+ &vq->shadow_used_ring[0],
+ vq->shadow_used_idx *
+ sizeof(struct vring_used_elem));
+ vhost_log_used_vring(dev, vq,
+ offsetof(struct vring_used,
+ ring[used_idx_start]),
+ vq->shadow_used_idx *
+ sizeof(struct vring_used_elem));
+ } else {
+ uint32_t part_1 = vq->size - used_idx_start;
+ uint32_t part_2 = vq->shadow_used_idx - part_1;
+
+ rte_memcpy(&vq->used->ring[used_idx_start],
+ &vq->shadow_used_ring[0],
+ part_1 *
+ sizeof(struct vring_used_elem));
+ vhost_log_used_vring(dev, vq,
+ offsetof(struct vring_used,
+ ring[used_idx_start]),
+ part_1 *
+ sizeof(struct vring_used_elem));
+ rte_memcpy(&vq->used->ring[0],
+ &vq->shadow_used_ring[part_1],
+ part_2 *
+ sizeof(struct vring_used_elem));
+ vhost_log_used_vring(dev, vq,
+ offsetof(struct vring_used,
+ ring[0]),
+ part_2 *
+ sizeof(struct vring_used_elem));
+ }
+}
+
+static inline void __attribute__((always_inline))
notify_guest(struct virtio_net *dev, struct vhost_virtqueue *vq)
{
rte_smp_wmb();
@@ -286,6 +315,7 @@ rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
{
struct vhost_virtqueue *vq;
struct virtio_net *dev;
+ uint32_t used_idx_start = 0;
uint32_t pkt_idx = 0;
uint32_t pkt_left = 0;
uint32_t pkt_sent = 0;
@@ -315,6 +345,8 @@ rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
/* start enqueuing packets 1 by 1 */
pkt_idx = 0;
pkt_left = count;
+ vq->shadow_used_idx = 0;
+ used_idx_start = vq->last_used_idx & (vq->size - 1);
avail_idx = *((volatile uint16_t *)&vq->avail->idx);
while (1) {
if (loop_check(vq, avail_idx, pkt_left))
@@ -329,6 +361,10 @@ rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
pkt_left--;
}

+ /* batch update used ring for better performance */
+ if (likely(vq->shadow_used_idx > 0))
+ update_used_ring(dev, vq, used_idx_start);
+
/* update used idx and kick the guest if necessary */
if (pkt_sent)
notify_guest(dev, vq);
diff --git a/lib/librte_vhost/virtio-net.c b/lib/librte_vhost/virtio-net.c
index 1785695..87d09fa 100644
--- a/lib/librte_vhost/virtio-net.c
+++ b/lib/librte_vhost/virtio-net.c
@@ -152,10 +152,14 @@ cleanup_device(struct virtio_net *dev, int destroy)
static void
free_device(struct virtio_net *dev)
{
+ struct vhost_virtqueue *vq;
uint32_t i;

- for (i = 0; i < dev->virt_qp_nb; i++)
- rte_free(dev->virtqueue[i * VIRTIO_QNUM]);
+ for (i = 0; i < dev->virt_qp_nb; i++) {
+ vq = dev->virtqueue[i * VIRTIO_QNUM];
+ rte_free(vq->shadow_used_ring);
+ rte_free(vq);
+ }

rte_free(dev);
}
@@ -418,13 +422,18 @@ int
vhost_set_vring_num(int vid, struct vhost_vring_state *state)
{
struct virtio_net *dev;
+ struct vhost_virtqueue *vq;

dev = get_device(vid);
if (dev == NULL)
return -1;

/* State->index refers to the queue index. The txq is 1, rxq is 0. */
- dev->virtqueue[state->index]->size = state->num;
+ vq = dev->virtqueue[state->index];
+ vq->size = state->num;
+ vq->shadow_used_ring = rte_malloc("",
+ vq->size * sizeof(struct vring_used_elem),
+ RTE_CACHE_LINE_SIZE);

return 0;
}
--
2.7.4
Yuanhan Liu
2016-08-25 03:48:00 UTC
Permalink
Post by Zhihong Wang
This patch enables batch update of the used ring for better efficiency.
...
Post by Zhihong Wang
diff --git a/lib/librte_vhost/virtio-net.c b/lib/librte_vhost/virtio-net.c
index 1785695..87d09fa 100644
--- a/lib/librte_vhost/virtio-net.c
+++ b/lib/librte_vhost/virtio-net.c
@@ -152,10 +152,14 @@ cleanup_device(struct virtio_net *dev, int destroy)
static void
free_device(struct virtio_net *dev)
{
+ struct vhost_virtqueue *vq;
uint32_t i;
- for (i = 0; i < dev->virt_qp_nb; i++)
- rte_free(dev->virtqueue[i * VIRTIO_QNUM]);
+ for (i = 0; i < dev->virt_qp_nb; i++) {
+ vq = dev->virtqueue[i * VIRTIO_QNUM];
+ rte_free(vq->shadow_used_ring);
+ rte_free(vq);
+ }
rte_free(dev);
}
@@ -418,13 +422,18 @@ int
vhost_set_vring_num(int vid, struct vhost_vring_state *state)
{
struct virtio_net *dev;
+ struct vhost_virtqueue *vq;
dev = get_device(vid);
if (dev == NULL)
return -1;
/* State->index refers to the queue index. The txq is 1, rxq is 0. */
- dev->virtqueue[state->index]->size = state->num;
+ vq = dev->virtqueue[state->index];
+ vq->size = state->num;
+ vq->shadow_used_ring = rte_malloc("",
+ vq->size * sizeof(struct vring_used_elem),
+ RTE_CACHE_LINE_SIZE);
Few notes here:

- I think the typical way to not specific a string type is using NULL,
but not "".

- You should check the return value of rte_malloc: it could fail.

- Note that free_device() is invoked only when the vhost-user connection
is broken (say the guest is halt). However, vhost_set_vring_num() could
be invoked many times for a connection, say when you restart testpmd
many times. This would lead to memory leak.

The right way is to free it on get_vring_base().

--yliu
Wang, Zhihong
2016-08-25 05:19:07 UTC
Permalink
-----Original Message-----
Sent: Thursday, August 25, 2016 11:48 AM
Subject: Re: [PATCH v3 4/5] vhost: batch update used ring
Post by Zhihong Wang
This patch enables batch update of the used ring for better efficiency.
...
Post by Zhihong Wang
diff --git a/lib/librte_vhost/virtio-net.c b/lib/librte_vhost/virtio-net.c
index 1785695..87d09fa 100644
--- a/lib/librte_vhost/virtio-net.c
+++ b/lib/librte_vhost/virtio-net.c
@@ -152,10 +152,14 @@ cleanup_device(struct virtio_net *dev, int
destroy)
Post by Zhihong Wang
static void
free_device(struct virtio_net *dev)
{
+ struct vhost_virtqueue *vq;
uint32_t i;
- for (i = 0; i < dev->virt_qp_nb; i++)
- rte_free(dev->virtqueue[i * VIRTIO_QNUM]);
+ for (i = 0; i < dev->virt_qp_nb; i++) {
+ vq = dev->virtqueue[i * VIRTIO_QNUM];
+ rte_free(vq->shadow_used_ring);
+ rte_free(vq);
+ }
rte_free(dev);
}
@@ -418,13 +422,18 @@ int
vhost_set_vring_num(int vid, struct vhost_vring_state *state)
{
struct virtio_net *dev;
+ struct vhost_virtqueue *vq;
dev = get_device(vid);
if (dev == NULL)
return -1;
/* State->index refers to the queue index. The txq is 1, rxq is 0. */
- dev->virtqueue[state->index]->size = state->num;
+ vq = dev->virtqueue[state->index];
+ vq->size = state->num;
+ vq->shadow_used_ring = rte_malloc("",
+ vq->size * sizeof(struct vring_used_elem),
+ RTE_CACHE_LINE_SIZE);
- I think the typical way to not specific a string type is using NULL,
but not "".
- You should check the return value of rte_malloc: it could fail.
- Note that free_device() is invoked only when the vhost-user connection
is broken (say the guest is halt). However, vhost_set_vring_num() could
be invoked many times for a connection, say when you restart testpmd
many times. This would lead to memory leak.
The right way is to free it on get_vring_base().
Good catch! Thanks.
--yliu
Zhihong Wang
2016-08-19 05:43:48 UTC
Permalink
This patch adds descriptor prefetch to hide cache access latency.

Signed-off-by: Zhihong Wang <***@intel.com>
---
lib/librte_vhost/vhost_rxtx.c | 5 +++++
1 file changed, 5 insertions(+)

diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index b09a9c3..7523b2d 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -131,6 +131,11 @@ loop_check(struct vhost_virtqueue *vq, uint16_t avail_idx, uint32_t pkt_left)
if (pkt_left == 0 || avail_idx == vq->last_used_idx)
return 1;

+ /* prefetch the next desc */
+ if (pkt_left > 1 && avail_idx != vq->last_used_idx + 1)
+ rte_prefetch0(&vq->desc[vq->avail->ring[
+ (vq->last_used_idx + 1) & (vq->size - 1)]]);
+
return 0;
}
--
2.7.4
Zhihong Wang
2016-08-19 05:43:50 UTC
Permalink
This patch reorders the code to delay virtio header write to optimize cache
access efficiency for cases where the mrg_rxbuf feature is turned on. It
reduces CPU pipeline stall cycles significantly.

---
Changes in v3:

1. Remove unnecessary memset which causes frontend stall on SNB & IVB.

2. Rename variables to follow naming convention.

Signed-off-by: Zhihong Wang <***@intel.com>
---
lib/librte_vhost/vhost_rxtx.c | 19 ++++++++++++++-----
1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index c4abaf1..e3ba4e0 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -154,6 +154,7 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
uint32_t mbuf_len = 0;
uint32_t mbuf_avail = 0;
uint32_t copy_len = 0;
+ uint32_t copy_virtio_hdr = 0;
uint32_t extra_buffers = 0;

/* start with the first mbuf of the packet */
@@ -168,15 +169,16 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
if (unlikely(!desc_addr))
goto error;

- /* handle virtio header */
+ /*
+ * handle virtio header, the actual write operation
+ * is delayed for cache optimization.
+ */
virtio_hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr;
- virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
+ copy_virtio_hdr = 1;
vhost_log_write(dev, desc->addr, dev->vhost_hlen);
desc_offset = dev->vhost_hlen;
desc_chain_len = desc_offset;
desc_addr += desc_offset;
- if (is_mrg_rxbuf)
- virtio_hdr->num_buffers = 1;

/* start copy from mbuf to desc */
while (1) {
@@ -228,8 +230,15 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
goto rollback;
}

- /* copy mbuf data */
+ /* copy virtio header and mbuf data */
copy_len = RTE_MIN(desc->len - desc_offset, mbuf_avail);
+ if (copy_virtio_hdr) {
+ copy_virtio_hdr = 0;
+ virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
+ if (is_mrg_rxbuf)
+ virtio_hdr->num_buffers = extra_buffers + 1;
+ }
+
rte_memcpy((void *)(uintptr_t)desc_addr,
rte_pktmbuf_mtod_offset(mbuf, void *,
mbuf_len - mbuf_avail),
--
2.7.4
Maxime Coquelin
2016-08-22 08:11:13 UTC
Permalink
Hi Zhihong,
Post by Zhihong Wang
This patch set optimizes the vhost enqueue function.
It implements the vhost logic from scratch into a single function designed
for high performance and good maintainability, and improves CPU efficiency
* For fast frontends (eg. DPDK virtio pmd), higher performance (maximum
throughput) can be achieved.
* For slow frontends (eg. kernel virtio-net), better scalability can be
achieved, each vhost core can support more connections since it takes
less cycles to handle each single frontend.
1. Reorder code to reduce CPU pipeline stall cycles.
2. Batch update the used ring for better efficiency.
3. Prefetch descriptor to hide cache latency.
4. Remove useless volatile attribute to allow compiler optimization.
Thanks for these details, this is helpful to understand where the perf
gain comes from.
I would suggest to add these information as comments in the code
where/if it makes sense. If more a general comment, at least add it in
the commit message of the patch introducing it.
Indeed, adding it to the cover letter is fine, but the information is
lost as soon as the series is applied.

You don't mention any figures, so I set up a benchmark on my side to
evaluate your series. It indeed shows an interesting performance gain.

My setup consists of one host running a guest.
The guest generates as much 64bytes packets as possible using
pktgen-dpdk. The hosts forwards received packets back to the guest
using testpmd on vhost pmd interface. Guest's vCPUs are pinned to
physical CPUs.

I tested it with and without your v1 patch, with and without
rx-mergeable feature turned ON.
Results are the average of 8 runs of 60 seconds:

Rx-Mergeable ON : 7.72Mpps
Rx-Mergeable ON + "vhost: optimize enqueue" v1: 9.19Mpps
Rx-Mergeable OFF: 10.52Mpps
Rx-Mergeable OFF + "vhost: optimize enqueue" v1: 10.60Mpps

Regards,
Maxime
Maxime Coquelin
2016-08-22 10:01:47 UTC
Permalink
Post by Maxime Coquelin
Hi Zhihong,
Post by Zhihong Wang
This patch set optimizes the vhost enqueue function.
It implements the vhost logic from scratch into a single function designed
for high performance and good maintainability, and improves CPU efficiency
* For fast frontends (eg. DPDK virtio pmd), higher performance (maximum
throughput) can be achieved.
* For slow frontends (eg. kernel virtio-net), better scalability can be
achieved, each vhost core can support more connections since it takes
less cycles to handle each single frontend.
1. Reorder code to reduce CPU pipeline stall cycles.
2. Batch update the used ring for better efficiency.
3. Prefetch descriptor to hide cache latency.
4. Remove useless volatile attribute to allow compiler optimization.
Thanks for these details, this is helpful to understand where the perf
gain comes from.
I would suggest to add these information as comments in the code
where/if it makes sense. If more a general comment, at least add it in
the commit message of the patch introducing it.
Indeed, adding it to the cover letter is fine, but the information is
lost as soon as the series is applied.
You don't mention any figures, so I set up a benchmark on my side to
evaluate your series. It indeed shows an interesting performance gain.
My setup consists of one host running a guest.
The guest generates as much 64bytes packets as possible using
pktgen-dpdk. The hosts forwards received packets back to the guest
using testpmd on vhost pmd interface. Guest's vCPUs are pinned to
physical CPUs.
I tested it with and without your v1 patch, with and without
rx-mergeable feature turned ON.
Rx-Mergeable ON : 7.72Mpps
Rx-Mergeable ON + "vhost: optimize enqueue" v1: 9.19Mpps
Rx-Mergeable OFF: 10.52Mpps
Rx-Mergeable OFF + "vhost: optimize enqueue" v1: 10.60Mpps
I forgot to add that before this series, I think we should first fix the windows bug.
Else we will need a dedicated fix for the stable branch.

Regards,
Maxime
Thomas Monjalon
2016-08-22 10:35:01 UTC
Permalink
Post by Maxime Coquelin
I forgot to add that before this series, I think we should first fix the windows bug.
Else we will need a dedicated fix for the stable branch.
This is a funny situation :)
If Zhihong had reworked the code without mentioning it is fixing a scenario
with Windows guests, maybe that nobody would have notice ;)
That's probably why it is not written in v2/v3. But thanks to the v1,
we all know it:
"It also fixes the issue working with Windows VMs."

So yes, it would be a lot better to find the root cause and try to have a
minimal fix for 16.07, then rework the code for performance in 16.11.
I think we must avoid silent fixes, and even more, avoid writing specific
fixes for stable branches without validating them in the master branch and
its large users base.

Thanks for your good works guys, DPDK vhost is improving very well.
Wang, Zhihong
2016-08-24 03:37:03 UTC
Permalink
-----Original Message-----
Sent: Monday, August 22, 2016 6:35 PM
Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
Post by Maxime Coquelin
I forgot to add that before this series, I think we should first fix the
windows bug.
Post by Maxime Coquelin
Else we will need a dedicated fix for the stable branch.
This is a funny situation :)
If Zhihong had reworked the code without mentioning it is fixing a scenario
with Windows guests, maybe that nobody would have notice ;) That's
"It also fixes the issue working with Windows VMs."
I thought it'd be more appropriate to send a dedicated fix for stable branch.
So I removed this info.
So yes, it would be a lot better to find the root cause and try to have a
minimal fix for 16.07, then rework the code for performance in 16.11.
I think we must avoid silent fixes, and even more, avoid writing specific fixes
for stable branches without validating them in the master branch and its large
users base.
Okay, that's also what Maxime and Yuanhan suggest.

BTW the root cause has been identified and fix will be in v4.
Thanks for your good works guys, DPDK vhost is improving very well.
Wang, Zhihong
2016-08-23 02:31:29 UTC
Permalink
-----Original Message-----
Sent: Monday, August 22, 2016 6:02 PM
Subject: Re: [PATCH v3 0/5] vhost: optimize enqueue
Post by Maxime Coquelin
Hi Zhihong,
Post by Zhihong Wang
This patch set optimizes the vhost enqueue function.
It implements the vhost logic from scratch into a single function designed
for high performance and good maintainability, and improves CPU efficiency
* For fast frontends (eg. DPDK virtio pmd), higher performance
(maximum
Post by Maxime Coquelin
Post by Zhihong Wang
throughput) can be achieved.
* For slow frontends (eg. kernel virtio-net), better scalability can be
achieved, each vhost core can support more connections since it takes
less cycles to handle each single frontend.
1. Reorder code to reduce CPU pipeline stall cycles.
2. Batch update the used ring for better efficiency.
3. Prefetch descriptor to hide cache latency.
4. Remove useless volatile attribute to allow compiler optimization.
Thanks for these details, this is helpful to understand where the perf
gain comes from.
I would suggest to add these information as comments in the code
where/if it makes sense. If more a general comment, at least add it in
the commit message of the patch introducing it.
Indeed, adding it to the cover letter is fine, but the information is
lost as soon as the series is applied.
You don't mention any figures, so I set up a benchmark on my side to
evaluate your series. It indeed shows an interesting performance gain.
My setup consists of one host running a guest.
The guest generates as much 64bytes packets as possible using
pktgen-dpdk. The hosts forwards received packets back to the guest
using testpmd on vhost pmd interface. Guest's vCPUs are pinned to
physical CPUs.
I tested it with and without your v1 patch, with and without
rx-mergeable feature turned ON.
Rx-Mergeable ON : 7.72Mpps
Rx-Mergeable ON + "vhost: optimize enqueue" v1: 9.19Mpps
Rx-Mergeable OFF: 10.52Mpps
Rx-Mergeable OFF + "vhost: optimize enqueue" v1: 10.60Mpps
I forgot to add that before this series, I think we should first fix the windows bug.
Else we will need a dedicated fix for the stable branch.
Okay I'll try to fix it, though I can't make any promises at present.

Have tried once but stopped since we don't have enough debug info from the
frontend side so basically I was debugging the backend based on guesses.
R
Wang, Zhihong
2016-08-23 10:43:36 UTC
Permalink
-----Original Message-----
From: Wang, Zhihong
Sent: Tuesday, August 23, 2016 10:31 AM
Subject: RE: [PATCH v3 0/5] vhost: optimize enqueue
-----Original Message-----
Sent: Monday, August 22, 2016 6:02 PM
Subject: Re: [PATCH v3 0/5] vhost: optimize enqueue
Post by Maxime Coquelin
Hi Zhihong,
Post by Zhihong Wang
This patch set optimizes the vhost enqueue function.
It implements the vhost logic from scratch into a single function designed
for high performance and good maintainability, and improves CPU efficiency
* For fast frontends (eg. DPDK virtio pmd), higher performance
(maximum
Post by Maxime Coquelin
Post by Zhihong Wang
throughput) can be achieved.
* For slow frontends (eg. kernel virtio-net), better scalability can be
achieved, each vhost core can support more connections since it takes
less cycles to handle each single frontend.
1. Reorder code to reduce CPU pipeline stall cycles.
2. Batch update the used ring for better efficiency.
3. Prefetch descriptor to hide cache latency.
4. Remove useless volatile attribute to allow compiler optimization.
Thanks for these details, this is helpful to understand where the perf
gain comes from.
I would suggest to add these information as comments in the code
where/if it makes sense. If more a general comment, at least add it in
the commit message of the patch introducing it.
Indeed, adding it to the cover letter is fine, but the information is
lost as soon as the series is applied.
You don't mention any figures, so I set up a benchmark on my side to
evaluate your series. It indeed shows an interesting performance gain.
My setup consists of one host running a guest.
The guest generates as much 64bytes packets as possible using
pktgen-dpdk. The hosts forwards received packets back to the guest
using testpmd on vhost pmd interface. Guest's vCPUs are pinned to
physical CPUs.
I tested it with and without your v1 patch, with and without
rx-mergeable feature turned ON.
Rx-Mergeable ON : 7.72Mpps
Rx-Mergeable ON + "vhost: optimize enqueue" v1: 9.19Mpps
Rx-Mergeable OFF: 10.52Mpps
Rx-Mergeable OFF + "vhost: optimize enqueue" v1: 10.60Mpps
I forgot to add that before this series, I think we should first fix the windows
bug.
Else we will need a dedicated fix for the stable branch.
Okay I'll try to fix it, though I can't make any promises at present.
Have tried once but stopped since we don't have enough debug info from the
frontend side so basically I was debugging the backend based on guesses.
Hi Maxime, Yuanhan,

I've identified the root cause, do you think it makes sense to put the fix
in the same patch set? Or send it as a separated patch?


Thanks
Zhih
Yuanhan Liu
2016-08-23 12:22:06 UTC
Permalink
Post by Wang, Zhihong
Post by Wang, Zhihong
Post by Maxime Coquelin
I forgot to add that before this series, I think we should first fix the windows
bug.
Post by Maxime Coquelin
Else we will need a dedicated fix for the stable branch.
Okay I'll try to fix it, though I can't make any promises at present.
Have tried once but stopped since we don't have enough debug info from the
frontend side so basically I was debugging the backend based on guesses.
Hi Maxime, Yuanhan,
I've identified the root cause, do you think it makes sense to put the fix
in the same patch set? Or send it as a separated patch?
Great!

Yes, it's okay to put it in the patch set (normally, as the first patch,
before the rewrite).

Please also add following line before your Signed-off-by in the commit
log:

Cc: <***@dpdk.org>

--yliu
Maxime Coquelin
2016-08-23 12:16:40 UTC
Permalink
Post by Wang, Zhihong
-----Original Message-----
From: Wang, Zhihong
Sent: Tuesday, August 23, 2016 10:31 AM
Subject: RE: [PATCH v3 0/5] vhost: optimize enqueue
-----Original Message-----
Sent: Monday, August 22, 2016 6:02 PM
Subject: Re: [PATCH v3 0/5] vhost: optimize enqueue
..
Post by Wang, Zhihong
I forgot to add that before this series, I think we should first fix the windows
bug.
Else we will need a dedicated fix for the stable branch.
Okay I'll try to fix it, though I can't make any promises at present.
Have tried once but stopped since we don't have enough debug info from the
frontend side so basically I was debugging the backend based on guesses.
Hi Maxime, Yuanhan,
I've identified the root cause, do you think it makes sense to put the fix
in the same patch set? Or send it as a separated patch?
Good work!

Agree with Yuanhan, send it before the optimization series.

Thanks,
Maxime
Wang, Zhihong
2016-08-23 02:15:43 UTC
Permalink
Subject: Re: [PATCH v3 0/5] vhost: optimize enqueue
Hi Zhihong,
[...]
Post by Zhihong Wang
1. Reorder code to reduce CPU pipeline stall cycles.
2. Batch update the used ring for better efficiency.
3. Prefetch descriptor to hide cache latency.
4. Remove useless volatile attribute to allow compiler optimization.
Thanks for these details, this is helpful to understand where the perf
gain comes from.
I would suggest to add these information as comments in the code
where/if it makes sense. If more a general comment, at least add it in
the commit message of the patch introducing it.
Indeed, adding it to the cover letter is fine, but the information is
lost as soon as the series is applied.
Hi Maxime,

I did add these info in the later optimization patches to explain each
optimization techniques. The v1 was indeed hard to read.
You don't mention any figures, so I set up a benchmark on my side to
evaluate your series. It indeed shows an interesting performance gain.
My setup consists of one host running a guest.
The guest generates as much 64bytes packets as possible using
pktgen-dpdk. The hosts forwards received packets back to the guest
using testpmd on vhost pmd interface. Guest's vCPUs are pinned to
physical CPUs.
Thanks for doing the test!

I didn't publish any numbers since the gain varies in different platforms
and test setups.

In my phy to vm test on both IVB and HSW, where testpmd in the host rx from
the nic and enqueue to the guest, the enqueue efficiency (cycles per packet)
is 2.4x and 1.4x as fast as the current code for mergeable on and mergeable
off respectively, for v3 patch.
I tested it with and without your v1 patch, with and without
rx-mergeable feature turned ON.
Rx-Mergeable ON : 7.72Mpps
Rx-Mergeable ON + "vhost: optimize enqueue" v1: 9.19Mpps
Rx-Mergeable OFF: 10.52Mpps
Rx-Mergeable OFF + "vhost: optimize enqueue" v1: 10.60Mpps
Regards,
Maxime
Jianbo Liu
2016-09-21 08:50:01 UTC
Permalink
Hi Maxime,
Post by Maxime Coquelin
Hi Zhihong,
Post by Zhihong Wang
This patch set optimizes the vhost enqueue function.
...
Post by Maxime Coquelin
My setup consists of one host running a guest.
The guest generates as much 64bytes packets as possible using
Have you tested with other different packet size?
My testing shows that performance is dropping when packet size is more than 256.
Post by Maxime Coquelin
pktgen-dpdk. The hosts forwards received packets back to the guest
using testpmd on vhost pmd interface. Guest's vCPUs are pinned to
physical CPUs.
I tested it with and without your v1 patch, with and without
rx-mergeable feature turned ON.
Rx-Mergeable ON : 7.72Mpps
Rx-Mergeable ON + "vhost: optimize enqueue" v1: 9.19Mpps
Rx-Mergeable OFF: 10.52Mpps
Rx-Mergeable OFF + "vhost: optimize enqueue" v1: 10.60Mpps
Regards,
Maxime
Wang, Zhihong
2016-09-21 09:27:21 UTC
Permalink
-----Original Message-----
Sent: Wednesday, September 21, 2016 4:50 PM
Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
Hi Maxime,
On 22 August 2016 at 16:11, Maxime Coquelin
Post by Maxime Coquelin
Hi Zhihong,
Post by Zhihong Wang
This patch set optimizes the vhost enqueue function.
...
Post by Maxime Coquelin
My setup consists of one host running a guest.
The guest generates as much 64bytes packets as possible using
Have you tested with other different packet size?
My testing shows that performance is dropping when packet size is more than 256.
Hi Jianbo,

Thanks for reporting this.

1. Are you running the vector frontend with mrg_rxbuf=off?

2. Could you please specify what CPU you're running? Is it Haswell
or Ivy Bridge?

3. How many percentage of drop are you seeing?

This is expected by me because I've already found the root cause and
the way to optimize it, but since it missed the v0 deadline and
requires changes in eal/memcpy, I postpone it to the next release.

After the upcoming optimization the performance for packets larger
than 256 will be improved, and the new code will be much faster than
the current code.


Thanks
Zhihong
Post by Maxime Coquelin
pktgen-dpdk. The hosts forwards received packets back to the guest
using testpmd on vhost pmd interface. Guest's vCPUs are pinned to
physical CPUs.
I tested it with and without your v1 patch, with and without
rx-mergeable feature turned ON.
Rx-Mergeable ON : 7.72Mpps
Rx-Mergeable ON + "vhost: optimize enqueue" v1: 9.19Mpps
Rx-Mergeable OFF: 10.52Mpps
Rx-Mergeable OFF + "vhost: optimize enqueue" v1: 10.60Mpps
Regards
Jianbo Liu
2016-09-21 12:54:11 UTC
Permalink
Post by Wang, Zhihong
-----Original Message-----
Sent: Wednesday, September 21, 2016 4:50 PM
Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
Hi Maxime,
On 22 August 2016 at 16:11, Maxime Coquelin
Post by Maxime Coquelin
Hi Zhihong,
Post by Zhihong Wang
This patch set optimizes the vhost enqueue function.
...
Post by Maxime Coquelin
My setup consists of one host running a guest.
The guest generates as much 64bytes packets as possible using
Have you tested with other different packet size?
My testing shows that performance is dropping when packet size is more than 256.
Hi Jianbo,
Thanks for reporting this.
1. Are you running the vector frontend with mrg_rxbuf=off?
2. Could you please specify what CPU you're running? Is it Haswell
or Ivy Bridge?
3. How many percentage of drop are you seeing?
This is expected by me because I've already found the root cause and
the way to optimize it, but since it missed the v0 deadline and
requires changes in eal/memcpy, I postpone it to the next release.
After the upcoming optimization the performance for packets larger
than 256 will be improved, and the new code will be much faster than
the current code.
Sorry, I tested on an ARM server, but I wonder if there is the same
issue for x86 platform.
Post by Wang, Zhihong
Post by Maxime Coquelin
pktgen-dpdk. The hosts forwards received packets back to the guest
using testpmd on vhost pmd interface. Guest's vCPUs are pinned to
physical CPUs.
I tested it with and without your v1 patch, with and without
rx-mergeable feature turned ON.
Rx-Mergeable ON : 7.72Mpps
Rx-Mergeable ON + "vhost: optimize enqueue" v1: 9.19Mpps
Rx-Mergeable OFF: 10.52Mpps
Rx-Mergeable OFF + "vhost: optimize enqueue" v1: 10.60Mpps
Wang, Zhihong
2016-09-22 02:11:23 UTC
Permalink
-----Original Message-----
Sent: Wednesday, September 21, 2016 8:54 PM
Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
Post by Wang, Zhihong
-----Original Message-----
Sent: Wednesday, September 21, 2016 4:50 PM
Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
Hi Maxime,
On 22 August 2016 at 16:11, Maxime Coquelin
Post by Maxime Coquelin
Hi Zhihong,
Post by Zhihong Wang
This patch set optimizes the vhost enqueue function.
...
Post by Maxime Coquelin
My setup consists of one host running a guest.
The guest generates as much 64bytes packets as possible using
Have you tested with other different packet size?
My testing shows that performance is dropping when packet size is more than 256.
Hi Jianbo,
Thanks for reporting this.
1. Are you running the vector frontend with mrg_rxbuf=off?
2. Could you please specify what CPU you're running? Is it Haswell
or Ivy Bridge?
3. How many percentage of drop are you seeing?
This is expected by me because I've already found the root cause and
the way to optimize it, but since it missed the v0 deadline and
requires changes in eal/memcpy, I postpone it to the next release.
After the upcoming optimization the performance for packets larger
than 256 will be improved, and the new code will be much faster than
the current code.
Sorry, I tested on an ARM server, but I wonder if there is the same
issue for x86 platform.
For mrg_rxbuf=off path it might be slight drop for packets larger than
256B (~3% for 512B and ~1% for 1024B), no drop for other cases.

This is not a bug or issue, only we need to enhance memcpy to complete
the whole optimization, which should be done in a separated patch,
unfortunately it misses this release window.
Post by Wang, Zhihong
Post by Maxime Coquelin
pktgen-dpdk. The hosts forwards received packets back to the guest
using testpmd on vhost pmd interface. Guest's vCPUs are pinned to
physical CPUs.
I tested it with and without your v1 patch, with and without
rx-mergeable feature turned ON.
Rx-Mergeable ON : 7.72Mpps
Rx-Mergeable ON + "vhost: optimize enqueue" v1: 9.19Mpps
Rx-Mergeable OFF: 10.52Mpps
Rx-Mergeable OFF + "vhost: optimize enqueue" v1: 10.60Mpps
Yuanhan Liu
2016-09-22 02:29:03 UTC
Permalink
Post by Jianbo Liu
Post by Wang, Zhihong
Post by Jianbo Liu
Post by Maxime Coquelin
My setup consists of one host running a guest.
The guest generates as much 64bytes packets as possible using
Have you tested with other different packet size?
My testing shows that performance is dropping when packet size is more than 256.
Hi Jianbo,
Thanks for reporting this.
1. Are you running the vector frontend with mrg_rxbuf=off?
2. Could you please specify what CPU you're running? Is it Haswell
or Ivy Bridge?
3. How many percentage of drop are you seeing?
This is expected by me because I've already found the root cause and
the way to optimize it, but since it missed the v0 deadline and
requires changes in eal/memcpy, I postpone it to the next release.
After the upcoming optimization the performance for packets larger
than 256 will be improved, and the new code will be much faster than
the current code.
Sorry, I tested on an ARM server, but I wonder if there is the same
issue for x86 platform.
Would you please provide more details? Say, answer the two left
questions from Zhihong?

Thanks.

--yliu
Jianbo Liu
2016-09-22 05:47:45 UTC
Permalink
Post by Yuanhan Liu
Post by Jianbo Liu
Post by Wang, Zhihong
Post by Jianbo Liu
Post by Maxime Coquelin
My setup consists of one host running a guest.
The guest generates as much 64bytes packets as possible using
Have you tested with other different packet size?
My testing shows that performance is dropping when packet size is more than 256.
Hi Jianbo,
Thanks for reporting this.
1. Are you running the vector frontend with mrg_rxbuf=off?
Yes, my testing is mrg_rxbuf=off, but not vector frontend PMD.
Post by Yuanhan Liu
Post by Jianbo Liu
Post by Wang, Zhihong
2. Could you please specify what CPU you're running? Is it Haswell
or Ivy Bridge?
It's an ARM server.
Post by Yuanhan Liu
Post by Jianbo Liu
Post by Wang, Zhihong
3. How many percentage of drop are you seeing?
The testing result:
size (bytes) improvement (%)
64 3.92
128 11.51
256 24.16
512 -13.79
1024 -22.51
1500 -12.22
A correction is that performance is dropping if byte size is larger than 512.
Post by Yuanhan Liu
Post by Jianbo Liu
Post by Wang, Zhihong
This is expected by me because I've already found the root cause and
the way to optimize it, but since it missed the v0 deadline and
requires changes in eal/memcpy, I postpone it to the next release.
After the upcoming optimization the performance for packets larger
than 256 will be improved, and the new code will be much faster than
the current code.
Sorry, I tested on an ARM server, but I wonder if there is the same
issue for x86 platform.
Would you please provide more details? Say, answer the two left
questions from Zhihong?
Thanks.
--yliu
Wang, Zhihong
2016-09-22 06:58:24 UTC
Permalink
-----Original Message-----
Sent: Thursday, September 22, 2016 1:48 PM
Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
Post by Yuanhan Liu
Post by Jianbo Liu
Post by Wang, Zhihong
Post by Jianbo Liu
Post by Maxime Coquelin
My setup consists of one host running a guest.
The guest generates as much 64bytes packets as possible using
Have you tested with other different packet size?
My testing shows that performance is dropping when packet size is
more
Post by Yuanhan Liu
Post by Jianbo Liu
Post by Wang, Zhihong
Post by Jianbo Liu
than 256.
Hi Jianbo,
Thanks for reporting this.
1. Are you running the vector frontend with mrg_rxbuf=off?
Yes, my testing is mrg_rxbuf=off, but not vector frontend PMD.
Post by Yuanhan Liu
Post by Jianbo Liu
Post by Wang, Zhihong
2. Could you please specify what CPU you're running? Is it Haswell
or Ivy Bridge?
It's an ARM server.
Post by Yuanhan Liu
Post by Jianbo Liu
Post by Wang, Zhihong
3. How many percentage of drop are you seeing?
size (bytes) improvement (%)
64 3.92
128 11.51
256 24.16
512 -13.79
1024 -22.51
1500 -12.22
A correction is that performance is dropping if byte size is larger than 512.
Jianbo,

Could you please verify does this patch really cause enqueue perf to drop?

You can test the enqueue path only by set guest to do rxonly, and compare
the mpps by show port stats all in the guest.


Thanks
Zhihong
Post by Yuanhan Liu
Post by Jianbo Liu
Post by Wang, Zhihong
This is expected by me because I've already found the root cause and
the way to optimize it, but since it missed the v0 deadline and
requires changes in eal/memcpy, I postpone it to the next release.
After the upcoming optimization the performance for packets larger
than 256 will be improved, and the new code will be much faster than
the current code.
Sorry, I tested on an ARM server, but I wonder if there is the same
issue for x86 platform.
Would you please provide more details? Say, answer the two left
questions from Zhihong?
Thank
Jianbo Liu
2016-09-22 09:01:41 UTC
Permalink
Post by Wang, Zhihong
-----Original Message-----
Sent: Thursday, September 22, 2016 1:48 PM
Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
Post by Wang, Zhihong
Post by Jianbo Liu
Post by Maxime Coquelin
My setup consists of one host running a guest.
The guest generates as much 64bytes packets as possible using
Have you tested with other different packet size?
My testing shows that performance is dropping when packet size is
more
Post by Wang, Zhihong
Post by Jianbo Liu
than 256.
Hi Jianbo,
Thanks for reporting this.
1. Are you running the vector frontend with mrg_rxbuf=off?
Yes, my testing is mrg_rxbuf=off, but not vector frontend PMD.
Post by Wang, Zhihong
2. Could you please specify what CPU you're running? Is it Haswell
or Ivy Bridge?
It's an ARM server.
Post by Wang, Zhihong
3. How many percentage of drop are you seeing?
size (bytes) improvement (%)
64 3.92
128 11.51
256 24.16
512 -13.79
1024 -22.51
1500 -12.22
A correction is that performance is dropping if byte size is larger than 512.
Jianbo,
Could you please verify does this patch really cause enqueue perf to drop?
You can test the enqueue path only by set guest to do rxonly, and compare
the mpps by show port stats all in the guest.
Tested with testpmd, host: txonly, guest: rxonly
size (bytes) improvement (%)
64 4.12
128 6
256 2.65
512 -1.12
1024 -7.02
Wang, Zhihong
2016-09-22 10:04:50 UTC
Permalink
-----Original Message-----
Sent: Thursday, September 22, 2016 5:02 PM
Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
Post by Wang, Zhihong
-----Original Message-----
Sent: Thursday, September 22, 2016 1:48 PM
Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
On 22 September 2016 at 10:29, Yuanhan Liu
Post by Wang, Zhihong
Post by Jianbo Liu
Post by Maxime Coquelin
My setup consists of one host running a guest.
The guest generates as much 64bytes packets as possible using
Have you tested with other different packet size?
My testing shows that performance is dropping when packet size is
more
Post by Wang, Zhihong
Post by Jianbo Liu
than 256.
Hi Jianbo,
Thanks for reporting this.
1. Are you running the vector frontend with mrg_rxbuf=off?
Yes, my testing is mrg_rxbuf=off, but not vector frontend PMD.
Post by Wang, Zhihong
2. Could you please specify what CPU you're running? Is it Haswell
or Ivy Bridge?
It's an ARM server.
Post by Wang, Zhihong
3. How many percentage of drop are you seeing?
size (bytes) improvement (%)
64 3.92
128 11.51
256 24.16
512 -13.79
1024 -22.51
1500 -12.22
A correction is that performance is dropping if byte size is larger than 512.
Jianbo,
Could you please verify does this patch really cause enqueue perf to drop?
You can test the enqueue path only by set guest to do rxonly, and compare
the mpps by show port stats all in the guest.
Tested with testpmd, host: txonly, guest: rxonly
size (bytes) improvement (%)
64 4.12
128 6
256 2.65
512 -1.12
1024 -7.02
I think your number is little bit hard to understand for me, this patch's
optimization contains 2 parts:

1. ring operation: works for both mrg_rxbuf on and off

2. remote write ordering: works for mrg_rxbuf=on only

So, for mrg_rxbuf=off, if this patch is good for 64B packets, then it
shouldn't do anything bad for larger packets.

This is the gain on x86 platform: host iofwd between nic and vhost,
guest rxonly.

nic2vm enhancement
64 21.83%
128 16.97%
256 6.34%
512 0.01%
1024 0.00%

I suspect there's some complication in ARM's micro-arch.

Could you try v6 and apply all patches except the the last one:
[PATCH v6 6/6] vhost: optimize cache access

And see if there's still perf drop?


Thanks
Zhi
Jianbo Liu
2016-09-22 14:41:37 UTC
Permalink
Post by Wang, Zhihong
-----Original Message-----
Sent: Thursday, September 22, 2016 5:02 PM
Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
Post by Wang, Zhihong
-----Original Message-----
Sent: Thursday, September 22, 2016 1:48 PM
Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
On 22 September 2016 at 10:29, Yuanhan Liu
Post by Wang, Zhihong
Post by Jianbo Liu
Post by Maxime Coquelin
My setup consists of one host running a guest.
The guest generates as much 64bytes packets as possible using
Have you tested with other different packet size?
My testing shows that performance is dropping when packet size is
more
Post by Wang, Zhihong
Post by Jianbo Liu
than 256.
Hi Jianbo,
Thanks for reporting this.
1. Are you running the vector frontend with mrg_rxbuf=off?
Yes, my testing is mrg_rxbuf=off, but not vector frontend PMD.
Post by Wang, Zhihong
2. Could you please specify what CPU you're running? Is it Haswell
or Ivy Bridge?
It's an ARM server.
Post by Wang, Zhihong
3. How many percentage of drop are you seeing?
size (bytes) improvement (%)
64 3.92
128 11.51
256 24.16
512 -13.79
1024 -22.51
1500 -12.22
A correction is that performance is dropping if byte size is larger than 512.
Jianbo,
Could you please verify does this patch really cause enqueue perf to drop?
You can test the enqueue path only by set guest to do rxonly, and compare
the mpps by show port stats all in the guest.
Tested with testpmd, host: txonly, guest: rxonly
size (bytes) improvement (%)
64 4.12
128 6
256 2.65
512 -1.12
1024 -7.02
I think your number is little bit hard to understand for me, this patch's
1. ring operation: works for both mrg_rxbuf on and off
2. remote write ordering: works for mrg_rxbuf=on only
So, for mrg_rxbuf=off, if this patch is good for 64B packets, then it
shouldn't do anything bad for larger packets.
This is the gain on x86 platform: host iofwd between nic and vhost,
guest rxonly.
nic2vm enhancement
64 21.83%
128 16.97%
256 6.34%
512 0.01%
1024 0.00%
I bootup a VM with 2 virtual port, and stress the traffic between them.
First, I stressed with pktgen-dpdk in VM, and did iofwd in host.
Then, as you told, I did rxonly in VM, and txonly in host.
Post by Wang, Zhihong
I suspect there's some complication in ARM's micro-arch.
[PATCH v6 6/6] vhost: optimize cache access
And see if there's still perf drop?
The last patch can improve the performance. The drop is actually
caused by the second patch.

Jianbo
Wang, Zhihong
2016-09-23 02:56:25 UTC
Permalink
-----Original Message-----
Sent: Thursday, September 22, 2016 10:42 PM
Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
Post by Wang, Zhihong
-----Original Message-----
Sent: Thursday, September 22, 2016 5:02 PM
Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
Post by Wang, Zhihong
-----Original Message-----
Sent: Thursday, September 22, 2016 1:48 PM
Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
On 22 September 2016 at 10:29, Yuanhan Liu
Post by Wang, Zhihong
Post by Jianbo Liu
Post by Maxime Coquelin
My setup consists of one host running a guest.
The guest generates as much 64bytes packets as possible using
Have you tested with other different packet size?
My testing shows that performance is dropping when packet size is
more
Post by Wang, Zhihong
Post by Jianbo Liu
than 256.
Hi Jianbo,
Thanks for reporting this.
1. Are you running the vector frontend with mrg_rxbuf=off?
Yes, my testing is mrg_rxbuf=off, but not vector frontend PMD.
Post by Wang, Zhihong
2. Could you please specify what CPU you're running? Is it Haswell
or Ivy Bridge?
It's an ARM server.
Post by Wang, Zhihong
3. How many percentage of drop are you seeing?
size (bytes) improvement (%)
64 3.92
128 11.51
256 24.16
512 -13.79
1024 -22.51
1500 -12.22
A correction is that performance is dropping if byte size is larger than 512.
Jianbo,
Could you please verify does this patch really cause enqueue perf to drop?
You can test the enqueue path only by set guest to do rxonly, and compare
the mpps by show port stats all in the guest.
Tested with testpmd, host: txonly, guest: rxonly
size (bytes) improvement (%)
64 4.12
128 6
256 2.65
512 -1.12
1024 -7.02
I think your number is little bit hard to understand for me, this patch's
1. ring operation: works for both mrg_rxbuf on and off
2. remote write ordering: works for mrg_rxbuf=on only
So, for mrg_rxbuf=off, if this patch is good for 64B packets, then it
shouldn't do anything bad for larger packets.
This is the gain on x86 platform: host iofwd between nic and vhost,
guest rxonly.
nic2vm enhancement
64 21.83%
128 16.97%
256 6.34%
512 0.01%
1024 0.00%
I bootup a VM with 2 virtual port, and stress the traffic between them.
First, I stressed with pktgen-dpdk in VM, and did iofwd in host.
Then, as you told, I did rxonly in VM, and txonly in host.
Post by Wang, Zhihong
I suspect there's some complication in ARM's micro-arch.
[PATCH v6 6/6] vhost: optimize cache access
And see if there's still perf drop?
The last patch can improve the performance. The drop is actually
caused by the second patch.
This is expected because the 2nd patch is just a baseline and all optimization
patches are organized in the rest of this patch set.

I think you can do bottleneck analysis on ARM to see what's slowing down the
perf, there might be some micro-arch complications there, mostly likely in
memcpy.

Do you use glibc's memcpy? I suggest to hand-crafted it on your own.

Could you publish the mrg_rxbuf=on data also? Since it's more widely used
in terms of spec integrity.


Thanks
Zhihong
Jianbo Liu
2016-09-23 10:41:30 UTC
Permalink
On 23 September 2016 at 10:56, Wang, Zhihong <***@intel.com> wrote:
.....
Post by Wang, Zhihong
This is expected because the 2nd patch is just a baseline and all optimization
patches are organized in the rest of this patch set.
I think you can do bottleneck analysis on ARM to see what's slowing down the
perf, there might be some micro-arch complications there, mostly likely in
memcpy.
Do you use glibc's memcpy? I suggest to hand-crafted it on your own.
Could you publish the mrg_rxbuf=on data also? Since it's more widely used
in terms of spec integrity.
I don't think it will be helpful for you, considering the differences
between x86 and arm.
So please move on with this patchset...

Thanks!
Jianbo
Thomas Monjalon
2016-09-23 13:41:08 UTC
Permalink
Post by Jianbo Liu
.....
Post by Wang, Zhihong
This is expected because the 2nd patch is just a baseline and all optimization
patches are organized in the rest of this patch set.
I think you can do bottleneck analysis on ARM to see what's slowing down the
perf, there might be some micro-arch complications there, mostly likely in
memcpy.
Do you use glibc's memcpy? I suggest to hand-crafted it on your own.
Could you publish the mrg_rxbuf=on data also? Since it's more widely used
in terms of spec integrity.
I don't think it will be helpful for you, considering the differences
between x86 and arm.
So please move on with this patchset...
Jianbo,
I don't understand.
You said that the 2nd patch is a regression:
- volatile uint16_t last_used_idx;
+ uint16_t last_used_idx;

And the overrall series lead to performance regression
for packets > 512 B, right?
But we don't know wether you have tested the v6 or not.

Zhihong talked about some improvements possible in rte_memcpy.
ARM64 is using libc memcpy in rte_memcpy.

Now you seem to give up.
Does it mean you accept having a regression in 16.11 release?
Are you working on rte_memcpy?
Wang, Zhihong
2016-09-25 05:41:55 UTC
Permalink
-----Original Message-----
Sent: Friday, September 23, 2016 9:41 PM
Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
Post by Jianbo Liu
.....
Post by Wang, Zhihong
This is expected because the 2nd patch is just a baseline and all optimization
patches are organized in the rest of this patch set.
I think you can do bottleneck analysis on ARM to see what's slowing down the
perf, there might be some micro-arch complications there, mostly likely in
memcpy.
Do you use glibc's memcpy? I suggest to hand-crafted it on your own.
Could you publish the mrg_rxbuf=on data also? Since it's more widely used
in terms of spec integrity.
I don't think it will be helpful for you, considering the differences
between x86 and arm.
Hi Jianbo,

This patch does help in ARM for small packets like 64B sized ones,
this actually proves the similarity between x86 and ARM in terms
of caching optimization in this patch.

My estimation is based on:

1. The last patch are for mrg_rxbuf=on, and since you said it helps
perf, we can ignore it for now when we discuss mrg_rxbuf=off

2. Vhost enqueue perf =
Ring overhead + Virtio header overhead + Data memcpy overhead

3. This patch helps small packets traffic, which means it helps
ring + virtio header operations

4. So, when you say perf drop when packet size larger than 512B,
this is most likely caused by memcpy in ARM not working well
with this patch

I'm not saying glibc's memcpy is not good enough, it's just that
this is a rather special use case. And since we see specialized
memcpy + this patch give better performance than other combinations
significantly on x86, we suggest to hand-craft a specialized memcpy
for it.

Of course on ARM this is still just my speculation, and we need to
either prove it or find the actual root cause.

It can be **REALLY HELPFUL** if you could help to test this patch on
ARM for mrg_rxbuf=on cases to see if this patch is in fact helpful
to ARM at all, since mrg_rxbuf=on the more widely used cases.


Thanks
Zhihong
Post by Jianbo Liu
So please move on with this patchset...
Jianbo,
I don't understand.
- volatile uint16_t last_used_idx;
+ uint16_t last_used_idx;
And the overrall series lead to performance regression
for packets > 512 B, right?
But we don't know wether you have tested the v6 or not.
Zhihong talked about some improvements possible in rte_memcpy.
ARM64 is using libc memcpy in rte_memcpy.
Now you seem to give up.
Does it mean you accept having a regression in 16.11 release?
Are you working on rte_memcpy?
Zhihong Wang
2016-08-30 03:35:58 UTC
Permalink
This patch set optimizes the vhost enqueue function.

It implements the vhost logic from scratch into a single function designed
for high performance and good maintainability, and improves CPU efficiency
significantly by optimizing cache access, which means:

* Higher maximum throughput can be achieved for fast frontends like DPDK
virtio pmd.

* Better scalability can be achieved that each vhost core can support
more connections because it takes less cycles to handle each single
frontend.

This patch set contains:

1. A Windows VM compatibility fix for vhost enqueue in 16.07 release.

2. A baseline patch to rewrite the vhost logic.

3. A series of optimization patches added upon the baseline.

The main optimization techniques are:

1. Reorder code to reduce CPU pipeline stall cycles.

2. Batch update the used ring for better efficiency.

3. Prefetch descriptor to hide cache latency.

4. Remove useless volatile attribute to allow compiler optimization.

Code reordering and batch used ring update bring most of the performance
improvements.

In the existing code there're 2 callbacks for vhost enqueue:

* virtio_dev_merge_rx for mrg_rxbuf turned on cases.

* virtio_dev_rx for mrg_rxbuf turned off cases.

The performance of the existing code is not optimal, especially when the
mrg_rxbuf feature turned on. Besides, having 2 callback paths increases
maintenance efforts.

Also, there's a compatibility issue in the existing code which causes
Windows VM to hang when the mrg_rxbuf feature turned on.

---
Changes in v4:

1. Fix a Windows VM compatibility issue.

2. Free shadow used ring in the right place.

3. Add failure check for shadow used ring malloc.

4. Refactor the code for clearer logic.

5. Add PRINT_PACKET for debugging.

---
Changes in v3:

1. Remove unnecessary memset which causes frontend stall on SNB & IVB.

2. Rename variables to follow naming convention.

3. Rewrite enqueue and delete the obsolete in the same patch.

---
Changes in v2:

1. Split the big function into several small ones.

2. Use multiple patches to explain each optimization.

3. Add comments.

Zhihong Wang (6):
vhost: fix windows vm hang
vhost: rewrite enqueue
vhost: remove useless volatile
vhost: add desc prefetch
vhost: batch update used ring
vhost: optimize cache access

lib/librte_vhost/vhost-net.h | 6 +-
lib/librte_vhost/vhost_rxtx.c | 572 +++++++++++++++---------------------------
lib/librte_vhost/virtio-net.c | 42 +++-
3 files changed, 244 insertions(+), 376 deletions(-)
--
2.7.4
Zhihong Wang
2016-08-30 03:35:59 UTC
Permalink
This patch fixes a Windows VM compatibility issue in DPDK 16.07 vhost code,
which causes the guest to hang once any packets are enqueued when mrg_rxbuf
is turned on.

How to test?

1. Start testpmd in the host with a vhost port.

2. Start a Windows VM image with qemu and connect to the vhost port.

3. Start io forwarding with tx_first in host testpmd.

For 16.07 code, the Windows VM will hang once any packets are enqueued.

Cc: <***@dpdk.org>
Signed-off-by: Zhihong Wang <***@intel.com>
---
lib/librte_vhost/vhost_rxtx.c | 17 ++++++++++++-----
1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 08a73fd..5806f99 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -384,6 +384,8 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
uint16_t start_idx = vq->last_used_idx;
uint16_t cur_idx = start_idx;
uint64_t desc_addr;
+ uint32_t desc_chain_head;
+ uint32_t desc_chain_len;
uint32_t mbuf_offset, mbuf_avail;
uint32_t desc_offset, desc_avail;
uint32_t cpy_len;
@@ -412,6 +414,8 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,

desc_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
desc_offset = dev->vhost_hlen;
+ desc_chain_head = buf_vec[vec_idx].desc_idx;
+ desc_chain_len = desc_offset;

mbuf_avail = rte_pktmbuf_data_len(m);
mbuf_offset = 0;
@@ -419,19 +423,21 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
/* done with current desc buf, get the next one */
if (desc_avail == 0) {
desc_idx = buf_vec[vec_idx].desc_idx;
+ vec_idx++;

if (!(vq->desc[desc_idx].flags & VRING_DESC_F_NEXT)) {
/* Update used ring with desc information */
used_idx = cur_idx++ & (vq->size - 1);
- vq->used->ring[used_idx].id = desc_idx;
- vq->used->ring[used_idx].len = desc_offset;
+ vq->used->ring[used_idx].id = desc_chain_head;
+ vq->used->ring[used_idx].len = desc_chain_len;
vhost_log_used_vring(dev, vq,
offsetof(struct vring_used,
ring[used_idx]),
sizeof(vq->used->ring[used_idx]));
+ desc_chain_head = buf_vec[vec_idx].desc_idx;
+ desc_chain_len = 0;
}

- vec_idx++;
desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
if (unlikely(!desc_addr))
return 0;
@@ -463,11 +469,12 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
mbuf_offset += cpy_len;
desc_avail -= cpy_len;
desc_offset += cpy_len;
+ desc_chain_len += cpy_len;
}

used_idx = cur_idx & (vq->size - 1);
- vq->used->ring[used_idx].id = buf_vec[vec_idx].desc_idx;
- vq->used->ring[used_idx].len = desc_offset;
+ vq->used->ring[used_idx].id = desc_chain_head;
+ vq->used->ring[used_idx].len = desc_chain_len;
vhost_log_used_vring(dev, vq,
offsetof(struct vring_used, ring[used_idx]),
sizeof(vq->used->ring[used_idx]));
--
2.7.4
Yuanhan Liu
2016-09-05 05:24:46 UTC
Permalink
Post by Zhihong Wang
This patch fixes a Windows VM compatibility issue in DPDK 16.07 vhost code,
which causes the guest to hang once any packets are enqueued when mrg_rxbuf
is turned on.
This commit log lacks two important pieces: why does the hang happen and
how does your patch fix it.
Post by Zhihong Wang
How to test?
1. Start testpmd in the host with a vhost port.
2. Start a Windows VM image with qemu and connect to the vhost port.
3. Start io forwarding with tx_first in host testpmd.
For 16.07 code, the Windows VM will hang once any packets are enqueued.
---
lib/librte_vhost/vhost_rxtx.c | 17 ++++++++++++-----
1 file changed, 12 insertions(+), 5 deletions(-)
diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 08a73fd..5806f99 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -384,6 +384,8 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
uint16_t start_idx = vq->last_used_idx;
uint16_t cur_idx = start_idx;
uint64_t desc_addr;
+ uint32_t desc_chain_head;
+ uint32_t desc_chain_len;
What's the point of introducing "desc_chain_len"? It has the same value
of desc_offset.

--yliu
Wang, Zhihong
2016-09-05 05:25:31 UTC
Permalink
-----Original Message-----
Sent: Monday, September 5, 2016 1:25 PM
Subject: Re: [dpdk-stable] [PATCH v4 1/6] vhost: fix windows vm hang
Post by Zhihong Wang
This patch fixes a Windows VM compatibility issue in DPDK 16.07 vhost
code,
Post by Zhihong Wang
which causes the guest to hang once any packets are enqueued when
mrg_rxbuf
Post by Zhihong Wang
is turned on.
This commit log lacks two important pieces: why does the hang happen and
how does your patch fix it.
Okay, I'll add it in v5.
Post by Zhihong Wang
How to test?
1. Start testpmd in the host with a vhost port.
2. Start a Windows VM image with qemu and connect to the vhost port.
3. Start io forwarding with tx_first in host testpmd.
For 16.07 code, the Windows VM will hang once any packets are enqueued.
---
lib/librte_vhost/vhost_rxtx.c | 17 ++++++++++++-----
1 file changed, 12 insertions(+), 5 deletions(-)
diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 08a73fd..5806f99 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -384,6 +384,8 @@ copy_mbuf_to_desc_mergeable(struct virtio_net
*dev, struct vhost_virtqueue *vq,
Post by Zhihong Wang
uint16_t start_idx = vq->last_used_idx;
uint16_t cur_idx = start_idx;
uint64_t desc_addr;
+ uint32_t desc_chain_head;
+ uint32_t desc_chain_len;
What's the point of introducing "desc_chain_len"? It has the same value
of desc_offset.
No it's not, desc_offset is the offset of the current desc only.
That's where the old code goes wrong.

If you take a look at the virtio spec:

/* le32 is used here for ids for padding reasons. */
struct vring_used_elem {
/* Index of start of used descriptor chain. */
le32 id;
/* Total length of the descriptor chain which was written to. */
le32 len;
};
--yliu
Yuanhan Liu
2016-09-05 05:40:51 UTC
Permalink
Post by Wang, Zhihong
-----Original Message-----
Sent: Monday, September 5, 2016 1:25 PM
Subject: Re: [dpdk-stable] [PATCH v4 1/6] vhost: fix windows vm hang
Post by Zhihong Wang
This patch fixes a Windows VM compatibility issue in DPDK 16.07 vhost
code,
Post by Zhihong Wang
which causes the guest to hang once any packets are enqueued when
mrg_rxbuf
Post by Zhihong Wang
is turned on.
This commit log lacks two important pieces: why does the hang happen and
how does your patch fix it.
Okay, I'll add it in v5.
Post by Zhihong Wang
How to test?
1. Start testpmd in the host with a vhost port.
2. Start a Windows VM image with qemu and connect to the vhost port.
3. Start io forwarding with tx_first in host testpmd.
For 16.07 code, the Windows VM will hang once any packets are enqueued.
---
lib/librte_vhost/vhost_rxtx.c | 17 ++++++++++++-----
1 file changed, 12 insertions(+), 5 deletions(-)
diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 08a73fd..5806f99 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -384,6 +384,8 @@ copy_mbuf_to_desc_mergeable(struct virtio_net
*dev, struct vhost_virtqueue *vq,
Post by Zhihong Wang
uint16_t start_idx = vq->last_used_idx;
uint16_t cur_idx = start_idx;
uint64_t desc_addr;
+ uint32_t desc_chain_head;
+ uint32_t desc_chain_len;
What's the point of introducing "desc_chain_len"? It has the same value
of desc_offset.
No it's not, desc_offset is the offset of the current desc only.
That's where the old code goes wrong.
Oh, right.

--yliu
Zhihong Wang
2016-08-30 03:36:00 UTC
Permalink
This patch implements the vhost logic from scratch into a single function
designed for high performance and better maintainability.

This is the baseline version of the new code, more optimization will be
added in the following patches in this patch set.

---
Changes in v4:

1. Refactor the code for clearer logic.

2. Add PRINT_PACKET for debugging.

---
Changes in v3:

1. Rewrite enqueue and delete the obsolete in the same patch.

Signed-off-by: Zhihong Wang <***@intel.com>
---
lib/librte_vhost/vhost_rxtx.c | 525 ++++++++++++------------------------------
1 file changed, 145 insertions(+), 380 deletions(-)

diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 5806f99..629e8ae 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -91,7 +91,7 @@ is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t qp_nb)
return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
}

-static void
+static inline void __attribute__((always_inline))
virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
{
if (m_buf->ol_flags & PKT_TX_L4_MASK) {
@@ -112,6 +112,10 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
cksum));
break;
}
+ } else {
+ net_hdr->flags = 0;
+ net_hdr->csum_start = 0;
+ net_hdr->csum_offset = 0;
}

if (m_buf->ol_flags & PKT_TX_TCP_SEG) {
@@ -122,437 +126,198 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
net_hdr->gso_size = m_buf->tso_segsz;
net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
+ m_buf->l4_len;
+ } else {
+ net_hdr->gso_type = 0;
+ net_hdr->hdr_len = 0;
+ net_hdr->gso_size = 0;
}
}

-static inline void
-copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr,
- struct virtio_net_hdr_mrg_rxbuf hdr)
+static inline void __attribute__((always_inline))
+update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ uint32_t desc_chain_head, uint32_t desc_chain_len)
{
- if (dev->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf))
- *(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr;
- else
- *(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr;
+ uint32_t used_idx_round = vq->last_used_idx & (vq->size - 1);
+
+ vq->used->ring[used_idx_round].id = desc_chain_head;
+ vq->used->ring[used_idx_round].len = desc_chain_len;
+ vhost_log_used_vring(dev, vq, offsetof(struct vring_used,
+ ring[used_idx_round]),
+ sizeof(vq->used->ring[used_idx_round]));
}

-static inline int __attribute__((always_inline))
-copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
- struct rte_mbuf *m, uint16_t desc_idx)
+static inline uint32_t __attribute__((always_inline))
+enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ uint16_t avail_idx, struct rte_mbuf *mbuf,
+ uint32_t is_mrg_rxbuf)
{
- uint32_t desc_avail, desc_offset;
- uint32_t mbuf_avail, mbuf_offset;
- uint32_t cpy_len;
+ struct virtio_net_hdr_mrg_rxbuf *virtio_hdr;
struct vring_desc *desc;
uint64_t desc_addr;
- struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
-
- desc = &vq->desc[desc_idx];
+ uint32_t desc_chain_head;
+ uint32_t desc_chain_len;
+ uint32_t desc_current;
+ uint32_t desc_offset;
+ uint32_t mbuf_len;
+ uint32_t mbuf_avail;
+ uint32_t copy_len;
+ uint32_t extra_buffers = 0;
+
+ /* start with the first mbuf of the packet */
+ mbuf_len = rte_pktmbuf_data_len(mbuf);
+ mbuf_avail = mbuf_len;
+
+ /* get the current desc */
+ desc_current = vq->avail->ring[(vq->last_used_idx) & (vq->size - 1)];
+ desc_chain_head = desc_current;
+ desc = &vq->desc[desc_current];
desc_addr = gpa_to_vva(dev, desc->addr);
- /*
- * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
- * performance issue with some versions of gcc (4.8.4 and 5.3.0) which
- * otherwise stores offset on the stack instead of in a register.
- */
- if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr)
- return -1;
+ if (unlikely(!desc_addr))
+ goto error;

- rte_prefetch0((void *)(uintptr_t)desc_addr);
+ /* handle virtio header */
+ virtio_hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr;
+ virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
+ if (is_mrg_rxbuf)
+ virtio_hdr->num_buffers = extra_buffers + 1;

- virtio_enqueue_offload(m, &virtio_hdr.hdr);
- copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
vhost_log_write(dev, desc->addr, dev->vhost_hlen);
PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
-
desc_offset = dev->vhost_hlen;
- desc_avail = desc->len - dev->vhost_hlen;
-
- mbuf_avail = rte_pktmbuf_data_len(m);
- mbuf_offset = 0;
- while (mbuf_avail != 0 || m->next != NULL) {
- /* done with current mbuf, fetch next */
- if (mbuf_avail == 0) {
- m = m->next;
-
- mbuf_offset = 0;
- mbuf_avail = rte_pktmbuf_data_len(m);
+ desc_chain_len = desc_offset;
+ desc_addr += desc_offset;
+
+ /* start copy from mbuf to desc */
+ while (mbuf_avail || mbuf->next) {
+ /* get the next mbuf if the current done */
+ if (!mbuf_avail) {
+ mbuf = mbuf->next;
+ mbuf_len = rte_pktmbuf_data_len(mbuf);
+ mbuf_avail = mbuf_len;
}

- /* done with current desc buf, fetch next */
- if (desc_avail == 0) {
- if ((desc->flags & VRING_DESC_F_NEXT) == 0) {
- /* Room in vring buffer is not enough */
- return -1;
- }
- if (unlikely(desc->next >= vq->size))
- return -1;
+ /* get the next desc if the current done */
+ if (desc->len <= desc_offset) {
+ if (desc->flags & VRING_DESC_F_NEXT) {
+ /* go on with the current desc chain */
+ desc_offset = 0;
+ desc_current = desc->next;
+ desc = &vq->desc[desc_current];
+ desc_addr = gpa_to_vva(dev, desc->addr);
+ if (unlikely(!desc_addr))
+ goto error;
+ } else if (is_mrg_rxbuf) {
+ /* start with the next desc chain */
+ update_used_ring(dev, vq, desc_chain_head,
+ desc_chain_len);
+ vq->last_used_idx++;
+ extra_buffers++;
+ virtio_hdr->num_buffers++;
+ if (avail_idx == vq->last_used_idx)
+ goto error;
+
+ desc_current =
+ vq->avail->ring[(vq->last_used_idx) &
+ (vq->size - 1)];
+ desc_chain_head = desc_current;
+ desc = &vq->desc[desc_current];
+ desc_addr = gpa_to_vva(dev, desc->addr);
+ if (unlikely(!desc_addr))
+ goto error;

- desc = &vq->desc[desc->next];
- desc_addr = gpa_to_vva(dev, desc->addr);
- if (unlikely(!desc_addr))
- return -1;
-
- desc_offset = 0;
- desc_avail = desc->len;
+ desc_chain_len = 0;
+ desc_offset = 0;
+ } else
+ goto error;
}

- cpy_len = RTE_MIN(desc_avail, mbuf_avail);
- rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
- rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
- cpy_len);
- vhost_log_write(dev, desc->addr + desc_offset, cpy_len);
- PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
- cpy_len, 0);
-
- mbuf_avail -= cpy_len;
- mbuf_offset += cpy_len;
- desc_avail -= cpy_len;
- desc_offset += cpy_len;
+ /* copy mbuf data */
+ copy_len = RTE_MIN(desc->len - desc_offset, mbuf_avail);
+ rte_memcpy((void *)(uintptr_t)desc_addr,
+ rte_pktmbuf_mtod_offset(mbuf, void *,
+ mbuf_len - mbuf_avail),
+ copy_len);
+ vhost_log_write(dev, desc->addr + desc_offset, copy_len);
+ PRINT_PACKET(dev, (uintptr_t)desc_addr, copy_len, 0);
+ mbuf_avail -= copy_len;
+ desc_offset += copy_len;
+ desc_addr += copy_len;
+ desc_chain_len += copy_len;
}

- return 0;
-}
+ update_used_ring(dev, vq, desc_chain_head, desc_chain_len);
+ vq->last_used_idx++;

-/**
- * This function adds buffers to the virtio devices RX virtqueue. Buffers can
- * be received from the physical port or from another virtio device. A packet
- * count is returned to indicate the number of packets that are succesfully
- * added to the RX queue. This function works when the mbuf is scattered, but
- * it doesn't support the mergeable feature.
- */
-static inline uint32_t __attribute__((always_inline))
-virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
- struct rte_mbuf **pkts, uint32_t count)
-{
- struct vhost_virtqueue *vq;
- uint16_t avail_idx, free_entries, start_idx;
- uint16_t desc_indexes[MAX_PKT_BURST];
- uint16_t used_idx;
- uint32_t i;
-
- LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
- if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
- RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
- dev->vid, __func__, queue_id);
- return 0;
- }
-
- vq = dev->virtqueue[queue_id];
- if (unlikely(vq->enabled == 0))
- return 0;
-
- avail_idx = *((volatile uint16_t *)&vq->avail->idx);
- start_idx = vq->last_used_idx;
- free_entries = avail_idx - start_idx;
- count = RTE_MIN(count, free_entries);
- count = RTE_MIN(count, (uint32_t)MAX_PKT_BURST);
- if (count == 0)
- return 0;
-
- LOG_DEBUG(VHOST_DATA, "(%d) start_idx %d | end_idx %d\n",
- dev->vid, start_idx, start_idx + count);
-
- /* Retrieve all of the desc indexes first to avoid caching issues. */
- rte_prefetch0(&vq->avail->ring[start_idx & (vq->size - 1)]);
- for (i = 0; i < count; i++) {
- used_idx = (start_idx + i) & (vq->size - 1);
- desc_indexes[i] = vq->avail->ring[used_idx];
- vq->used->ring[used_idx].id = desc_indexes[i];
- vq->used->ring[used_idx].len = pkts[i]->pkt_len +
- dev->vhost_hlen;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used, ring[used_idx]),
- sizeof(vq->used->ring[used_idx]));
- }
-
- rte_prefetch0(&vq->desc[desc_indexes[0]]);
- for (i = 0; i < count; i++) {
- uint16_t desc_idx = desc_indexes[i];
- int err;
+ return 0;

- err = copy_mbuf_to_desc(dev, vq, pkts[i], desc_idx);
- if (unlikely(err)) {
- used_idx = (start_idx + i) & (vq->size - 1);
- vq->used->ring[used_idx].len = dev->vhost_hlen;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used, ring[used_idx]),
- sizeof(vq->used->ring[used_idx]));
- }
+error:
+ /* rollback on any error if last_used_idx update on-the-fly */
+ vq->last_used_idx -= extra_buffers;

- if (i + 1 < count)
- rte_prefetch0(&vq->desc[desc_indexes[i+1]]);
- }
+ return 1;
+}

+static inline void __attribute__((always_inline))
+notify_guest(struct virtio_net *dev, struct vhost_virtqueue *vq)
+{
rte_smp_wmb();
-
- *(volatile uint16_t *)&vq->used->idx += count;
- vq->last_used_idx += count;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used, idx),
- sizeof(vq->used->idx));
-
- /* flush used->idx update before we read avail->flags. */
+ vq->used->idx = vq->last_used_idx;
+ vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
+ sizeof(vq->used->idx));
rte_mb();
-
- /* Kick the guest if necessary. */
if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
&& (vq->callfd >= 0))
eventfd_write(vq->callfd, (eventfd_t)1);
- return count;
}

-static inline int
-fill_vec_buf(struct vhost_virtqueue *vq, uint32_t avail_idx,
- uint32_t *allocated, uint32_t *vec_idx,
- struct buf_vector *buf_vec)
-{
- uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
- uint32_t vec_id = *vec_idx;
- uint32_t len = *allocated;
-
- while (1) {
- if (unlikely(vec_id >= BUF_VECTOR_MAX || idx >= vq->size))
- return -1;
-
- len += vq->desc[idx].len;
- buf_vec[vec_id].buf_addr = vq->desc[idx].addr;
- buf_vec[vec_id].buf_len = vq->desc[idx].len;
- buf_vec[vec_id].desc_idx = idx;
- vec_id++;
-
- if ((vq->desc[idx].flags & VRING_DESC_F_NEXT) == 0)
- break;
-
- idx = vq->desc[idx].next;
- }
-
- *allocated = len;
- *vec_idx = vec_id;
-
- return 0;
-}
-
-/*
- * Returns -1 on fail, 0 on success
- */
-static inline int
-reserve_avail_buf_mergeable(struct vhost_virtqueue *vq, uint32_t size,
- uint16_t *end, struct buf_vector *buf_vec)
+uint16_t
+rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
+ struct rte_mbuf **pkts, uint16_t count)
{
- uint16_t cur_idx;
+ struct vhost_virtqueue *vq;
+ struct virtio_net *dev;
+ uint32_t pkt_left = count;
+ uint32_t pkt_idx = 0;
+ uint32_t pkt_sent = 0;
+ uint32_t is_mrg_rxbuf = 0;
uint16_t avail_idx;
- uint32_t allocated = 0;
- uint32_t vec_idx = 0;
- uint16_t tries = 0;

- cur_idx = vq->last_used_idx;
-
- while (1) {
- avail_idx = *((volatile uint16_t *)&vq->avail->idx);
- if (unlikely(cur_idx == avail_idx))
- return -1;
-
- if (unlikely(fill_vec_buf(vq, cur_idx, &allocated,
- &vec_idx, buf_vec) < 0))
- return -1;
-
- cur_idx++;
- tries++;
-
- if (allocated >= size)
- break;
-
- /*
- * if we tried all available ring items, and still
- * can't get enough buf, it means something abnormal
- * happened.
- */
- if (unlikely(tries >= vq->size))
- return -1;
- }
-
- *end = cur_idx;
- return 0;
-}
-
-static inline uint32_t __attribute__((always_inline))
-copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
- uint16_t end_idx, struct rte_mbuf *m,
- struct buf_vector *buf_vec)
-{
- struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
- uint32_t vec_idx = 0;
- uint16_t start_idx = vq->last_used_idx;
- uint16_t cur_idx = start_idx;
- uint64_t desc_addr;
- uint32_t desc_chain_head;
- uint32_t desc_chain_len;
- uint32_t mbuf_offset, mbuf_avail;
- uint32_t desc_offset, desc_avail;
- uint32_t cpy_len;
- uint16_t desc_idx, used_idx;
-
- if (unlikely(m == NULL))
+ if (unlikely(!pkt_left))
return 0;

- LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n",
- dev->vid, cur_idx, end_idx);
+ pkt_left = RTE_MIN((uint32_t)MAX_PKT_BURST, pkt_left);

- desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
- if (buf_vec[vec_idx].buf_len < dev->vhost_hlen || !desc_addr)
+ dev = get_device(vid);
+ if (unlikely(!dev))
return 0;

- rte_prefetch0((void *)(uintptr_t)desc_addr);
-
- virtio_hdr.num_buffers = end_idx - start_idx;
- LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n",
- dev->vid, virtio_hdr.num_buffers);
-
- virtio_enqueue_offload(m, &virtio_hdr.hdr);
- copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
- vhost_log_write(dev, buf_vec[vec_idx].buf_addr, dev->vhost_hlen);
- PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
-
- desc_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
- desc_offset = dev->vhost_hlen;
- desc_chain_head = buf_vec[vec_idx].desc_idx;
- desc_chain_len = desc_offset;
-
- mbuf_avail = rte_pktmbuf_data_len(m);
- mbuf_offset = 0;
- while (mbuf_avail != 0 || m->next != NULL) {
- /* done with current desc buf, get the next one */
- if (desc_avail == 0) {
- desc_idx = buf_vec[vec_idx].desc_idx;
- vec_idx++;
-
- if (!(vq->desc[desc_idx].flags & VRING_DESC_F_NEXT)) {
- /* Update used ring with desc information */
- used_idx = cur_idx++ & (vq->size - 1);
- vq->used->ring[used_idx].id = desc_chain_head;
- vq->used->ring[used_idx].len = desc_chain_len;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used,
- ring[used_idx]),
- sizeof(vq->used->ring[used_idx]));
- desc_chain_head = buf_vec[vec_idx].desc_idx;
- desc_chain_len = 0;
- }
-
- desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
- if (unlikely(!desc_addr))
- return 0;
-
- /* Prefetch buffer address. */
- rte_prefetch0((void *)(uintptr_t)desc_addr);
- desc_offset = 0;
- desc_avail = buf_vec[vec_idx].buf_len;
- }
-
- /* done with current mbuf, get the next one */
- if (mbuf_avail == 0) {
- m = m->next;
-
- mbuf_offset = 0;
- mbuf_avail = rte_pktmbuf_data_len(m);
- }
-
- cpy_len = RTE_MIN(desc_avail, mbuf_avail);
- rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
- rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
- cpy_len);
- vhost_log_write(dev, buf_vec[vec_idx].buf_addr + desc_offset,
- cpy_len);
- PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
- cpy_len, 0);
-
- mbuf_avail -= cpy_len;
- mbuf_offset += cpy_len;
- desc_avail -= cpy_len;
- desc_offset += cpy_len;
- desc_chain_len += cpy_len;
- }
-
- used_idx = cur_idx & (vq->size - 1);
- vq->used->ring[used_idx].id = desc_chain_head;
- vq->used->ring[used_idx].len = desc_chain_len;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used, ring[used_idx]),
- sizeof(vq->used->ring[used_idx]));
-
- return end_idx - start_idx;
-}
-
-static inline uint32_t __attribute__((always_inline))
-virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
- struct rte_mbuf **pkts, uint32_t count)
-{
- struct vhost_virtqueue *vq;
- uint32_t pkt_idx = 0, nr_used = 0;
- uint16_t end;
- struct buf_vector buf_vec[BUF_VECTOR_MAX];
-
- LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
- if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
- RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
- dev->vid, __func__, queue_id);
+ if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb)))
return 0;
- }

vq = dev->virtqueue[queue_id];
- if (unlikely(vq->enabled == 0))
+ if (unlikely(!vq->enabled))
return 0;

- count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
- if (count == 0)
- return 0;
-
- for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
- uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
+ if (dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF))
+ is_mrg_rxbuf = 1;

- if (unlikely(reserve_avail_buf_mergeable(vq, pkt_len,
- &end, buf_vec) < 0)) {
- LOG_DEBUG(VHOST_DATA,
- "(%d) failed to get enough desc from vring\n",
- dev->vid);
+ /* start enqueuing packets 1 by 1 */
+ avail_idx = *((volatile uint16_t *)&vq->avail->idx);
+ while (pkt_left && avail_idx != vq->last_used_idx) {
+ if (enqueue_packet(dev, vq, avail_idx, pkts[pkt_idx],
+ is_mrg_rxbuf))
break;
- }
-
- nr_used = copy_mbuf_to_desc_mergeable(dev, vq, end,
- pkts[pkt_idx], buf_vec);
- rte_smp_wmb();
-
- *(volatile uint16_t *)&vq->used->idx += nr_used;
- vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
- sizeof(vq->used->idx));
- vq->last_used_idx += nr_used;
- }
-
- if (likely(pkt_idx)) {
- /* flush used->idx update before we read avail->flags. */
- rte_mb();

- /* Kick the guest if necessary. */
- if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
- && (vq->callfd >= 0))
- eventfd_write(vq->callfd, (eventfd_t)1);
+ pkt_idx++;
+ pkt_sent++;
+ pkt_left--;
}

- return pkt_idx;
-}
-
-uint16_t
-rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
- struct rte_mbuf **pkts, uint16_t count)
-{
- struct virtio_net *dev = get_device(vid);
-
- if (!dev)
- return 0;
+ /* update used idx and kick the guest if necessary */
+ if (pkt_sent)
+ notify_guest(dev, vq);

- if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF))
- return virtio_dev_merge_rx(dev, queue_id, pkts, count);
- else
- return virtio_dev_rx(dev, queue_id, pkts, count);
+ return pkt_sent;
}

static void
--
2.7.4
Yuanhan Liu
2016-09-05 06:39:25 UTC
Permalink
Post by Zhihong Wang
This patch implements the vhost logic from scratch into a single function
designed for high performance and better maintainability.
This is the baseline version of the new code, more optimization will be
added in the following patches in this patch set.
---
1. Refactor the code for clearer logic.
2. Add PRINT_PACKET for debugging.
---
1. Rewrite enqueue and delete the obsolete in the same patch.
Change log should go ---->
Post by Zhihong Wang
---
... here, after the SoB.
Post by Zhihong Wang
lib/librte_vhost/vhost_rxtx.c | 525 ++++++++++++------------------------------
1 file changed, 145 insertions(+), 380 deletions(-)
diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 5806f99..629e8ae 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -91,7 +91,7 @@ is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t qp_nb)
return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
}
-static void
+static inline void __attribute__((always_inline))
virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
{
if (m_buf->ol_flags & PKT_TX_L4_MASK) {
@@ -112,6 +112,10 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
cksum));
break;
}
+ } else {
+ net_hdr->flags = 0;
+ net_hdr->csum_start = 0;
+ net_hdr->csum_offset = 0;
}
if (m_buf->ol_flags & PKT_TX_TCP_SEG) {
@@ -122,437 +126,198 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
net_hdr->gso_size = m_buf->tso_segsz;
net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
+ m_buf->l4_len;
+ } else {
+ net_hdr->gso_type = 0;
+ net_hdr->hdr_len = 0;
+ net_hdr->gso_size = 0;
}
}
-static inline void
-copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr,
- struct virtio_net_hdr_mrg_rxbuf hdr)
+static inline void __attribute__((always_inline))
+update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ uint32_t desc_chain_head, uint32_t desc_chain_len)
{
- if (dev->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf))
- *(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr;
- else
- *(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr;
+ uint32_t used_idx_round = vq->last_used_idx & (vq->size - 1);
I'd suggest to use "used_idx", instead of "used_idx_round".
Post by Zhihong Wang
+
+ vq->used->ring[used_idx_round].id = desc_chain_head;
+ vq->used->ring[used_idx_round].len = desc_chain_len;
+ vhost_log_used_vring(dev, vq, offsetof(struct vring_used,
+ ring[used_idx_round]),
+ sizeof(vq->used->ring[used_idx_round]));
}
-static inline int __attribute__((always_inline))
-copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
- struct rte_mbuf *m, uint16_t desc_idx)
+static inline uint32_t __attribute__((always_inline))
+enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ uint16_t avail_idx, struct rte_mbuf *mbuf,
+ uint32_t is_mrg_rxbuf)
{
- uint32_t desc_avail, desc_offset;
- uint32_t mbuf_avail, mbuf_offset;
- uint32_t cpy_len;
+ struct virtio_net_hdr_mrg_rxbuf *virtio_hdr;
struct vring_desc *desc;
uint64_t desc_addr;
- struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
-
- desc = &vq->desc[desc_idx];
+ uint32_t desc_chain_head;
+ uint32_t desc_chain_len;
+ uint32_t desc_current;
+ uint32_t desc_offset;
+ uint32_t mbuf_len;
+ uint32_t mbuf_avail;
+ uint32_t copy_len;
+ uint32_t extra_buffers = 0;
I'd name it to "num_buffers", to keep consistent with the virito hdr
naming style.
Post by Zhihong Wang
+
+ /* start with the first mbuf of the packet */
+ mbuf_len = rte_pktmbuf_data_len(mbuf);
+ mbuf_avail = mbuf_len;
+
+ /* get the current desc */
+ desc_current = vq->avail->ring[(vq->last_used_idx) & (vq->size - 1)];
+ desc_chain_head = desc_current;
+ desc = &vq->desc[desc_current];
desc_addr = gpa_to_vva(dev, desc->addr);
- /*
- * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
- * performance issue with some versions of gcc (4.8.4 and 5.3.0) which
- * otherwise stores offset on the stack instead of in a register.
- */
- if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr)
- return -1;
+ if (unlikely(!desc_addr))
+ goto error;
- rte_prefetch0((void *)(uintptr_t)desc_addr);
+ /* handle virtio header */
+ virtio_hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr;
+ virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
+ if (is_mrg_rxbuf)
+ virtio_hdr->num_buffers = extra_buffers + 1;
- virtio_enqueue_offload(m, &virtio_hdr.hdr);
- copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
vhost_log_write(dev, desc->addr, dev->vhost_hlen);
PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
-
desc_offset = dev->vhost_hlen;
- desc_avail = desc->len - dev->vhost_hlen;
-
- mbuf_avail = rte_pktmbuf_data_len(m);
- mbuf_offset = 0;
- while (mbuf_avail != 0 || m->next != NULL) {
- /* done with current mbuf, fetch next */
- if (mbuf_avail == 0) {
- m = m->next;
-
- mbuf_offset = 0;
- mbuf_avail = rte_pktmbuf_data_len(m);
+ desc_chain_len = desc_offset;
+ desc_addr += desc_offset;
+
+ /* start copy from mbuf to desc */
+ while (mbuf_avail || mbuf->next) {
+ /* get the next mbuf if the current done */
+ if (!mbuf_avail) {
+ mbuf = mbuf->next;
+ mbuf_len = rte_pktmbuf_data_len(mbuf);
+ mbuf_avail = mbuf_len;
}
- /* done with current desc buf, fetch next */
- if (desc_avail == 0) {
- if ((desc->flags & VRING_DESC_F_NEXT) == 0) {
- /* Room in vring buffer is not enough */
- return -1;
- }
- if (unlikely(desc->next >= vq->size))
- return -1;
+ /* get the next desc if the current done */
+ if (desc->len <= desc_offset) {
+ if (desc->flags & VRING_DESC_F_NEXT) {
+ /* go on with the current desc chain */
+ desc_offset = 0;
+ desc_current = desc->next;
+ desc = &vq->desc[desc_current];
+ desc_addr = gpa_to_vva(dev, desc->addr);
+ if (unlikely(!desc_addr))
+ goto error;
+ } else if (is_mrg_rxbuf) {
+ /* start with the next desc chain */
+ update_used_ring(dev, vq, desc_chain_head,
+ desc_chain_len);
+ vq->last_used_idx++;
Why not putting "vq->last_used_idx++" into update_used_ring()?
Post by Zhihong Wang
+ extra_buffers++;
+ virtio_hdr->num_buffers++;
+ if (avail_idx == vq->last_used_idx)
+ goto error;
+
+ desc_current =
+ vq->avail->ring[(vq->last_used_idx) &
+ (vq->size - 1)];
+ desc_chain_head = desc_current;
+ desc = &vq->desc[desc_current];
+ desc_addr = gpa_to_vva(dev, desc->addr);
+ if (unlikely(!desc_addr))
+ goto error;
- desc = &vq->desc[desc->next];
- desc_addr = gpa_to_vva(dev, desc->addr);
- if (unlikely(!desc_addr))
- return -1;
-
- desc_offset = 0;
- desc_avail = desc->len;
+ desc_chain_len = 0;
+ desc_offset = 0;
+ } else
+ goto error;
}
- cpy_len = RTE_MIN(desc_avail, mbuf_avail);
- rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
- rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
- cpy_len);
- vhost_log_write(dev, desc->addr + desc_offset, cpy_len);
- PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
- cpy_len, 0);
-
- mbuf_avail -= cpy_len;
- mbuf_offset += cpy_len;
- desc_avail -= cpy_len;
- desc_offset += cpy_len;
+ /* copy mbuf data */
+ copy_len = RTE_MIN(desc->len - desc_offset, mbuf_avail);
TBH, I'm okay with copy_len (actually, I prefer it slightly). However,
the old code uses cpy_len, the current dequeue function also uses cpy_len,
I then see no good reason to use copy_len here. It's really not a good
idea to me to use two different naming styles in one source file.
Post by Zhihong Wang
+ rte_memcpy((void *)(uintptr_t)desc_addr,
+ rte_pktmbuf_mtod_offset(mbuf, void *,
+ mbuf_len - mbuf_avail),
I would keep the old var "mbuf_offset" and do not introduce "mbuf_len".
This could avoid above calculation and make it straightforward.
Post by Zhihong Wang
+ copy_len);
+ vhost_log_write(dev, desc->addr + desc_offset, copy_len);
+ PRINT_PACKET(dev, (uintptr_t)desc_addr, copy_len, 0);
+ mbuf_avail -= copy_len;
+ desc_offset += copy_len;
+ desc_addr += copy_len;
+ desc_chain_len += copy_len;
Vertical alighment[0] is not a must, but as you can see, it's a style I
prefer. Meaning, if possible, please follow it.

[0]: https://en.wikipedia.org/wiki/Programming_style#Vertical_alignment
Post by Zhihong Wang
}
- return 0;
-}
+ update_used_ring(dev, vq, desc_chain_head, desc_chain_len);
+ vq->last_used_idx++;
...
Post by Zhihong Wang
- rte_prefetch0(&vq->desc[desc_indexes[0]]);
- for (i = 0; i < count; i++) {
- uint16_t desc_idx = desc_indexes[i];
- int err;
+ return 0;
- err = copy_mbuf_to_desc(dev, vq, pkts[i], desc_idx);
- if (unlikely(err)) {
- used_idx = (start_idx + i) & (vq->size - 1);
- vq->used->ring[used_idx].len = dev->vhost_hlen;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used, ring[used_idx]),
- sizeof(vq->used->ring[used_idx]));
- }
+ /* rollback on any error if last_used_idx update on-the-fly */
+ vq->last_used_idx -= extra_buffers;
- if (i + 1 < count)
- rte_prefetch0(&vq->desc[desc_indexes[i+1]]);
- }
+ return 1;
We normally returns -1 (but not 1) on error.
Post by Zhihong Wang
+static inline void __attribute__((always_inline))
+notify_guest(struct virtio_net *dev, struct vhost_virtqueue *vq)
+{
rte_smp_wmb();
-
- *(volatile uint16_t *)&vq->used->idx += count;
- vq->last_used_idx += count;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used, idx),
- sizeof(vq->used->idx));
-
- /* flush used->idx update before we read avail->flags. */
+ vq->used->idx = vq->last_used_idx;
I will not drop the "volatile" cast here, sliently. You know this kind of
stuff is tricky and would be painful to debug if it cause any issue. Such
removal deserves a patch, as well as some explanations.
Post by Zhihong Wang
+ vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
+ sizeof(vq->used->idx));
rte_mb();
-
- /* Kick the guest if necessary. */
if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
&& (vq->callfd >= 0))
eventfd_write(vq->callfd, (eventfd_t)1);
- return count;
}
...
Post by Zhihong Wang
+ /* start enqueuing packets 1 by 1 */
+ avail_idx = *((volatile uint16_t *)&vq->avail->idx);
+ while (pkt_left && avail_idx != vq->last_used_idx) {
+ if (enqueue_packet(dev, vq, avail_idx, pkts[pkt_idx],
+ is_mrg_rxbuf))
break;
- }
-
- nr_used = copy_mbuf_to_desc_mergeable(dev, vq, end,
- pkts[pkt_idx], buf_vec);
- rte_smp_wmb();
-
- *(volatile uint16_t *)&vq->used->idx += nr_used;
- vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
- sizeof(vq->used->idx));
- vq->last_used_idx += nr_used;
- }
-
- if (likely(pkt_idx)) {
- /* flush used->idx update before we read avail->flags. */
- rte_mb();
- /* Kick the guest if necessary. */
- if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
- && (vq->callfd >= 0))
- eventfd_write(vq->callfd, (eventfd_t)1);
+ pkt_idx++;
+ pkt_sent++;
pkt_idx and pkt_sent is duplicate here.

--yliu
Post by Zhihong Wang
+ pkt_left--;
}
- return pkt_idx;
-}
-
-uint16_t
-rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
- struct rte_mbuf **pkts, uint16_t count)
-{
- struct virtio_net *dev = get_device(vid);
-
- if (!dev)
- return 0;
+ /* update used idx and kick the guest if necessary */
+ if (pkt_sent)
+ notify_guest(dev, vq);
- if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF))
- return virtio_dev_merge_rx(dev, queue_id, pkts, count);
- else
- return virtio_dev_rx(dev, queue_id, pkts, count);
+ return pkt_sent;
}
static void
--
2.7.4
Wang, Zhihong
2016-09-07 05:39:10 UTC
Permalink
-----Original Message-----
Sent: Wednesday, September 7, 2016 1:33 PM
Subject: Re: [PATCH v4 2/6] vhost: rewrite enqueue
Hmmm, yet another email didn't send out successfully. Resend.
BTW, please work out v5 on top of the latest next-virtio tree.
Thanks.
Okay. Thanks.
--yliu
----
Post by Zhihong Wang
This patch implements the vhost logic from scratch into a single function
designed for high performance and better maintainability.
This is the baseline version of the new code, more optimization will be
added in the following patches in this patch set.
---
1. Refactor the code for clearer logic.
2. Add PRINT_PACKET for debugging.
---
1. Rewrite enqueue and delete the obsolete in the same patch.
Change log should go ---->
Post by Zhihong Wang
---
... here, after the SoB.
Post by Zhihong Wang
lib/librte_vhost/vhost_rxtx.c | 525 ++++++++++++-----------------------------
-
Post by Zhihong Wang
1 file changed, 145 insertions(+), 380 deletions(-)
diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 5806f99..629e8ae 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -91,7 +91,7 @@ is_valid_virt_queue_idx(uint32_t idx, int is_tx,
uint32_t qp_nb)
Post by Zhihong Wang
return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
}
-static void
+static inline void __attribute__((always_inline))
virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr
*net_hdr)
Post by Zhihong Wang
{
if (m_buf->ol_flags & PKT_TX_L4_MASK) {
@@ -112,6 +112,10 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf,
struct virtio_net_hdr *net_hdr)
Post by Zhihong Wang
cksum));
break;
}
+ } else {
+ net_hdr->flags = 0;
+ net_hdr->csum_start = 0;
+ net_hdr->csum_offset = 0;
}
if (m_buf->ol_flags & PKT_TX_TCP_SEG) {
@@ -122,437 +126,198 @@ virtio_enqueue_offload(struct rte_mbuf
*m_buf, struct virtio_net_hdr *net_hdr)
Post by Zhihong Wang
net_hdr->gso_size = m_buf->tso_segsz;
net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
+ m_buf->l4_len;
+ } else {
+ net_hdr->gso_type = 0;
+ net_hdr->hdr_len = 0;
+ net_hdr->gso_size = 0;
}
}
-static inline void
-copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr,
- struct virtio_net_hdr_mrg_rxbuf hdr)
+static inline void __attribute__((always_inline))
+update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ uint32_t desc_chain_head, uint32_t desc_chain_len)
{
- if (dev->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf))
- *(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr =
hdr;
Post by Zhihong Wang
- else
- *(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr;
+ uint32_t used_idx_round = vq->last_used_idx & (vq->size - 1);
I'd suggest to use "used_idx", instead of "used_idx_round".
Post by Zhihong Wang
+
+ vq->used->ring[used_idx_round].id = desc_chain_head;
+ vq->used->ring[used_idx_round].len = desc_chain_len;
+ vhost_log_used_vring(dev, vq, offsetof(struct vring_used,
+ ring[used_idx_round]),
+ sizeof(vq->used->ring[used_idx_round]));
}
-static inline int __attribute__((always_inline))
-copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
- struct rte_mbuf *m, uint16_t desc_idx)
+static inline uint32_t __attribute__((always_inline))
+enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ uint16_t avail_idx, struct rte_mbuf *mbuf,
+ uint32_t is_mrg_rxbuf)
{
- uint32_t desc_avail, desc_offset;
- uint32_t mbuf_avail, mbuf_offset;
- uint32_t cpy_len;
+ struct virtio_net_hdr_mrg_rxbuf *virtio_hdr;
struct vring_desc *desc;
uint64_t desc_addr;
- struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
-
- desc = &vq->desc[desc_idx];
+ uint32_t desc_chain_head;
+ uint32_t desc_chain_len;
+ uint32_t desc_current;
+ uint32_t desc_offset;
+ uint32_t mbuf_len;
+ uint32_t mbuf_avail;
+ uint32_t copy_len;
+ uint32_t extra_buffers = 0;
I'd name it to "num_buffers", to keep consistent with the virito hdr
naming style.
Post by Zhihong Wang
+
+ /* start with the first mbuf of the packet */
+ mbuf_len = rte_pktmbuf_data_len(mbuf);
+ mbuf_avail = mbuf_len;
+
+ /* get the current desc */
+ desc_current = vq->avail->ring[(vq->last_used_idx) & (vq->size - 1)];
+ desc_chain_head = desc_current;
+ desc = &vq->desc[desc_current];
desc_addr = gpa_to_vva(dev, desc->addr);
- /*
- * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
- * performance issue with some versions of gcc (4.8.4 and 5.3.0)
which
Post by Zhihong Wang
- * otherwise stores offset on the stack instead of in a register.
- */
- if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr)
- return -1;
+ if (unlikely(!desc_addr))
+ goto error;
- rte_prefetch0((void *)(uintptr_t)desc_addr);
+ /* handle virtio header */
+ virtio_hdr = (struct virtio_net_hdr_mrg_rxbuf
*)(uintptr_t)desc_addr;
Post by Zhihong Wang
+ virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
+ if (is_mrg_rxbuf)
+ virtio_hdr->num_buffers = extra_buffers + 1;
- virtio_enqueue_offload(m, &virtio_hdr.hdr);
- copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
vhost_log_write(dev, desc->addr, dev->vhost_hlen);
PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
-
desc_offset = dev->vhost_hlen;
- desc_avail = desc->len - dev->vhost_hlen;
-
- mbuf_avail = rte_pktmbuf_data_len(m);
- mbuf_offset = 0;
- while (mbuf_avail != 0 || m->next != NULL) {
- /* done with current mbuf, fetch next */
- if (mbuf_avail == 0) {
- m = m->next;
-
- mbuf_offset = 0;
- mbuf_avail = rte_pktmbuf_data_len(m);
+ desc_chain_len = desc_offset;
+ desc_addr += desc_offset;
+
+ /* start copy from mbuf to desc */
+ while (mbuf_avail || mbuf->next) {
+ /* get the next mbuf if the current done */
+ if (!mbuf_avail) {
+ mbuf = mbuf->next;
+ mbuf_len = rte_pktmbuf_data_len(mbuf);
+ mbuf_avail = mbuf_len;
}
- /* done with current desc buf, fetch next */
- if (desc_avail == 0) {
- if ((desc->flags & VRING_DESC_F_NEXT) == 0) {
- /* Room in vring buffer is not enough */
- return -1;
- }
- if (unlikely(desc->next >= vq->size))
- return -1;
+ /* get the next desc if the current done */
+ if (desc->len <= desc_offset) {
+ if (desc->flags & VRING_DESC_F_NEXT) {
+ /* go on with the current desc chain */
+ desc_offset = 0;
+ desc_current = desc->next;
+ desc = &vq->desc[desc_current];
+ desc_addr = gpa_to_vva(dev, desc->addr);
+ if (unlikely(!desc_addr))
+ goto error;
+ } else if (is_mrg_rxbuf) {
+ /* start with the next desc chain */
+ update_used_ring(dev, vq,
desc_chain_head,
Post by Zhihong Wang
+ desc_chain_len);
+ vq->last_used_idx++;
Why not putting "vq->last_used_idx++" into update_used_ring()?
Post by Zhihong Wang
+ extra_buffers++;
+ virtio_hdr->num_buffers++;
+ if (avail_idx == vq->last_used_idx)
+ goto error;
+
+ desc_current =
+ vq->avail->ring[(vq->last_used_idx)
&
Post by Zhihong Wang
+ (vq->size - 1)];
+ desc_chain_head = desc_current;
+ desc = &vq->desc[desc_current];
+ desc_addr = gpa_to_vva(dev, desc->addr);
+ if (unlikely(!desc_addr))
+ goto error;
- desc = &vq->desc[desc->next];
- desc_addr = gpa_to_vva(dev, desc->addr);
- if (unlikely(!desc_addr))
- return -1;
-
- desc_offset = 0;
- desc_avail = desc->len;
+ desc_chain_len = 0;
+ desc_offset = 0;
+ } else
+ goto error;
}
- cpy_len = RTE_MIN(desc_avail, mbuf_avail);
- rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
- rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
- cpy_len);
- vhost_log_write(dev, desc->addr + desc_offset, cpy_len);
- PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
- cpy_len, 0);
-
- mbuf_avail -= cpy_len;
- mbuf_offset += cpy_len;
- desc_avail -= cpy_len;
- desc_offset += cpy_len;
+ /* copy mbuf data */
+ copy_len = RTE_MIN(desc->len - desc_offset, mbuf_avail);
TBH, I'm okay with copy_len (actually, I prefer it slightly). However,
the old code uses cpy_len, the current dequeue function also uses cpy_len,
I then see no good reason to use copy_len here. It's really not a good
idea to me to use two different naming styles in one source file.
Post by Zhihong Wang
+ rte_memcpy((void *)(uintptr_t)desc_addr,
+ rte_pktmbuf_mtod_offset(mbuf, void *,
+ mbuf_len - mbuf_avail),
I would keep the old var "mbuf_offset" and do not introduce "mbuf_len".
This could avoid above calculation and make it straightforward.
Post by Zhihong Wang
+ copy_len);
+ vhost_log_write(dev, desc->addr + desc_offset, copy_len);
+ PRINT_PACKET(dev, (uintptr_t)desc_addr, copy_len, 0);
+ mbuf_avail -= copy_len;
+ desc_offset += copy_len;
+ desc_addr += copy_len;
+ desc_chain_len += copy_len;
Vertical alighment[0] is not a must, but as you can see, it's a style I
prefer. Meaning, if possible, please follow it.
[0]: https://en.wikipedia.org/wiki/Programming_style#Vertical_alignment
Post by Zhihong Wang
}
- return 0;
-}
+ update_used_ring(dev, vq, desc_chain_head, desc_chain_len);
+ vq->last_used_idx++;
...
Post by Zhihong Wang
- rte_prefetch0(&vq->desc[desc_indexes[0]]);
- for (i = 0; i < count; i++) {
- uint16_t desc_idx = desc_indexes[i];
- int err;
+ return 0;
- err = copy_mbuf_to_desc(dev, vq, pkts[i], desc_idx);
- if (unlikely(err)) {
- used_idx = (start_idx + i) & (vq->size - 1);
- vq->used->ring[used_idx].len = dev->vhost_hlen;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used, ring[used_idx]),
- sizeof(vq->used->ring[used_idx]));
- }
+ /* rollback on any error if last_used_idx update on-the-fly */
+ vq->last_used_idx -= extra_buffers;
- if (i + 1 < count)
- rte_prefetch0(&vq->desc[desc_indexes[i+1]]);
- }
+ return 1;
We normally returns -1 (but not 1) on error.
Post by Zhihong Wang
+static inline void __attribute__((always_inline))
+notify_guest(struct virtio_net *dev, struct vhost_virtqueue *vq)
+{
rte_smp_wmb();
-
- *(volatile uint16_t *)&vq->used->idx += count;
- vq->last_used_idx += count;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used, idx),
- sizeof(vq->used->idx));
-
- /* flush used->idx update before we read avail->flags. */
+ vq->used->idx = vq->last_used_idx;
I will not drop the "volatile" cast here, sliently. You know this kind of
stuff is tricky and would be painful to debug if it cause any issue. Such
removal deserves a patch, as well as some explanations.
Post by Zhihong Wang
+ vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
+ sizeof(vq->used->idx));
rte_mb();
-
- /* Kick the guest if necessary. */
if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
&& (vq->callfd >= 0))
eventfd_write(vq->callfd, (eventfd_t)1);
- return count;
}
...
Post by Zhihong Wang
+ /* start enqueuing packets 1 by 1 */
+ avail_idx = *((volatile uint16_t *)&vq->avail->idx);
+ while (pkt_left && avail_idx != vq->last_used_idx) {
+ if (enqueue_packet(dev, vq, avail_idx, pkts[pkt_idx],
+ is_mrg_rxbuf))
break;
- }
-
- nr_used = copy_mbuf_to_desc_mergeable(dev, vq, end,
- pkts[pkt_idx], buf_vec);
- rte_smp_wmb();
-
- *(volatile uint16_t *)&vq->used->idx += nr_used;
- vhost_log_used_vring(dev, vq, offsetof(struct vring_used,
idx),
Post by Zhihong Wang
- sizeof(vq->used->idx));
- vq->last_used_idx += nr_used;
- }
-
- if (likely(pkt_idx)) {
- /* flush used->idx update before we read avail->flags. */
- rte_mb();
- /* Kick the guest if necessary. */
- if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
- && (vq->callfd >= 0))
- eventfd_write(vq->callfd, (eventfd_t)1);
+ pkt_idx++;
+ pkt_sent++;
pkt_idx and pkt_sent is duplicate here.
--yliu
Post by Zhihong Wang
+ pkt_left--;
}
- return pkt_idx;
-}
-
-uint16_t
-rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
- struct rte_mbuf **pkts, uint16_t count)
-{
- struct virtio_net *dev = get_device(vid);
-
- if (!dev)
- return 0;
+ /* update used idx and kick the guest if necessary */
+ if (pkt_sent)
+ notify_guest(dev, vq);
- if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF))
- return virtio_dev_merge_rx(dev, queue_id, pkts, count);
- else
- return virtio_dev_rx(dev, queue_id, pkts, count);
+ return pkt_sent;
}
static void
--
2.7.4
Zhihong Wang
2016-08-30 03:36:01 UTC
Permalink
This patch removes useless volatile attribute to allow compiler
optimization.

Signed-off-by: Zhihong Wang <***@intel.com>
---
lib/librte_vhost/vhost-net.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/librte_vhost/vhost-net.h b/lib/librte_vhost/vhost-net.h
index 38593a2..51fdf3d 100644
--- a/lib/librte_vhost/vhost-net.h
+++ b/lib/librte_vhost/vhost-net.h
@@ -71,7 +71,7 @@ struct vhost_virtqueue {
uint32_t size;

/* Last index used on the available ring */
- volatile uint16_t last_used_idx;
+ uint16_t last_used_idx;
#define VIRTIO_INVALID_EVENTFD (-1)
#define VIRTIO_UNINITIALIZED_EVENTFD (-2)
--
2.7.4
Zhihong Wang
2016-08-30 03:36:02 UTC
Permalink
This patch adds descriptor prefetch to hide cache access latency.

Signed-off-by: Zhihong Wang <***@intel.com>
---
lib/librte_vhost/vhost_rxtx.c | 6 ++++++
1 file changed, 6 insertions(+)

diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 629e8ae..927896c 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -304,6 +304,12 @@ rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
/* start enqueuing packets 1 by 1 */
avail_idx = *((volatile uint16_t *)&vq->avail->idx);
while (pkt_left && avail_idx != vq->last_used_idx) {
+ /* prefetch the next desc */
+ if (pkt_left > 1 && avail_idx != vq->last_used_idx + 1)
+ rte_prefetch0(&vq->desc[vq->avail->ring[
+ (vq->last_used_idx + 1) &
+ (vq->size - 1)]]);
+
if (enqueue_packet(dev, vq, avail_idx, pkts[pkt_idx],
is_mrg_rxbuf))
break;
--
2.7.4
Zhihong Wang
2016-08-30 03:36:03 UTC
Permalink
This patch enables batch update of the used ring for better efficiency.

---
Changes in v4:

1. Free shadow used ring in the right place.

2. Add failure check for shadow used ring malloc.

Signed-off-by: Zhihong Wang <***@intel.com>
---
lib/librte_vhost/vhost-net.h | 4 +++
lib/librte_vhost/vhost_rxtx.c | 62 ++++++++++++++++++++++++++++++++++++-------
lib/librte_vhost/virtio-net.c | 42 ++++++++++++++++++++++++++---
3 files changed, 95 insertions(+), 13 deletions(-)

diff --git a/lib/librte_vhost/vhost-net.h b/lib/librte_vhost/vhost-net.h
index 51fdf3d..a15182c 100644
--- a/lib/librte_vhost/vhost-net.h
+++ b/lib/librte_vhost/vhost-net.h
@@ -85,6 +85,10 @@ struct vhost_virtqueue {

/* Physical address of used ring, for logging */
uint64_t log_guest_addr;
+
+ /* Shadow used ring for performance */
+ struct vring_used_elem *shadow_used_ring;
+ uint32_t shadow_used_idx;
} __rte_cache_aligned;

/* Old kernels have no such macro defined */
diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 927896c..ddc7b21 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -134,16 +134,51 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
}

static inline void __attribute__((always_inline))
-update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
- uint32_t desc_chain_head, uint32_t desc_chain_len)
+update_used_ring(struct vhost_virtqueue *vq, uint32_t desc_chain_head,
+ uint32_t desc_chain_len)
{
- uint32_t used_idx_round = vq->last_used_idx & (vq->size - 1);
+ vq->shadow_used_ring[vq->shadow_used_idx].id = desc_chain_head;
+ vq->shadow_used_ring[vq->shadow_used_idx].len = desc_chain_len;
+ vq->shadow_used_idx++;
+}
+
+static inline void __attribute__((always_inline))
+flush_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ uint32_t used_idx_start)
+{
+ if (used_idx_start + vq->shadow_used_idx < vq->size) {
+ rte_memcpy(&vq->used->ring[used_idx_start],
+ &vq->shadow_used_ring[0],
+ vq->shadow_used_idx *
+ sizeof(struct vring_used_elem));
+ vhost_log_used_vring(dev, vq,
+ offsetof(struct vring_used,
+ ring[used_idx_start]),
+ vq->shadow_used_idx *
+ sizeof(struct vring_used_elem));
+ } else {
+ uint32_t part_1 = vq->size - used_idx_start;
+ uint32_t part_2 = vq->shadow_used_idx - part_1;

- vq->used->ring[used_idx_round].id = desc_chain_head;
- vq->used->ring[used_idx_round].len = desc_chain_len;
- vhost_log_used_vring(dev, vq, offsetof(struct vring_used,
- ring[used_idx_round]),
- sizeof(vq->used->ring[used_idx_round]));
+ rte_memcpy(&vq->used->ring[used_idx_start],
+ &vq->shadow_used_ring[0],
+ part_1 *
+ sizeof(struct vring_used_elem));
+ vhost_log_used_vring(dev, vq,
+ offsetof(struct vring_used,
+ ring[used_idx_start]),
+ part_1 *
+ sizeof(struct vring_used_elem));
+ rte_memcpy(&vq->used->ring[0],
+ &vq->shadow_used_ring[part_1],
+ part_2 *
+ sizeof(struct vring_used_elem));
+ vhost_log_used_vring(dev, vq,
+ offsetof(struct vring_used,
+ ring[0]),
+ part_2 *
+ sizeof(struct vring_used_elem));
+ }
}

static inline uint32_t __attribute__((always_inline))
@@ -208,7 +243,7 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
goto error;
} else if (is_mrg_rxbuf) {
/* start with the next desc chain */
- update_used_ring(dev, vq, desc_chain_head,
+ update_used_ring(vq, desc_chain_head,
desc_chain_len);
vq->last_used_idx++;
extra_buffers++;
@@ -245,7 +280,7 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
desc_chain_len += copy_len;
}

- update_used_ring(dev, vq, desc_chain_head, desc_chain_len);
+ update_used_ring(vq, desc_chain_head, desc_chain_len);
vq->last_used_idx++;

return 0;
@@ -276,6 +311,7 @@ rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
{
struct vhost_virtqueue *vq;
struct virtio_net *dev;
+ uint32_t used_idx_start;
uint32_t pkt_left = count;
uint32_t pkt_idx = 0;
uint32_t pkt_sent = 0;
@@ -302,6 +338,8 @@ rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
is_mrg_rxbuf = 1;

/* start enqueuing packets 1 by 1 */
+ vq->shadow_used_idx = 0;
+ used_idx_start = vq->last_used_idx & (vq->size - 1);
avail_idx = *((volatile uint16_t *)&vq->avail->idx);
while (pkt_left && avail_idx != vq->last_used_idx) {
/* prefetch the next desc */
@@ -319,6 +357,10 @@ rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
pkt_left--;
}

+ /* batch update used ring for better performance */
+ if (likely(vq->shadow_used_idx > 0))
+ flush_used_ring(dev, vq, used_idx_start);
+
/* update used idx and kick the guest if necessary */
if (pkt_sent)
notify_guest(dev, vq);
diff --git a/lib/librte_vhost/virtio-net.c b/lib/librte_vhost/virtio-net.c
index 1785695..7416079 100644
--- a/lib/librte_vhost/virtio-net.c
+++ b/lib/librte_vhost/virtio-net.c
@@ -152,10 +152,26 @@ cleanup_device(struct virtio_net *dev, int destroy)
static void
free_device(struct virtio_net *dev)
{
+ struct vhost_virtqueue *vq_0;
+ struct vhost_virtqueue *vq_1;
uint32_t i;

- for (i = 0; i < dev->virt_qp_nb; i++)
- rte_free(dev->virtqueue[i * VIRTIO_QNUM]);
+ for (i = 0; i < dev->virt_qp_nb; i++) {
+ vq_0 = dev->virtqueue[i * VIRTIO_QNUM];
+ if (vq_0->shadow_used_ring) {
+ rte_free(vq_0->shadow_used_ring);
+ vq_0->shadow_used_ring = NULL;
+ }
+
+ vq_1 = dev->virtqueue[i * VIRTIO_QNUM + 1];
+ if (vq_1->shadow_used_ring) {
+ rte_free(vq_1->shadow_used_ring);
+ vq_1->shadow_used_ring = NULL;
+ }
+
+ /* malloc together, free together */
+ rte_free(vq_0);
+ }

rte_free(dev);
}
@@ -418,13 +434,26 @@ int
vhost_set_vring_num(int vid, struct vhost_vring_state *state)
{
struct virtio_net *dev;
+ struct vhost_virtqueue *vq;

dev = get_device(vid);
if (dev == NULL)
return -1;

/* State->index refers to the queue index. The txq is 1, rxq is 0. */
- dev->virtqueue[state->index]->size = state->num;
+ vq = dev->virtqueue[state->index];
+ vq->size = state->num;
+ if (!vq->shadow_used_ring) {
+ vq->shadow_used_ring = rte_malloc(NULL,
+ vq->size * sizeof(struct vring_used_elem),
+ RTE_CACHE_LINE_SIZE);
+ if (!vq->shadow_used_ring) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Failed to allocate memory"
+ " for shadow used ring.\n");
+ return -1;
+ }
+ }

return 0;
}
@@ -610,6 +639,7 @@ int
vhost_get_vring_base(int vid, uint32_t index,
struct vhost_vring_state *state)
{
+ struct vhost_virtqueue *vq;
struct virtio_net *dev;

dev = get_device(vid);
@@ -617,6 +647,12 @@ vhost_get_vring_base(int vid, uint32_t index,
return -1;

state->index = index;
+ vq = dev->virtqueue[state->index];
+ if (vq->shadow_used_ring) {
+ rte_free(vq->shadow_used_ring);
+ vq->shadow_used_ring = NULL;
+ }
+
/* State->index refers to the queue index. The txq is 1, rxq is 0. */
state->num = dev->virtqueue[state->index]->last_used_idx;
--
2.7.4
Zhihong Wang
2016-08-30 03:36:04 UTC
Permalink
This patch reorders the code to delay virtio header write to optimize cache
access efficiency for cases where the mrg_rxbuf feature is turned on. It
reduces CPU pipeline stall cycles significantly.

---
Changes in v3:

1. Remove unnecessary memset which causes frontend stall on SNB & IVB.

2. Rename variables to follow naming convention.

Signed-off-by: Zhihong Wang <***@intel.com>
---
lib/librte_vhost/vhost_rxtx.c | 20 ++++++++++++++------
1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index ddc7b21..fc5dc4a 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -196,6 +196,7 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
uint32_t mbuf_len;
uint32_t mbuf_avail;
uint32_t copy_len;
+ uint32_t copy_virtio_hdr;
uint32_t extra_buffers = 0;

/* start with the first mbuf of the packet */
@@ -210,12 +211,12 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
if (unlikely(!desc_addr))
goto error;

- /* handle virtio header */
+ /*
+ * handle virtio header, the actual write operation is delayed
+ * for cache optimization, to reduce CPU pipeline stall cycles.
+ */
virtio_hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr;
- virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
- if (is_mrg_rxbuf)
- virtio_hdr->num_buffers = extra_buffers + 1;
-
+ copy_virtio_hdr = 1;
vhost_log_write(dev, desc->addr, dev->vhost_hlen);
PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
desc_offset = dev->vhost_hlen;
@@ -266,8 +267,15 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
goto error;
}

- /* copy mbuf data */
+ /* copy virtio header and mbuf data */
copy_len = RTE_MIN(desc->len - desc_offset, mbuf_avail);
+ if (copy_virtio_hdr) {
+ copy_virtio_hdr = 0;
+ virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
+ if (is_mrg_rxbuf)
+ virtio_hdr->num_buffers = extra_buffers + 1;
+ }
+
rte_memcpy((void *)(uintptr_t)desc_addr,
rte_pktmbuf_mtod_offset(mbuf, void *,
mbuf_len - mbuf_avail),
--
2.7.4
Zhihong Wang
2016-09-09 03:39:22 UTC
Permalink
This patch set optimizes the vhost enqueue function.

It implements the vhost logic from scratch into a single function designed
for high performance and good maintainability, and improves CPU efficiency
significantly by optimizing cache access, which means:

* Higher maximum throughput can be achieved for fast frontends like DPDK
virtio pmd.

* Better scalability can be achieved that each vhost core can support
more connections because it takes less cycles to handle each single
frontend.

This patch set contains:

1. A Windows VM compatibility fix for vhost enqueue in 16.07 release.

2. A baseline patch to rewrite the vhost logic.

3. A series of optimization patches added upon the baseline.

The main optimization techniques are:

1. Reorder code to reduce CPU pipeline stall cycles.

2. Batch update the used ring for better efficiency.

3. Prefetch descriptor to hide cache latency.

4. Remove useless volatile attribute to allow compiler optimization.

Code reordering and batch used ring update bring most of the performance
improvements.

In the existing code there're 2 callbacks for vhost enqueue:

* virtio_dev_merge_rx for mrg_rxbuf turned on cases.

* virtio_dev_rx for mrg_rxbuf turned off cases.

The performance of the existing code is not optimal, especially when the
mrg_rxbuf feature turned on. Besides, having 2 callback paths increases
maintenance efforts.

Also, there's a compatibility issue in the existing code which causes
Windows VM to hang when the mrg_rxbuf feature turned on.

---
Changes in v5:

1. Rebase to the latest branch.

2. Rename variables to keep consistent in naming style.

3. Small changes like return value adjustment and vertical alignment.

4. Add details in commit log.

---
Changes in v4:

1. Fix a Windows VM compatibility issue.

2. Free shadow used ring in the right place.

3. Add failure check for shadow used ring malloc.

4. Refactor the code for clearer logic.

5. Add PRINT_PACKET for debugging.

---
Changes in v3:

1. Remove unnecessary memset which causes frontend stall on SNB & IVB.

2. Rename variables to follow naming convention.

3. Rewrite enqueue and delete the obsolete in the same patch.

---
Changes in v2:

1. Split the big function into several small ones.

2. Use multiple patches to explain each optimization.

3. Add comments.

Zhihong Wang (6):
vhost: fix windows vm hang
vhost: rewrite enqueue
vhost: remove useless volatile
vhost: add desc prefetch
vhost: batch update used ring
vhost: optimize cache access

lib/librte_vhost/vhost.c | 20 +-
lib/librte_vhost/vhost.h | 6 +-
lib/librte_vhost/vhost_user.c | 31 ++-
lib/librte_vhost/virtio_net.c | 561 +++++++++++++++---------------------------
4 files changed, 242 insertions(+), 376 deletions(-)
--
2.7.4
Zhihong Wang
2016-09-09 03:39:23 UTC
Permalink
This patch fixes a Windows VM compatibility issue in DPDK 16.07 vhost code
which causes the guest to hang once any packets are enqueued when mrg_rxbuf
is turned on by setting the right id and len in the used ring.

As defined in virtio spec 0.95 and 1.0, in each used ring element, id means
index of start of used descriptor chain, and len means total length of the
descriptor chain which was written to. While in 16.07 code, index of the
last descriptor is assigned to id, and the length of the last descriptor is
assigned to len.

How to test?

1. Start testpmd in the host with a vhost port.

2. Start a Windows VM image with qemu and connect to the vhost port.

3. Start io forwarding with tx_first in host testpmd.

For 16.07 code, the Windows VM will hang once any packets are enqueued.

Cc: <***@dpdk.org>
Signed-off-by: Zhihong Wang <***@intel.com>
---
Changes in v5:

1. Add details in commit log.

lib/librte_vhost/virtio_net.c | 17 ++++++++++++-----
1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 8a151af..0d6e7d9 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -384,6 +384,8 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
uint16_t start_idx = vq->last_used_idx;
uint16_t cur_idx = start_idx;
uint64_t desc_addr;
+ uint32_t desc_chain_head;
+ uint32_t desc_chain_len;
uint32_t mbuf_offset, mbuf_avail;
uint32_t desc_offset, desc_avail;
uint32_t cpy_len;
@@ -412,6 +414,8 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,

desc_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
desc_offset = dev->vhost_hlen;
+ desc_chain_head = buf_vec[vec_idx].desc_idx;
+ desc_chain_len = desc_offset;

mbuf_avail = rte_pktmbuf_data_len(m);
mbuf_offset = 0;
@@ -419,19 +423,21 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
/* done with current desc buf, get the next one */
if (desc_avail == 0) {
desc_idx = buf_vec[vec_idx].desc_idx;
+ vec_idx++;

if (!(vq->desc[desc_idx].flags & VRING_DESC_F_NEXT)) {
/* Update used ring with desc information */
used_idx = cur_idx++ & (vq->size - 1);
- vq->used->ring[used_idx].id = desc_idx;
- vq->used->ring[used_idx].len = desc_offset;
+ vq->used->ring[used_idx].id = desc_chain_head;
+ vq->used->ring[used_idx].len = desc_chain_len;
vhost_log_used_vring(dev, vq,
offsetof(struct vring_used,
ring[used_idx]),
sizeof(vq->used->ring[used_idx]));
+ desc_chain_head = buf_vec[vec_idx].desc_idx;
+ desc_chain_len = 0;
}

- vec_idx++;
desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
if (unlikely(!desc_addr))
return 0;
@@ -463,11 +469,12 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
mbuf_offset += cpy_len;
desc_avail -= cpy_len;
desc_offset += cpy_len;
+ desc_chain_len += cpy_len;
}

used_idx = cur_idx & (vq->size - 1);
- vq->used->ring[used_idx].id = buf_vec[vec_idx].desc_idx;
- vq->used->ring[used_idx].len = desc_offset;
+ vq->used->ring[used_idx].id = desc_chain_head;
+ vq->used->ring[used_idx].len = desc_chain_len;
vhost_log_used_vring(dev, vq,
offsetof(struct vring_used, ring[used_idx]),
sizeof(vq->used->ring[used_idx]));
--
2.7.4
Zhihong Wang
2016-09-09 03:39:24 UTC
Permalink
This patch implements the vhost logic from scratch into a single function
designed for high performance and better maintainability.

This is the baseline version of the new code, more optimization will be
added in the following patches in this patch set.

Signed-off-by: Zhihong Wang <***@intel.com>
---
Changes in v5:

1. Rebase to the latest branch.

2. Rename variables to keep consistent in naming style.

3. Small changes like return value adjustment and vertical alignment.

---
Changes in v4:

1. Refactor the code for clearer logic.

2. Add PRINT_PACKET for debugging.

---
Changes in v3:

1. Rewrite enqueue and delete the obsolete in the same patch.

lib/librte_vhost/virtio_net.c | 514 ++++++++++++------------------------------
1 file changed, 138 insertions(+), 376 deletions(-)

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 0d6e7d9..6f63968 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -91,7 +91,7 @@ is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t qp_nb)
return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
}

-static void
+static inline void __attribute__((always_inline))
virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
{
if (m_buf->ol_flags & PKT_TX_L4_MASK) {
@@ -112,6 +112,10 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
cksum));
break;
}
+ } else {
+ net_hdr->flags = 0;
+ net_hdr->csum_start = 0;
+ net_hdr->csum_offset = 0;
}

if (m_buf->ol_flags & PKT_TX_TCP_SEG) {
@@ -122,439 +126,197 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
net_hdr->gso_size = m_buf->tso_segsz;
net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
+ m_buf->l4_len;
+ } else {
+ net_hdr->gso_type = 0;
+ net_hdr->hdr_len = 0;
+ net_hdr->gso_size = 0;
}
}

-static inline void
-copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr,
- struct virtio_net_hdr_mrg_rxbuf hdr)
+static inline void __attribute__((always_inline))
+update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ uint32_t desc_chain_head, uint32_t desc_chain_len)
{
- if (dev->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf))
- *(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr;
- else
- *(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr;
+ uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
+
+ vq->used->ring[used_idx].id = desc_chain_head;
+ vq->used->ring[used_idx].len = desc_chain_len;
+ vq->last_used_idx++;
+ vhost_log_used_vring(dev, vq, offsetof(struct vring_used,
+ ring[used_idx]),
+ sizeof(vq->used->ring[used_idx]));
}

static inline int __attribute__((always_inline))
-copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
- struct rte_mbuf *m, uint16_t desc_idx)
+enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ uint16_t avail_idx, struct rte_mbuf *mbuf,
+ uint32_t is_mrg_rxbuf)
{
- uint32_t desc_avail, desc_offset;
- uint32_t mbuf_avail, mbuf_offset;
- uint32_t cpy_len;
+ struct virtio_net_hdr_mrg_rxbuf *virtio_hdr;
struct vring_desc *desc;
uint64_t desc_addr;
- struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
+ uint32_t desc_chain_head;
+ uint32_t desc_chain_len;
+ uint32_t desc_current;
+ uint32_t desc_offset;
+ uint32_t mbuf_len;
+ uint32_t mbuf_avail;
+ uint32_t cpy_len;
+ uint32_t num_buffers = 0;

- desc = &vq->desc[desc_idx];
+ /* start with the first mbuf of the packet */
+ mbuf_len = rte_pktmbuf_data_len(mbuf);
+ mbuf_avail = mbuf_len;
+
+ /* get the current desc */
+ desc_current = vq->avail->ring[(vq->last_used_idx) & (vq->size - 1)];
+ desc_chain_head = desc_current;
+ desc = &vq->desc[desc_current];
desc_addr = gpa_to_vva(dev, desc->addr);
- /*
- * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
- * performance issue with some versions of gcc (4.8.4 and 5.3.0) which
- * otherwise stores offset on the stack instead of in a register.
- */
- if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr)
- return -1;
+ if (unlikely(!desc_addr))
+ goto error;

- rte_prefetch0((void *)(uintptr_t)desc_addr);
+ /* handle virtio header */
+ virtio_hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr;
+ virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
+ if (is_mrg_rxbuf)
+ virtio_hdr->num_buffers = 1;

- virtio_enqueue_offload(m, &virtio_hdr.hdr);
- copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
vhost_log_write(dev, desc->addr, dev->vhost_hlen);
PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
-
desc_offset = dev->vhost_hlen;
- desc_avail = desc->len - dev->vhost_hlen;
-
- mbuf_avail = rte_pktmbuf_data_len(m);
- mbuf_offset = 0;
- while (mbuf_avail != 0 || m->next != NULL) {
- /* done with current mbuf, fetch next */
- if (mbuf_avail == 0) {
- m = m->next;
-
- mbuf_offset = 0;
- mbuf_avail = rte_pktmbuf_data_len(m);
+ desc_chain_len = desc_offset;
+ desc_addr += desc_offset;
+
+ /* start copy from mbuf to desc */
+ while (mbuf_avail || mbuf->next) {
+ /* get the next mbuf if the current done */
+ if (!mbuf_avail) {
+ mbuf = mbuf->next;
+ mbuf_len = rte_pktmbuf_data_len(mbuf);
+ mbuf_avail = mbuf_len;
}

- /* done with current desc buf, fetch next */
- if (desc_avail == 0) {
- if ((desc->flags & VRING_DESC_F_NEXT) == 0) {
- /* Room in vring buffer is not enough */
- return -1;
- }
- if (unlikely(desc->next >= vq->size))
- return -1;
+ /* get the next desc if the current done */
+ if (desc->len <= desc_offset) {
+ if (desc->flags & VRING_DESC_F_NEXT) {
+ /* go on with the current desc chain */
+ desc_offset = 0;
+ desc_current = desc->next;
+ desc = &vq->desc[desc_current];
+ desc_addr = gpa_to_vva(dev, desc->addr);
+ if (unlikely(!desc_addr))
+ goto error;
+ } else if (is_mrg_rxbuf) {
+ /* start with the next desc chain */
+ update_used_ring(dev, vq, desc_chain_head,
+ desc_chain_len);
+ num_buffers++;
+ virtio_hdr->num_buffers++;
+ if (avail_idx == vq->last_used_idx)
+ goto error;
+
+ desc_current =
+ vq->avail->ring[(vq->last_used_idx) &
+ (vq->size - 1)];
+ desc_chain_head = desc_current;
+ desc = &vq->desc[desc_current];
+ desc_addr = gpa_to_vva(dev, desc->addr);
+ if (unlikely(!desc_addr))
+ goto error;

- desc = &vq->desc[desc->next];
- desc_addr = gpa_to_vva(dev, desc->addr);
- if (unlikely(!desc_addr))
- return -1;
-
- desc_offset = 0;
- desc_avail = desc->len;
+ desc_chain_len = 0;
+ desc_offset = 0;
+ } else
+ goto error;
}

- cpy_len = RTE_MIN(desc_avail, mbuf_avail);
- rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
- rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
- cpy_len);
+ /* copy mbuf data */
+ cpy_len = RTE_MIN(desc->len - desc_offset, mbuf_avail);
+ rte_memcpy((void *)(uintptr_t)desc_addr,
+ rte_pktmbuf_mtod_offset(mbuf, void *,
+ mbuf_len - mbuf_avail),
+ cpy_len);
vhost_log_write(dev, desc->addr + desc_offset, cpy_len);
- PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
- cpy_len, 0);
-
- mbuf_avail -= cpy_len;
- mbuf_offset += cpy_len;
- desc_avail -= cpy_len;
- desc_offset += cpy_len;
- }
-
- return 0;
-}
-
-/**
- * This function adds buffers to the virtio devices RX virtqueue. Buffers can
- * be received from the physical port or from another virtio device. A packet
- * count is returned to indicate the number of packets that are succesfully
- * added to the RX queue. This function works when the mbuf is scattered, but
- * it doesn't support the mergeable feature.
- */
-static inline uint32_t __attribute__((always_inline))
-virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
- struct rte_mbuf **pkts, uint32_t count)
-{
- struct vhost_virtqueue *vq;
- uint16_t avail_idx, free_entries, start_idx;
- uint16_t desc_indexes[MAX_PKT_BURST];
- uint16_t used_idx;
- uint32_t i;
-
- LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
- if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
- RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
- dev->vid, __func__, queue_id);
- return 0;
+ PRINT_PACKET(dev, (uintptr_t)desc_addr, cpy_len, 0);
+ mbuf_avail -= cpy_len;
+ desc_addr += cpy_len;
+ desc_offset += cpy_len;
+ desc_chain_len += cpy_len;
}

- vq = dev->virtqueue[queue_id];
- if (unlikely(vq->enabled == 0))
- return 0;
+ update_used_ring(dev, vq, desc_chain_head, desc_chain_len);

- avail_idx = *((volatile uint16_t *)&vq->avail->idx);
- start_idx = vq->last_used_idx;
- free_entries = avail_idx - start_idx;
- count = RTE_MIN(count, free_entries);
- count = RTE_MIN(count, (uint32_t)MAX_PKT_BURST);
- if (count == 0)
- return 0;
-
- LOG_DEBUG(VHOST_DATA, "(%d) start_idx %d | end_idx %d\n",
- dev->vid, start_idx, start_idx + count);
-
- /* Retrieve all of the desc indexes first to avoid caching issues. */
- rte_prefetch0(&vq->avail->ring[start_idx & (vq->size - 1)]);
- for (i = 0; i < count; i++) {
- used_idx = (start_idx + i) & (vq->size - 1);
- desc_indexes[i] = vq->avail->ring[used_idx];
- vq->used->ring[used_idx].id = desc_indexes[i];
- vq->used->ring[used_idx].len = pkts[i]->pkt_len +
- dev->vhost_hlen;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used, ring[used_idx]),
- sizeof(vq->used->ring[used_idx]));
- }
-
- rte_prefetch0(&vq->desc[desc_indexes[0]]);
- for (i = 0; i < count; i++) {
- uint16_t desc_idx = desc_indexes[i];
- int err;
+ return 0;

- err = copy_mbuf_to_desc(dev, vq, pkts[i], desc_idx);
- if (unlikely(err)) {
- used_idx = (start_idx + i) & (vq->size - 1);
- vq->used->ring[used_idx].len = dev->vhost_hlen;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used, ring[used_idx]),
- sizeof(vq->used->ring[used_idx]));
- }
+error:
+ /* rollback on any error if last_used_idx update on-the-fly */
+ vq->last_used_idx -= num_buffers;

- if (i + 1 < count)
- rte_prefetch0(&vq->desc[desc_indexes[i+1]]);
- }
+ return -1;
+}

+static inline void __attribute__((always_inline))
+notify_guest(struct virtio_net *dev, struct vhost_virtqueue *vq)
+{
rte_smp_wmb();
-
- *(volatile uint16_t *)&vq->used->idx += count;
- vq->last_used_idx += count;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used, idx),
- sizeof(vq->used->idx));
-
- /* flush used->idx update before we read avail->flags. */
+ *(volatile uint16_t *)&vq->used->idx = vq->last_used_idx;
+ vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
+ sizeof(vq->used->idx));
rte_mb();
-
- /* Kick the guest if necessary. */
if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
&& (vq->callfd >= 0))
eventfd_write(vq->callfd, (eventfd_t)1);
- return count;
}

-static inline int
-fill_vec_buf(struct vhost_virtqueue *vq, uint32_t avail_idx,
- uint32_t *allocated, uint32_t *vec_idx,
- struct buf_vector *buf_vec)
-{
- uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
- uint32_t vec_id = *vec_idx;
- uint32_t len = *allocated;
-
- while (1) {
- if (unlikely(vec_id >= BUF_VECTOR_MAX || idx >= vq->size))
- return -1;
-
- len += vq->desc[idx].len;
- buf_vec[vec_id].buf_addr = vq->desc[idx].addr;
- buf_vec[vec_id].buf_len = vq->desc[idx].len;
- buf_vec[vec_id].desc_idx = idx;
- vec_id++;
-
- if ((vq->desc[idx].flags & VRING_DESC_F_NEXT) == 0)
- break;
-
- idx = vq->desc[idx].next;
- }
-
- *allocated = len;
- *vec_idx = vec_id;
-
- return 0;
-}
-
-/*
- * Returns -1 on fail, 0 on success
- */
-static inline int
-reserve_avail_buf_mergeable(struct vhost_virtqueue *vq, uint32_t size,
- uint16_t *end, struct buf_vector *buf_vec)
+uint16_t
+rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
+ struct rte_mbuf **pkts, uint16_t count)
{
- uint16_t cur_idx;
+ struct vhost_virtqueue *vq;
+ struct virtio_net *dev;
+ uint32_t is_mrg_rxbuf = 0;
+ uint32_t pkt_idx = 0;
+ uint32_t pkt_left = count;
uint16_t avail_idx;
- uint32_t allocated = 0;
- uint32_t vec_idx = 0;
- uint16_t tries = 0;
-
- cur_idx = vq->last_used_idx;
-
- while (1) {
- avail_idx = *((volatile uint16_t *)&vq->avail->idx);
- if (unlikely(cur_idx == avail_idx))
- return -1;
-
- if (unlikely(fill_vec_buf(vq, cur_idx, &allocated,
- &vec_idx, buf_vec) < 0))
- return -1;
-
- cur_idx++;
- tries++;
-
- if (allocated >= size)
- break;
-
- /*
- * if we tried all available ring items, and still
- * can't get enough buf, it means something abnormal
- * happened.
- */
- if (unlikely(tries >= vq->size))
- return -1;
- }

- *end = cur_idx;
- return 0;
-}
-
-static inline uint32_t __attribute__((always_inline))
-copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
- uint16_t end_idx, struct rte_mbuf *m,
- struct buf_vector *buf_vec)
-{
- struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
- uint32_t vec_idx = 0;
- uint16_t start_idx = vq->last_used_idx;
- uint16_t cur_idx = start_idx;
- uint64_t desc_addr;
- uint32_t desc_chain_head;
- uint32_t desc_chain_len;
- uint32_t mbuf_offset, mbuf_avail;
- uint32_t desc_offset, desc_avail;
- uint32_t cpy_len;
- uint16_t desc_idx, used_idx;
-
- if (unlikely(m == NULL))
+ if (unlikely(!pkt_left))
return 0;

- LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n",
- dev->vid, cur_idx, end_idx);
+ pkt_left = RTE_MIN((uint32_t)MAX_PKT_BURST, pkt_left);

- desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
- if (buf_vec[vec_idx].buf_len < dev->vhost_hlen || !desc_addr)
+ dev = get_device(vid);
+ if (unlikely(!dev))
return 0;

- rte_prefetch0((void *)(uintptr_t)desc_addr);
-
- virtio_hdr.num_buffers = end_idx - start_idx;
- LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n",
- dev->vid, virtio_hdr.num_buffers);
-
- virtio_enqueue_offload(m, &virtio_hdr.hdr);
- copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
- vhost_log_write(dev, buf_vec[vec_idx].buf_addr, dev->vhost_hlen);
- PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
-
- desc_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
- desc_offset = dev->vhost_hlen;
- desc_chain_head = buf_vec[vec_idx].desc_idx;
- desc_chain_len = desc_offset;
-
- mbuf_avail = rte_pktmbuf_data_len(m);
- mbuf_offset = 0;
- while (mbuf_avail != 0 || m->next != NULL) {
- /* done with current desc buf, get the next one */
- if (desc_avail == 0) {
- desc_idx = buf_vec[vec_idx].desc_idx;
- vec_idx++;
-
- if (!(vq->desc[desc_idx].flags & VRING_DESC_F_NEXT)) {
- /* Update used ring with desc information */
- used_idx = cur_idx++ & (vq->size - 1);
- vq->used->ring[used_idx].id = desc_chain_head;
- vq->used->ring[used_idx].len = desc_chain_len;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used,
- ring[used_idx]),
- sizeof(vq->used->ring[used_idx]));
- desc_chain_head = buf_vec[vec_idx].desc_idx;
- desc_chain_len = 0;
- }
-
- desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
- if (unlikely(!desc_addr))
- return 0;
-
- /* Prefetch buffer address. */
- rte_prefetch0((void *)(uintptr_t)desc_addr);
- desc_offset = 0;
- desc_avail = buf_vec[vec_idx].buf_len;
- }
-
- /* done with current mbuf, get the next one */
- if (mbuf_avail == 0) {
- m = m->next;
-
- mbuf_offset = 0;
- mbuf_avail = rte_pktmbuf_data_len(m);
- }
-
- cpy_len = RTE_MIN(desc_avail, mbuf_avail);
- rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
- rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
- cpy_len);
- vhost_log_write(dev, buf_vec[vec_idx].buf_addr + desc_offset,
- cpy_len);
- PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
- cpy_len, 0);
-
- mbuf_avail -= cpy_len;
- mbuf_offset += cpy_len;
- desc_avail -= cpy_len;
- desc_offset += cpy_len;
- desc_chain_len += cpy_len;
- }
-
- used_idx = cur_idx & (vq->size - 1);
- vq->used->ring[used_idx].id = desc_chain_head;
- vq->used->ring[used_idx].len = desc_chain_len;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used, ring[used_idx]),
- sizeof(vq->used->ring[used_idx]));
-
- return end_idx - start_idx;
-}
-
-static inline uint32_t __attribute__((always_inline))
-virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
- struct rte_mbuf **pkts, uint32_t count)
-{
- struct vhost_virtqueue *vq;
- uint32_t pkt_idx = 0, nr_used = 0;
- uint16_t end;
- struct buf_vector buf_vec[BUF_VECTOR_MAX];
-
- LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
- if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
- RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
- dev->vid, __func__, queue_id);
+ if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb)))
return 0;
- }

vq = dev->virtqueue[queue_id];
- if (unlikely(vq->enabled == 0))
+ if (unlikely(!vq->enabled))
return 0;

- count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
- if (count == 0)
- return 0;
-
- for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
- uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
+ if (dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF))
+ is_mrg_rxbuf = 1;

- if (unlikely(reserve_avail_buf_mergeable(vq, pkt_len,
- &end, buf_vec) < 0)) {
- LOG_DEBUG(VHOST_DATA,
- "(%d) failed to get enough desc from vring\n",
- dev->vid);
+ /* start enqueuing packets 1 by 1 */
+ avail_idx = *((volatile uint16_t *)&vq->avail->idx);
+ while (pkt_left && avail_idx != vq->last_used_idx) {
+ if (enqueue_packet(dev, vq, avail_idx, pkts[pkt_idx],
+ is_mrg_rxbuf))
break;
- }
-
- nr_used = copy_mbuf_to_desc_mergeable(dev, vq, end,
- pkts[pkt_idx], buf_vec);
- rte_smp_wmb();

- *(volatile uint16_t *)&vq->used->idx += nr_used;
- vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
- sizeof(vq->used->idx));
- vq->last_used_idx += nr_used;
+ pkt_idx++;
+ pkt_left--;
}

- if (likely(pkt_idx)) {
- /* flush used->idx update before we read avail->flags. */
- rte_mb();
-
- /* Kick the guest if necessary. */
- if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
- && (vq->callfd >= 0))
- eventfd_write(vq->callfd, (eventfd_t)1);
- }
+ /* update used idx and kick the guest if necessary */
+ if (pkt_idx)
+ notify_guest(dev, vq);

return pkt_idx;
}

-uint16_t
-rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
- struct rte_mbuf **pkts, uint16_t count)
-{
- struct virtio_net *dev = get_device(vid);
-
- if (!dev)
- return 0;
-
- if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF))
- return virtio_dev_merge_rx(dev, queue_id, pkts, count);
- else
- return virtio_dev_rx(dev, queue_id, pkts, count);
-}
-
static void
parse_ethernet(struct rte_mbuf *m, uint16_t *l4_proto, void **l4_hdr)
{
--
2.7.4
Maxime Coquelin
2016-09-12 15:42:40 UTC
Permalink
Hi,
Post by Zhihong Wang
This patch implements the vhost logic from scratch into a single function
designed for high performance and better maintainability.
This is the baseline version of the new code, more optimization will be
added in the following patches in this patch set.
---
1. Rebase to the latest branch.
2. Rename variables to keep consistent in naming style.
3. Small changes like return value adjustment and vertical alignment.
---
1. Refactor the code for clearer logic.
2. Add PRINT_PACKET for debugging.
---
1. Rewrite enqueue and delete the obsolete in the same patch.
lib/librte_vhost/virtio_net.c | 514 ++++++++++++------------------------------
1 file changed, 138 insertions(+), 376 deletions(-)
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 0d6e7d9..6f63968 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -91,7 +91,7 @@ is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t qp_nb)
return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
}
-static void
+static inline void __attribute__((always_inline))
virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
{
if (m_buf->ol_flags & PKT_TX_L4_MASK) {
@@ -112,6 +112,10 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
cksum));
break;
}
+ } else {
+ net_hdr->flags = 0;
+ net_hdr->csum_start = 0;
+ net_hdr->csum_offset = 0;
}
if (m_buf->ol_flags & PKT_TX_TCP_SEG) {
@@ -122,439 +126,197 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
net_hdr->gso_size = m_buf->tso_segsz;
net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
+ m_buf->l4_len;
+ } else {
+ net_hdr->gso_type = 0;
+ net_hdr->hdr_len = 0;
+ net_hdr->gso_size = 0;
}
}
-static inline void
-copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr,
- struct virtio_net_hdr_mrg_rxbuf hdr)
+static inline void __attribute__((always_inline))
+update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ uint32_t desc_chain_head, uint32_t desc_chain_len)
{
- if (dev->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf))
- *(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr;
- else
- *(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr;
+ uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
+
+ vq->used->ring[used_idx].id = desc_chain_head;
+ vq->used->ring[used_idx].len = desc_chain_len;
+ vq->last_used_idx++;
+ vhost_log_used_vring(dev, vq, offsetof(struct vring_used,
+ ring[used_idx]),
+ sizeof(vq->used->ring[used_idx]));
}
static inline int __attribute__((always_inline))
-copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
- struct rte_mbuf *m, uint16_t desc_idx)
+enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ uint16_t avail_idx, struct rte_mbuf *mbuf,
+ uint32_t is_mrg_rxbuf)
{
- uint32_t desc_avail, desc_offset;
- uint32_t mbuf_avail, mbuf_offset;
- uint32_t cpy_len;
+ struct virtio_net_hdr_mrg_rxbuf *virtio_hdr;
struct vring_desc *desc;
uint64_t desc_addr;
- struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
+ uint32_t desc_chain_head;
+ uint32_t desc_chain_len;
+ uint32_t desc_current;
+ uint32_t desc_offset;
+ uint32_t mbuf_len;
+ uint32_t mbuf_avail;
+ uint32_t cpy_len;
+ uint32_t num_buffers = 0;
- desc = &vq->desc[desc_idx];
+ /* start with the first mbuf of the packet */
+ mbuf_len = rte_pktmbuf_data_len(mbuf);
+ mbuf_avail = mbuf_len;
+
+ /* get the current desc */
+ desc_current = vq->avail->ring[(vq->last_used_idx) & (vq->size - 1)];
+ desc_chain_head = desc_current;
+ desc = &vq->desc[desc_current];
desc_addr = gpa_to_vva(dev, desc->addr);
- /*
- * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
- * performance issue with some versions of gcc (4.8.4 and 5.3.0) which
- * otherwise stores offset on the stack instead of in a register.
- */
- if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr)
- return -1;
+ if (unlikely(!desc_addr))
+ goto error;
- rte_prefetch0((void *)(uintptr_t)desc_addr);
+ /* handle virtio header */
+ virtio_hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr;
+ virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
+ if (is_mrg_rxbuf)
+ virtio_hdr->num_buffers = 1;
- virtio_enqueue_offload(m, &virtio_hdr.hdr);
- copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
vhost_log_write(dev, desc->addr, dev->vhost_hlen);
PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
-
desc_offset = dev->vhost_hlen;
- desc_avail = desc->len - dev->vhost_hlen;
-
- mbuf_avail = rte_pktmbuf_data_len(m);
- mbuf_offset = 0;
- while (mbuf_avail != 0 || m->next != NULL) {
- /* done with current mbuf, fetch next */
- if (mbuf_avail == 0) {
- m = m->next;
-
- mbuf_offset = 0;
- mbuf_avail = rte_pktmbuf_data_len(m);
+ desc_chain_len = desc_offset;
+ desc_addr += desc_offset;
+
+ /* start copy from mbuf to desc */
+ while (mbuf_avail || mbuf->next) {
+ /* get the next mbuf if the current done */
+ if (!mbuf_avail) {
+ mbuf = mbuf->next;
+ mbuf_len = rte_pktmbuf_data_len(mbuf);
+ mbuf_avail = mbuf_len;
}
- /* done with current desc buf, fetch next */
- if (desc_avail == 0) {
- if ((desc->flags & VRING_DESC_F_NEXT) == 0) {
- /* Room in vring buffer is not enough */
- return -1;
- }
- if (unlikely(desc->next >= vq->size))
- return -1;
+ /* get the next desc if the current done */
+ if (desc->len <= desc_offset) {
+ if (desc->flags & VRING_DESC_F_NEXT) {
+ /* go on with the current desc chain */
+ desc_offset = 0;
+ desc_current = desc->next;
+ desc = &vq->desc[desc_current];
+ desc_addr = gpa_to_vva(dev, desc->addr);
+ if (unlikely(!desc_addr))
+ goto error;
+ } else if (is_mrg_rxbuf) {
+ /* start with the next desc chain */
+ update_used_ring(dev, vq, desc_chain_head,
+ desc_chain_len);
+ num_buffers++;
+ virtio_hdr->num_buffers++;
+ if (avail_idx == vq->last_used_idx)
+ goto error;
+
+ desc_current =
+ vq->avail->ring[(vq->last_used_idx) &
+ (vq->size - 1)];
+ desc_chain_head = desc_current;
+ desc = &vq->desc[desc_current];
+ desc_addr = gpa_to_vva(dev, desc->addr);
+ if (unlikely(!desc_addr))
+ goto error;
- desc = &vq->desc[desc->next];
- desc_addr = gpa_to_vva(dev, desc->addr);
- if (unlikely(!desc_addr))
- return -1;
-
- desc_offset = 0;
- desc_avail = desc->len;
+ desc_chain_len = 0;
+ desc_offset = 0;
As I commented on v3, there is code duplication between next flag, and
mrg buf cases:
desc_offset = 0;

and:

desc = &vq->desc[desc_current];
desc_addr = gpa_to_vva(dev, desc->addr);
if (unlikely(!desc_addr))
goto error;

Regards,
Maxime
Wang, Zhihong
2016-09-14 08:20:20 UTC
Permalink
Post by Maxime Coquelin
Post by Zhihong Wang
+ desc_current =
+ vq->avail->ring[(vq->last_used_idx)
&
Post by Zhihong Wang
+ (vq->size - 1)];
+ desc_chain_head = desc_current;
+ desc = &vq->desc[desc_current];
+ desc_addr = gpa_to_vva(dev, desc->addr);
+ if (unlikely(!desc_addr))
+ goto error;
- desc = &vq->desc[desc->next];
- desc_addr = gpa_to_vva(dev, desc->addr);
- if (unlikely(!desc_addr))
- return -1;
-
- desc_offset = 0;
- desc_avail = desc->len;
+ desc_chain_len = 0;
+ desc_offset = 0;
As I commented on v3, there is code duplication between next flag, and
desc_offset = 0;
desc = &vq->desc[desc_current];
desc_addr = gpa_to_vva(dev, desc->addr);
if (unlikely(!desc_addr))
goto error;
Do you mean to add something like:

static inline int __attribute__((always_inline))
get_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
uint32_t desc_idx, struct vring_desc **desc,
uint64_t *desc_addr)
{
*desc = &vq->desc[desc_idx];
*desc_addr = gpa_to_vva(dev, (*desc)->addr);
if (unlikely(!(*desc_addr)))
return -1;

return 0;
}
Post by Maxime Coquelin
Regards,
Maxime
Maxime Coquelin
2016-09-15 16:35:35 UTC
Permalink
Hi,
Post by Zhihong Wang
Post by Maxime Coquelin
Post by Zhihong Wang
+ desc_current =
+ vq->avail->ring[(vq->last_used_idx)
&
Post by Zhihong Wang
+ (vq->size - 1)];
+ desc_chain_head = desc_current;
+ desc = &vq->desc[desc_current];
+ desc_addr = gpa_to_vva(dev, desc->addr);
+ if (unlikely(!desc_addr))
+ goto error;
- desc = &vq->desc[desc->next];
- desc_addr = gpa_to_vva(dev, desc->addr);
- if (unlikely(!desc_addr))
- return -1;
-
- desc_offset = 0;
- desc_avail = desc->len;
+ desc_chain_len = 0;
+ desc_offset = 0;
As I commented on v3, there is code duplication between next flag, and
desc_offset = 0;
desc = &vq->desc[desc_current];
desc_addr = gpa_to_vva(dev, desc->addr);
if (unlikely(!desc_addr))
goto error;
static inline int __attribute__((always_inline))
get_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
uint32_t desc_idx, struct vring_desc **desc,
uint64_t *desc_addr)
{
*desc = &vq->desc[desc_idx];
*desc_addr = gpa_to_vva(dev, (*desc)->addr);
if (unlikely(!(*desc_addr)))
return -1;
return 0;
}
I meant, move this code after the if/else.
You can do it in a function if it is done elsewhere in the file.
Maxime Coquelin
2016-09-12 16:26:38 UTC
Permalink
Post by Zhihong Wang
+static inline void __attribute__((always_inline))
+notify_guest(struct virtio_net *dev, struct vhost_virtqueue *vq)
+{
rte_smp_wmb();
-
- *(volatile uint16_t *)&vq->used->idx += count;
- vq->last_used_idx += count;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used, idx),
- sizeof(vq->used->idx));
-
- /* flush used->idx update before we read avail->flags. */
Please don't remove comments if not justified.
Here the comment is important, as it explains why the barrier is needed.
Post by Zhihong Wang
+ *(volatile uint16_t *)&vq->used->idx = vq->last_used_idx;
+ vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
+ sizeof(vq->used->idx));
rte_mb();
-
- /* Kick the guest if necessary. */
if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
&& (vq->callfd >= 0))
eventfd_write(vq->callfd, (eventfd_t)1);
- return count;
}
Wang, Zhihong
2016-09-14 08:22:39 UTC
Permalink
-----Original Message-----
Sent: Tuesday, September 13, 2016 12:27 AM
Subject: Re: [PATCH v5 2/6] vhost: rewrite enqueue
Post by Zhihong Wang
+static inline void __attribute__((always_inline))
+notify_guest(struct virtio_net *dev, struct vhost_virtqueue *vq)
+{
rte_smp_wmb();
-
- *(volatile uint16_t *)&vq->used->idx += count;
- vq->last_used_idx += count;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used, idx),
- sizeof(vq->used->idx));
-
- /* flush used->idx update before we read avail->flags. */
Please don't remove comments if not justified.
Here the comment is important, as it explains why the barrier is needed.
Okay.
Post by Zhihong Wang
+ *(volatile uint16_t *)&vq->used->idx = vq->last_used_idx;
+ vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
+ sizeof(vq->used->idx));
rte_mb();
-
- /* Kick the guest if necessary. */
if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
&& (vq->callfd >= 0))
eventfd_write(vq->callfd, (eventfd_t)1);
- return count;
}
Yuanhan Liu
2016-09-18 14:19:14 UTC
Permalink
Post by Zhihong Wang
This patch implements the vhost logic from scratch into a single function
designed for high performance and better maintainability.
As always, your commit log just states what have been done, but doesn't
tell why such changes have been made. For example, you said "it's designed
for high performance", then you'd better explain why your version would
introduce high performance. You need a reason, as well as some numbers
(percent change) to prove it: it's not that right to keep the numbers
inside: I'm sure people outside intel are also willing and happy to know
those numbers.

For this patch, I think it's more about the maintainability improvement
but not performance: the performance tunning patches are done later
after all.

Another example is, in patch 6, you said "It reduces CPU pipeline stall
cycles significantly", but you didn't say why there is pipeline stall
before and why your patch reduces it.

All those are important things that deserves some explanation. So, I'd
ask you to re-visit all your patches in this set, to think what you
could add to make the commit better and more informative.

Besides that, I think this patchset looks fine to me. I may just need
another time to look it more carefully, then I think I can merge (v6).

BTW, thanks for the great work!

--yliu
Post by Zhihong Wang
This is the baseline version of the new code, more optimization will be
added in the following patches in this patch set.
---
Wang, Zhihong
2016-09-19 03:29:58 UTC
Permalink
-----Original Message-----
Sent: Sunday, September 18, 2016 10:19 PM
Subject: Re: [PATCH v5 2/6] vhost: rewrite enqueue
Post by Zhihong Wang
This patch implements the vhost logic from scratch into a single function
designed for high performance and better maintainability.
As always, your commit log just states what have been done, but doesn't
tell why such changes have been made. For example, you said "it's designed
for high performance", then you'd better explain why your version would
introduce high performance. You need a reason, as well as some numbers
(percent change) to prove it: it's not that right to keep the numbers
inside: I'm sure people outside intel are also willing and happy to know
those numbers.
For this patch, I think it's more about the maintainability improvement
but not performance: the performance tunning patches are done later
after all.
Another example is, in patch 6, you said "It reduces CPU pipeline stall
cycles significantly", but you didn't say why there is pipeline stall
before and why your patch reduces it.
All those are important things that deserves some explanation. So, I'd
ask you to re-visit all your patches in this set, to think what you
could add to make the commit better and more informative.
Okay. I'll add more detailed commit log.
Besides that, I think this patchset looks fine to me. I may just need
another time to look it more carefully, then I think I can merge (v6).
BTW, thanks for the great work!
--yliu
Post by Zhihong Wang
This is the baseline version of the new code, more optimization will be
added in the following patches in this patch set.
---
Zhihong Wang
2016-09-09 03:39:25 UTC
Permalink
This patch removes useless volatile attribute to allow compiler
optimization.

Signed-off-by: Zhihong Wang <***@intel.com>
---
lib/librte_vhost/vhost.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index c2dfc3c..9707dfc 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -71,7 +71,7 @@ struct vhost_virtqueue {
uint32_t size;

/* Last index used on the available ring */
- volatile uint16_t last_used_idx;
+ uint16_t last_used_idx;
#define VIRTIO_INVALID_EVENTFD (-1)
#define VIRTIO_UNINITIALIZED_EVENTFD (-2)
--
2.7.4
Zhihong Wang
2016-09-09 03:39:26 UTC
Permalink
This patch adds descriptor prefetch to hide cache access latency.

Signed-off-by: Zhihong Wang <***@intel.com>
---
lib/librte_vhost/virtio_net.c | 6 ++++++
1 file changed, 6 insertions(+)

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 6f63968..b38f18f 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -302,6 +302,12 @@ rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
/* start enqueuing packets 1 by 1 */
avail_idx = *((volatile uint16_t *)&vq->avail->idx);
while (pkt_left && avail_idx != vq->last_used_idx) {
+ /* prefetch the next desc */
+ if (pkt_left > 1 && avail_idx != vq->last_used_idx + 1)
+ rte_prefetch0(&vq->desc[vq->avail->ring[
+ (vq->last_used_idx + 1) &
+ (vq->size - 1)]]);
+
if (enqueue_packet(dev, vq, avail_idx, pkts[pkt_idx],
is_mrg_rxbuf))
break;
--
2.7.4
Zhihong Wang
2016-09-09 03:39:27 UTC
Permalink
This patch enables batch update of the used ring for better efficiency.

Signed-off-by: Zhihong Wang <***@intel.com>
---
Changes in v4:

1. Free shadow used ring in the right place.

2. Add failure check for shadow used ring malloc.

lib/librte_vhost/vhost.c | 20 ++++++++++++--
lib/librte_vhost/vhost.h | 4 +++
lib/librte_vhost/vhost_user.c | 31 +++++++++++++++++----
lib/librte_vhost/virtio_net.c | 64 +++++++++++++++++++++++++++++++++++--------
4 files changed, 101 insertions(+), 18 deletions(-)

diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index 46095c3..cb31cdd 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -119,10 +119,26 @@ cleanup_device(struct virtio_net *dev, int destroy)
static void
free_device(struct virtio_net *dev)
{
+ struct vhost_virtqueue *vq_0;
+ struct vhost_virtqueue *vq_1;
uint32_t i;

- for (i = 0; i < dev->virt_qp_nb; i++)
- rte_free(dev->virtqueue[i * VIRTIO_QNUM]);
+ for (i = 0; i < dev->virt_qp_nb; i++) {
+ vq_0 = dev->virtqueue[i * VIRTIO_QNUM];
+ if (vq_0->shadow_used_ring) {
+ rte_free(vq_0->shadow_used_ring);
+ vq_0->shadow_used_ring = NULL;
+ }
+
+ vq_1 = dev->virtqueue[i * VIRTIO_QNUM + 1];
+ if (vq_1->shadow_used_ring) {
+ rte_free(vq_1->shadow_used_ring);
+ vq_1->shadow_used_ring = NULL;
+ }
+
+ /* malloc together, free together */
+ rte_free(vq_0);
+ }

rte_free(dev);
}
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 9707dfc..381dc27 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -85,6 +85,10 @@ struct vhost_virtqueue {

/* Physical address of used ring, for logging */
uint64_t log_guest_addr;
+
+ /* Shadow used ring for performance */
+ struct vring_used_elem *shadow_used_ring;
+ uint32_t shadow_used_idx;
} __rte_cache_aligned;

/* Old kernels have no such macro defined */
diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
index eee99e9..d7cf1ed 100644
--- a/lib/librte_vhost/vhost_user.c
+++ b/lib/librte_vhost/vhost_user.c
@@ -193,7 +193,21 @@ static int
vhost_user_set_vring_num(struct virtio_net *dev,
struct vhost_vring_state *state)
{
- dev->virtqueue[state->index]->size = state->num;
+ struct vhost_virtqueue *vq;
+
+ vq = dev->virtqueue[state->index];
+ vq->size = state->num;
+ if (!vq->shadow_used_ring) {
+ vq->shadow_used_ring = rte_malloc(NULL,
+ vq->size * sizeof(struct vring_used_elem),
+ RTE_CACHE_LINE_SIZE);
+ if (!vq->shadow_used_ring) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Failed to allocate memory"
+ " for shadow used ring.\n");
+ return -1;
+ }
+ }

return 0;
}
@@ -611,14 +625,21 @@ static int
vhost_user_get_vring_base(struct virtio_net *dev,
struct vhost_vring_state *state)
{
+ struct vhost_virtqueue *vq;
+
/* We have to stop the queue (virtio) if it is running. */
if (dev->flags & VIRTIO_DEV_RUNNING) {
dev->flags &= ~VIRTIO_DEV_RUNNING;
notify_ops->destroy_device(dev->vid);
}

+ vq = dev->virtqueue[state->index];
/* Here we are safe to get the last used index */
- state->num = dev->virtqueue[state->index]->last_used_idx;
+ state->num = vq->last_used_idx;
+ if (vq->shadow_used_ring) {
+ rte_free(vq->shadow_used_ring);
+ vq->shadow_used_ring = NULL;
+ }

RTE_LOG(INFO, VHOST_CONFIG,
"vring base idx:%d file:%d\n", state->index, state->num);
@@ -627,10 +648,10 @@ vhost_user_get_vring_base(struct virtio_net *dev,
* sent and only sent in vhost_vring_stop.
* TODO: cleanup the vring, it isn't usable since here.
*/
- if (dev->virtqueue[state->index]->kickfd >= 0)
- close(dev->virtqueue[state->index]->kickfd);
+ if (vq->kickfd >= 0)
+ close(vq->kickfd);

- dev->virtqueue[state->index]->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
+ vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;

return 0;
}
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index b38f18f..e9f6353 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -134,17 +134,52 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
}

static inline void __attribute__((always_inline))
-update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
- uint32_t desc_chain_head, uint32_t desc_chain_len)
+update_used_ring(struct vhost_virtqueue *vq, uint32_t desc_chain_head,
+ uint32_t desc_chain_len)
{
- uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
-
- vq->used->ring[used_idx].id = desc_chain_head;
- vq->used->ring[used_idx].len = desc_chain_len;
+ vq->shadow_used_ring[vq->shadow_used_idx].id = desc_chain_head;
+ vq->shadow_used_ring[vq->shadow_used_idx].len = desc_chain_len;
+ vq->shadow_used_idx++;
vq->last_used_idx++;
- vhost_log_used_vring(dev, vq, offsetof(struct vring_used,
- ring[used_idx]),
- sizeof(vq->used->ring[used_idx]));
+}
+
+static inline void __attribute__((always_inline))
+flush_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ uint32_t used_idx_start)
+{
+ if (used_idx_start + vq->shadow_used_idx < vq->size) {
+ rte_memcpy(&vq->used->ring[used_idx_start],
+ &vq->shadow_used_ring[0],
+ vq->shadow_used_idx *
+ sizeof(struct vring_used_elem));
+ vhost_log_used_vring(dev, vq,
+ offsetof(struct vring_used,
+ ring[used_idx_start]),
+ vq->shadow_used_idx *
+ sizeof(struct vring_used_elem));
+ } else {
+ uint32_t part_1 = vq->size - used_idx_start;
+ uint32_t part_2 = vq->shadow_used_idx - part_1;
+
+ rte_memcpy(&vq->used->ring[used_idx_start],
+ &vq->shadow_used_ring[0],
+ part_1 *
+ sizeof(struct vring_used_elem));
+ vhost_log_used_vring(dev, vq,
+ offsetof(struct vring_used,
+ ring[used_idx_start]),
+ part_1 *
+ sizeof(struct vring_used_elem));
+ rte_memcpy(&vq->used->ring[0],
+ &vq->shadow_used_ring[part_1],
+ part_2 *
+ sizeof(struct vring_used_elem));
+ vhost_log_used_vring(dev, vq,
+ offsetof(struct vring_used,
+ ring[0]),
+ part_2 *
+ sizeof(struct vring_used_elem));
+ }
}

static inline int __attribute__((always_inline))
@@ -209,7 +244,7 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
goto error;
} else if (is_mrg_rxbuf) {
/* start with the next desc chain */
- update_used_ring(dev, vq, desc_chain_head,
+ update_used_ring(vq, desc_chain_head,
desc_chain_len);
num_buffers++;
virtio_hdr->num_buffers++;
@@ -245,7 +280,7 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
desc_chain_len += cpy_len;
}

- update_used_ring(dev, vq, desc_chain_head, desc_chain_len);
+ update_used_ring(vq, desc_chain_head, desc_chain_len);

return 0;

@@ -275,6 +310,7 @@ rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
{
struct vhost_virtqueue *vq;
struct virtio_net *dev;
+ uint32_t used_idx_start;
uint32_t is_mrg_rxbuf = 0;
uint32_t pkt_idx = 0;
uint32_t pkt_left = count;
@@ -300,6 +336,8 @@ rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
is_mrg_rxbuf = 1;

/* start enqueuing packets 1 by 1 */
+ vq->shadow_used_idx = 0;
+ used_idx_start = vq->last_used_idx & (vq->size - 1);
avail_idx = *((volatile uint16_t *)&vq->avail->idx);
while (pkt_left && avail_idx != vq->last_used_idx) {
/* prefetch the next desc */
@@ -316,6 +354,10 @@ rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
pkt_left--;
}

+ /* batch update used ring for better performance */
+ if (likely(vq->shadow_used_idx > 0))
+ flush_used_ring(dev, vq, used_idx_start);
+
/* update used idx and kick the guest if necessary */
if (pkt_idx)
notify_guest(dev, vq);
--
2.7.4
Maxime Coquelin
2016-09-12 15:45:40 UTC
Permalink
Post by Zhihong Wang
This patch enables batch update of the used ring for better efficiency.
---
1. Free shadow used ring in the right place.
2. Add failure check for shadow used ring malloc.
lib/librte_vhost/vhost.c | 20 ++++++++++++--
lib/librte_vhost/vhost.h | 4 +++
lib/librte_vhost/vhost_user.c | 31 +++++++++++++++++----
lib/librte_vhost/virtio_net.c | 64 +++++++++++++++++++++++++++++++++++--------
4 files changed, 101 insertions(+), 18 deletions(-)
diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index 46095c3..cb31cdd 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -119,10 +119,26 @@ cleanup_device(struct virtio_net *dev, int destroy)
static void
free_device(struct virtio_net *dev)
{
+ struct vhost_virtqueue *vq_0;
+ struct vhost_virtqueue *vq_1;
uint32_t i;
- for (i = 0; i < dev->virt_qp_nb; i++)
- rte_free(dev->virtqueue[i * VIRTIO_QNUM]);
+ for (i = 0; i < dev->virt_qp_nb; i++) {
+ vq_0 = dev->virtqueue[i * VIRTIO_QNUM];
+ if (vq_0->shadow_used_ring) {
+ rte_free(vq_0->shadow_used_ring);
+ vq_0->shadow_used_ring = NULL;
+ }
+
+ vq_1 = dev->virtqueue[i * VIRTIO_QNUM + 1];
+ if (vq_1->shadow_used_ring) {
+ rte_free(vq_1->shadow_used_ring);
+ vq_1->shadow_used_ring = NULL;
+ }
+
+ /* malloc together, free together */
+ rte_free(vq_0);
+ }
rte_free(dev);
}
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 9707dfc..381dc27 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -85,6 +85,10 @@ struct vhost_virtqueue {
/* Physical address of used ring, for logging */
uint64_t log_guest_addr;
+
+ /* Shadow used ring for performance */
+ struct vring_used_elem *shadow_used_ring;
+ uint32_t shadow_used_idx;
} __rte_cache_aligned;
/* Old kernels have no such macro defined */
diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
index eee99e9..d7cf1ed 100644
--- a/lib/librte_vhost/vhost_user.c
+++ b/lib/librte_vhost/vhost_user.c
@@ -193,7 +193,21 @@ static int
vhost_user_set_vring_num(struct virtio_net *dev,
struct vhost_vring_state *state)
{
- dev->virtqueue[state->index]->size = state->num;
+ struct vhost_virtqueue *vq;
+
+ vq = dev->virtqueue[state->index];
+ vq->size = state->num;
+ if (!vq->shadow_used_ring) {
+ vq->shadow_used_ring = rte_malloc(NULL,
+ vq->size * sizeof(struct vring_used_elem),
+ RTE_CACHE_LINE_SIZE);
+ if (!vq->shadow_used_ring) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Failed to allocate memory"
+ " for shadow used ring.\n");
+ return -1;
+ }
+ }
return 0;
}
@@ -611,14 +625,21 @@ static int
vhost_user_get_vring_base(struct virtio_net *dev,
struct vhost_vring_state *state)
{
+ struct vhost_virtqueue *vq;
+
/* We have to stop the queue (virtio) if it is running. */
if (dev->flags & VIRTIO_DEV_RUNNING) {
dev->flags &= ~VIRTIO_DEV_RUNNING;
notify_ops->destroy_device(dev->vid);
}
+ vq = dev->virtqueue[state->index];
/* Here we are safe to get the last used index */
- state->num = dev->virtqueue[state->index]->last_used_idx;
+ state->num = vq->last_used_idx;
+ if (vq->shadow_used_ring) {
+ rte_free(vq->shadow_used_ring);
+ vq->shadow_used_ring = NULL;
+ }
RTE_LOG(INFO, VHOST_CONFIG,
"vring base idx:%d file:%d\n", state->index, state->num);
@@ -627,10 +648,10 @@ vhost_user_get_vring_base(struct virtio_net *dev,
* sent and only sent in vhost_vring_stop.
* TODO: cleanup the vring, it isn't usable since here.
*/
- if (dev->virtqueue[state->index]->kickfd >= 0)
- close(dev->virtqueue[state->index]->kickfd);
+ if (vq->kickfd >= 0)
+ close(vq->kickfd);
- dev->virtqueue[state->index]->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
+ vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
return 0;
}
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index b38f18f..e9f6353 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -134,17 +134,52 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
}
static inline void __attribute__((always_inline))
-update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
- uint32_t desc_chain_head, uint32_t desc_chain_len)
+update_used_ring(struct vhost_virtqueue *vq, uint32_t desc_chain_head,
+ uint32_t desc_chain_len)
{
- uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
-
- vq->used->ring[used_idx].id = desc_chain_head;
- vq->used->ring[used_idx].len = desc_chain_len;
+ vq->shadow_used_ring[vq->shadow_used_idx].id = desc_chain_head;
+ vq->shadow_used_ring[vq->shadow_used_idx].len = desc_chain_len;
+ vq->shadow_used_idx++;
vq->last_used_idx++;
- vhost_log_used_vring(dev, vq, offsetof(struct vring_used,
- ring[used_idx]),
- sizeof(vq->used->ring[used_idx]));
+}
+
+static inline void __attribute__((always_inline))
+flush_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ uint32_t used_idx_start)
+{
+ if (used_idx_start + vq->shadow_used_idx < vq->size) {
+ rte_memcpy(&vq->used->ring[used_idx_start],
+ &vq->shadow_used_ring[0],
+ vq->shadow_used_idx *
+ sizeof(struct vring_used_elem));
+ vhost_log_used_vring(dev, vq,
+ offsetof(struct vring_used,
+ ring[used_idx_start]),
+ vq->shadow_used_idx *
+ sizeof(struct vring_used_elem));
+ } else {
+ uint32_t part_1 = vq->size - used_idx_start;
+ uint32_t part_2 = vq->shadow_used_idx - part_1;
+
+ rte_memcpy(&vq->used->ring[used_idx_start],
+ &vq->shadow_used_ring[0],
+ part_1 *
+ sizeof(struct vring_used_elem));
+ vhost_log_used_vring(dev, vq,
+ offsetof(struct vring_used,
+ ring[used_idx_start]),
+ part_1 *
+ sizeof(struct vring_used_elem));
+ rte_memcpy(&vq->used->ring[0],
+ &vq->shadow_used_ring[part_1],
+ part_2 *
+ sizeof(struct vring_used_elem));
+ vhost_log_used_vring(dev, vq,
+ offsetof(struct vring_used,
+ ring[0]),
+ part_2 *
+ sizeof(struct vring_used_elem));
+ }
}
Is expanding the code done for performance purpose?
Or maybe we could have a loop to do that?
Something like this (not compiled, not tested):

static inline void __attribute__((always_inline))
flush_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
{
uint32_t to = used_idx_start;
uint32_t from = 0;
uint32_t count;

if (used_idx_start + vq->shadow_used_idx < vq->size)
count = vq->shadow_used_idx;
else
count = vq->size - used_idx_start;

do {
rte_memcpy(&vq->used->ring[to],
&vq->shadow_used_ring[from],
count * sizeof(struct vring_used_elem));
vhost_log_used_vring(dev, vq,
offsetof(struct vring_used, ring[to]),
count * sizeof(struct vring_used_elem));

to = (to + count) & (vq->size - 1);
from += count;
count = vq->shadow_used_idx - count;
} while (count);
}

Regards,
Maxime
Wang, Zhihong
2016-09-14 08:43:30 UTC
Permalink
-----Original Message-----
Sent: Monday, September 12, 2016 11:46 PM
Subject: Re: [PATCH v5 5/6] vhost: batch update used ring
Post by Zhihong Wang
This patch enables batch update of the used ring for better efficiency.
---
1. Free shadow used ring in the right place.
2. Add failure check for shadow used ring malloc.
lib/librte_vhost/vhost.c | 20 ++++++++++++--
lib/librte_vhost/vhost.h | 4 +++
lib/librte_vhost/vhost_user.c | 31 +++++++++++++++++----
lib/librte_vhost/virtio_net.c | 64
+++++++++++++++++++++++++++++++++++--------
Post by Zhihong Wang
4 files changed, 101 insertions(+), 18 deletions(-)
diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index 46095c3..cb31cdd 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -119,10 +119,26 @@ cleanup_device(struct virtio_net *dev, int
destroy)
Post by Zhihong Wang
static void
free_device(struct virtio_net *dev)
{
+ struct vhost_virtqueue *vq_0;
+ struct vhost_virtqueue *vq_1;
uint32_t i;
- for (i = 0; i < dev->virt_qp_nb; i++)
- rte_free(dev->virtqueue[i * VIRTIO_QNUM]);
+ for (i = 0; i < dev->virt_qp_nb; i++) {
+ vq_0 = dev->virtqueue[i * VIRTIO_QNUM];
+ if (vq_0->shadow_used_ring) {
+ rte_free(vq_0->shadow_used_ring);
+ vq_0->shadow_used_ring = NULL;
+ }
+
+ vq_1 = dev->virtqueue[i * VIRTIO_QNUM + 1];
+ if (vq_1->shadow_used_ring) {
+ rte_free(vq_1->shadow_used_ring);
+ vq_1->shadow_used_ring = NULL;
+ }
+
+ /* malloc together, free together */
+ rte_free(vq_0);
+ }
rte_free(dev);
}
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 9707dfc..381dc27 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -85,6 +85,10 @@ struct vhost_virtqueue {
/* Physical address of used ring, for logging */
uint64_t log_guest_addr;
+
+ /* Shadow used ring for performance */
+ struct vring_used_elem *shadow_used_ring;
+ uint32_t shadow_used_idx;
} __rte_cache_aligned;
/* Old kernels have no such macro defined */
diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
index eee99e9..d7cf1ed 100644
--- a/lib/librte_vhost/vhost_user.c
+++ b/lib/librte_vhost/vhost_user.c
@@ -193,7 +193,21 @@ static int
vhost_user_set_vring_num(struct virtio_net *dev,
struct vhost_vring_state *state)
{
- dev->virtqueue[state->index]->size = state->num;
+ struct vhost_virtqueue *vq;
+
+ vq = dev->virtqueue[state->index];
+ vq->size = state->num;
+ if (!vq->shadow_used_ring) {
+ vq->shadow_used_ring = rte_malloc(NULL,
+ vq->size * sizeof(struct vring_used_elem),
+ RTE_CACHE_LINE_SIZE);
+ if (!vq->shadow_used_ring) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Failed to allocate memory"
+ " for shadow used ring.\n");
+ return -1;
+ }
+ }
return 0;
}
@@ -611,14 +625,21 @@ static int
vhost_user_get_vring_base(struct virtio_net *dev,
struct vhost_vring_state *state)
{
+ struct vhost_virtqueue *vq;
+
/* We have to stop the queue (virtio) if it is running. */
if (dev->flags & VIRTIO_DEV_RUNNING) {
dev->flags &= ~VIRTIO_DEV_RUNNING;
notify_ops->destroy_device(dev->vid);
}
+ vq = dev->virtqueue[state->index];
/* Here we are safe to get the last used index */
- state->num = dev->virtqueue[state->index]->last_used_idx;
+ state->num = vq->last_used_idx;
+ if (vq->shadow_used_ring) {
+ rte_free(vq->shadow_used_ring);
+ vq->shadow_used_ring = NULL;
+ }
RTE_LOG(INFO, VHOST_CONFIG,
"vring base idx:%d file:%d\n", state->index, state->num);
@@ -627,10 +648,10 @@ vhost_user_get_vring_base(struct virtio_net
*dev,
Post by Zhihong Wang
* sent and only sent in vhost_vring_stop.
* TODO: cleanup the vring, it isn't usable since here.
*/
- if (dev->virtqueue[state->index]->kickfd >= 0)
- close(dev->virtqueue[state->index]->kickfd);
+ if (vq->kickfd >= 0)
+ close(vq->kickfd);
- dev->virtqueue[state->index]->kickfd =
VIRTIO_UNINITIALIZED_EVENTFD;
Post by Zhihong Wang
+ vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
return 0;
}
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index b38f18f..e9f6353 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -134,17 +134,52 @@ virtio_enqueue_offload(struct rte_mbuf
*m_buf, struct virtio_net_hdr *net_hdr)
Post by Zhihong Wang
}
static inline void __attribute__((always_inline))
-update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
- uint32_t desc_chain_head, uint32_t desc_chain_len)
+update_used_ring(struct vhost_virtqueue *vq, uint32_t
desc_chain_head,
Post by Zhihong Wang
+ uint32_t desc_chain_len)
{
- uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
-
- vq->used->ring[used_idx].id = desc_chain_head;
- vq->used->ring[used_idx].len = desc_chain_len;
+ vq->shadow_used_ring[vq->shadow_used_idx].id =
desc_chain_head;
Post by Zhihong Wang
+ vq->shadow_used_ring[vq->shadow_used_idx].len =
desc_chain_len;
Post by Zhihong Wang
+ vq->shadow_used_idx++;
vq->last_used_idx++;
- vhost_log_used_vring(dev, vq, offsetof(struct vring_used,
- ring[used_idx]),
- sizeof(vq->used->ring[used_idx]));
+}
+
+static inline void __attribute__((always_inline))
+flush_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ uint32_t used_idx_start)
+{
+ if (used_idx_start + vq->shadow_used_idx < vq->size) {
+ rte_memcpy(&vq->used->ring[used_idx_start],
+ &vq->shadow_used_ring[0],
+ vq->shadow_used_idx *
+ sizeof(struct vring_used_elem));
+ vhost_log_used_vring(dev, vq,
+ offsetof(struct vring_used,
+ ring[used_idx_start]),
+ vq->shadow_used_idx *
+ sizeof(struct vring_used_elem));
+ } else {
+ uint32_t part_1 = vq->size - used_idx_start;
+ uint32_t part_2 = vq->shadow_used_idx - part_1;
+
+ rte_memcpy(&vq->used->ring[used_idx_start],
+ &vq->shadow_used_ring[0],
+ part_1 *
+ sizeof(struct vring_used_elem));
+ vhost_log_used_vring(dev, vq,
+ offsetof(struct vring_used,
+ ring[used_idx_start]),
+ part_1 *
+ sizeof(struct vring_used_elem));
+ rte_memcpy(&vq->used->ring[0],
+ &vq->shadow_used_ring[part_1],
+ part_2 *
+ sizeof(struct vring_used_elem));
+ vhost_log_used_vring(dev, vq,
+ offsetof(struct vring_used,
+ ring[0]),
+ part_2 *
+ sizeof(struct vring_used_elem));
+ }
}
Is expanding the code done for performance purpose?
Hi Maxime,

Yes theoretically this has the least branch number.
And I think the logic is simpler this way.

Thanks
Zhihong
Or maybe we could have a loop to do that?
static inline void __attribute__((always_inline))
flush_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
{
uint32_t to = used_idx_start;
uint32_t from = 0;
uint32_t count;
if (used_idx_start + vq->shadow_used_idx < vq->size)
count = vq->shadow_used_idx;
else
count = vq->size - used_idx_start;
do {
rte_memcpy(&vq->used->ring[to],
&vq->shadow_used_ring[from],
count * sizeof(struct vring_used_elem));
vhost_log_used_vring(dev, vq,
offsetof(struct vring_used, ring[to]),
count * sizeof(struct vring_used_elem));
to = (to + count) & (vq->size - 1);
from += count;
count = vq->shadow_used_idx - count;
} while (count);
}
Regards,
Maxime
Maxime Coquelin
2016-09-15 16:38:06 UTC
Permalink
Post by Wang, Zhihong
-----Original Message-----
Sent: Monday, September 12, 2016 11:46 PM
Subject: Re: [PATCH v5 5/6] vhost: batch update used ring
Post by Zhihong Wang
This patch enables batch update of the used ring for better efficiency.
---
1. Free shadow used ring in the right place.
2. Add failure check for shadow used ring malloc.
lib/librte_vhost/vhost.c | 20 ++++++++++++--
lib/librte_vhost/vhost.h | 4 +++
lib/librte_vhost/vhost_user.c | 31 +++++++++++++++++----
lib/librte_vhost/virtio_net.c | 64
+++++++++++++++++++++++++++++++++++--------
Post by Zhihong Wang
4 files changed, 101 insertions(+), 18 deletions(-)
diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index 46095c3..cb31cdd 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -119,10 +119,26 @@ cleanup_device(struct virtio_net *dev, int
destroy)
Post by Zhihong Wang
static void
free_device(struct virtio_net *dev)
{
+ struct vhost_virtqueue *vq_0;
+ struct vhost_virtqueue *vq_1;
uint32_t i;
- for (i = 0; i < dev->virt_qp_nb; i++)
- rte_free(dev->virtqueue[i * VIRTIO_QNUM]);
+ for (i = 0; i < dev->virt_qp_nb; i++) {
+ vq_0 = dev->virtqueue[i * VIRTIO_QNUM];
+ if (vq_0->shadow_used_ring) {
+ rte_free(vq_0->shadow_used_ring);
+ vq_0->shadow_used_ring = NULL;
+ }
+
+ vq_1 = dev->virtqueue[i * VIRTIO_QNUM + 1];
+ if (vq_1->shadow_used_ring) {
+ rte_free(vq_1->shadow_used_ring);
+ vq_1->shadow_used_ring = NULL;
+ }
+
+ /* malloc together, free together */
+ rte_free(vq_0);
+ }
rte_free(dev);
}
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 9707dfc..381dc27 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -85,6 +85,10 @@ struct vhost_virtqueue {
/* Physical address of used ring, for logging */
uint64_t log_guest_addr;
+
+ /* Shadow used ring for performance */
+ struct vring_used_elem *shadow_used_ring;
+ uint32_t shadow_used_idx;
} __rte_cache_aligned;
/* Old kernels have no such macro defined */
diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
index eee99e9..d7cf1ed 100644
--- a/lib/librte_vhost/vhost_user.c
+++ b/lib/librte_vhost/vhost_user.c
@@ -193,7 +193,21 @@ static int
vhost_user_set_vring_num(struct virtio_net *dev,
struct vhost_vring_state *state)
{
- dev->virtqueue[state->index]->size = state->num;
+ struct vhost_virtqueue *vq;
+
+ vq = dev->virtqueue[state->index];
+ vq->size = state->num;
+ if (!vq->shadow_used_ring) {
+ vq->shadow_used_ring = rte_malloc(NULL,
+ vq->size * sizeof(struct vring_used_elem),
+ RTE_CACHE_LINE_SIZE);
+ if (!vq->shadow_used_ring) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Failed to allocate memory"
+ " for shadow used ring.\n");
+ return -1;
+ }
+ }
return 0;
}
@@ -611,14 +625,21 @@ static int
vhost_user_get_vring_base(struct virtio_net *dev,
struct vhost_vring_state *state)
{
+ struct vhost_virtqueue *vq;
+
/* We have to stop the queue (virtio) if it is running. */
if (dev->flags & VIRTIO_DEV_RUNNING) {
dev->flags &= ~VIRTIO_DEV_RUNNING;
notify_ops->destroy_device(dev->vid);
}
+ vq = dev->virtqueue[state->index];
/* Here we are safe to get the last used index */
- state->num = dev->virtqueue[state->index]->last_used_idx;
+ state->num = vq->last_used_idx;
+ if (vq->shadow_used_ring) {
+ rte_free(vq->shadow_used_ring);
+ vq->shadow_used_ring = NULL;
+ }
RTE_LOG(INFO, VHOST_CONFIG,
"vring base idx:%d file:%d\n", state->index, state->num);
@@ -627,10 +648,10 @@ vhost_user_get_vring_base(struct virtio_net
*dev,
Post by Zhihong Wang
* sent and only sent in vhost_vring_stop.
* TODO: cleanup the vring, it isn't usable since here.
*/
- if (dev->virtqueue[state->index]->kickfd >= 0)
- close(dev->virtqueue[state->index]->kickfd);
+ if (vq->kickfd >= 0)
+ close(vq->kickfd);
- dev->virtqueue[state->index]->kickfd =
VIRTIO_UNINITIALIZED_EVENTFD;
Post by Zhihong Wang
+ vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
return 0;
}
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index b38f18f..e9f6353 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -134,17 +134,52 @@ virtio_enqueue_offload(struct rte_mbuf
*m_buf, struct virtio_net_hdr *net_hdr)
Post by Zhihong Wang
}
static inline void __attribute__((always_inline))
-update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
- uint32_t desc_chain_head, uint32_t desc_chain_len)
+update_used_ring(struct vhost_virtqueue *vq, uint32_t
desc_chain_head,
Post by Zhihong Wang
+ uint32_t desc_chain_len)
{
- uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
-
- vq->used->ring[used_idx].id = desc_chain_head;
- vq->used->ring[used_idx].len = desc_chain_len;
+ vq->shadow_used_ring[vq->shadow_used_idx].id =
desc_chain_head;
Post by Zhihong Wang
+ vq->shadow_used_ring[vq->shadow_used_idx].len =
desc_chain_len;
Post by Zhihong Wang
+ vq->shadow_used_idx++;
vq->last_used_idx++;
- vhost_log_used_vring(dev, vq, offsetof(struct vring_used,
- ring[used_idx]),
- sizeof(vq->used->ring[used_idx]));
+}
+
+static inline void __attribute__((always_inline))
+flush_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ uint32_t used_idx_start)
+{
+ if (used_idx_start + vq->shadow_used_idx < vq->size) {
+ rte_memcpy(&vq->used->ring[used_idx_start],
+ &vq->shadow_used_ring[0],
+ vq->shadow_used_idx *
+ sizeof(struct vring_used_elem));
+ vhost_log_used_vring(dev, vq,
+ offsetof(struct vring_used,
+ ring[used_idx_start]),
+ vq->shadow_used_idx *
+ sizeof(struct vring_used_elem));
+ } else {
+ uint32_t part_1 = vq->size - used_idx_start;
+ uint32_t part_2 = vq->shadow_used_idx - part_1;
+
+ rte_memcpy(&vq->used->ring[used_idx_start],
+ &vq->shadow_used_ring[0],
+ part_1 *
+ sizeof(struct vring_used_elem));
+ vhost_log_used_vring(dev, vq,
+ offsetof(struct vring_used,
+ ring[used_idx_start]),
+ part_1 *
+ sizeof(struct vring_used_elem));
+ rte_memcpy(&vq->used->ring[0],
+ &vq->shadow_used_ring[part_1],
+ part_2 *
+ sizeof(struct vring_used_elem));
+ vhost_log_used_vring(dev, vq,
+ offsetof(struct vring_used,
+ ring[0]),
+ part_2 *
+ sizeof(struct vring_used_elem));
+ }
}
Is expanding the code done for performance purpose?
Hi Maxime,
Yes theoretically this has the least branch number.
And I think the logic is simpler this way.
Ok, in that case, maybe you could create a function to
do the rte_memcpy and the vhost_log_used on a given range.

I don't have a strong opinion on this, if Yuanhan is fine
with current code, that's ok for me.
Thanks,
Maxime
Post by Wang, Zhihong
Thanks
Zhihong
Or maybe we could have a loop to do that?
static inline void __attribute__((always_inline))
flush_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
{
uint32_t to = used_idx_start;
uint32_t from = 0;
uint32_t count;
if (used_idx_start + vq->shadow_used_idx < vq->size)
count = vq->shadow_used_idx;
else
count = vq->size - used_idx_start;
do {
rte_memcpy(&vq->used->ring[to],
&vq->shadow_used_ring[from],
count * sizeof(struct vring_used_elem));
vhost_log_used_vring(dev, vq,
offsetof(struct vring_used, ring[to]),
count * sizeof(struct vring_used_elem));
to = (to + count) & (vq->size - 1);
from += count;
count = vq->shadow_used_idx - count;
} while (count);
}
Regards,
Maxime
Yuanhan Liu
2016-09-18 02:55:42 UTC
Permalink
Post by Maxime Coquelin
Post by Wang, Zhihong
Post by Maxime Coquelin
Post by Zhihong Wang
+static inline void __attribute__((always_inline))
+flush_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ uint32_t used_idx_start)
+{
+ if (used_idx_start + vq->shadow_used_idx < vq->size) {
+ rte_memcpy(&vq->used->ring[used_idx_start],
+ &vq->shadow_used_ring[0],
+ vq->shadow_used_idx *
+ sizeof(struct vring_used_elem));
+ vhost_log_used_vring(dev, vq,
+ offsetof(struct vring_used,
+ ring[used_idx_start]),
+ vq->shadow_used_idx *
+ sizeof(struct vring_used_elem));
+ } else {
+ uint32_t part_1 = vq->size - used_idx_start;
+ uint32_t part_2 = vq->shadow_used_idx - part_1;
+
+ rte_memcpy(&vq->used->ring[used_idx_start],
+ &vq->shadow_used_ring[0],
+ part_1 *
+ sizeof(struct vring_used_elem));
+ vhost_log_used_vring(dev, vq,
+ offsetof(struct vring_used,
+ ring[used_idx_start]),
+ part_1 *
+ sizeof(struct vring_used_elem));
+ rte_memcpy(&vq->used->ring[0],
+ &vq->shadow_used_ring[part_1],
+ part_2 *
+ sizeof(struct vring_used_elem));
+ vhost_log_used_vring(dev, vq,
+ offsetof(struct vring_used,
+ ring[0]),
+ part_2 *
+ sizeof(struct vring_used_elem));
+ }
}
Is expanding the code done for performance purpose?
Hi Maxime,
Yes theoretically this has the least branch number.
And I think the logic is simpler this way.
Ok, in that case, maybe you could create a function to
do the rte_memcpy and the vhost_log_used on a given range.
Agreed, that will be better; it could avoid repeating similar code
block 3 times.
Post by Maxime Coquelin
I don't have a strong opinion on this, if Yuanhan is fine
with current code, that's ok for me.
From what I know, that's kind of DPDK prefered way, to expand code
when necessary. For example, 9ec201f5d6e7 ("mbuf: provide bulk
allocation").

So I'm fine with it.

--yliu
Wang, Zhihong
2016-09-18 02:57:40 UTC
Permalink
-----Original Message-----
Sent: Sunday, September 18, 2016 10:56 AM
Subject: Re: [PATCH v5 5/6] vhost: batch update used ring
Post by Maxime Coquelin
Post by Wang, Zhihong
Post by Maxime Coquelin
Post by Zhihong Wang
+static inline void __attribute__((always_inline))
+flush_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ uint32_t used_idx_start)
+{
+ if (used_idx_start + vq->shadow_used_idx < vq->size) {
+ rte_memcpy(&vq->used->ring[used_idx_start],
+ &vq->shadow_used_ring[0],
+ vq->shadow_used_idx *
+ sizeof(struct vring_used_elem));
+ vhost_log_used_vring(dev, vq,
+ offsetof(struct vring_used,
+ ring[used_idx_start]),
+ vq->shadow_used_idx *
+ sizeof(struct vring_used_elem));
+ } else {
+ uint32_t part_1 = vq->size - used_idx_start;
+ uint32_t part_2 = vq->shadow_used_idx - part_1;
+
+ rte_memcpy(&vq->used->ring[used_idx_start],
+ &vq->shadow_used_ring[0],
+ part_1 *
+ sizeof(struct vring_used_elem));
+ vhost_log_used_vring(dev, vq,
+ offsetof(struct vring_used,
+ ring[used_idx_start]),
+ part_1 *
+ sizeof(struct vring_used_elem));
+ rte_memcpy(&vq->used->ring[0],
+ &vq->shadow_used_ring[part_1],
+ part_2 *
+ sizeof(struct vring_used_elem));
+ vhost_log_used_vring(dev, vq,
+ offsetof(struct vring_used,
+ ring[0]),
+ part_2 *
+ sizeof(struct vring_used_elem));
+ }
}
Is expanding the code done for performance purpose?
Hi Maxime,
Yes theoretically this has the least branch number.
And I think the logic is simpler this way.
Ok, in that case, maybe you could create a function to
do the rte_memcpy and the vhost_log_used on a given range.
Agreed, that will be better; it could avoid repeating similar code
block 3 times.
Okay. Thanks for the suggestion, Maxime and Yuanhan.
Post by Maxime Coquelin
I don't have a strong opinion on this, if Yuanhan is fine
with current code, that's ok for me.
From what I know, that's kind of DPDK prefered way, to expand code
when necessary. For example, 9ec201f5d6e7 ("mbuf: provide bulk
allocation").
So I'm fine with it.
--yliu
Zhihong Wang
2016-09-09 03:39:28 UTC
Permalink
This patch reorders the code to delay virtio header write to optimize cache
access efficiency for cases where the mrg_rxbuf feature is turned on. It
reduces CPU pipeline stall cycles significantly.

Signed-off-by: Zhihong Wang <***@intel.com>
---
Changes in v3:

1. Remove unnecessary memset which causes frontend stall on SNB & IVB.

2. Rename variables to follow naming convention.

lib/librte_vhost/virtio_net.c | 20 ++++++++++++++------
1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index e9f6353..0086bcb 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -197,6 +197,7 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
uint32_t mbuf_len;
uint32_t mbuf_avail;
uint32_t cpy_len;
+ uint32_t copy_virtio_hdr;
uint32_t num_buffers = 0;

/* start with the first mbuf of the packet */
@@ -211,12 +212,12 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
if (unlikely(!desc_addr))
goto error;

- /* handle virtio header */
+ /*
+ * handle virtio header, the actual write operation is delayed
+ * for cache optimization, to reduce CPU pipeline stall cycles.
+ */
virtio_hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr;
- virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
- if (is_mrg_rxbuf)
- virtio_hdr->num_buffers = 1;
-
+ copy_virtio_hdr = 1;
vhost_log_write(dev, desc->addr, dev->vhost_hlen);
PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
desc_offset = dev->vhost_hlen;
@@ -266,8 +267,15 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
goto error;
}

- /* copy mbuf data */
+ /* copy virtio header and mbuf data */
cpy_len = RTE_MIN(desc->len - desc_offset, mbuf_avail);
+ if (copy_virtio_hdr) {
+ copy_virtio_hdr = 0;
+ virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
+ if (is_mrg_rxbuf)
+ virtio_hdr->num_buffers = num_buffers + 1;
+ }
+
rte_memcpy((void *)(uintptr_t)desc_addr,
rte_pktmbuf_mtod_offset(mbuf, void *,
mbuf_len - mbuf_avail),
--
2.7.4
Maxime Coquelin
2016-09-12 13:52:12 UTC
Permalink
Hi,
Post by Zhihong Wang
This patch set optimizes the vhost enqueue function.
It implements the vhost logic from scratch into a single function designed
for high performance and good maintainability, and improves CPU efficiency
* Higher maximum throughput can be achieved for fast frontends like DPDK
virtio pmd.
* Better scalability can be achieved that each vhost core can support
more connections because it takes less cycles to handle each single
frontend.
1. A Windows VM compatibility fix for vhost enqueue in 16.07 release.
2. A baseline patch to rewrite the vhost logic.
3. A series of optimization patches added upon the baseline.
1. Reorder code to reduce CPU pipeline stall cycles.
2. Batch update the used ring for better efficiency.
3. Prefetch descriptor to hide cache latency.
4. Remove useless volatile attribute to allow compiler optimization.
Code reordering and batch used ring update bring most of the performance
improvements.
* virtio_dev_merge_rx for mrg_rxbuf turned on cases.
* virtio_dev_rx for mrg_rxbuf turned off cases.
The performance of the existing code is not optimal, especially when the
mrg_rxbuf feature turned on. Besides, having 2 callback paths increases
maintenance efforts.
Also, there's a compatibility issue in the existing code which causes
Windows VM to hang when the mrg_rxbuf feature turned on.
---
1. Rebase to the latest branch.
2. Rename variables to keep consistent in naming style.
3. Small changes like return value adjustment and vertical alignment.
4. Add details in commit log.
Just tried to apply your series without success.
Apparently, it is not based directly on master branch,
as it lacks some SHA-1 information.

Could you rebase it against master please?

Thanks,
Maxime
Maxime Coquelin
2016-09-12 13:56:43 UTC
Permalink
Post by Maxime Coquelin
Hi,
Post by Zhihong Wang
This patch set optimizes the vhost enqueue function.
It implements the vhost logic from scratch into a single function designed
for high performance and good maintainability, and improves CPU efficiency
* Higher maximum throughput can be achieved for fast frontends like DPDK
virtio pmd.
* Better scalability can be achieved that each vhost core can support
more connections because it takes less cycles to handle each single
frontend.
1. A Windows VM compatibility fix for vhost enqueue in 16.07 release.
2. A baseline patch to rewrite the vhost logic.
3. A series of optimization patches added upon the baseline.
1. Reorder code to reduce CPU pipeline stall cycles.
2. Batch update the used ring for better efficiency.
3. Prefetch descriptor to hide cache latency.
4. Remove useless volatile attribute to allow compiler optimization.
Code reordering and batch used ring update bring most of the performance
improvements.
* virtio_dev_merge_rx for mrg_rxbuf turned on cases.
* virtio_dev_rx for mrg_rxbuf turned off cases.
The performance of the existing code is not optimal, especially when the
mrg_rxbuf feature turned on. Besides, having 2 callback paths increases
maintenance efforts.
Also, there's a compatibility issue in the existing code which causes
Windows VM to hang when the mrg_rxbuf feature turned on.
---
1. Rebase to the latest branch.
2. Rename variables to keep consistent in naming style.
3. Small changes like return value adjustment and vertical alignment.
4. Add details in commit log.
Just tried to apply your series without success.
Apparently, it is not based directly on master branch,
as it lacks some SHA-1 information.
Could you rebase it against master please?
Ok, it is in fact based on top of:
git://dpdk.org/next/dpdk-next-virtio master

For v6, if any, could you add this info to the cover letter please?

Thanks,
Maxime
Yuanhan Liu
2016-09-12 14:01:52 UTC
Permalink
Post by Maxime Coquelin
Just tried to apply your series without success.
Apparently, it is not based directly on master branch,
as it lacks some SHA-1 information.
Could you rebase it against master please?
It's rebased against the dpdk-next-virtio tree [0], where all the
virtio/vhost patches are applied first.

[0]: http://dpdk.org/browse/next/dpdk-next-virtio/

--yliu
Zhihong Wang
2016-09-20 02:00:11 UTC
Permalink
This patch set optimizes the vhost enqueue function.

It implements the vhost logic from scratch into a single function designed
for high performance and good maintainability, and improves CPU efficiency
significantly by optimizing cache access, which means:

* Higher maximum throughput can be achieved for fast frontends like DPDK
virtio pmd.

* Better scalability can be achieved that each vhost core can support
more connections because it takes less cycles to handle each single
frontend.

This patch set contains:

1. A Windows VM compatibility fix for vhost enqueue in 16.07 release.

2. A baseline patch to rewrite the vhost logic.

3. A series of optimization patches added upon the baseline.

The main optimization techniques are:

1. Reorder code to reduce CPU pipeline stall cycles.

2. Batch update the used ring for better efficiency.

3. Prefetch descriptor to hide cache latency.

4. Remove useless volatile attribute to allow compiler optimization.

Code reordering and batch used ring update bring most of the performance
improvements.

In the existing code there're 2 callbacks for vhost enqueue:

* virtio_dev_merge_rx for mrg_rxbuf turned on cases.

* virtio_dev_rx for mrg_rxbuf turned off cases.

The performance of the existing code is not optimal, especially when the
mrg_rxbuf feature turned on. Besides, having 2 callback paths increases
maintenance efforts.

Also, there's a compatibility issue in the existing code which causes
Windows VM to hang when the mrg_rxbuf feature turned on.

---
Changes in v6:

1. Merge duplicated code.

2. Introduce a function for used ring write.

3. Add necessary comments.

---
Changes in v5:

1. Rebase to dpdk-next-virtio master.

2. Rename variables to keep consistent in naming style.

3. Small changes like return value adjustment and vertical alignment.

4. Add details in commit log.

---
Changes in v4:

1. Fix a Windows VM compatibility issue.

2. Free shadow used ring in the right place.

3. Add failure check for shadow used ring malloc.

4. Refactor the code for clearer logic.

5. Add PRINT_PACKET for debugging.

---
Changes in v3:

1. Remove unnecessary memset which causes frontend stall on SNB & IVB.

2. Rename variables to follow naming convention.

3. Rewrite enqueue and delete the obsolete in the same patch.

---
Changes in v2:

1. Split the big function into several small ones.

2. Use multiple patches to explain each optimization.

3. Add comments.

Zhihong Wang (6):
vhost: fix windows vm hang
vhost: rewrite enqueue
vhost: remove useless volatile
vhost: add desc prefetch
vhost: batch update used ring
vhost: optimize cache access

lib/librte_vhost/vhost.c | 20 +-
lib/librte_vhost/vhost.h | 6 +-
lib/librte_vhost/vhost_user.c | 31 ++-
lib/librte_vhost/virtio_net.c | 541 ++++++++++++++----------------------------
4 files changed, 225 insertions(+), 373 deletions(-)
--
2.7.4
Zhihong Wang
2016-09-20 02:00:12 UTC
Permalink
This patch fixes a Windows VM compatibility issue in DPDK 16.07 vhost code
which causes the guest to hang once any packets are enqueued when mrg_rxbuf
is turned on by setting the right id and len in the used ring.

As defined in virtio spec 0.95 and 1.0, in each used ring element, id means
index of start of used descriptor chain, and len means total length of the
descriptor chain which was written to. While in 16.07 code, index of the
last descriptor is assigned to id, and the length of the last descriptor is
assigned to len.

How to test?

1. Start testpmd in the host with a vhost port.

2. Start a Windows VM image with qemu and connect to the vhost port.

3. Start io forwarding with tx_first in host testpmd.

For 16.07 code, the Windows VM will hang once any packets are enqueued.

Cc: <***@dpdk.org>
Signed-off-by: Zhihong Wang <***@intel.com>
---
Changes in v5:

1. Add details in commit log.

lib/librte_vhost/virtio_net.c | 17 ++++++++++++-----
1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 8a151af..0d6e7d9 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -384,6 +384,8 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
uint16_t start_idx = vq->last_used_idx;
uint16_t cur_idx = start_idx;
uint64_t desc_addr;
+ uint32_t desc_chain_head;
+ uint32_t desc_chain_len;
uint32_t mbuf_offset, mbuf_avail;
uint32_t desc_offset, desc_avail;
uint32_t cpy_len;
@@ -412,6 +414,8 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,

desc_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
desc_offset = dev->vhost_hlen;
+ desc_chain_head = buf_vec[vec_idx].desc_idx;
+ desc_chain_len = desc_offset;

mbuf_avail = rte_pktmbuf_data_len(m);
mbuf_offset = 0;
@@ -419,19 +423,21 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
/* done with current desc buf, get the next one */
if (desc_avail == 0) {
desc_idx = buf_vec[vec_idx].desc_idx;
+ vec_idx++;

if (!(vq->desc[desc_idx].flags & VRING_DESC_F_NEXT)) {
/* Update used ring with desc information */
used_idx = cur_idx++ & (vq->size - 1);
- vq->used->ring[used_idx].id = desc_idx;
- vq->used->ring[used_idx].len = desc_offset;
+ vq->used->ring[used_idx].id = desc_chain_head;
+ vq->used->ring[used_idx].len = desc_chain_len;
vhost_log_used_vring(dev, vq,
offsetof(struct vring_used,
ring[used_idx]),
sizeof(vq->used->ring[used_idx]));
+ desc_chain_head = buf_vec[vec_idx].desc_idx;
+ desc_chain_len = 0;
}

- vec_idx++;
desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
if (unlikely(!desc_addr))
return 0;
@@ -463,11 +469,12 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
mbuf_offset += cpy_len;
desc_avail -= cpy_len;
desc_offset += cpy_len;
+ desc_chain_len += cpy_len;
}

used_idx = cur_idx & (vq->size - 1);
- vq->used->ring[used_idx].id = buf_vec[vec_idx].desc_idx;
- vq->used->ring[used_idx].len = desc_offset;
+ vq->used->ring[used_idx].id = desc_chain_head;
+ vq->used->ring[used_idx].len = desc_chain_len;
vhost_log_used_vring(dev, vq,
offsetof(struct vring_used, ring[used_idx]),
sizeof(vq->used->ring[used_idx]));
--
2.7.4
Zhihong Wang
2016-09-20 02:00:13 UTC
Permalink
This patch implements the vhost logic from scratch into a single function
to improve maintainability. This is the baseline version of the new code,
more optimization will be added in the following patches in this patch set.

In the existing code there're 2 callbacks for vhost enqueue:

* virtio_dev_merge_rx for mrg_rxbuf turned on cases.

* virtio_dev_rx for mrg_rxbuf turned off cases.

Having 2 callback paths increases maintenance effort. Also, the performance
of the existing code is not optimal, especially when the mrg_rxbuf feature
turned on.

Signed-off-by: Zhihong Wang <***@intel.com>
---
Changes in v6:

1. Merge duplicated code.

2. Add necessary comments.

---
Changes in v5:

1. Rebase to dpdk-next-virtio master.

2. Rename variables to keep consistent in naming style.

3. Small changes like return value adjustment and vertical alignment.

---
Changes in v4:

1. Refactor the code for clearer logic.

2. Add PRINT_PACKET for debugging.

---
Changes in v3:

1. Rewrite enqueue and delete the obsolete in the same patch.

lib/librte_vhost/virtio_net.c | 508 +++++++++++-------------------------------
1 file changed, 134 insertions(+), 374 deletions(-)

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 0d6e7d9..0ada32b 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -91,7 +91,7 @@ is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t qp_nb)
return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
}

-static void
+static inline void __attribute__((always_inline))
virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
{
if (m_buf->ol_flags & PKT_TX_L4_MASK) {
@@ -112,6 +112,10 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
cksum));
break;
}
+ } else {
+ net_hdr->flags = 0;
+ net_hdr->csum_start = 0;
+ net_hdr->csum_offset = 0;
}

if (m_buf->ol_flags & PKT_TX_TCP_SEG) {
@@ -122,439 +126,195 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
net_hdr->gso_size = m_buf->tso_segsz;
net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
+ m_buf->l4_len;
+ } else {
+ net_hdr->gso_type = 0;
+ net_hdr->hdr_len = 0;
+ net_hdr->gso_size = 0;
}
}

-static inline void
-copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr,
- struct virtio_net_hdr_mrg_rxbuf hdr)
+static inline void __attribute__((always_inline))
+update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ uint32_t desc_chain_head, uint32_t desc_chain_len)
{
- if (dev->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf))
- *(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr;
- else
- *(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr;
+ uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
+
+ vq->used->ring[used_idx].id = desc_chain_head;
+ vq->used->ring[used_idx].len = desc_chain_len;
+ vq->last_used_idx++;
+ vhost_log_used_vring(dev, vq, offsetof(struct vring_used,
+ ring[used_idx]),
+ sizeof(vq->used->ring[used_idx]));
}

static inline int __attribute__((always_inline))
-copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
- struct rte_mbuf *m, uint16_t desc_idx)
+enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ uint16_t avail_idx, struct rte_mbuf *mbuf,
+ uint32_t is_mrg_rxbuf)
{
- uint32_t desc_avail, desc_offset;
- uint32_t mbuf_avail, mbuf_offset;
- uint32_t cpy_len;
+ struct virtio_net_hdr_mrg_rxbuf *virtio_hdr;
struct vring_desc *desc;
uint64_t desc_addr;
- struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
+ uint32_t desc_chain_head;
+ uint32_t desc_chain_len;
+ uint32_t desc_current;
+ uint32_t desc_offset;
+ uint32_t mbuf_len;
+ uint32_t mbuf_avail;
+ uint32_t cpy_len;
+ uint32_t num_buffers = 0;

- desc = &vq->desc[desc_idx];
+ /* start with the first mbuf of the packet */
+ mbuf_len = rte_pktmbuf_data_len(mbuf);
+ mbuf_avail = mbuf_len;
+
+ /* get the current desc */
+ desc_current = vq->avail->ring[(vq->last_used_idx) & (vq->size - 1)];
+ desc_chain_head = desc_current;
+ desc = &vq->desc[desc_current];
desc_addr = gpa_to_vva(dev, desc->addr);
- /*
- * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
- * performance issue with some versions of gcc (4.8.4 and 5.3.0) which
- * otherwise stores offset on the stack instead of in a register.
- */
- if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr)
- return -1;
+ if (unlikely(!desc_addr))
+ goto error;

- rte_prefetch0((void *)(uintptr_t)desc_addr);
+ /* handle virtio header */
+ virtio_hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr;
+ virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
+ if (is_mrg_rxbuf)
+ virtio_hdr->num_buffers = 1;

- virtio_enqueue_offload(m, &virtio_hdr.hdr);
- copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
vhost_log_write(dev, desc->addr, dev->vhost_hlen);
PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
-
desc_offset = dev->vhost_hlen;
- desc_avail = desc->len - dev->vhost_hlen;
-
- mbuf_avail = rte_pktmbuf_data_len(m);
- mbuf_offset = 0;
- while (mbuf_avail != 0 || m->next != NULL) {
- /* done with current mbuf, fetch next */
- if (mbuf_avail == 0) {
- m = m->next;
-
- mbuf_offset = 0;
- mbuf_avail = rte_pktmbuf_data_len(m);
+ desc_chain_len = desc_offset;
+ desc_addr += desc_offset;
+
+ /* start copy from mbuf to desc */
+ while (mbuf_avail || mbuf->next) {
+ /* get the next mbuf if the current done */
+ if (!mbuf_avail) {
+ mbuf = mbuf->next;
+ mbuf_len = rte_pktmbuf_data_len(mbuf);
+ mbuf_avail = mbuf_len;
}

- /* done with current desc buf, fetch next */
- if (desc_avail == 0) {
- if ((desc->flags & VRING_DESC_F_NEXT) == 0) {
- /* Room in vring buffer is not enough */
- return -1;
- }
- if (unlikely(desc->next >= vq->size))
- return -1;
+ /* get the next desc if the current done */
+ if (desc->len <= desc_offset) {
+ if (desc->flags & VRING_DESC_F_NEXT) {
+ /* go on with the current desc chain */
+ desc_current = desc->next;
+ } else if (is_mrg_rxbuf) {
+ /* start with the next desc chain */
+ update_used_ring(dev, vq, desc_chain_head,
+ desc_chain_len);
+ num_buffers++;
+ virtio_hdr->num_buffers++;
+ if (avail_idx == vq->last_used_idx)
+ goto error;
+
+ desc_current =
+ vq->avail->ring[(vq->last_used_idx) &
+ (vq->size - 1)];
+ desc_chain_head = desc_current;
+ desc_chain_len = 0;
+ } else
+ goto error;

- desc = &vq->desc[desc->next];
+ desc_offset = 0;
+ desc = &vq->desc[desc_current];
desc_addr = gpa_to_vva(dev, desc->addr);
if (unlikely(!desc_addr))
- return -1;
-
- desc_offset = 0;
- desc_avail = desc->len;
+ goto error;
}

- cpy_len = RTE_MIN(desc_avail, mbuf_avail);
- rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
- rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
- cpy_len);
+ /* copy mbuf data */
+ cpy_len = RTE_MIN(desc->len - desc_offset, mbuf_avail);
+ rte_memcpy((void *)(uintptr_t)desc_addr,
+ rte_pktmbuf_mtod_offset(mbuf, void *,
+ mbuf_len - mbuf_avail),
+ cpy_len);
vhost_log_write(dev, desc->addr + desc_offset, cpy_len);
- PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
- cpy_len, 0);
-
- mbuf_avail -= cpy_len;
- mbuf_offset += cpy_len;
- desc_avail -= cpy_len;
- desc_offset += cpy_len;
- }
-
- return 0;
-}
-
-/**
- * This function adds buffers to the virtio devices RX virtqueue. Buffers can
- * be received from the physical port or from another virtio device. A packet
- * count is returned to indicate the number of packets that are succesfully
- * added to the RX queue. This function works when the mbuf is scattered, but
- * it doesn't support the mergeable feature.
- */
-static inline uint32_t __attribute__((always_inline))
-virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
- struct rte_mbuf **pkts, uint32_t count)
-{
- struct vhost_virtqueue *vq;
- uint16_t avail_idx, free_entries, start_idx;
- uint16_t desc_indexes[MAX_PKT_BURST];
- uint16_t used_idx;
- uint32_t i;
-
- LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
- if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
- RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
- dev->vid, __func__, queue_id);
- return 0;
+ PRINT_PACKET(dev, (uintptr_t)desc_addr, cpy_len, 0);
+ mbuf_avail -= cpy_len;
+ desc_addr += cpy_len;
+ desc_offset += cpy_len;
+ desc_chain_len += cpy_len;
}

- vq = dev->virtqueue[queue_id];
- if (unlikely(vq->enabled == 0))
- return 0;
+ update_used_ring(dev, vq, desc_chain_head, desc_chain_len);

- avail_idx = *((volatile uint16_t *)&vq->avail->idx);
- start_idx = vq->last_used_idx;
- free_entries = avail_idx - start_idx;
- count = RTE_MIN(count, free_entries);
- count = RTE_MIN(count, (uint32_t)MAX_PKT_BURST);
- if (count == 0)
- return 0;
-
- LOG_DEBUG(VHOST_DATA, "(%d) start_idx %d | end_idx %d\n",
- dev->vid, start_idx, start_idx + count);
-
- /* Retrieve all of the desc indexes first to avoid caching issues. */
- rte_prefetch0(&vq->avail->ring[start_idx & (vq->size - 1)]);
- for (i = 0; i < count; i++) {
- used_idx = (start_idx + i) & (vq->size - 1);
- desc_indexes[i] = vq->avail->ring[used_idx];
- vq->used->ring[used_idx].id = desc_indexes[i];
- vq->used->ring[used_idx].len = pkts[i]->pkt_len +
- dev->vhost_hlen;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used, ring[used_idx]),
- sizeof(vq->used->ring[used_idx]));
- }
-
- rte_prefetch0(&vq->desc[desc_indexes[0]]);
- for (i = 0; i < count; i++) {
- uint16_t desc_idx = desc_indexes[i];
- int err;
+ return 0;

- err = copy_mbuf_to_desc(dev, vq, pkts[i], desc_idx);
- if (unlikely(err)) {
- used_idx = (start_idx + i) & (vq->size - 1);
- vq->used->ring[used_idx].len = dev->vhost_hlen;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used, ring[used_idx]),
- sizeof(vq->used->ring[used_idx]));
- }
+error:
+ /* rollback on any error if last_used_idx update on-the-fly */
+ vq->last_used_idx -= num_buffers;

- if (i + 1 < count)
- rte_prefetch0(&vq->desc[desc_indexes[i+1]]);
- }
+ return -1;
+}

+static inline void __attribute__((always_inline))
+notify_guest(struct virtio_net *dev, struct vhost_virtqueue *vq)
+{
+ /* flush changes before updating used->idx */
rte_smp_wmb();
-
- *(volatile uint16_t *)&vq->used->idx += count;
- vq->last_used_idx += count;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used, idx),
- sizeof(vq->used->idx));
-
- /* flush used->idx update before we read avail->flags. */
+ *(volatile uint16_t *)&vq->used->idx = vq->last_used_idx;
+ vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
+ sizeof(vq->used->idx));
+ /* flush used->idx update before reading avail->flags */
rte_mb();
-
- /* Kick the guest if necessary. */
+ /* kick the guest if necessary */
if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
&& (vq->callfd >= 0))
eventfd_write(vq->callfd, (eventfd_t)1);
- return count;
-}
-
-static inline int
-fill_vec_buf(struct vhost_virtqueue *vq, uint32_t avail_idx,
- uint32_t *allocated, uint32_t *vec_idx,
- struct buf_vector *buf_vec)
-{
- uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
- uint32_t vec_id = *vec_idx;
- uint32_t len = *allocated;
-
- while (1) {
- if (unlikely(vec_id >= BUF_VECTOR_MAX || idx >= vq->size))
- return -1;
-
- len += vq->desc[idx].len;
- buf_vec[vec_id].buf_addr = vq->desc[idx].addr;
- buf_vec[vec_id].buf_len = vq->desc[idx].len;
- buf_vec[vec_id].desc_idx = idx;
- vec_id++;
-
- if ((vq->desc[idx].flags & VRING_DESC_F_NEXT) == 0)
- break;
-
- idx = vq->desc[idx].next;
- }
-
- *allocated = len;
- *vec_idx = vec_id;
-
- return 0;
}

-/*
- * Returns -1 on fail, 0 on success
- */
-static inline int
-reserve_avail_buf_mergeable(struct vhost_virtqueue *vq, uint32_t size,
- uint16_t *end, struct buf_vector *buf_vec)
+uint16_t
+rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
+ struct rte_mbuf **pkts, uint16_t count)
{
- uint16_t cur_idx;
+ struct vhost_virtqueue *vq;
+ struct virtio_net *dev;
+ uint32_t is_mrg_rxbuf = 0;
+ uint32_t pkt_idx = 0;
+ uint32_t pkt_left = count;
uint16_t avail_idx;
- uint32_t allocated = 0;
- uint32_t vec_idx = 0;
- uint16_t tries = 0;
-
- cur_idx = vq->last_used_idx;
-
- while (1) {
- avail_idx = *((volatile uint16_t *)&vq->avail->idx);
- if (unlikely(cur_idx == avail_idx))
- return -1;
-
- if (unlikely(fill_vec_buf(vq, cur_idx, &allocated,
- &vec_idx, buf_vec) < 0))
- return -1;
-
- cur_idx++;
- tries++;
-
- if (allocated >= size)
- break;
-
- /*
- * if we tried all available ring items, and still
- * can't get enough buf, it means something abnormal
- * happened.
- */
- if (unlikely(tries >= vq->size))
- return -1;
- }

- *end = cur_idx;
- return 0;
-}
-
-static inline uint32_t __attribute__((always_inline))
-copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
- uint16_t end_idx, struct rte_mbuf *m,
- struct buf_vector *buf_vec)
-{
- struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
- uint32_t vec_idx = 0;
- uint16_t start_idx = vq->last_used_idx;
- uint16_t cur_idx = start_idx;
- uint64_t desc_addr;
- uint32_t desc_chain_head;
- uint32_t desc_chain_len;
- uint32_t mbuf_offset, mbuf_avail;
- uint32_t desc_offset, desc_avail;
- uint32_t cpy_len;
- uint16_t desc_idx, used_idx;
-
- if (unlikely(m == NULL))
+ if (unlikely(!pkt_left))
return 0;

- LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n",
- dev->vid, cur_idx, end_idx);
+ pkt_left = RTE_MIN((uint32_t)MAX_PKT_BURST, pkt_left);

- desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
- if (buf_vec[vec_idx].buf_len < dev->vhost_hlen || !desc_addr)
+ dev = get_device(vid);
+ if (unlikely(!dev))
return 0;

- rte_prefetch0((void *)(uintptr_t)desc_addr);
-
- virtio_hdr.num_buffers = end_idx - start_idx;
- LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n",
- dev->vid, virtio_hdr.num_buffers);
-
- virtio_enqueue_offload(m, &virtio_hdr.hdr);
- copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
- vhost_log_write(dev, buf_vec[vec_idx].buf_addr, dev->vhost_hlen);
- PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
-
- desc_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
- desc_offset = dev->vhost_hlen;
- desc_chain_head = buf_vec[vec_idx].desc_idx;
- desc_chain_len = desc_offset;
-
- mbuf_avail = rte_pktmbuf_data_len(m);
- mbuf_offset = 0;
- while (mbuf_avail != 0 || m->next != NULL) {
- /* done with current desc buf, get the next one */
- if (desc_avail == 0) {
- desc_idx = buf_vec[vec_idx].desc_idx;
- vec_idx++;
-
- if (!(vq->desc[desc_idx].flags & VRING_DESC_F_NEXT)) {
- /* Update used ring with desc information */
- used_idx = cur_idx++ & (vq->size - 1);
- vq->used->ring[used_idx].id = desc_chain_head;
- vq->used->ring[used_idx].len = desc_chain_len;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used,
- ring[used_idx]),
- sizeof(vq->used->ring[used_idx]));
- desc_chain_head = buf_vec[vec_idx].desc_idx;
- desc_chain_len = 0;
- }
-
- desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
- if (unlikely(!desc_addr))
- return 0;
-
- /* Prefetch buffer address. */
- rte_prefetch0((void *)(uintptr_t)desc_addr);
- desc_offset = 0;
- desc_avail = buf_vec[vec_idx].buf_len;
- }
-
- /* done with current mbuf, get the next one */
- if (mbuf_avail == 0) {
- m = m->next;
-
- mbuf_offset = 0;
- mbuf_avail = rte_pktmbuf_data_len(m);
- }
-
- cpy_len = RTE_MIN(desc_avail, mbuf_avail);
- rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
- rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
- cpy_len);
- vhost_log_write(dev, buf_vec[vec_idx].buf_addr + desc_offset,
- cpy_len);
- PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
- cpy_len, 0);
-
- mbuf_avail -= cpy_len;
- mbuf_offset += cpy_len;
- desc_avail -= cpy_len;
- desc_offset += cpy_len;
- desc_chain_len += cpy_len;
- }
-
- used_idx = cur_idx & (vq->size - 1);
- vq->used->ring[used_idx].id = desc_chain_head;
- vq->used->ring[used_idx].len = desc_chain_len;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used, ring[used_idx]),
- sizeof(vq->used->ring[used_idx]));
-
- return end_idx - start_idx;
-}
-
-static inline uint32_t __attribute__((always_inline))
-virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
- struct rte_mbuf **pkts, uint32_t count)
-{
- struct vhost_virtqueue *vq;
- uint32_t pkt_idx = 0, nr_used = 0;
- uint16_t end;
- struct buf_vector buf_vec[BUF_VECTOR_MAX];
-
- LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
- if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
- RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
- dev->vid, __func__, queue_id);
+ if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb)))
return 0;
- }

vq = dev->virtqueue[queue_id];
- if (unlikely(vq->enabled == 0))
+ if (unlikely(!vq->enabled))
return 0;

- count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
- if (count == 0)
- return 0;
-
- for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
- uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
+ if (dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF))
+ is_mrg_rxbuf = 1;

- if (unlikely(reserve_avail_buf_mergeable(vq, pkt_len,
- &end, buf_vec) < 0)) {
- LOG_DEBUG(VHOST_DATA,
- "(%d) failed to get enough desc from vring\n",
- dev->vid);
+ /* start enqueuing packets 1 by 1 */
+ avail_idx = *((volatile uint16_t *)&vq->avail->idx);
+ while (pkt_left && avail_idx != vq->last_used_idx) {
+ if (enqueue_packet(dev, vq, avail_idx, pkts[pkt_idx],
+ is_mrg_rxbuf))
break;
- }
-
- nr_used = copy_mbuf_to_desc_mergeable(dev, vq, end,
- pkts[pkt_idx], buf_vec);
- rte_smp_wmb();

- *(volatile uint16_t *)&vq->used->idx += nr_used;
- vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
- sizeof(vq->used->idx));
- vq->last_used_idx += nr_used;
+ pkt_idx++;
+ pkt_left--;
}

- if (likely(pkt_idx)) {
- /* flush used->idx update before we read avail->flags. */
- rte_mb();
-
- /* Kick the guest if necessary. */
- if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
- && (vq->callfd >= 0))
- eventfd_write(vq->callfd, (eventfd_t)1);
- }
+ /* update used idx and kick the guest if necessary */
+ if (pkt_idx)
+ notify_guest(dev, vq);

return pkt_idx;
}

-uint16_t
-rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
- struct rte_mbuf **pkts, uint16_t count)
-{
- struct virtio_net *dev = get_device(vid);
-
- if (!dev)
- return 0;
-
- if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF))
- return virtio_dev_merge_rx(dev, queue_id, pkts, count);
- else
- return virtio_dev_rx(dev, queue_id, pkts, count);
-}
-
static void
parse_ethernet(struct rte_mbuf *m, uint16_t *l4_proto, void **l4_hdr)
{
--
2.7.4
Jianbo Liu
2016-09-22 09:58:28 UTC
Permalink
Post by Zhihong Wang
This patch implements the vhost logic from scratch into a single function
to improve maintainability. This is the baseline version of the new code,
more optimization will be added in the following patches in this patch set.
* virtio_dev_merge_rx for mrg_rxbuf turned on cases.
* virtio_dev_rx for mrg_rxbuf turned off cases.
Having 2 callback paths increases maintenance effort. Also, the performance
of the existing code is not optimal, especially when the mrg_rxbuf feature
turned on.
---
.....
Post by Zhihong Wang
-/*
- * Returns -1 on fail, 0 on success
- */
-static inline int
-reserve_avail_buf_mergeable(struct vhost_virtqueue *vq, uint32_t size,
- uint16_t *end, struct buf_vector *buf_vec)
+uint16_t
+rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
+ struct rte_mbuf **pkts, uint16_t count)
{
- uint16_t cur_idx;
+ struct vhost_virtqueue *vq;
+ struct virtio_net *dev;
+ uint32_t is_mrg_rxbuf = 0;
+ uint32_t pkt_idx = 0;
+ uint32_t pkt_left = count;
Is pkt_left really needed?
Post by Zhihong Wang
uint16_t avail_idx;
- uint32_t allocated = 0;
- uint32_t vec_idx = 0;
- uint16_t tries = 0;
....
Wang, Zhihong
2016-09-22 10:13:11 UTC
Permalink
-----Original Message-----
Sent: Thursday, September 22, 2016 5:58 PM
Subject: Re: [dpdk-dev] [PATCH v6 2/6] vhost: rewrite enqueue
Post by Zhihong Wang
This patch implements the vhost logic from scratch into a single function
to improve maintainability. This is the baseline version of the new code,
more optimization will be added in the following patches in this patch set.
* virtio_dev_merge_rx for mrg_rxbuf turned on cases.
* virtio_dev_rx for mrg_rxbuf turned off cases.
Having 2 callback paths increases maintenance effort. Also, the
performance
Post by Zhihong Wang
of the existing code is not optimal, especially when the mrg_rxbuf feature
turned on.
---
.....
Post by Zhihong Wang
-/*
- * Returns -1 on fail, 0 on success
- */
-static inline int
-reserve_avail_buf_mergeable(struct vhost_virtqueue *vq, uint32_t size,
- uint16_t *end, struct buf_vector *buf_vec)
+uint16_t
+rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
+ struct rte_mbuf **pkts, uint16_t count)
{
- uint16_t cur_idx;
+ struct vhost_virtqueue *vq;
+ struct virtio_net *dev;
+ uint32_t is_mrg_rxbuf = 0;
+ uint32_t pkt_idx = 0;
+ uint32_t pkt_left = count;
Is pkt_left really needed?
It's a matter of coding style since there's no underlying difference.
I prefer this way personally.
Post by Zhihong Wang
uint16_t avail_idx;
- uint32_t allocated = 0;
- uint32_t
Zhihong Wang
2016-09-20 02:00:14 UTC
Permalink
This patch removes useless volatile attribute to allow compiler
optimization.

Signed-off-by: Zhihong Wang <***@intel.com>
---
lib/librte_vhost/vhost.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index c2dfc3c..9707dfc 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -71,7 +71,7 @@ struct vhost_virtqueue {
uint32_t size;

/* Last index used on the available ring */
- volatile uint16_t last_used_idx;
+ uint16_t last_used_idx;
#define VIRTIO_INVALID_EVENTFD (-1)
#define VIRTIO_UNINITIALIZED_EVENTFD (-2)
--
2.7.4
Zhihong Wang
2016-09-20 02:00:15 UTC
Permalink
This patch adds descriptor prefetch to hide cache access latency.

Signed-off-by: Zhihong Wang <***@intel.com>
---
lib/librte_vhost/virtio_net.c | 6 ++++++
1 file changed, 6 insertions(+)

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 0ada32b..f32a143 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -300,6 +300,12 @@ rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
/* start enqueuing packets 1 by 1 */
avail_idx = *((volatile uint16_t *)&vq->avail->idx);
while (pkt_left && avail_idx != vq->last_used_idx) {
+ /* prefetch the next desc */
+ if (pkt_left > 1 && avail_idx != vq->last_used_idx + 1)
+ rte_prefetch0(&vq->desc[vq->avail->ring[
+ (vq->last_used_idx + 1) &
+ (vq->size - 1)]]);
+
if (enqueue_packet(dev, vq, avail_idx, pkts[pkt_idx],
is_mrg_rxbuf))
break;
--
2.7.4
Zhihong Wang
2016-09-20 02:00:16 UTC
Permalink
This patch enables batch update of the used ring for better efficiency.

Signed-off-by: Zhihong Wang <***@intel.com>
---
Changes in v6:

1. Introduce a function for used ring write.

---
Changes in v4:

1. Free shadow used ring in the right place.

2. Add failure check for shadow used ring malloc.

lib/librte_vhost/vhost.c | 20 +++++++++++++++--
lib/librte_vhost/vhost.h | 4 ++++
lib/librte_vhost/vhost_user.c | 31 +++++++++++++++++++++-----
lib/librte_vhost/virtio_net.c | 52 ++++++++++++++++++++++++++++++++++---------
4 files changed, 89 insertions(+), 18 deletions(-)

diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index 46095c3..cb31cdd 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -119,10 +119,26 @@ cleanup_device(struct virtio_net *dev, int destroy)
static void
free_device(struct virtio_net *dev)
{
+ struct vhost_virtqueue *vq_0;
+ struct vhost_virtqueue *vq_1;
uint32_t i;

- for (i = 0; i < dev->virt_qp_nb; i++)
- rte_free(dev->virtqueue[i * VIRTIO_QNUM]);
+ for (i = 0; i < dev->virt_qp_nb; i++) {
+ vq_0 = dev->virtqueue[i * VIRTIO_QNUM];
+ if (vq_0->shadow_used_ring) {
+ rte_free(vq_0->shadow_used_ring);
+ vq_0->shadow_used_ring = NULL;
+ }
+
+ vq_1 = dev->virtqueue[i * VIRTIO_QNUM + 1];
+ if (vq_1->shadow_used_ring) {
+ rte_free(vq_1->shadow_used_ring);
+ vq_1->shadow_used_ring = NULL;
+ }
+
+ /* malloc together, free together */
+ rte_free(vq_0);
+ }

rte_free(dev);
}
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 9707dfc..381dc27 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -85,6 +85,10 @@ struct vhost_virtqueue {

/* Physical address of used ring, for logging */
uint64_t log_guest_addr;
+
+ /* Shadow used ring for performance */
+ struct vring_used_elem *shadow_used_ring;
+ uint32_t shadow_used_idx;
} __rte_cache_aligned;

/* Old kernels have no such macro defined */
diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
index eee99e9..d7cf1ed 100644
--- a/lib/librte_vhost/vhost_user.c
+++ b/lib/librte_vhost/vhost_user.c
@@ -193,7 +193,21 @@ static int
vhost_user_set_vring_num(struct virtio_net *dev,
struct vhost_vring_state *state)
{
- dev->virtqueue[state->index]->size = state->num;
+ struct vhost_virtqueue *vq;
+
+ vq = dev->virtqueue[state->index];
+ vq->size = state->num;
+ if (!vq->shadow_used_ring) {
+ vq->shadow_used_ring = rte_malloc(NULL,
+ vq->size * sizeof(struct vring_used_elem),
+ RTE_CACHE_LINE_SIZE);
+ if (!vq->shadow_used_ring) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Failed to allocate memory"
+ " for shadow used ring.\n");
+ return -1;
+ }
+ }

return 0;
}
@@ -611,14 +625,21 @@ static int
vhost_user_get_vring_base(struct virtio_net *dev,
struct vhost_vring_state *state)
{
+ struct vhost_virtqueue *vq;
+
/* We have to stop the queue (virtio) if it is running. */
if (dev->flags & VIRTIO_DEV_RUNNING) {
dev->flags &= ~VIRTIO_DEV_RUNNING;
notify_ops->destroy_device(dev->vid);
}

+ vq = dev->virtqueue[state->index];
/* Here we are safe to get the last used index */
- state->num = dev->virtqueue[state->index]->last_used_idx;
+ state->num = vq->last_used_idx;
+ if (vq->shadow_used_ring) {
+ rte_free(vq->shadow_used_ring);
+ vq->shadow_used_ring = NULL;
+ }

RTE_LOG(INFO, VHOST_CONFIG,
"vring base idx:%d file:%d\n", state->index, state->num);
@@ -627,10 +648,10 @@ vhost_user_get_vring_base(struct virtio_net *dev,
* sent and only sent in vhost_vring_stop.
* TODO: cleanup the vring, it isn't usable since here.
*/
- if (dev->virtqueue[state->index]->kickfd >= 0)
- close(dev->virtqueue[state->index]->kickfd);
+ if (vq->kickfd >= 0)
+ close(vq->kickfd);

- dev->virtqueue[state->index]->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
+ vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;

return 0;
}
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index f32a143..8f2882b 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -134,17 +134,40 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
}

static inline void __attribute__((always_inline))
-update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
- uint32_t desc_chain_head, uint32_t desc_chain_len)
+update_used_ring(struct vhost_virtqueue *vq, uint32_t desc_chain_head,
+ uint32_t desc_chain_len)
{
- uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
-
- vq->used->ring[used_idx].id = desc_chain_head;
- vq->used->ring[used_idx].len = desc_chain_len;
+ vq->shadow_used_ring[vq->shadow_used_idx].id = desc_chain_head;
+ vq->shadow_used_ring[vq->shadow_used_idx].len = desc_chain_len;
+ vq->shadow_used_idx++;
vq->last_used_idx++;
- vhost_log_used_vring(dev, vq, offsetof(struct vring_used,
- ring[used_idx]),
- sizeof(vq->used->ring[used_idx]));
+}
+
+static inline void __attribute__((always_inline))
+write_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ uint32_t used_idx, uint32_t used_idx_shadow, uint32_t size)
+{
+ rte_memcpy(&vq->used->ring[used_idx],
+ &vq->shadow_used_ring[used_idx_shadow],
+ size * sizeof(struct vring_used_elem));
+ vhost_log_used_vring(dev, vq,
+ offsetof(struct vring_used, ring[used_idx]),
+ size * sizeof(struct vring_used_elem));
+}
+
+static inline void __attribute__((always_inline))
+flush_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ uint32_t used_idx)
+{
+ if (used_idx + vq->shadow_used_idx < vq->size) {
+ write_used_ring(dev, vq, used_idx, 0, vq->shadow_used_idx);
+ } else {
+ uint32_t size_0 = vq->size - used_idx;
+ uint32_t size_1 = vq->shadow_used_idx - size_0;
+
+ write_used_ring(dev, vq, used_idx, 0, size_0);
+ write_used_ring(dev, vq, 0, size_0, size_1);
+ }
}

static inline int __attribute__((always_inline))
@@ -204,7 +227,7 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
desc_current = desc->next;
} else if (is_mrg_rxbuf) {
/* start with the next desc chain */
- update_used_ring(dev, vq, desc_chain_head,
+ update_used_ring(vq, desc_chain_head,
desc_chain_len);
num_buffers++;
virtio_hdr->num_buffers++;
@@ -240,7 +263,7 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
desc_chain_len += cpy_len;
}

- update_used_ring(dev, vq, desc_chain_head, desc_chain_len);
+ update_used_ring(vq, desc_chain_head, desc_chain_len);

return 0;

@@ -273,6 +296,7 @@ rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
{
struct vhost_virtqueue *vq;
struct virtio_net *dev;
+ uint32_t used_idx;
uint32_t is_mrg_rxbuf = 0;
uint32_t pkt_idx = 0;
uint32_t pkt_left = count;
@@ -298,6 +322,8 @@ rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
is_mrg_rxbuf = 1;

/* start enqueuing packets 1 by 1 */
+ vq->shadow_used_idx = 0;
+ used_idx = vq->last_used_idx & (vq->size - 1);
avail_idx = *((volatile uint16_t *)&vq->avail->idx);
while (pkt_left && avail_idx != vq->last_used_idx) {
/* prefetch the next desc */
@@ -314,6 +340,10 @@ rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
pkt_left--;
}

+ /* batch update used ring for better performance */
+ if (likely(vq->shadow_used_idx > 0))
+ flush_used_ring(dev, vq, used_idx);
+
/* update used idx and kick the guest if necessary */
if (pkt_idx)
notify_guest(dev, vq);
--
2.7.4
Zhihong Wang
2016-09-20 02:00:17 UTC
Permalink
This patch reorders the code to delay virtio header write to improve
cache access efficiency for cases where the mrg_rxbuf feature is turned
on. CPU pipeline stall cycles can be significantly reduced.

Virtio header write and mbuf data copy are all remote store operations
which takes a long time to finish. It's a good idea to put them together
to remove bubbles in between, to let as many remote store instructions
as possible go into store buffer at the same time to hide latency, and
to let the H/W prefetcher goes to work as early as possible.

On a Haswell machine, about 100 cycles can be saved per packet by this
patch alone. Taking 64B packets traffic for example, this means about 60%
efficiency improvement for the enqueue operation.

Signed-off-by: Zhihong Wang <***@intel.com>
---
Changes in v3:

1. Remove unnecessary memset which causes frontend stall on SNB & IVB.

2. Rename variables to follow naming convention.

lib/librte_vhost/virtio_net.c | 20 ++++++++++++++------
1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 8f2882b..11a2c1a 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -185,6 +185,7 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
uint32_t mbuf_len;
uint32_t mbuf_avail;
uint32_t cpy_len;
+ uint32_t copy_virtio_hdr;
uint32_t num_buffers = 0;

/* start with the first mbuf of the packet */
@@ -199,12 +200,12 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
if (unlikely(!desc_addr))
goto error;

- /* handle virtio header */
+ /*
+ * handle virtio header, the actual write operation is delayed
+ * for cache optimization, to reduce CPU pipeline stall cycles.
+ */
virtio_hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr;
- virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
- if (is_mrg_rxbuf)
- virtio_hdr->num_buffers = 1;
-
+ copy_virtio_hdr = 1;
vhost_log_write(dev, desc->addr, dev->vhost_hlen);
PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
desc_offset = dev->vhost_hlen;
@@ -249,8 +250,15 @@ enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
goto error;
}

- /* copy mbuf data */
+ /* copy virtio header and mbuf data */
cpy_len = RTE_MIN(desc->len - desc_offset, mbuf_avail);
+ if (copy_virtio_hdr) {
+ copy_virtio_hdr = 0;
+ virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
+ if (is_mrg_rxbuf)
+ virtio_hdr->num_buffers = num_buffers + 1;
+ }
+
rte_memcpy((void *)(uintptr_t)desc_addr,
rte_pktmbuf_mtod_offset(mbuf, void *,
mbuf_len - mbuf_avail),
--
2.7.4
Maxime Coquelin
2016-09-21 04:32:54 UTC
Permalink
Post by Zhihong Wang
This patch reorders the code to delay virtio header write to improve
cache access efficiency for cases where the mrg_rxbuf feature is turned
on. CPU pipeline stall cycles can be significantly reduced.
Virtio header write and mbuf data copy are all remote store operations
which takes a long time to finish. It's a good idea to put them together
to remove bubbles in between, to let as many remote store instructions
as possible go into store buffer at the same time to hide latency, and
to let the H/W prefetcher goes to work as early as possible.
On a Haswell machine, about 100 cycles can be saved per packet by this
patch alone. Taking 64B packets traffic for example, this means about 60%
efficiency improvement for the enqueue operation.
Thanks for the detailed information, I appreciate it.

Maxime
Yuanhan Liu
2016-09-21 02:26:56 UTC
Permalink
Hi Maxime,

Do you have more comments about this set? If no, I think I could merge
it shortly.

Thanks.

--yliu
Post by Zhihong Wang
This patch set optimizes the vhost enqueue function.
It implements the vhost logic from scratch into a single function designed
for high performance and good maintainability, and improves CPU efficiency
* Higher maximum throughput can be achieved for fast frontends like DPDK
virtio pmd.
* Better scalability can be achieved that each vhost core can support
more connections because it takes less cycles to handle each single
frontend.
1. A Windows VM compatibility fix for vhost enqueue in 16.07 release.
2. A baseline patch to rewrite the vhost logic.
3. A series of optimization patches added upon the baseline.
1. Reorder code to reduce CPU pipeline stall cycles.
2. Batch update the used ring for better efficiency.
3. Prefetch descriptor to hide cache latency.
4. Remove useless volatile attribute to allow compiler optimization.
Code reordering and batch used ring update bring most of the performance
improvements.
* virtio_dev_merge_rx for mrg_rxbuf turned on cases.
* virtio_dev_rx for mrg_rxbuf turned off cases.
The performance of the existing code is not optimal, especially when the
mrg_rxbuf feature turned on. Besides, having 2 callback paths increases
maintenance efforts.
Also, there's a compatibility issue in the existing code which causes
Windows VM to hang when the mrg_rxbuf feature turned on.
---
1. Merge duplicated code.
2. Introduce a function for used ring write.
3. Add necessary comments.
---
1. Rebase to dpdk-next-virtio master.
2. Rename variables to keep consistent in naming style.
3. Small changes like return value adjustment and vertical alignment.
4. Add details in commit log.
---
1. Fix a Windows VM compatibility issue.
2. Free shadow used ring in the right place.
3. Add failure check for shadow used ring malloc.
4. Refactor the code for clearer logic.
5. Add PRINT_PACKET for debugging.
---
1. Remove unnecessary memset which causes frontend stall on SNB & IVB.
2. Rename variables to follow naming convention.
3. Rewrite enqueue and delete the obsolete in the same patch.
---
1. Split the big function into several small ones.
2. Use multiple patches to explain each optimization.
3. Add comments.
vhost: fix windows vm hang
vhost: rewrite enqueue
vhost: remove useless volatile
vhost: add desc prefetch
vhost: batch update used ring
vhost: optimize cache access
lib/librte_vhost/vhost.c | 20 +-
lib/librte_vhost/vhost.h | 6 +-
lib/librte_vhost/vhost_user.c | 31 ++-
lib/librte_vhost/virtio_net.c | 541 ++++++++++++++----------------------------
4 files changed, 225 insertions(+), 373 deletions(-)
--
2.7.4
Maxime Coquelin
2016-09-21 04:39:50 UTC
Permalink
Hi Yuanhan,
Post by Wang, Zhihong
Hi Maxime,
Do you have more comments about this set? If no, I think I could merge
it shortly.
No more comments, this is good to me.

Feel free to add:
Reviewed-by: Maxime Coquelin <***@redhat.com>

Thanks,
Maxime
Post by Wang, Zhihong
Thanks.
--yliu
Post by Zhihong Wang
This patch set optimizes the vhost enqueue function.
It implements the vhost logic from scratch into a single function designed
for high performance and good maintainability, and improves CPU efficiency
* Higher maximum throughput can be achieved for fast frontends like DPDK
virtio pmd.
* Better scalability can be achieved that each vhost core can support
more connections because it takes less cycles to handle each single
frontend.
1. A Windows VM compatibility fix for vhost enqueue in 16.07 release.
2. A baseline patch to rewrite the vhost logic.
3. A series of optimization patches added upon the baseline.
1. Reorder code to reduce CPU pipeline stall cycles.
2. Batch update the used ring for better efficiency.
3. Prefetch descriptor to hide cache latency.
4. Remove useless volatile attribute to allow compiler optimization.
Code reordering and batch used ring update bring most of the performance
improvements.
* virtio_dev_merge_rx for mrg_rxbuf turned on cases.
* virtio_dev_rx for mrg_rxbuf turned off cases.
The performance of the existing code is not optimal, especially when the
mrg_rxbuf feature turned on. Besides, having 2 callback paths increases
maintenance efforts.
Also, there's a compatibility issue in the existing code which causes
Windows VM to hang when the mrg_rxbuf feature turned on.
---
1. Merge duplicated code.
2. Introduce a function for used ring write.
3. Add necessary comments.
---
1. Rebase to dpdk-next-virtio master.
2. Rename variables to keep consistent in naming style.
3. Small changes like return value adjustment and vertical alignment.
4. Add details in commit log.
---
1. Fix a Windows VM compatibility issue.
2. Free shadow used ring in the right place.
3. Add failure check for shadow used ring malloc.
4. Refactor the code for clearer logic.
5. Add PRINT_PACKET for debugging.
---
1. Remove unnecessary memset which causes frontend stall on SNB & IVB.
2. Rename variables to follow naming convention.
3. Rewrite enqueue and delete the obsolete in the same patch.
---
1. Split the big function into several small ones.
2. Use multiple patches to explain each optimization.
3. Add comments.
vhost: fix windows vm hang
vhost: rewrite enqueue
vhost: remove useless volatile
vhost: add desc prefetch
vhost: batch update used ring
vhost: optimize cache access
lib/librte_vhost/vhost.c | 20 +-
lib/librte_vhost/vhost.h | 6 +-
lib/librte_vhost/vhost_user.c | 31 ++-
lib/librte_vhost/virtio_net.c | 541 ++++++++++++++----------------------------
4 files changed, 225 insertions(+), 373 deletions(-)
--
2.7.4
Continue reading on narkive:
Loading...