Discussion:
[dpdk-dev] [RFC 1/8] mbuf: make segment prefree function public
(too old to reply)
Olivier Matz
2017-01-24 15:19:26 UTC
Permalink
Document the function and make it public, since it is used at several
places in the drivers. The old one is marked as deprecated.

Signed-off-by: Olivier Matz <***@6wind.com>
---
drivers/net/enic/enic_rxtx.c | 2 +-
drivers/net/fm10k/fm10k_rxtx.c | 6 +++---
drivers/net/fm10k/fm10k_rxtx_vec.c | 6 +++---
drivers/net/i40e/i40e_rxtx_vec_common.h | 6 +++---
drivers/net/ixgbe/ixgbe_rxtx.c | 2 +-
drivers/net/ixgbe/ixgbe_rxtx_vec_common.h | 6 +++---
drivers/net/virtio/virtio_rxtx_simple.h | 6 +++---
lib/librte_mbuf/rte_mbuf.h | 30 +++++++++++++++++++++++++++---
8 files changed, 44 insertions(+), 20 deletions(-)

diff --git a/drivers/net/enic/enic_rxtx.c b/drivers/net/enic/enic_rxtx.c
index 26b83ae..f8c8ad0 100644
--- a/drivers/net/enic/enic_rxtx.c
+++ b/drivers/net/enic/enic_rxtx.c
@@ -473,7 +473,7 @@ static inline void enic_free_wq_bufs(struct vnic_wq *wq, u16 completed_index)
pool = ((struct rte_mbuf *)buf->mb)->pool;
for (i = 0; i < nb_to_free; i++) {
buf = &wq->bufs[tail_idx];
- m = __rte_pktmbuf_prefree_seg((struct rte_mbuf *)(buf->mb));
+ m = rte_pktmbuf_prefree_seg((struct rte_mbuf *)(buf->mb));
buf->mb = NULL;

if (unlikely(m == NULL)) {
diff --git a/drivers/net/fm10k/fm10k_rxtx.c b/drivers/net/fm10k/fm10k_rxtx.c
index 144e5e6..c9bb04a 100644
--- a/drivers/net/fm10k/fm10k_rxtx.c
+++ b/drivers/net/fm10k/fm10k_rxtx.c
@@ -434,12 +434,12 @@ static inline void tx_free_bulk_mbuf(struct rte_mbuf **txep, int num)
if (unlikely(num == 0))
return;

- m = __rte_pktmbuf_prefree_seg(txep[0]);
+ m = rte_pktmbuf_prefree_seg(txep[0]);
if (likely(m != NULL)) {
free[0] = m;
nb_free = 1;
for (i = 1; i < num; i++) {
- m = __rte_pktmbuf_prefree_seg(txep[i]);
+ m = rte_pktmbuf_prefree_seg(txep[i]);
if (likely(m != NULL)) {
if (likely(m->pool == free[0]->pool))
free[nb_free++] = m;
@@ -455,7 +455,7 @@ static inline void tx_free_bulk_mbuf(struct rte_mbuf **txep, int num)
rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free);
} else {
for (i = 1; i < num; i++) {
- m = __rte_pktmbuf_prefree_seg(txep[i]);
+ m = rte_pktmbuf_prefree_seg(txep[i]);
if (m != NULL)
rte_mempool_put(m->pool, m);
txep[i] = NULL;
diff --git a/drivers/net/fm10k/fm10k_rxtx_vec.c b/drivers/net/fm10k/fm10k_rxtx_vec.c
index 27f3e43..825e3c1 100644
--- a/drivers/net/fm10k/fm10k_rxtx_vec.c
+++ b/drivers/net/fm10k/fm10k_rxtx_vec.c
@@ -754,12 +754,12 @@ fm10k_tx_free_bufs(struct fm10k_tx_queue *txq)
* next_dd - (rs_thresh-1)
*/
txep = &txq->sw_ring[txq->next_dd - (n - 1)];
- m = __rte_pktmbuf_prefree_seg(txep[0]);
+ m = rte_pktmbuf_prefree_seg(txep[0]);
if (likely(m != NULL)) {
free[0] = m;
nb_free = 1;
for (i = 1; i < n; i++) {
- m = __rte_pktmbuf_prefree_seg(txep[i]);
+ m = rte_pktmbuf_prefree_seg(txep[i]);
if (likely(m != NULL)) {
if (likely(m->pool == free[0]->pool))
free[nb_free++] = m;
@@ -774,7 +774,7 @@ fm10k_tx_free_bufs(struct fm10k_tx_queue *txq)
rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free);
} else {
for (i = 1; i < n; i++) {
- m = __rte_pktmbuf_prefree_seg(txep[i]);
+ m = rte_pktmbuf_prefree_seg(txep[i]);
if (m != NULL)
rte_mempool_put(m->pool, m);
}
diff --git a/drivers/net/i40e/i40e_rxtx_vec_common.h b/drivers/net/i40e/i40e_rxtx_vec_common.h
index 3745558..76031fe 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_common.h
+++ b/drivers/net/i40e/i40e_rxtx_vec_common.h
@@ -123,12 +123,12 @@ i40e_tx_free_bufs(struct i40e_tx_queue *txq)
* tx_next_dd - (tx_rs_thresh-1)
*/
txep = &txq->sw_ring[txq->tx_next_dd - (n - 1)];
- m = __rte_pktmbuf_prefree_seg(txep[0].mbuf);
+ m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
if (likely(m != NULL)) {
free[0] = m;
nb_free = 1;
for (i = 1; i < n; i++) {
- m = __rte_pktmbuf_prefree_seg(txep[i].mbuf);
+ m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
if (likely(m != NULL)) {
if (likely(m->pool == free[0]->pool)) {
free[nb_free++] = m;
@@ -144,7 +144,7 @@ i40e_tx_free_bufs(struct i40e_tx_queue *txq)
rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free);
} else {
for (i = 1; i < n; i++) {
- m = __rte_pktmbuf_prefree_seg(txep[i].mbuf);
+ m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
if (m != NULL)
rte_mempool_put(m->pool, m);
}
diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
index 36f1c02..dd53cc6 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx.c
@@ -136,7 +136,7 @@ ixgbe_tx_free_bufs(struct ixgbe_tx_queue *txq)

for (i = 0; i < txq->tx_rs_thresh; ++i, ++txep) {
/* free buffers one at a time */
- m = __rte_pktmbuf_prefree_seg(txep->mbuf);
+ m = rte_pktmbuf_prefree_seg(txep->mbuf);
txep->mbuf = NULL;

if (unlikely(m == NULL))
diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec_common.h b/drivers/net/ixgbe/ixgbe_rxtx_vec_common.h
index a3473b9..a83afe5 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx_vec_common.h
+++ b/drivers/net/ixgbe/ixgbe_rxtx_vec_common.h
@@ -123,12 +123,12 @@ ixgbe_tx_free_bufs(struct ixgbe_tx_queue *txq)
* tx_next_dd - (tx_rs_thresh-1)
*/
txep = &txq->sw_ring_v[txq->tx_next_dd - (n - 1)];
- m = __rte_pktmbuf_prefree_seg(txep[0].mbuf);
+ m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
if (likely(m != NULL)) {
free[0] = m;
nb_free = 1;
for (i = 1; i < n; i++) {
- m = __rte_pktmbuf_prefree_seg(txep[i].mbuf);
+ m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
if (likely(m != NULL)) {
if (likely(m->pool == free[0]->pool))
free[nb_free++] = m;
@@ -143,7 +143,7 @@ ixgbe_tx_free_bufs(struct ixgbe_tx_queue *txq)
rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free);
} else {
for (i = 1; i < n; i++) {
- m = __rte_pktmbuf_prefree_seg(txep[i].mbuf);
+ m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
if (m != NULL)
rte_mempool_put(m->pool, m);
}
diff --git a/drivers/net/virtio/virtio_rxtx_simple.h b/drivers/net/virtio/virtio_rxtx_simple.h
index b08f859..f531c54 100644
--- a/drivers/net/virtio/virtio_rxtx_simple.h
+++ b/drivers/net/virtio/virtio_rxtx_simple.h
@@ -98,13 +98,13 @@ virtio_xmit_cleanup(struct virtqueue *vq)
desc_idx = (uint16_t)(vq->vq_used_cons_idx &
((vq->vq_nentries >> 1) - 1));
m = (struct rte_mbuf *)vq->vq_descx[desc_idx++].cookie;
- m = __rte_pktmbuf_prefree_seg(m);
+ m = rte_pktmbuf_prefree_seg(m);
if (likely(m != NULL)) {
free[0] = m;
nb_free = 1;
for (i = 1; i < VIRTIO_TX_FREE_NR; i++) {
m = (struct rte_mbuf *)vq->vq_descx[desc_idx++].cookie;
- m = __rte_pktmbuf_prefree_seg(m);
+ m = rte_pktmbuf_prefree_seg(m);
if (likely(m != NULL)) {
if (likely(m->pool == free[0]->pool))
free[nb_free++] = m;
@@ -123,7 +123,7 @@ virtio_xmit_cleanup(struct virtqueue *vq)
} else {
for (i = 1; i < VIRTIO_TX_FREE_NR; i++) {
m = (struct rte_mbuf *)vq->vq_descx[desc_idx++].cookie;
- m = __rte_pktmbuf_prefree_seg(m);
+ m = rte_pktmbuf_prefree_seg(m);
if (m != NULL)
rte_mempool_put(m->pool, m);
}
diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
index bfce9f4..73b79c0 100644
--- a/lib/librte_mbuf/rte_mbuf.h
+++ b/lib/librte_mbuf/rte_mbuf.h
@@ -1212,8 +1212,23 @@ static inline void rte_pktmbuf_detach(struct rte_mbuf *m)
__rte_mbuf_raw_free(md);
}

-static inline struct rte_mbuf* __attribute__((always_inline))
-__rte_pktmbuf_prefree_seg(struct rte_mbuf *m)
+/**
+ * Decrease reference counter and unlink a mbuf segment
+ *
+ * This function does the same than a free, except that it does not
+ * return the segment to its pool.
+ * It decreases the reference counter, and if it reaches 0, it is
+ * detached from its parent for an indirect mbuf.
+ *
+ * @param m
+ * The mbuf to be unlinked
+ * @return
+ * - (m) if it is the last reference. It can be recycled or freed.
+ * - (NULL) if the mbuf still has remaining references on it.
+ */
+__attribute__((always_inline))
+static inline struct rte_mbuf *
+rte_pktmbuf_prefree_seg(struct rte_mbuf *m)
{
__rte_mbuf_sanity_check(m, 0);

@@ -1226,6 +1241,14 @@ __rte_pktmbuf_prefree_seg(struct rte_mbuf *m)
return NULL;
}

+/* deprecated, replaced by rte_pktmbuf_prefree_seg() */
+__rte_deprecated
+static inline struct rte_mbuf *
+__rte_pktmbuf_prefree_seg(struct rte_mbuf *m)
+{
+ return rte_pktmbuf_prefree_seg(m);
+}
+
/**
* Free a segment of a packet mbuf into its original mempool.
*
@@ -1238,7 +1261,8 @@ __rte_pktmbuf_prefree_seg(struct rte_mbuf *m)
static inline void __attribute__((always_inline))
rte_pktmbuf_free_seg(struct rte_mbuf *m)
{
- if (likely(NULL != (m = __rte_pktmbuf_prefree_seg(m)))) {
+ m = rte_pktmbuf_prefree_seg(m);
+ if (likely(m != NULL)) {
m->next = NULL;
__rte_mbuf_raw_free(m);
}
--
2.8.1
Olivier Matz
2017-01-24 15:19:27 UTC
Permalink
Rename __rte_mbuf_raw_free() as rte_mbuf_raw_free() and make
it public. The old function is kept for compat but is marked as
deprecated.

The next commit changes the behavior of rte_mbuf_raw_free() to
make it more consistent with rte_mbuf_raw_alloc().

Signed-off-by: Olivier Matz <***@6wind.com>
---
drivers/net/ena/ena_ethdev.c | 2 +-
drivers/net/mlx5/mlx5_rxtx.c | 6 +++---
drivers/net/mpipe/mpipe_tilegx.c | 2 +-
lib/librte_mbuf/rte_mbuf.h | 22 ++++++++++++++++------
4 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ena/ena_ethdev.c b/drivers/net/ena/ena_ethdev.c
index 8497cd7..4aac6a9 100644
--- a/drivers/net/ena/ena_ethdev.c
+++ b/drivers/net/ena/ena_ethdev.c
@@ -685,7 +685,7 @@ static void ena_rx_queue_release_bufs(struct ena_ring *ring)
ring->rx_buffer_info[ring->next_to_clean & ring_mask];

if (m)
- __rte_mbuf_raw_free(m);
+ rte_mbuf_raw_free(m);

ring->next_to_clean++;
}
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 56c7f78..a518a42 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -1328,7 +1328,7 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
assert(pkt != (*rxq->elts)[idx]);
rep = NEXT(pkt);
rte_mbuf_refcnt_set(pkt, 0);
- __rte_mbuf_raw_free(pkt);
+ rte_mbuf_raw_free(pkt);
pkt = rep;
}
break;
@@ -1339,13 +1339,13 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
&rss_hash_res);
if (!len) {
rte_mbuf_refcnt_set(rep, 0);
- __rte_mbuf_raw_free(rep);
+ rte_mbuf_raw_free(rep);
break;
}
if (unlikely(len == -1)) {
/* RX error, packet is likely too large. */
rte_mbuf_refcnt_set(rep, 0);
- __rte_mbuf_raw_free(rep);
+ rte_mbuf_raw_free(rep);
++rxq->stats.idropped;
goto skip;
}
diff --git a/drivers/net/mpipe/mpipe_tilegx.c b/drivers/net/mpipe/mpipe_tilegx.c
index 7bbd168..eedc0b3 100644
--- a/drivers/net/mpipe/mpipe_tilegx.c
+++ b/drivers/net/mpipe/mpipe_tilegx.c
@@ -549,7 +549,7 @@ mpipe_recv_flush_stack(struct mpipe_dev_priv *priv)
mbuf->data_len = 0;
mbuf->pkt_len = 0;

- __rte_mbuf_raw_free(mbuf);
+ rte_mbuf_raw_free(mbuf);
}
}

diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
index 73b79c0..8ff2290 100644
--- a/lib/librte_mbuf/rte_mbuf.h
+++ b/lib/librte_mbuf/rte_mbuf.h
@@ -788,20 +788,30 @@ static inline struct rte_mbuf *rte_mbuf_raw_alloc(struct rte_mempool *mp)
}

/**
- * @internal Put mbuf back into its original mempool.
- * The use of that function is reserved for RTE internal needs.
- * Please use rte_pktmbuf_free().
+ * Put mbuf back into its original mempool.
+ *
+ * The caller must ensure that the mbuf is direct and that the
+ * reference counter is 0.
*
* @param m
* The mbuf to be freed.
*/
static inline void __attribute__((always_inline))
-__rte_mbuf_raw_free(struct rte_mbuf *m)
+rte_mbuf_raw_free(struct rte_mbuf *m)
{
+ RTE_ASSERT(RTE_MBUF_DIRECT(m));
RTE_ASSERT(rte_mbuf_refcnt_read(m) == 0);
rte_mempool_put(m->pool, m);
}

+/* compat with older versions */
+__rte_deprecated
+static inline void __attribute__((always_inline))
+__rte_mbuf_raw_free(struct rte_mbuf *m)
+{
+ rte_mbuf_raw_free(m);
+}
+
/* Operations on ctrl mbuf */

/**
@@ -1209,7 +1219,7 @@ static inline void rte_pktmbuf_detach(struct rte_mbuf *m)
m->ol_flags = 0;

if (rte_mbuf_refcnt_update(md, -1) == 0)
- __rte_mbuf_raw_free(md);
+ rte_mbuf_raw_free(md);
}

/**
@@ -1264,7 +1274,7 @@ rte_pktmbuf_free_seg(struct rte_mbuf *m)
m = rte_pktmbuf_prefree_seg(m);
if (likely(m != NULL)) {
m->next = NULL;
- __rte_mbuf_raw_free(m);
+ rte_mbuf_raw_free(m);
}
}
--
2.8.1
Olivier Matz
2017-01-24 15:19:28 UTC
Permalink
Set the value of m->refcnt to 1, m->nb_segs to 1 and m->next
to NULL when the mbuf is stored inside the mempool (unused).
This is done in rte_pktmbuf_prefree_seg(), before freeing or
recycling a mbuf.

Before this patch, the value of m->refcnt was expected to be 0
while in pool.

The objectives are:

- to avoid drivers to set m->next to NULL in the early Rx path, since
this field is in the second 64B of the mbuf and its access could
trigger a cache miss

- rationalize the behavior of raw_alloc/raw_free: one is now the
symmetric of the other, and refcnt is never changed in these functions.

Signed-off-by: Olivier Matz <***@6wind.com>
---
drivers/net/mlx5/mlx5_rxtx.c | 5 ++---
drivers/net/mpipe/mpipe_tilegx.c | 1 +
lib/librte_mbuf/rte_mbuf.c | 2 ++
lib/librte_mbuf/rte_mbuf.h | 45 +++++++++++++++++++++++++++++-----------
4 files changed, 38 insertions(+), 15 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index a518a42..294dfde 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -1327,7 +1327,8 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
while (pkt != seg) {
assert(pkt != (*rxq->elts)[idx]);
rep = NEXT(pkt);
- rte_mbuf_refcnt_set(pkt, 0);
+ NEXT(pkt) = NULL;
+ NB_SEGS(pkt) = 1;
rte_mbuf_raw_free(pkt);
pkt = rep;
}
@@ -1338,13 +1339,11 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt,
&rss_hash_res);
if (!len) {
- rte_mbuf_refcnt_set(rep, 0);
rte_mbuf_raw_free(rep);
break;
}
if (unlikely(len == -1)) {
/* RX error, packet is likely too large. */
- rte_mbuf_refcnt_set(rep, 0);
rte_mbuf_raw_free(rep);
++rxq->stats.idropped;
goto skip;
diff --git a/drivers/net/mpipe/mpipe_tilegx.c b/drivers/net/mpipe/mpipe_tilegx.c
index eedc0b3..560ffe9 100644
--- a/drivers/net/mpipe/mpipe_tilegx.c
+++ b/drivers/net/mpipe/mpipe_tilegx.c
@@ -548,6 +548,7 @@ mpipe_recv_flush_stack(struct mpipe_dev_priv *priv)
mbuf->packet_type = 0;
mbuf->data_len = 0;
mbuf->pkt_len = 0;
+ mbuf->next = NULL;

rte_mbuf_raw_free(mbuf);
}
diff --git a/lib/librte_mbuf/rte_mbuf.c b/lib/librte_mbuf/rte_mbuf.c
index 72ad91e..0acc810 100644
--- a/lib/librte_mbuf/rte_mbuf.c
+++ b/lib/librte_mbuf/rte_mbuf.c
@@ -145,6 +145,8 @@ rte_pktmbuf_init(struct rte_mempool *mp,
m->pool = mp;
m->nb_segs = 1;
m->port = 0xff;
+ rte_mbuf_refcnt_set(m, 1);
+ m->next = NULL;
}

/* helper to create a mbuf pool */
diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
index 8ff2290..bbd0700 100644
--- a/lib/librte_mbuf/rte_mbuf.h
+++ b/lib/librte_mbuf/rte_mbuf.h
@@ -766,6 +766,11 @@ rte_mbuf_sanity_check(const struct rte_mbuf *m, int is_header);
* initializing all the required fields. See rte_pktmbuf_reset().
* For standard needs, prefer rte_pktmbuf_alloc().
*
+ * The caller can expect that the following fields of the mbuf structure
+ * are initialized: buf_addr, buf_physaddr, buf_len, refcnt=1, nb_segs=1,
+ * next=NULL, pool, priv_size. The other fields must be initialized
+ * by the caller.
+ *
* @param mp
* The mempool from which mbuf is allocated.
* @return
@@ -780,8 +785,9 @@ static inline struct rte_mbuf *rte_mbuf_raw_alloc(struct rte_mempool *mp)
if (rte_mempool_get(mp, &mb) < 0)
return NULL;
m = (struct rte_mbuf *)mb;
- RTE_ASSERT(rte_mbuf_refcnt_read(m) == 0);
- rte_mbuf_refcnt_set(m, 1);
+ RTE_ASSERT(rte_mbuf_refcnt_read(m) == 1);
+ RTE_ASSERT(m->next == NULL);
+ RTE_ASSERT(m->nb_segs == 1);
__rte_mbuf_sanity_check(m, 0);

return m;
@@ -790,8 +796,13 @@ static inline struct rte_mbuf *rte_mbuf_raw_alloc(struct rte_mempool *mp)
/**
* Put mbuf back into its original mempool.
*
- * The caller must ensure that the mbuf is direct and that the
- * reference counter is 0.
+ * The caller must ensure that the mbuf is direct and properly
+ * reinitialized (refcnt=1, next=NULL, nb_segs=1), as done by
+ * rte_pktmbuf_prefree_seg().
+ *
+ * This function should be used with care, when optimization is
+ * required. For standard needs, prefer rte_pktmbuf_free() or
+ * rte_pktmbuf_free_seg().
*
* @param m
* The mbuf to be freed.
@@ -800,13 +811,16 @@ static inline void __attribute__((always_inline))
rte_mbuf_raw_free(struct rte_mbuf *m)
{
RTE_ASSERT(RTE_MBUF_DIRECT(m));
- RTE_ASSERT(rte_mbuf_refcnt_read(m) == 0);
+ RTE_ASSERT(rte_mbuf_refcnt_read(m) == 1);
+ RTE_ASSERT(m->next == NULL);
+ RTE_ASSERT(m->nb_segs == 1);
+ __rte_mbuf_sanity_check(m, 0);
rte_mempool_put(m->pool, m);
}

/* compat with older versions */
__rte_deprecated
-static inline void __attribute__((always_inline))
+static inline void
__rte_mbuf_raw_free(struct rte_mbuf *m)
{
rte_mbuf_raw_free(m);
@@ -1218,8 +1232,12 @@ static inline void rte_pktmbuf_detach(struct rte_mbuf *m)
m->data_len = 0;
m->ol_flags = 0;

- if (rte_mbuf_refcnt_update(md, -1) == 0)
+ if (rte_mbuf_refcnt_update(md, -1) == 0) {
+ md->next = NULL;
+ md->nb_segs = 1;
+ rte_mbuf_refcnt_set(md, 1);
rte_mbuf_raw_free(md);
+ }
}

/**
@@ -1243,9 +1261,14 @@ rte_pktmbuf_prefree_seg(struct rte_mbuf *m)
__rte_mbuf_sanity_check(m, 0);

if (likely(rte_mbuf_refcnt_update(m, -1) == 0)) {
- /* if this is an indirect mbuf, it is detached. */
- if (RTE_MBUF_INDIRECT(m))
+ if (RTE_MBUF_INDIRECT(m)) {
rte_pktmbuf_detach(m);
+ /* next, nb_segs, refcnt are reset */
+ } else {
+ m->next = NULL;
+ m->nb_segs = 1;
+ rte_mbuf_refcnt_set(m, 1);
+ }
return m;
}
return NULL;
@@ -1272,10 +1295,8 @@ static inline void __attribute__((always_inline))
rte_pktmbuf_free_seg(struct rte_mbuf *m)
{
m = rte_pktmbuf_prefree_seg(m);
- if (likely(m != NULL)) {
- m->next = NULL;
+ if (likely(m != NULL))
rte_mbuf_raw_free(m);
- }
}

/**
--
2.8.1
Bruce Richardson
2017-01-24 15:50:49 UTC
Permalink
Post by Olivier Matz
Set the value of m->refcnt to 1, m->nb_segs to 1 and m->next
to NULL when the mbuf is stored inside the mempool (unused).
This is done in rte_pktmbuf_prefree_seg(), before freeing or
recycling a mbuf.
Before this patch, the value of m->refcnt was expected to be 0
while in pool.
- to avoid drivers to set m->next to NULL in the early Rx path, since
this field is in the second 64B of the mbuf and its access could
trigger a cache miss
- rationalize the behavior of raw_alloc/raw_free: one is now the
symmetric of the other, and refcnt is never changed in these functions.
---
drivers/net/mlx5/mlx5_rxtx.c | 5 ++---
drivers/net/mpipe/mpipe_tilegx.c | 1 +
lib/librte_mbuf/rte_mbuf.c | 2 ++
lib/librte_mbuf/rte_mbuf.h | 45 +++++++++++++++++++++++++++++-----------
4 files changed, 38 insertions(+), 15 deletions(-)
<snip>
Post by Olivier Matz
/* compat with older versions */
__rte_deprecated
-static inline void __attribute__((always_inline))
+static inline void
__rte_mbuf_raw_free(struct rte_mbuf *m)
{
rte_mbuf_raw_free(m);
@@ -1218,8 +1232,12 @@ static inline void rte_pktmbuf_detach(struct rte_mbuf *m)
m->data_len = 0;
m->ol_flags = 0;
- if (rte_mbuf_refcnt_update(md, -1) == 0)
+ if (rte_mbuf_refcnt_update(md, -1) == 0) {
Minor nit, but in the case that we only have a single reference to the
mbufs, we are always setting that to zero just to re-increment it to 1
again.
Post by Olivier Matz
+ md->next = NULL;
+ md->nb_segs = 1;
+ rte_mbuf_refcnt_set(md, 1);
rte_mbuf_raw_free(md);
+ }
}
/**
Olivier Matz
2017-02-28 14:51:11 UTC
Permalink
Hi Bruce,

On Tue, 24 Jan 2017 15:50:49 +0000, Bruce Richardson
Post by Bruce Richardson
Post by Olivier Matz
Set the value of m->refcnt to 1, m->nb_segs to 1 and m->next
to NULL when the mbuf is stored inside the mempool (unused).
This is done in rte_pktmbuf_prefree_seg(), before freeing or
recycling a mbuf.
Before this patch, the value of m->refcnt was expected to be 0
while in pool.
- to avoid drivers to set m->next to NULL in the early Rx path,
since this field is in the second 64B of the mbuf and its access
could trigger a cache miss
- rationalize the behavior of raw_alloc/raw_free: one is now the
symmetric of the other, and refcnt is never changed in these functions.
---
drivers/net/mlx5/mlx5_rxtx.c | 5 ++---
drivers/net/mpipe/mpipe_tilegx.c | 1 +
lib/librte_mbuf/rte_mbuf.c | 2 ++
lib/librte_mbuf/rte_mbuf.h | 45
+++++++++++++++++++++++++++++----------- 4 files changed, 38
insertions(+), 15 deletions(-)
<snip>
Post by Olivier Matz
/* compat with older versions */
__rte_deprecated
-static inline void __attribute__((always_inline))
+static inline void
__rte_mbuf_raw_free(struct rte_mbuf *m)
{
rte_mbuf_raw_free(m);
@@ -1218,8 +1232,12 @@ static inline void rte_pktmbuf_detach(struct
rte_mbuf *m) m->data_len = 0;
m->ol_flags = 0;
- if (rte_mbuf_refcnt_update(md, -1) == 0)
+ if (rte_mbuf_refcnt_update(md, -1) == 0) {
Minor nit, but in the case that we only have a single reference to the
mbufs, we are always setting that to zero just to re-increment it to 1
again.
Post by Olivier Matz
+ md->next = NULL;
+ md->nb_segs = 1;
+ rte_mbuf_refcnt_set(md, 1);
rte_mbuf_raw_free(md);
+ }
}
/**
I'm trying to gather the comments that have been made on this patchset.
About this one, I think it would be more complex to change the code
to avoid to set the refcnt twice:

- we would need to duplicate code from rte_mbuf_refcnt_update(), which
I think is not a very good idea, due to the big comment
- it would make the detach code less readable
- it's even not sure that it will be more performant: since
rte_mbuf_refcnt_update() is inline, the compiler is probably able to
do the simplification by itself.



Olivier
Olivier Matz
2017-01-24 15:19:31 UTC
Permalink
It is now possible to reference a port identifier larger than 256
and have a mbuf chain larger than 256 segments.

Signed-off-by: Olivier Matz <***@6wind.com>
---
app/test-pmd/csumonly.c | 4 ++--
lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h | 4 ++--
lib/librte_mbuf/rte_mbuf.h | 5 ++---
3 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/app/test-pmd/csumonly.c b/app/test-pmd/csumonly.c
index 88cc842..5eaff9b 100644
--- a/app/test-pmd/csumonly.c
+++ b/app/test-pmd/csumonly.c
@@ -583,7 +583,7 @@ pkt_copy_split(const struct rte_mbuf *pkt)
rc = mbuf_copy_split(pkt, md, seglen, nb_seg);
if (rc < 0)
RTE_LOG(ERR, USER1,
- "mbuf_copy_split for %p(len=%u, nb_seg=%hhu) "
+ "mbuf_copy_split for %p(len=%u, nb_seg=%u) "
"into %u segments failed with error code: %d\n",
pkt, pkt->pkt_len, pkt->nb_segs, nb_seg, rc);

@@ -801,7 +801,7 @@ pkt_burst_checksum_forward(struct fwd_stream *fs)
char buf[256];

printf("-----------------\n");
- printf("port=%u, mbuf=%p, pkt_len=%u, nb_segs=%hhu:\n",
+ printf("port=%u, mbuf=%p, pkt_len=%u, nb_segs=%u:\n",
fs->rx_port, m, m->pkt_len, m->nb_segs);
/* dump rx parsed packet info */
rte_get_rx_ol_flag_list(rx_ol_flags, buf, sizeof(buf));
diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
index f24f79f..2ac879f 100644
--- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
+++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
@@ -118,8 +118,8 @@ struct rte_kni_mbuf {
uint64_t buf_physaddr;
uint16_t data_off; /**< Start address of data in segment buffer. */
char pad1[2];
- uint8_t nb_segs; /**< Number of segments. */
- char pad4[3];
+ uint16_t nb_segs; /**< Number of segments. */
+ char pad4[2];
uint64_t ol_flags; /**< Offload features. */
char pad2[4];
uint32_t pkt_len; /**< Total pkt len: sum of all segment data_len. */
diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
index cac31c9..de72314 100644
--- a/lib/librte_mbuf/rte_mbuf.h
+++ b/lib/librte_mbuf/rte_mbuf.h
@@ -408,9 +408,8 @@ struct rte_mbuf {
rte_atomic16_t refcnt_atomic; /**< Atomically accessed refcnt */
uint16_t refcnt; /**< Non-atomically accessed refcnt */
};
- uint8_t nb_segs; /**< Number of segments. */
- uint8_t port; /**< Input port. */
- uint16_t pad; /**< 2B pad for naturally aligned ol_flags */
+ uint16_t nb_segs; /**< Number of segments. */
+ uint16_t port; /**< Input port. */

uint64_t ol_flags; /**< Offload features. */
--
2.8.1
Olivier Matz
2017-01-24 15:19:32 UTC
Permalink
Move this field in the second cache line, since no driver use it
in Rx path. The freed space will be used by a timestamp in next
commit.

Signed-off-by: Olivier Matz <***@6wind.com>
---
lib/librte_mbuf/rte_mbuf.h | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
index de72314..39df3e1 100644
--- a/lib/librte_mbuf/rte_mbuf.h
+++ b/lib/librte_mbuf/rte_mbuf.h
@@ -465,8 +465,6 @@ struct rte_mbuf {
uint32_t usr; /**< User defined tags. See rte_distributor_process() */
} hash; /**< hash information */

- uint32_t seqn; /**< Sequence number. See also rte_reorder_insert() */
-
/** Outer VLAN TCI (CPU order), valid if PKT_RX_QINQ_STRIPPED is set. */
uint16_t vlan_tci_outer;

@@ -511,6 +509,10 @@ struct rte_mbuf {

/** Timesync flags for use with IEEE1588. */
uint16_t timesync;
+
+ /** Sequence number. See also rte_reorder_insert(). */
+ uint32_t seqn;
+
} __rte_cache_aligned;

/**
--
2.8.1
Olivier Matz
2017-01-24 15:19:33 UTC
Permalink
The field itself is not fully described yet, but this commit reserves
the room in the mbuf.

Signed-off-by: Olivier Matz <***@6wind.com>
---
lib/librte_mbuf/rte_mbuf.c | 2 ++
lib/librte_mbuf/rte_mbuf.h | 10 ++++++++++
2 files changed, 12 insertions(+)

diff --git a/lib/librte_mbuf/rte_mbuf.c b/lib/librte_mbuf/rte_mbuf.c
index 0acc810..f679bce 100644
--- a/lib/librte_mbuf/rte_mbuf.c
+++ b/lib/librte_mbuf/rte_mbuf.c
@@ -322,6 +322,7 @@ const char *rte_get_rx_ol_flag_name(uint64_t mask)
case PKT_RX_IEEE1588_TMST: return "PKT_RX_IEEE1588_TMST";
case PKT_RX_QINQ_STRIPPED: return "PKT_RX_QINQ_STRIPPED";
case PKT_RX_LRO: return "PKT_RX_LRO";
+ case PKT_RX_TIMESTAMP: return "PKT_RX_TIMESTAMP";
default: return NULL;
}
}
@@ -356,6 +357,7 @@ rte_get_rx_ol_flag_list(uint64_t mask, char *buf, size_t buflen)
{ PKT_RX_IEEE1588_TMST, PKT_RX_IEEE1588_TMST, NULL },
{ PKT_RX_QINQ_STRIPPED, PKT_RX_QINQ_STRIPPED, NULL },
{ PKT_RX_LRO, PKT_RX_LRO, NULL },
+ { PKT_RX_TIMESTAMP, PKT_RX_TIMESTAMP, NULL },
};
const char *name;
unsigned int i;
diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
index 39df3e1..4818e2f 100644
--- a/lib/librte_mbuf/rte_mbuf.h
+++ b/lib/librte_mbuf/rte_mbuf.h
@@ -177,6 +177,11 @@ extern "C" {
*/
#define PKT_RX_LRO (1ULL << 16)

+/**
+ * Indicate that the timestamp field in the mbuf is valid.
+ */
+#define PKT_RX_TIMESTAMP (1ULL << 17)
+
/* add new RX flags here */

/* add new TX flags here */
@@ -469,6 +474,10 @@ struct rte_mbuf {
uint16_t vlan_tci_outer;

uint16_t buf_len; /**< Length of segment buffer. */
+
+ /** Valid if PKT_RX_TIMESTAMP is set. The unit is nanoseconds */
+ uint64_t timestamp;
+
/* second cache line - fields only used in slow path or on TX */
MARKER cacheline1 __rte_cache_min_aligned;

@@ -1197,6 +1206,7 @@ static inline void rte_pktmbuf_attach(struct rte_mbuf *mi, struct rte_mbuf *m)
mi->nb_segs = 1;
mi->ol_flags = m->ol_flags | IND_ATTACHED_MBUF;
mi->packet_type = m->packet_type;
+ mi->timestamp = m->timestamp;

__rte_mbuf_sanity_check(mi, 1);
__rte_mbuf_sanity_check(m, 0);
--
2.8.1
Olivier Matz
2017-01-24 15:19:29 UTC
Permalink
Now that the m->next pointer and m->nb_segs is expected to be set (to
NULL and 1 respectively) after a mempool_get(), we can avoid to write them
in the Rx functions of drivers.

Only some drivers are patched, it's not an exhaustive patch. It gives
the idea to do the same in other drivers.

Signed-off-by: Olivier Matz <***@6wind.com>
---
drivers/net/i40e/i40e_rxtx_vec_sse.c | 6 ------
drivers/net/ixgbe/ixgbe_rxtx.c | 8 --------
drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c | 6 ------
drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c | 6 ------
drivers/net/null/rte_eth_null.c | 2 --
drivers/net/virtio/virtio_rxtx.c | 3 ---
6 files changed, 31 deletions(-)

diff --git a/drivers/net/i40e/i40e_rxtx_vec_sse.c b/drivers/net/i40e/i40e_rxtx_vec_sse.c
index 7c84a41..33bc121 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_sse.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_sse.c
@@ -403,12 +403,6 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
/* store the resulting 32-bit value */
*(int *)split_packet = _mm_cvtsi128_si32(eop_bits);
split_packet += RTE_I40E_DESCS_PER_LOOP;
-
- /* zero-out next pointers */
- rx_pkts[pos]->next = NULL;
- rx_pkts[pos + 1]->next = NULL;
- rx_pkts[pos + 2]->next = NULL;
- rx_pkts[pos + 3]->next = NULL;
}

/* C.3 calc available number of desc */
diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
index dd53cc6..2c9e342 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx.c
@@ -1548,8 +1548,6 @@ ixgbe_rx_alloc_bufs(struct ixgbe_rx_queue *rxq, bool reset_mbuf)
/* populate the static rte mbuf fields */
mb = rxep[i].mbuf;
if (reset_mbuf) {
- mb->next = NULL;
- mb->nb_segs = 1;
mb->port = rxq->port_id;
}

@@ -2157,12 +2155,6 @@ ixgbe_recv_pkts_lro(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts,
goto next_desc;
}

- /*
- * This is the last buffer of the received packet - return
- * the current cluster to the user.
- */
- rxm->next = NULL;
-
/* Initialize the first mbuf of the returned packet */
ixgbe_fill_cluster_head_buf(first_seg, &rxd, rxq, staterr);

diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c b/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
index f96cc85..63f2556 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
@@ -333,12 +333,6 @@ _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
*(int *)split_packet = ~stat & IXGBE_VPMD_DESC_EOP_MASK;

split_packet += RTE_IXGBE_DESCS_PER_LOOP;
-
- /* zero-out next pointers */
- rx_pkts[pos]->next = NULL;
- rx_pkts[pos + 1]->next = NULL;
- rx_pkts[pos + 2]->next = NULL;
- rx_pkts[pos + 3]->next = NULL;
}

rte_prefetch_non_temporal(rxdp + RTE_IXGBE_DESCS_PER_LOOP);
diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c b/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
index abbf284..65c5da3 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
@@ -425,12 +425,6 @@ _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
/* store the resulting 32-bit value */
*(int *)split_packet = _mm_cvtsi128_si32(eop_bits);
split_packet += RTE_IXGBE_DESCS_PER_LOOP;
-
- /* zero-out next pointers */
- rx_pkts[pos]->next = NULL;
- rx_pkts[pos + 1]->next = NULL;
- rx_pkts[pos + 2]->next = NULL;
- rx_pkts[pos + 3]->next = NULL;
}

/* C.3 calc available number of desc */
diff --git a/drivers/net/null/rte_eth_null.c b/drivers/net/null/rte_eth_null.c
index 57203e2..7e14da0 100644
--- a/drivers/net/null/rte_eth_null.c
+++ b/drivers/net/null/rte_eth_null.c
@@ -112,8 +112,6 @@ eth_null_rx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
break;
bufs[i]->data_len = (uint16_t)packet_size;
bufs[i]->pkt_len = packet_size;
- bufs[i]->nb_segs = 1;
- bufs[i]->next = NULL;
bufs[i]->port = h->internals->port_id;
}

diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
index b29565e..111a983 100644
--- a/drivers/net/virtio/virtio_rxtx.c
+++ b/drivers/net/virtio/virtio_rxtx.c
@@ -761,7 +761,6 @@ virtio_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
rxm->vlan_tci = 0;

rxm->nb_segs = 1;
- rxm->next = NULL;
rxm->pkt_len = (uint32_t)(len[i] - hdr_size);
rxm->data_len = (uint16_t)(len[i] - hdr_size);

@@ -888,7 +887,6 @@ virtio_recv_mergeable_pkts(void *rx_queue,

rxm->data_off = RTE_PKTMBUF_HEADROOM;
rxm->nb_segs = seg_num;
- rxm->next = NULL;
rxm->ol_flags = 0;
rxm->vlan_tci = 0;
rxm->pkt_len = (uint32_t)(len[0] - hdr_size);
@@ -933,7 +931,6 @@ virtio_recv_mergeable_pkts(void *rx_queue,
rxm = rcv_pkts[extra_idx];

rxm->data_off = RTE_PKTMBUF_HEADROOM - hdr_size;
- rxm->next = NULL;
rxm->pkt_len = (uint32_t)(len[extra_idx]);
rxm->data_len = (uint16_t)(len[extra_idx]);
--
2.8.1
Olivier Matz
2017-01-24 15:19:30 UTC
Permalink
From: Jerin Jacob <***@caviumnetworks.com>

To avoid multiple stores on fast path, Ethernet drivers
aggregate the writes to data_off, refcnt, nb_segs and port
to an uint64_t data and write the data in one shot
with uint64_t* at &mbuf->rearm_data address.

Some of the non-IA platforms have store operation overhead
if the store address is not naturally aligned.This patch
fixes the performance issue on those targets.

Signed-off-by: Jerin Jacob <***@caviumnetworks.com>
Signed-off-by: Olivier Matz <***@6wind.com>
---
drivers/net/fm10k/fm10k_rxtx_vec.c | 3 ---
drivers/net/i40e/i40e_rxtx_vec_sse.c | 5 +----
drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c | 3 ---
drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c | 3 ---
lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h | 3 +--
lib/librte_mbuf/rte_mbuf.h | 6 +++---
6 files changed, 5 insertions(+), 18 deletions(-)

diff --git a/drivers/net/fm10k/fm10k_rxtx_vec.c b/drivers/net/fm10k/fm10k_rxtx_vec.c
index 825e3c1..61a65e9 100644
--- a/drivers/net/fm10k/fm10k_rxtx_vec.c
+++ b/drivers/net/fm10k/fm10k_rxtx_vec.c
@@ -324,9 +324,6 @@ fm10k_rxq_rearm(struct fm10k_rx_queue *rxq)

/* Flush mbuf with pkt template.
* Data to be rearmed is 6 bytes long.
- * Though, RX will overwrite ol_flags that are coming next
- * anyway. So overwrite whole 8 bytes with one load:
- * 6 bytes of rearm_data plus first 2 bytes of ol_flags.
*/
p0 = (uintptr_t)&mb0->rearm_data;
*(uint64_t *)p0 = rxq->mbuf_initializer;
diff --git a/drivers/net/i40e/i40e_rxtx_vec_sse.c b/drivers/net/i40e/i40e_rxtx_vec_sse.c
index 33bc121..1a8bcdf 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_sse.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_sse.c
@@ -87,11 +87,8 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
mb0 = rxep[0].mbuf;
mb1 = rxep[1].mbuf;

- /* Flush mbuf with pkt template.
+ /* Flush mbuf with pkt template.
* Data to be rearmed is 6 bytes long.
- * Though, RX will overwrite ol_flags that are coming next
- * anyway. So overwrite whole 8 bytes with one load:
- * 6 bytes of rearm_data plus first 2 bytes of ol_flags.
*/
p0 = (uintptr_t)&mb0->rearm_data;
*(uint64_t *)p0 = rxq->mbuf_initializer;
diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c b/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
index 63f2556..c538796 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
@@ -85,9 +85,6 @@ ixgbe_rxq_rearm(struct ixgbe_rx_queue *rxq)
/*
* Flush mbuf with pkt template.
* Data to be rearmed is 6 bytes long.
- * Though, RX will overwrite ol_flags that are coming next
- * anyway. So overwrite whole 8 bytes with one load:
- * 6 bytes of rearm_data plus first 2 bytes of ol_flags.
*/
vst1_u8((uint8_t *)&mb0->rearm_data, p);
paddr = mb0->buf_physaddr + RTE_PKTMBUF_HEADROOM;
diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c b/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
index 65c5da3..62afe31 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
@@ -90,9 +90,6 @@ ixgbe_rxq_rearm(struct ixgbe_rx_queue *rxq)
/*
* Flush mbuf with pkt template.
* Data to be rearmed is 6 bytes long.
- * Though, RX will overwrite ol_flags that are coming next
- * anyway. So overwrite whole 8 bytes with one load:
- * 6 bytes of rearm_data plus first 2 bytes of ol_flags.
*/
p0 = (uintptr_t)&mb0->rearm_data;
*(uint64_t *)p0 = rxq->mbuf_initializer;
diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
index 09713b0..f24f79f 100644
--- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
+++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
@@ -116,11 +116,10 @@ struct rte_kni_fifo {
struct rte_kni_mbuf {
void *buf_addr __attribute__((__aligned__(RTE_CACHE_LINE_SIZE)));
uint64_t buf_physaddr;
- char pad0[2];
uint16_t data_off; /**< Start address of data in segment buffer. */
char pad1[2];
uint8_t nb_segs; /**< Number of segments. */
- char pad4[1];
+ char pad4[3];
uint64_t ol_flags; /**< Offload features. */
char pad2[4];
uint32_t pkt_len; /**< Total pkt len: sum of all segment data_len. */
diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
index bbd0700..cac31c9 100644
--- a/lib/librte_mbuf/rte_mbuf.h
+++ b/lib/librte_mbuf/rte_mbuf.h
@@ -391,10 +391,8 @@ struct rte_mbuf {
void *buf_addr; /**< Virtual address of segment buffer. */
phys_addr_t buf_physaddr; /**< Physical address of segment buffer. */

- uint16_t buf_len; /**< Length of segment buffer. */
-
/* next 6 bytes are initialised on RX descriptor rearm */
- MARKER8 rearm_data;
+ MARKER64 rearm_data;
uint16_t data_off;

/**
@@ -412,6 +410,7 @@ struct rte_mbuf {
};
uint8_t nb_segs; /**< Number of segments. */
uint8_t port; /**< Input port. */
+ uint16_t pad; /**< 2B pad for naturally aligned ol_flags */

uint64_t ol_flags; /**< Offload features. */

@@ -472,6 +471,7 @@ struct rte_mbuf {
/** Outer VLAN TCI (CPU order), valid if PKT_RX_QINQ_STRIPPED is set. */
uint16_t vlan_tci_outer;

+ uint16_t buf_len; /**< Length of segment buffer. */
/* second cache line - fields only used in slow path or on TX */
MARKER cacheline1 __rte_cache_min_aligned;
--
2.8.1
Bruce Richardson
2017-01-24 15:59:08 UTC
Permalink
Based on discussion done in [1], this patchset reorganizes the mbuf.
Hi Olivier,

thanks for all the work on this. From a quick scan of the patches, and
the description below, it looks like a good set of changes. Comments
below to see about kick-starting some further discussion about some of
the other changes you propose.
- reorder structure to increase vector performance on some non-ia
platforms.
- add a 64bits timestamp field in the 1st cache line
- m->next, m->nb_segs, and m->refcnt are always initialized for mbufs
in the pool, avoiding the need of setting m->next (located in the
2nd cache line) in the Rx path for mono-segment packets.
- change port and nb_segs to 16 bits
- move seqn in the 2nd cache line
- move refcnt and nb_segs to the 2nd cache line: many drivers sets
them in the Rx path, so it could introduce a performance regression, or
it would require to change all the drivers, which is not an easy task.
But if we do make this change and update the drivers, some of them
should perform a little better, since they do fewer writes. I don't
think the fastest vector drivers will be affected, since they already
coalesce the writes to these fields with other writes, but others drivers
may well be improved by the change.
- remove the m->port field: too much impact on many examples and libraries,
and some people highlighted they are using it.
- moving m->next in the 1st cache line: there is not enough room, and having
it set to NULL for unused mbuf should remove the need for it.
I agree.
- merge seqn and timestamp together in a union: we could imagine use cases
were both are activated. There is no flag indicating the presence of seqn,
so it looks preferable to keep them separated for now.
What were the use-cases? If we have a timestamp, surely sequence can be
determined from that? Even if you use the TSC as a timestamp per burst,
you can still sequence the packets cheaply by just adding 1 to each
subsequent value.

/Bruce
Olivier MATZ
2017-01-24 16:16:40 UTC
Permalink
On Tue, 24 Jan 2017 15:59:08 +0000, Bruce Richardson
Post by Bruce Richardson
Based on discussion done in [1], this patchset reorganizes the mbuf.
Hi Olivier,
thanks for all the work on this. From a quick scan of the patches, and
the description below, it looks like a good set of changes. Comments
below to see about kick-starting some further discussion about some of
the other changes you propose.
- reorder structure to increase vector performance on some non-ia
platforms.
- add a 64bits timestamp field in the 1st cache line
- m->next, m->nb_segs, and m->refcnt are always initialized for
mbufs in the pool, avoiding the need of setting m->next (located in
the 2nd cache line) in the Rx path for mono-segment packets.
- change port and nb_segs to 16 bits
- move seqn in the 2nd cache line
- move refcnt and nb_segs to the 2nd cache line: many drivers sets
them in the Rx path, so it could introduce a performance
regression, or it would require to change all the drivers, which is
not an easy task.
But if we do make this change and update the drivers, some of them
should perform a little better, since they do fewer writes. I don't
think the fastest vector drivers will be affected, since they already
coalesce the writes to these fields with other writes, but others
drivers may well be improved by the change.
Yes, that's something I forgot to say in the cover letter: after this
patchset, the Rx path of drivers could be optimized a bit by removing
writes to m->next, m->nb_segs and m->refcnt. The patch 4/8 gives an
idea of what could be done.

Once most drivers are updated, we could reconsider moving nb_segs and
refcnt in the second cache line.
Post by Bruce Richardson
- remove the m->port field: too much impact on many examples and
libraries, and some people highlighted they are using it.
- moving m->next in the 1st cache line: there is not enough room,
and having it set to NULL for unused mbuf should remove the need
for it.
I agree.
- merge seqn and timestamp together in a union: we could imagine
use cases were both are activated. There is no flag indicating the
presence of seqn, so it looks preferable to keep them separated for
now.
What were the use-cases? If we have a timestamp, surely sequence can
be determined from that? Even if you use the TSC as a timestamp per
burst, you can still sequence the packets cheaply by just adding 1 to
each subsequent value.
Assuming the timestamp is in nanosecond, it is not a sequence number,
so I'm not sure it should be hijacked for this purpose. A timestamp can
be used to reorder packets, but having a sequence number is better
because you can be sure that when you get packets 1, 3, 2, 0 that no
packet is missing between 0 and 3.

For that reason, I guess both features could be used at the same time.

Regards,
Olivier
Ananyev, Konstantin
2017-02-06 18:41:27 UTC
Permalink
Hi Olivier,
Looks good in general, some comments from me below.
Thanks
Konstantin
- reorder structure to increase vector performance on some non-ia
platforms.
- add a 64bits timestamp field in the 1st cache line
Wonder why it deserves to be in first cache line?
How it differs from seqn below (pure SW stuff right now).
- m->next, m->nb_segs, and m->refcnt are always initialized for mbufs
in the pool, avoiding the need of setting m->next (located in the
2nd cache line) in the Rx path for mono-segment packets.
- change port and nb_segs to 16 bits
Not that I am completely against it,
but changing nb_segs to 16 bits seems like an overkill to me.
I think we can keep and extra 8bits for something more useful in future.
- move seqn in the 2nd cache line
- move refcnt and nb_segs to the 2nd cache line: many drivers sets
them in the Rx path, so it could introduce a performance regression, or
I wonder can refcnt only be moved into the 2-nd cacheline?
As I understand thanks to other change (from above) m->refcnt
will already be initialized, so RX code don't need to touch it.
Though yes, it still would require changes in all PMDs.
it would require to change all the drivers, which is not an easy task.
- remove the m->port field: too much impact on many examples and libraries,
and some people highlighted they are using it.
Ok, but can it be moved into the second cache-line?
- moving m->next in the 1st cache line: there is not enough room, and having
it set to NULL for unused mbuf should remove the need for it.
- merge seqn and timestamp together in a union: we could imagine use cases
were both are activated. There is no flag indicating the presence of seqn,
so it looks preferable to keep them separated for now.
I made some basic performance tests (ixgbe) and see no regression, but
the patchset requires more testing.
[1] http://dpdk.org/ml/archives/dev/2016-October/049338.html
Morten Brørup
2017-02-09 16:20:13 UTC
Permalink
Post by Ananyev, Konstantin
...
- change port and nb_segs to 16 bits
Not that I am completely against it,
but changing nb_segs to 16 bits seems like an overkill to me.
I think we can keep and extra 8bits for something more useful in future.
If I recall correctly, this was discussed at DPDK Userspace: If mbuf->nb_segs is used for multicasting (or port flooding), it should have the same size as mbuf->port.

Someone please correct me if I'm mixing things up. The mbuf discussion is important!


Med venlig hil
Ananyev, Konstantin
2017-02-09 16:56:18 UTC
Permalink
-----Original Message-----
Sent: Thursday, February 9, 2017 4:20 PM
Subject: RE: [dpdk-dev] [RFC 0/8] mbuf: structure reorganization
Post by Ananyev, Konstantin
...
- change port and nb_segs to 16 bits
Not that I am completely against it,
but changing nb_segs to 16 bits seems like an overkill to me.
I think we can keep and extra 8bits for something more useful in future.
If I recall correctly, this was discussed at DPDK Userspace: If mbuf->nb_segs is used for multicasting (or port flooding), it should have the
same size as mbuf->port.
Someone please correct me if I'm mixing things up. The mbuf discussion is important!
I think that's for refcnt not nb_segs.
Actually a question - does anyone really do use/see a packets that have >= 256 segments?
Konstantin
Med venlig hilsen
Olivier Matz
2017-02-16 13:48:07 UTC
Permalink
Hi Konstantin,

Thanks for the feedback.
Comments inline.


On Mon, 6 Feb 2017 18:41:27 +0000, "Ananyev, Konstantin"
Post by Bruce Richardson
Hi Olivier,
Looks good in general, some comments from me below.
Thanks
Konstantin
- reorder structure to increase vector performance on some non-ia
platforms.
- add a 64bits timestamp field in the 1st cache line
Wonder why it deserves to be in first cache line?
How it differs from seqn below (pure SW stuff right now).
In case the timestamp is set from a NIC value, it is set in the Rx
path. So that's why I think it deserve to be located in the 1st cache
line.

As you said, the seqn is a pure sw stuff right: it is set in a lib, not
in a PMD rx path.
Post by Bruce Richardson
- m->next, m->nb_segs, and m->refcnt are always initialized for
mbufs in the pool, avoiding the need of setting m->next (located in
the 2nd cache line) in the Rx path for mono-segment packets.
- change port and nb_segs to 16 bits
Not that I am completely against it,
but changing nb_segs to 16 bits seems like an overkill to me.
I think we can keep and extra 8bits for something more useful in future.
In my case, I use the m->next field to chain more than 256 segments for
L4 socket buffers. It also updates nb_seg that can overflow. It's not
a big issue since at the end, nb_seg is decremented for each segment.
On the other hand, if I enable some sanity checks on mbufs, it
complains because the number of segments is not equal to nb_seg.

There is also another use case with fragmentation as discussed recently:
http://dpdk.org/dev/patchwork/patch/19819/

Of course, dealing with a long mbuf list is not that efficient,
but the application can maintain another structure to accelerate the
access to the middle/end of the list.

Finally, we have other ideas to get additional 8 bits if required in
the future, so I don't think it's really a problem.
Post by Bruce Richardson
- move seqn in the 2nd cache line
- move refcnt and nb_segs to the 2nd cache line: many drivers sets
them in the Rx path, so it could introduce a performance
regression, or
I wonder can refcnt only be moved into the 2-nd cacheline?
As I understand thanks to other change (from above) m->refcnt
will already be initialized, so RX code don't need to touch it.
Though yes, it still would require changes in all PMDs.
Yes, I agree, some fields could be moved in the 2nd cache line once all
PMDs stop to write them in RX path. I propose to issue some guidelines
to PMD maintainers at the same time the patchset is pushed. Then we can
consider changing it in a future version, in case we need more room in
the 1st mbuf cache line.
Post by Bruce Richardson
it would require to change all the drivers, which is not an easy task.
- remove the m->port field: too much impact on many examples and
libraries, and some people highlighted they are using it.
Ok, but can it be moved into the second cache-line?
I think no: it is set by the PMDs in RX path, it would impact
performance.
Post by Bruce Richardson
- moving m->next in the 1st cache line: there is not enough room,
and having it set to NULL for unused mbuf should remove the need
for it.
- merge seqn and timestamp together in a union: we could imagine
use cases were both are activated. There is no flag indicating the
presence of seqn, so it looks preferable to keep them separated for
now.
I made some basic performance tests (ixgbe) and see no regression,
but the patchset requires more testing.
[1] http://dpdk.org/ml/archives/dev/2016-October/049338.html
By the way, additional performance tests on this patchset from PMD
vendors would be helpful.


Olivier
Bruce Richardson
2017-02-16 15:46:19 UTC
Permalink
Post by Olivier Matz
Hi Konstantin,
Thanks for the feedback.
Comments inline.
On Mon, 6 Feb 2017 18:41:27 +0000, "Ananyev, Konstantin"
Post by Bruce Richardson
Hi Olivier,
Looks good in general, some comments from me below.
Thanks
Konstantin
- reorder structure to increase vector performance on some non-ia
platforms.
- add a 64bits timestamp field in the 1st cache line
Wonder why it deserves to be in first cache line?
How it differs from seqn below (pure SW stuff right now).
In case the timestamp is set from a NIC value, it is set in the Rx
path. So that's why I think it deserve to be located in the 1st cache
line.
As you said, the seqn is a pure sw stuff right: it is set in a lib, not
in a PMD rx path.
Post by Bruce Richardson
- m->next, m->nb_segs, and m->refcnt are always initialized for
mbufs in the pool, avoiding the need of setting m->next (located in
the 2nd cache line) in the Rx path for mono-segment packets.
- change port and nb_segs to 16 bits
Not that I am completely against it,
but changing nb_segs to 16 bits seems like an overkill to me.
I think we can keep and extra 8bits for something more useful in future.
In my case, I use the m->next field to chain more than 256 segments for
L4 socket buffers. It also updates nb_seg that can overflow. It's not
a big issue since at the end, nb_seg is decremented for each segment.
On the other hand, if I enable some sanity checks on mbufs, it
complains because the number of segments is not equal to nb_seg.
http://dpdk.org/dev/patchwork/patch/19819/
Of course, dealing with a long mbuf list is not that efficient,
but the application can maintain another structure to accelerate the
access to the middle/end of the list.
Finally, we have other ideas to get additional 8 bits if required in
the future, so I don't think it's really a problem.
Post by Bruce Richardson
- move seqn in the 2nd cache line
- move refcnt and nb_segs to the 2nd cache line: many drivers sets
them in the Rx path, so it could introduce a performance
regression, or
I wonder can refcnt only be moved into the 2-nd cacheline?
As I understand thanks to other change (from above) m->refcnt
will already be initialized, so RX code don't need to touch it.
Though yes, it still would require changes in all PMDs.
Yes, I agree, some fields could be moved in the 2nd cache line once all
PMDs stop to write them in RX path. I propose to issue some guidelines
to PMD maintainers at the same time the patchset is pushed. Then we can
consider changing it in a future version, in case we need more room in
the 1st mbuf cache line.
If we are changing things, we should really do all that now, rather than
storing up future breaks to mbuf. Worst case, we should plan for it
immediately after the release where we make these changes. Have two
releases that break mbuf immediately after each other - and flagged as
such, but keep it stable thereafter. I don't like having technical debt
on mbuf just after we supposedly "fix" it.

/Bruce
Olivier Matz
2017-02-16 16:14:10 UTC
Permalink
On Thu, 16 Feb 2017 15:46:19 +0000, Bruce Richardson
Post by Bruce Richardson
Post by Olivier Matz
Hi Konstantin,
Thanks for the feedback.
Comments inline.
On Mon, 6 Feb 2017 18:41:27 +0000, "Ananyev, Konstantin"
Post by Bruce Richardson
Hi Olivier,
Looks good in general, some comments from me below.
Thanks
Konstantin
- reorder structure to increase vector performance on some
non-ia platforms.
- add a 64bits timestamp field in the 1st cache line
Wonder why it deserves to be in first cache line?
How it differs from seqn below (pure SW stuff right now).
In case the timestamp is set from a NIC value, it is set in the Rx
path. So that's why I think it deserve to be located in the 1st
cache line.
As you said, the seqn is a pure sw stuff right: it is set in a lib,
not in a PMD rx path.
Post by Bruce Richardson
- m->next, m->nb_segs, and m->refcnt are always initialized for
mbufs in the pool, avoiding the need of setting m->next
(located in the 2nd cache line) in the Rx path for mono-segment
packets.
- change port and nb_segs to 16 bits
Not that I am completely against it,
but changing nb_segs to 16 bits seems like an overkill to me.
I think we can keep and extra 8bits for something more useful in future.
In my case, I use the m->next field to chain more than 256 segments
for L4 socket buffers. It also updates nb_seg that can overflow.
It's not a big issue since at the end, nb_seg is decremented for
each segment. On the other hand, if I enable some sanity checks on
mbufs, it complains because the number of segments is not equal to
nb_seg.
There is also another use case with fragmentation as discussed
recently: http://dpdk.org/dev/patchwork/patch/19819/
Of course, dealing with a long mbuf list is not that efficient,
but the application can maintain another structure to accelerate the
access to the middle/end of the list.
Finally, we have other ideas to get additional 8 bits if required in
the future, so I don't think it's really a problem.
Post by Bruce Richardson
- move seqn in the 2nd cache line
- move refcnt and nb_segs to the 2nd cache line: many drivers
sets them in the Rx path, so it could introduce a performance
regression, or
I wonder can refcnt only be moved into the 2-nd cacheline?
As I understand thanks to other change (from above) m->refcnt
will already be initialized, so RX code don't need to touch it.
Though yes, it still would require changes in all PMDs.
Yes, I agree, some fields could be moved in the 2nd cache line once
all PMDs stop to write them in RX path. I propose to issue some
guidelines to PMD maintainers at the same time the patchset is
pushed. Then we can consider changing it in a future version, in
case we need more room in the 1st mbuf cache line.
If we are changing things, we should really do all that now, rather
than storing up future breaks to mbuf. Worst case, we should plan for
it immediately after the release where we make these changes. Have two
releases that break mbuf immediately after each other - and flagged as
such, but keep it stable thereafter. I don't like having technical
debt on mbuf just after we supposedly "fix" it.
I think there is no need to do this change now. And I don't feel good
with the idea of having a patchset that updates all the PMDs to remove
the access to a field because it moved to the 2nd cache line
(especially thinking about vector PMDs).

That's why I think the plan could be:
- push an updated version of this patchset quickly
- advertise to PMD maintainers "you don't need to set the m->next,
m->refcnt, and m->nb_segs in the RX path, please update your drivers"
- later, if we need more room in the 1st cache line of the mbuf, we
can move refcnt and nb_seg, probably without impacting the
performance.


Olivier
Morten Brørup
2017-02-21 14:20:23 UTC
Permalink
Comments at the end.
-----Original Message-----
Sent: Thursday, February 16, 2017 5:14 PM
To: Bruce Richardson
Subject: Re: [dpdk-dev] [RFC 0/8] mbuf: structure reorganization
On Thu, 16 Feb 2017 15:46:19 +0000, Bruce Richardson
Post by Bruce Richardson
Post by Olivier Matz
Hi Konstantin,
Thanks for the feedback.
Comments inline.
On Mon, 6 Feb 2017 18:41:27 +0000, "Ananyev, Konstantin"
Post by Bruce Richardson
Hi Olivier,
Looks good in general, some comments from me below.
Thanks
Konstantin
- reorder structure to increase vector performance on some
non-ia platforms.
- add a 64bits timestamp field in the 1st cache line
Wonder why it deserves to be in first cache line?
How it differs from seqn below (pure SW stuff right now).
In case the timestamp is set from a NIC value, it is set in the Rx
path. So that's why I think it deserve to be located in the 1st
cache line.
As you said, the seqn is a pure sw stuff right: it is set in a lib,
not in a PMD rx path.
Post by Bruce Richardson
- m->next, m->nb_segs, and m->refcnt are always initialized for
mbufs in the pool, avoiding the need of setting m->next
(located
Post by Bruce Richardson
Post by Olivier Matz
Post by Bruce Richardson
in the 2nd cache line) in the Rx path for mono-segment packets.
- change port and nb_segs to 16 bits
Not that I am completely against it, but changing nb_segs to 16
bits seems like an overkill to me.
I think we can keep and extra 8bits for something more useful in future.
In my case, I use the m->next field to chain more than 256 segments
for L4 socket buffers. It also updates nb_seg that can overflow.
It's not a big issue since at the end, nb_seg is decremented for
each segment. On the other hand, if I enable some sanity checks on
mbufs, it complains because the number of segments is not equal to
nb_seg.
There is also another use case with fragmentation as discussed
recently: http://dpdk.org/dev/patchwork/patch/19819/
Of course, dealing with a long mbuf list is not that efficient, but
the application can maintain another structure to accelerate the
access to the middle/end of the list.
Finally, we have other ideas to get additional 8 bits if required
in
Post by Bruce Richardson
Post by Olivier Matz
the future, so I don't think it's really a problem.
Post by Bruce Richardson
- move seqn in the 2nd cache line
- move refcnt and nb_segs to the 2nd cache line: many drivers
sets them in the Rx path, so it could introduce a performance
regression, or
I wonder can refcnt only be moved into the 2-nd cacheline?
As I understand thanks to other change (from above) m->refcnt
will
Post by Bruce Richardson
Post by Olivier Matz
Post by Bruce Richardson
already be initialized, so RX code don't need to touch it.
Though yes, it still would require changes in all PMDs.
Yes, I agree, some fields could be moved in the 2nd cache line once
all PMDs stop to write them in RX path. I propose to issue some
guidelines to PMD maintainers at the same time the patchset is
pushed. Then we can consider changing it in a future version, in
case we need more room in the 1st mbuf cache line.
If we are changing things, we should really do all that now, rather
than storing up future breaks to mbuf. Worst case, we should plan for
it immediately after the release where we make these changes. Have
two
Post by Bruce Richardson
releases that break mbuf immediately after each other - and flagged
as
Post by Bruce Richardson
such, but keep it stable thereafter. I don't like having technical
debt on mbuf just after we supposedly "fix" it.
I think there is no need to do this change now. And I don't feel good
with the idea of having a patchset that updates all the PMDs to remove
the access to a field because it moved to the 2nd cache line
(especially thinking about vector PMDs).
- push an updated version of this patchset quickly
- advertise to PMD maintainers "you don't need to set the m->next,
m->refcnt, and m->nb_segs in the RX path, please update your drivers"
- later, if we need more room in the 1st cache line of the mbuf, we
can move refcnt and nb_seg, probably without impacting the
performance.
Olivier
I suppose you mean that PMDs don't need to /initialize/ m->next, m->refcnt and m->nb_segs.

Forgive my ignorance, and this is wild speculation, but: Would a PMD not need to set m->next and m->nb_segs if it receives a jumbogram larger than an mbuf packet buffer? And if this is a realistic use case, these fields actually do belong in the 1st cache line. PMD developers please chime in.


And I tend to agree with Bruce about making all these mbuf changes in one go, rather than postponing some of them to later. Especially because the postponement also closes and reopens the whole discussion and decision process! (Not initializing a few fields in a PMD cannot require a lot of work by the PMD developers. Moving the fields to the 2nd cache line will in the worst case degrade the performance of the non-updated PMDs.)

A two step process makes good sense for the developers of DPDK, but both steps should be taken within the same release, so they are transparent to the users of DPDK.


Me
Bruce Richardson
2017-02-21 14:28:46 UTC
Permalink
Post by Morten Brørup
Comments at the end.
-----Original Message-----
Sent: Thursday, February 16, 2017 5:14 PM
To: Bruce Richardson
Subject: Re: [dpdk-dev] [RFC 0/8] mbuf: structure reorganization
On Thu, 16 Feb 2017 15:46:19 +0000, Bruce Richardson
Post by Bruce Richardson
Post by Olivier Matz
Hi Konstantin,
Thanks for the feedback.
Comments inline.
On Mon, 6 Feb 2017 18:41:27 +0000, "Ananyev, Konstantin"
Post by Bruce Richardson
Hi Olivier,
Looks good in general, some comments from me below.
Thanks
Konstantin
- reorder structure to increase vector performance on some
non-ia platforms.
- add a 64bits timestamp field in the 1st cache line
Wonder why it deserves to be in first cache line?
How it differs from seqn below (pure SW stuff right now).
In case the timestamp is set from a NIC value, it is set in the Rx
path. So that's why I think it deserve to be located in the 1st
cache line.
As you said, the seqn is a pure sw stuff right: it is set in a lib,
not in a PMD rx path.
Post by Bruce Richardson
- m->next, m->nb_segs, and m->refcnt are always initialized for
mbufs in the pool, avoiding the need of setting m->next
(located
Post by Bruce Richardson
Post by Olivier Matz
Post by Bruce Richardson
in the 2nd cache line) in the Rx path for mono-segment packets.
- change port and nb_segs to 16 bits
Not that I am completely against it, but changing nb_segs to 16
bits seems like an overkill to me.
I think we can keep and extra 8bits for something more useful in future.
In my case, I use the m->next field to chain more than 256 segments
for L4 socket buffers. It also updates nb_seg that can overflow.
It's not a big issue since at the end, nb_seg is decremented for
each segment. On the other hand, if I enable some sanity checks on
mbufs, it complains because the number of segments is not equal to
nb_seg.
There is also another use case with fragmentation as discussed
recently: http://dpdk.org/dev/patchwork/patch/19819/
Of course, dealing with a long mbuf list is not that efficient, but
the application can maintain another structure to accelerate the
access to the middle/end of the list.
Finally, we have other ideas to get additional 8 bits if required
in
Post by Bruce Richardson
Post by Olivier Matz
the future, so I don't think it's really a problem.
Post by Bruce Richardson
- move seqn in the 2nd cache line
- move refcnt and nb_segs to the 2nd cache line: many drivers
sets them in the Rx path, so it could introduce a performance
regression, or
I wonder can refcnt only be moved into the 2-nd cacheline?
As I understand thanks to other change (from above) m->refcnt
will
Post by Bruce Richardson
Post by Olivier Matz
Post by Bruce Richardson
already be initialized, so RX code don't need to touch it.
Though yes, it still would require changes in all PMDs.
Yes, I agree, some fields could be moved in the 2nd cache line once
all PMDs stop to write them in RX path. I propose to issue some
guidelines to PMD maintainers at the same time the patchset is
pushed. Then we can consider changing it in a future version, in
case we need more room in the 1st mbuf cache line.
If we are changing things, we should really do all that now, rather
than storing up future breaks to mbuf. Worst case, we should plan for
it immediately after the release where we make these changes. Have
two
Post by Bruce Richardson
releases that break mbuf immediately after each other - and flagged
as
Post by Bruce Richardson
such, but keep it stable thereafter. I don't like having technical
debt on mbuf just after we supposedly "fix" it.
I think there is no need to do this change now. And I don't feel good
with the idea of having a patchset that updates all the PMDs to remove
the access to a field because it moved to the 2nd cache line
(especially thinking about vector PMDs).
- push an updated version of this patchset quickly
- advertise to PMD maintainers "you don't need to set the m->next,
m->refcnt, and m->nb_segs in the RX path, please update your drivers"
- later, if we need more room in the 1st cache line of the mbuf, we
can move refcnt and nb_seg, probably without impacting the
performance.
Olivier
I suppose you mean that PMDs don't need to /initialize/ m->next, m->refcnt and m->nb_segs.
Forgive my ignorance, and this is wild speculation, but: Would a PMD not need to set m->next and m->nb_segs if it receives a jumbogram larger than an mbuf packet buffer? And if this is a realistic use case, these fields actually do belong in the 1st cache line. PMD developers please chime in.
Yes, it would. However, this is not really fast-path processing. If we
assume a 2GHz CPU, for 64-byte packets, a core has 34 cycles to process
each packet to achieve 40G line rate. For a packet of size 2k - the
normal size it would need to hit to overflow a buffer, unless you are
using small buffers - the core has 827 cycles per packet. Therefore, in
the latter case, with big packets, the core can afford the hit of
accessing the second cacheline.

/Bruce
Olivier MATZ
2017-02-21 15:04:40 UTC
Permalink
Hi Morten,

On Tue, 21 Feb 2017 15:20:23 +0100, Morten Brørup
Post by Morten Brørup
Comments at the end.
-----Original Message-----
Sent: Thursday, February 16, 2017 5:14 PM
To: Bruce Richardson
Subject: Re: [dpdk-dev] [RFC 0/8] mbuf: structure reorganization
On Thu, 16 Feb 2017 15:46:19 +0000, Bruce Richardson
Post by Bruce Richardson
Post by Olivier Matz
Hi Konstantin,
Thanks for the feedback.
Comments inline.
On Mon, 6 Feb 2017 18:41:27 +0000, "Ananyev, Konstantin"
Post by Bruce Richardson
Hi Olivier,
Looks good in general, some comments from me below.
Thanks
Konstantin
- reorder structure to increase vector performance on some
non-ia platforms.
- add a 64bits timestamp field in the 1st cache line
Wonder why it deserves to be in first cache line?
How it differs from seqn below (pure SW stuff right now).
In case the timestamp is set from a NIC value, it is set in the
Rx path. So that's why I think it deserve to be located in the
1st cache line.
As you said, the seqn is a pure sw stuff right: it is set in a
lib, not in a PMD rx path.
Post by Bruce Richardson
- m->next, m->nb_segs, and m->refcnt are always initialized
for mbufs in the pool, avoiding the need of setting
m->next
(located
Post by Bruce Richardson
Post by Olivier Matz
Post by Bruce Richardson
in the 2nd cache line) in the Rx path for mono-segment packets.
- change port and nb_segs to 16 bits
Not that I am completely against it, but changing nb_segs to
16 bits seems like an overkill to me.
I think we can keep and extra 8bits for something more useful in future.
In my case, I use the m->next field to chain more than 256
segments for L4 socket buffers. It also updates nb_seg that can
overflow. It's not a big issue since at the end, nb_seg is
decremented for each segment. On the other hand, if I enable
some sanity checks on mbufs, it complains because the number of
segments is not equal to nb_seg.
There is also another use case with fragmentation as discussed
recently: http://dpdk.org/dev/patchwork/patch/19819/
Of course, dealing with a long mbuf list is not that efficient,
but the application can maintain another structure to
accelerate the access to the middle/end of the list.
Finally, we have other ideas to get additional 8 bits if
required
in
Post by Bruce Richardson
Post by Olivier Matz
the future, so I don't think it's really a problem.
Post by Bruce Richardson
- move seqn in the 2nd cache line
- move refcnt and nb_segs to the 2nd cache line: many
drivers sets them in the Rx path, so it could introduce a
performance regression, or
I wonder can refcnt only be moved into the 2-nd cacheline?
As I understand thanks to other change (from above)
m->refcnt
will
Post by Bruce Richardson
Post by Olivier Matz
Post by Bruce Richardson
already be initialized, so RX code don't need to touch it.
Though yes, it still would require changes in all PMDs.
Yes, I agree, some fields could be moved in the 2nd cache line
once all PMDs stop to write them in RX path. I propose to issue
some guidelines to PMD maintainers at the same time the
patchset is pushed. Then we can consider changing it in a
future version, in case we need more room in the 1st mbuf cache
line.
If we are changing things, we should really do all that now,
rather than storing up future breaks to mbuf. Worst case, we
should plan for it immediately after the release where we make
these changes. Have
two
Post by Bruce Richardson
releases that break mbuf immediately after each other - and flagged
as
Post by Bruce Richardson
such, but keep it stable thereafter. I don't like having technical
debt on mbuf just after we supposedly "fix" it.
I think there is no need to do this change now. And I don't feel
good with the idea of having a patchset that updates all the PMDs
to remove the access to a field because it moved to the 2nd cache
line (especially thinking about vector PMDs).
- push an updated version of this patchset quickly
- advertise to PMD maintainers "you don't need to set the m->next,
m->refcnt, and m->nb_segs in the RX path, please update your drivers"
- later, if we need more room in the 1st cache line of the mbuf, we
can move refcnt and nb_seg, probably without impacting the
performance.
Olivier
I suppose you mean that PMDs don't need to /initialize/ m->next, m->refcnt and m->nb_segs.
Forgive my ignorance, and this is wild speculation, but: Would a PMD
not need to set m->next and m->nb_segs if it receives a jumbogram
larger than an mbuf packet buffer? And if this is a realistic use
case, these fields actually do belong in the 1st cache line. PMD
developers please chime in.
Nothing to add to Bruce's answer :)
Post by Morten Brørup
And I tend to agree with Bruce about making all these mbuf changes in
one go, rather than postponing some of them to later. Especially
because the postponement also closes and reopens the whole discussion
and decision process! (Not initializing a few fields in a PMD cannot
require a lot of work by the PMD developers. Moving the fields to the
2nd cache line will in the worst case degrade the performance of the
non-updated PMDs.)
A two step process makes good sense for the developers of DPDK, but
both steps should be taken within the same release, so they are
transparent to the users of DPDK.
I don't think this is doable, knowing the submission dead line is in
less than 2 weeks. On my side, honestly, I don't want to dive in the
code of into all PMDs. I feel this would be more risky than letting
the PMD maintainers update their own PMD code.

Olivier
Bruce Richardson
2017-02-21 15:18:03 UTC
Permalink
Post by Olivier MATZ
Hi Morten,
On Tue, 21 Feb 2017 15:20:23 +0100, Morten Brørup
Post by Morten Brørup
Comments at the end.
-----Original Message-----
Sent: Thursday, February 16, 2017 5:14 PM
To: Bruce Richardson
Subject: Re: [dpdk-dev] [RFC 0/8] mbuf: structure reorganization
On Thu, 16 Feb 2017 15:46:19 +0000, Bruce Richardson
Post by Bruce Richardson
Post by Olivier Matz
Hi Konstantin,
Thanks for the feedback.
Comments inline.
On Mon, 6 Feb 2017 18:41:27 +0000, "Ananyev, Konstantin"
Post by Bruce Richardson
Hi Olivier,
Looks good in general, some comments from me below.
Thanks
Konstantin
- reorder structure to increase vector performance on some
non-ia platforms.
- add a 64bits timestamp field in the 1st cache line
Wonder why it deserves to be in first cache line?
How it differs from seqn below (pure SW stuff right now).
In case the timestamp is set from a NIC value, it is set in the
Rx path. So that's why I think it deserve to be located in the
1st cache line.
As you said, the seqn is a pure sw stuff right: it is set in a
lib, not in a PMD rx path.
Post by Bruce Richardson
- m->next, m->nb_segs, and m->refcnt are always initialized
for mbufs in the pool, avoiding the need of setting
m->next
(located
Post by Bruce Richardson
Post by Olivier Matz
Post by Bruce Richardson
in the 2nd cache line) in the Rx path for mono-segment packets.
- change port and nb_segs to 16 bits
Not that I am completely against it, but changing nb_segs to
16 bits seems like an overkill to me.
I think we can keep and extra 8bits for something more useful
in future.
In my case, I use the m->next field to chain more than 256
segments for L4 socket buffers. It also updates nb_seg that can
overflow. It's not a big issue since at the end, nb_seg is
decremented for each segment. On the other hand, if I enable
some sanity checks on mbufs, it complains because the number of
segments is not equal to nb_seg.
There is also another use case with fragmentation as discussed
recently: http://dpdk.org/dev/patchwork/patch/19819/
Of course, dealing with a long mbuf list is not that efficient,
but the application can maintain another structure to
accelerate the access to the middle/end of the list.
Finally, we have other ideas to get additional 8 bits if required
in
Post by Bruce Richardson
Post by Olivier Matz
the future, so I don't think it's really a problem.
Post by Bruce Richardson
- move seqn in the 2nd cache line
- move refcnt and nb_segs to the 2nd cache line: many
drivers sets them in the Rx path, so it could introduce a
performance regression, or
I wonder can refcnt only be moved into the 2-nd cacheline?
As I understand thanks to other change (from above)
m->refcnt
will
Post by Bruce Richardson
Post by Olivier Matz
Post by Bruce Richardson
already be initialized, so RX code don't need to touch it.
Though yes, it still would require changes in all PMDs.
Yes, I agree, some fields could be moved in the 2nd cache line
once all PMDs stop to write them in RX path. I propose to issue
some guidelines to PMD maintainers at the same time the
patchset is pushed. Then we can consider changing it in a
future version, in case we need more room in the 1st mbuf cache
line.
If we are changing things, we should really do all that now,
rather than storing up future breaks to mbuf. Worst case, we
should plan for it immediately after the release where we make
these changes. Have
two
Post by Bruce Richardson
releases that break mbuf immediately after each other - and flagged
as
Post by Bruce Richardson
such, but keep it stable thereafter. I don't like having technical
debt on mbuf just after we supposedly "fix" it.
I think there is no need to do this change now. And I don't feel
good with the idea of having a patchset that updates all the PMDs
to remove the access to a field because it moved to the 2nd cache
line (especially thinking about vector PMDs).
- push an updated version of this patchset quickly
- advertise to PMD maintainers "you don't need to set the m->next,
m->refcnt, and m->nb_segs in the RX path, please update your drivers"
- later, if we need more room in the 1st cache line of the mbuf, we
can move refcnt and nb_seg, probably without impacting the
performance.
Olivier
I suppose you mean that PMDs don't need to /initialize/ m->next,
m->refcnt and m->nb_segs.
Forgive my ignorance, and this is wild speculation, but: Would a PMD
not need to set m->next and m->nb_segs if it receives a jumbogram
larger than an mbuf packet buffer? And if this is a realistic use
case, these fields actually do belong in the 1st cache line. PMD
developers please chime in.
Nothing to add to Bruce's answer :)
Post by Morten Brørup
And I tend to agree with Bruce about making all these mbuf changes in
one go, rather than postponing some of them to later. Especially
because the postponement also closes and reopens the whole discussion
and decision process! (Not initializing a few fields in a PMD cannot
require a lot of work by the PMD developers. Moving the fields to the
2nd cache line will in the worst case degrade the performance of the
non-updated PMDs.)
A two step process makes good sense for the developers of DPDK, but
both steps should be taken within the same release, so they are
transparent to the users of DPDK.
I don't think this is doable, knowing the submission dead line is in
less than 2 weeks. On my side, honestly, I don't want to dive in the
code of into all PMDs. I feel this would be more risky than letting
the PMD maintainers update their own PMD code.
I, sadly, have to agree here. I think undertaking rework of all PMDs is
a huge job, that probably needs to be shared among the PMD authors.

Regards,
/Bruce
Morten Brørup
2017-02-21 15:18:50 UTC
Permalink
Comments inline.
-----Original Message-----
Sent: Tuesday, February 21, 2017 4:05 PM
To: Morten Brørup
Subject: Re: [dpdk-dev] [RFC 0/8] mbuf: structure reorganization
Hi Morten,
On Tue, 21 Feb 2017 15:20:23 +0100, Morten Brørup
Post by Morten Brørup
Comments at the end.
-----Original Message-----
Sent: Thursday, February 16, 2017 5:14 PM
To: Bruce Richardson
Subject: Re: [dpdk-dev] [RFC 0/8] mbuf: structure reorganization
On Thu, 16 Feb 2017 15:46:19 +0000, Bruce Richardson
Post by Bruce Richardson
Post by Olivier Matz
Hi Konstantin,
Thanks for the feedback.
Comments inline.
On Mon, 6 Feb 2017 18:41:27 +0000, "Ananyev, Konstantin"
Post by Bruce Richardson
Hi Olivier,
Looks good in general, some comments from me below.
Thanks
Konstantin
- reorder structure to increase vector performance on some
non-ia platforms.
- add a 64bits timestamp field in the 1st cache line
Wonder why it deserves to be in first cache line?
How it differs from seqn below (pure SW stuff right now).
In case the timestamp is set from a NIC value, it is set in the
Rx path. So that's why I think it deserve to be located in the
1st cache line.
As you said, the seqn is a pure sw stuff right: it is set in a
lib, not in a PMD rx path.
Post by Bruce Richardson
- m->next, m->nb_segs, and m->refcnt are always initialized
for mbufs in the pool, avoiding the need of setting
m->next
(located
Post by Bruce Richardson
Post by Olivier Matz
Post by Bruce Richardson
in the 2nd cache line) in the Rx path for mono-segment packets.
- change port and nb_segs to 16 bits
Not that I am completely against it, but changing nb_segs to
16 bits seems like an overkill to me.
I think we can keep and extra 8bits for something more useful
in future.
In my case, I use the m->next field to chain more than 256
segments for L4 socket buffers. It also updates nb_seg that can
overflow. It's not a big issue since at the end, nb_seg is
decremented for each segment. On the other hand, if I enable
some sanity checks on mbufs, it complains because the number of
segments is not equal to nb_seg.
There is also another use case with fragmentation as discussed
recently: http://dpdk.org/dev/patchwork/patch/19819/
Of course, dealing with a long mbuf list is not that efficient,
but the application can maintain another structure to
accelerate
Post by Morten Brørup
Post by Bruce Richardson
Post by Olivier Matz
the access to the middle/end of the list.
Finally, we have other ideas to get additional 8 bits if
required
in
Post by Bruce Richardson
Post by Olivier Matz
the future, so I don't think it's really a problem.
Post by Bruce Richardson
- move seqn in the 2nd cache line
- move refcnt and nb_segs to the 2nd cache line: many
drivers sets them in the Rx path, so it could introduce a
performance regression, or
I wonder can refcnt only be moved into the 2-nd cacheline?
As I understand thanks to other change (from above)
m->refcnt
will
Post by Bruce Richardson
Post by Olivier Matz
Post by Bruce Richardson
already be initialized, so RX code don't need to touch it.
Though yes, it still would require changes in all PMDs.
Yes, I agree, some fields could be moved in the 2nd cache line
once all PMDs stop to write them in RX path. I propose to issue
some guidelines to PMD maintainers at the same time the
patchset
Post by Morten Brørup
Post by Bruce Richardson
Post by Olivier Matz
is pushed. Then we can consider changing it in a future
version,
Post by Morten Brørup
Post by Bruce Richardson
Post by Olivier Matz
in case we need more room in the 1st mbuf cache line.
If we are changing things, we should really do all that now,
rather than storing up future breaks to mbuf. Worst case, we
should plan for it immediately after the release where we make
these changes. Have
two
Post by Bruce Richardson
releases that break mbuf immediately after each other - and flagged
as
Post by Bruce Richardson
such, but keep it stable thereafter. I don't like having
technical
Post by Morten Brørup
Post by Bruce Richardson
debt on mbuf just after we supposedly "fix" it.
I think there is no need to do this change now. And I don't feel
good with the idea of having a patchset that updates all the PMDs
to
Post by Morten Brørup
remove the access to a field because it moved to the 2nd cache line
(especially thinking about vector PMDs).
- push an updated version of this patchset quickly
- advertise to PMD maintainers "you don't need to set the m->next,
m->refcnt, and m->nb_segs in the RX path, please update your drivers"
- later, if we need more room in the 1st cache line of the mbuf, we
can move refcnt and nb_seg, probably without impacting the
performance.
Olivier
I suppose you mean that PMDs don't need to /initialize/ m->next,
m->refcnt and m->nb_segs.
Forgive my ignorance, and this is wild speculation, but: Would a PMD
not need to set m->next and m->nb_segs if it receives a jumbogram
larger than an mbuf packet buffer? And if this is a realistic use
case, these fields actually do belong in the 1st cache line. PMD
developers please chime in.
Nothing to add to Bruce's answer :)
ACK that!
Post by Morten Brørup
And I tend to agree with Bruce about making all these mbuf changes in
one go, rather than postponing some of them to later. Especially
because the postponement also closes and reopens the whole discussion
and decision process! (Not initializing a few fields in a PMD cannot
require a lot of work by the PMD developers. Moving the fields to the
2nd cache line will in the worst case degrade the performance of the
non-updated PMDs.)
A two step process makes good sense for the developers of DPDK, but
both steps should be taken within the same release, so they are
transparent to the users of DPDK.
I don't think this is doable, knowing the submission dead line is in
less than 2 weeks. On my side, honestly, I don't want to dive in the
code of into all PMDs. I feel this would be more risky than letting the
PMD maintainers update their own PMD code.
Olivier
I was assuming that the work of updating the PMD code according to the new mbuf code would be left to the PMD developers. I.e. you take the first step (of updating the mbuf completely), and the PMD developers/maintainers take the second step.

Although I am not doing any of the actual work here, it seems like a relatively small task for each PMD maintainer to update his PMD accordingly, so perhaps you should ask them directly about the deadline issue. From a project release perspective, completing the mbuf structure reorganization in one release seems better than postponing some of the intended modifications for later.

Med venlig hilsen
Chilikin, Andrey
2017-02-19 19:04:58 UTC
Permalink
While doing this fields reshuffling, any chance to put uint16_t vlan_tci_outer beside uint16_t vlan_tci? It will allow to treat QinQ tags as a single 32-bit tag, if needed, and use it for other tag/labels like MPLS, GRE, NSH which could be 20, 24, 32 bits wide.

/Andrey
-----Original Message-----
Sent: Tuesday, January 24, 2017 3:19 PM
Subject: [dpdk-dev] [RFC 0/8] mbuf: structure reorganization
Based on discussion done in [1], this patchset reorganizes the mbuf.
- reorder structure to increase vector performance on some non-ia
platforms.
- add a 64bits timestamp field in the 1st cache line
- m->next, m->nb_segs, and m->refcnt are always initialized for mbufs
in the pool, avoiding the need of setting m->next (located in the
2nd cache line) in the Rx path for mono-segment packets.
- change port and nb_segs to 16 bits
- move seqn in the 2nd cache line
- move refcnt and nb_segs to the 2nd cache line: many drivers sets
them in the Rx path, so it could introduce a performance regression, or
it would require to change all the drivers, which is not an easy task.
- remove the m->port field: too much impact on many examples and libraries,
and some people highlighted they are using it.
- moving m->next in the 1st cache line: there is not enough room, and having
it set to NULL for unused mbuf should remove the need for it.
- merge seqn and timestamp together in a union: we could imagine use cases
were both are activated. There is no flag indicating the presence of seqn,
so it looks preferable to keep them separated for now.
I made some basic performance tests (ixgbe) and see no regression, but the
patchset requires more testing.
[1] http://dpdk.org/ml/archives/dev/2016-October/049338.html
mbuf: make rearm data address naturally aligned
mbuf: make segment prefree function public
mbuf: make raw free function public
mbuf: set mbuf fields while in pool
net: don't touch mbuf next or nb segs on Rx
mbuf: use 2 bytes for port and nb segments
mbuf: move sequence number in second cache line
mbuf: add a timestamp field
app/test-pmd/csumonly.c | 4 +-
drivers/net/ena/ena_ethdev.c | 2 +-
drivers/net/enic/enic_rxtx.c | 2 +-
drivers/net/fm10k/fm10k_rxtx.c | 6 +-
drivers/net/fm10k/fm10k_rxtx_vec.c | 9 +-
drivers/net/i40e/i40e_rxtx_vec_common.h | 6 +-
drivers/net/i40e/i40e_rxtx_vec_sse.c | 11 +-
drivers/net/ixgbe/ixgbe_rxtx.c | 10 +-
drivers/net/ixgbe/ixgbe_rxtx_vec_common.h | 6 +-
drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c | 9 --
drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c | 9 --
drivers/net/mlx5/mlx5_rxtx.c | 11 +-
drivers/net/mpipe/mpipe_tilegx.c | 3 +-
drivers/net/null/rte_eth_null.c | 2 -
drivers/net/virtio/virtio_rxtx.c | 3 -
drivers/net/virtio/virtio_rxtx_simple.h | 6 +-
.../linuxapp/eal/include/exec-env/rte_kni_common.h | 5 +-
lib/librte_mbuf/rte_mbuf.c | 4 +
lib/librte_mbuf/rte_mbuf.h | 114 ++++++++++++++++-----
19 files changed, 124 insertions(+), 98 deletions(-)
--
2.8.1
Olivier MATZ
2017-02-21 09:53:54 UTC
Permalink
Hi Andrey,

On Sun, 19 Feb 2017 19:04:58 +0000, "Chilikin, Andrey"
Post by Chilikin, Andrey
While doing this fields reshuffling, any chance to put uint16_t
vlan_tci_outer beside uint16_t vlan_tci? It will allow to treat QinQ
tags as a single 32-bit tag, if needed, and use it for other
tag/labels like MPLS, GRE, NSH which could be 20, 24, 32 bits wide.
Merging Vlan is a good idea, and it looks it's feasible, so
I'll add it in next version. About using the same field for other
tags/labels, I'm a bit more reserved as of now, but it's another
debate :)

Thanks,
Olivier
Jan Blunck
2017-02-16 17:26:39 UTC
Permalink
Post by Olivier Matz
On Mon, 6 Feb 2017 18:41:27 +0000, "Ananyev, Konstantin"
Post by Ananyev, Konstantin
- reorder structure to increase vector performance on some non-ia
platforms.
- add a 64bits timestamp field in the 1st cache line
Wonder why it deserves to be in first cache line?
How it differs from seqn below (pure SW stuff right now).
In case the timestamp is set from a NIC value, it is set in the Rx
path. So that's why I think it deserve to be located in the 1st cache
line.
As you said, the seqn is a pure sw stuff right: it is set in a lib, not
in a PMD rx path.
If we talk about setting the timestamp value in the RX path this
implicitly means software timestamps. Hardware timestamping usually
works by letting the hardware inject sync events for coarse time
tracking and additionally injecting fine granular per-packet ticks at
a specific offset in the packet. Out of performance reasons I don't
think it makes sense to extract this during the burst and write it
into the mbuf again.

The problem with timestamps is to get the abstraction right wrt the
correction factors and the size of the tick vs. the timestamp in the
events injected. From my perspective it would be better to extract the
handling of timestamp data into a library with PMD specific
implementation of the conversions. That way the normalized timestamp
values can get extracted if they are present. The mbuf itself would
only indicate the presence of timestamp metadata in that case.
Olivier Matz
2017-02-17 10:51:53 UTC
Permalink
Hi Jan,
On Thu, Feb 16, 2017 at 2:48 PM, Olivier Matz
Post by Olivier Matz
On Mon, 6 Feb 2017 18:41:27 +0000, "Ananyev, Konstantin"
Post by Ananyev, Konstantin
- reorder structure to increase vector performance on some non-ia
platforms.
- add a 64bits timestamp field in the 1st cache line
Wonder why it deserves to be in first cache line?
How it differs from seqn below (pure SW stuff right now).
In case the timestamp is set from a NIC value, it is set in the Rx
path. So that's why I think it deserve to be located in the 1st
cache line.
As you said, the seqn is a pure sw stuff right: it is set in a lib,
not in a PMD rx path.
If we talk about setting the timestamp value in the RX path this
implicitly means software timestamps. Hardware timestamping usually
works by letting the hardware inject sync events for coarse time
tracking and additionally injecting fine granular per-packet ticks at
a specific offset in the packet. Out of performance reasons I don't
think it makes sense to extract this during the burst and write it
into the mbuf again.
From what I've understand, at least it does not work like this for
mellanox NICs: timestamp is a metadata attached to a rx packet. But
maybe they (and other NIC vendors interrested in the feature) can
confirm or not.
The problem with timestamps is to get the abstraction right wrt the
correction factors and the size of the tick vs. the timestamp in the
events injected. From my perspective it would be better to extract the
handling of timestamp data into a library with PMD specific
implementation of the conversions. That way the normalized timestamp
values can get extracted if they are present. The mbuf itself would
only indicate the presence of timestamp metadata in that case.
I agree however that we need to properly define the meaning of this
field. My idea is:

- the timestamp is in nanosecond
- the reference is always the same for a given path: if the timestamp is
set in a PMD, all the packets for this PMD will have the same
reference, but for 2 different PMDs (or a sw lib), the reference
would not be the same.

I think it's enough for many use cases.
We can later add helpers to compare timestamps with different
references.


Regards,
Olivier
Nélio Laranjeiro
2017-02-17 12:49:54 UTC
Permalink
Hi Olivier, Jan,
Post by Olivier Matz
Hi Jan,
On Thu, Feb 16, 2017 at 2:48 PM, Olivier Matz
Post by Olivier Matz
On Mon, 6 Feb 2017 18:41:27 +0000, "Ananyev, Konstantin"
Post by Ananyev, Konstantin
- reorder structure to increase vector performance on some non-ia
platforms.
- add a 64bits timestamp field in the 1st cache line
Wonder why it deserves to be in first cache line?
How it differs from seqn below (pure SW stuff right now).
In case the timestamp is set from a NIC value, it is set in the Rx
path. So that's why I think it deserve to be located in the 1st
cache line.
As you said, the seqn is a pure sw stuff right: it is set in a lib,
not in a PMD rx path.
If we talk about setting the timestamp value in the RX path this
implicitly means software timestamps. Hardware timestamping usually
works by letting the hardware inject sync events for coarse time
tracking and additionally injecting fine granular per-packet ticks at
a specific offset in the packet. Out of performance reasons I don't
think it makes sense to extract this during the burst and write it
into the mbuf again.
From what I've understand, at least it does not work like this for
mellanox NICs: timestamp is a metadata attached to a rx packet. But
maybe they (and other NIC vendors interrested in the feature) can
confirm or not.
Olivier is right, this timestamp information is returned by the hardware
as the other fields describing the Rx packet (length, RSS hash, checksum
...). The PMD only copy it into the Mbuf.
Post by Olivier Matz
The problem with timestamps is to get the abstraction right wrt the
correction factors and the size of the tick vs. the timestamp in the
events injected. From my perspective it would be better to extract the
handling of timestamp data into a library with PMD specific
implementation of the conversions. That way the normalized timestamp
values can get extracted if they are present. The mbuf itself would
only indicate the presence of timestamp metadata in that case.
I agree however that we need to properly define the meaning of this
- the timestamp is in nanosecond
- the reference is always the same for a given path: if the timestamp is
set in a PMD, all the packets for this PMD will have the same
reference, but for 2 different PMDs (or a sw lib), the reference
would not be the same.
I think it's enough for many use cases.
We can later add helpers to compare timestamps with different
references.
Regards,
Olivier
Regards,
--
Nélio Laranjeiro
6WIND
Jan Blunck
2017-02-17 13:51:57 UTC
Permalink
On Fri, Feb 17, 2017 at 1:49 PM, Nélio Laranjeiro
Post by Nélio Laranjeiro
Hi Olivier, Jan,
Post by Olivier Matz
Hi Jan,
Post by Jan Blunck
If we talk about setting the timestamp value in the RX path this
implicitly means software timestamps. Hardware timestamping usually
works by letting the hardware inject sync events for coarse time
tracking and additionally injecting fine granular per-packet ticks at
a specific offset in the packet. Out of performance reasons I don't
think it makes sense to extract this during the burst and write it
into the mbuf again.
From what I've understand, at least it does not work like this for
mellanox NICs: timestamp is a metadata attached to a rx packet. But
maybe they (and other NIC vendors interrested in the feature) can
confirm or not.
Olivier is right, this timestamp information is returned by the hardware
as the other fields describing the Rx packet (length, RSS hash, checksum
...). The PMD only copy it into the Mbuf.
Indeed, for Mellanox the timestamp is stored in the CQ entry.
Solarflares write it relative to the packet header.
Andrew Rybchenko
2017-02-18 05:48:48 UTC
Permalink
Post by Jan Blunck
On Fri, Feb 17, 2017 at 1:49 PM, Nélio Laranjeiro
Post by Nélio Laranjeiro
Hi Olivier, Jan,
Post by Olivier Matz
Hi Jan,
Post by Jan Blunck
If we talk about setting the timestamp value in the RX path this
implicitly means software timestamps. Hardware timestamping usually
works by letting the hardware inject sync events for coarse time
tracking and additionally injecting fine granular per-packet ticks at
a specific offset in the packet. Out of performance reasons I don't
think it makes sense to extract this during the burst and write it
into the mbuf again.
From what I've understand, at least it does not work like this for
mellanox NICs: timestamp is a metadata attached to a rx packet. But
maybe they (and other NIC vendors interrested in the feature) can
confirm or not.
Olivier is right, this timestamp information is returned by the hardware
as the other fields describing the Rx packet (length, RSS hash, checksum
...). The PMD only copy it into the Mbuf.
Indeed, for Mellanox the timestamp is stored in the CQ entry.
Solarflares write it relative to the packet header.
Confirmed. We have pseudo-header just before the packet itself and
timestamp is put to pseudo-header by the HW.
Jan Blunck
2017-02-17 13:38:32 UTC
Permalink
Post by Olivier Matz
Hi Jan,
On Thu, Feb 16, 2017 at 2:48 PM, Olivier Matz
Post by Olivier Matz
On Mon, 6 Feb 2017 18:41:27 +0000, "Ananyev, Konstantin"
Post by Ananyev, Konstantin
- reorder structure to increase vector performance on some non-ia
platforms.
- add a 64bits timestamp field in the 1st cache line
Wonder why it deserves to be in first cache line?
How it differs from seqn below (pure SW stuff right now).
In case the timestamp is set from a NIC value, it is set in the Rx
path. So that's why I think it deserve to be located in the 1st
cache line.
As you said, the seqn is a pure sw stuff right: it is set in a lib,
not in a PMD rx path.
If we talk about setting the timestamp value in the RX path this
implicitly means software timestamps. Hardware timestamping usually
works by letting the hardware inject sync events for coarse time
tracking and additionally injecting fine granular per-packet ticks at
a specific offset in the packet. Out of performance reasons I don't
think it makes sense to extract this during the burst and write it
into the mbuf again.
From what I've understand, at least it does not work like this for
mellanox NICs: timestamp is a metadata attached to a rx packet. But
maybe they (and other NIC vendors interrested in the feature) can
confirm or not.
Mellanox NICs use a 48bit cycle counter split into a high and low
part. To convert the cycle values into a timestamp you need to
initialize and maintainer a timecounter that shifts the cycle count
e.g. nanosecs. IIRC Mellanox doesn't generate explicit clock events
but the cycle counter is large enough so that the user can easily
maintain the timecounter by manually updating it.
Post by Olivier Matz
The problem with timestamps is to get the abstraction right wrt the
correction factors and the size of the tick vs. the timestamp in the
events injected. From my perspective it would be better to extract the
handling of timestamp data into a library with PMD specific
implementation of the conversions. That way the normalized timestamp
values can get extracted if they are present. The mbuf itself would
only indicate the presence of timestamp metadata in that case.
I agree however that we need to properly define the meaning of this
- the timestamp is in nanosecond
- the reference is always the same for a given path: if the timestamp is
set in a PMD, all the packets for this PMD will have the same
reference, but for 2 different PMDs (or a sw lib), the reference
would not be the same.
I think it's enough for many use cases.
We can later add helpers to compare timestamps with different
references.
My point is that I still doubt that it belongs into the first
cacheline. It requires accessing other structures for converting into
nanoseconds anyway. Optimally I would like to see this happening on
access instead but if that isn't achievable at least in a second step.
Olivier Matz
2017-02-17 14:17:08 UTC
Permalink
Hi Jan,
On Fri, Feb 17, 2017 at 11:51 AM, Olivier Matz
Post by Olivier Matz
Hi Jan,
On Thu, 16 Feb 2017 18:26:39 +0100, Jan Blunck
On Thu, Feb 16, 2017 at 2:48 PM, Olivier Matz
Post by Olivier Matz
On Mon, 6 Feb 2017 18:41:27 +0000, "Ananyev, Konstantin"
Post by Ananyev, Konstantin
- reorder structure to increase vector performance on some
non-ia platforms.
- add a 64bits timestamp field in the 1st cache line
Wonder why it deserves to be in first cache line?
How it differs from seqn below (pure SW stuff right now).
In case the timestamp is set from a NIC value, it is set in the
Rx path. So that's why I think it deserve to be located in the
1st cache line.
As you said, the seqn is a pure sw stuff right: it is set in a
lib, not in a PMD rx path.
If we talk about setting the timestamp value in the RX path this
implicitly means software timestamps. Hardware timestamping usually
works by letting the hardware inject sync events for coarse time
tracking and additionally injecting fine granular per-packet ticks
at a specific offset in the packet. Out of performance reasons I
don't think it makes sense to extract this during the burst and
write it into the mbuf again.
From what I've understand, at least it does not work like this for
mellanox NICs: timestamp is a metadata attached to a rx packet. But
maybe they (and other NIC vendors interrested in the feature) can
confirm or not.
Mellanox NICs use a 48bit cycle counter split into a high and low
part. To convert the cycle values into a timestamp you need to
initialize and maintainer a timecounter that shifts the cycle count
e.g. nanosecs. IIRC Mellanox doesn't generate explicit clock events
but the cycle counter is large enough so that the user can easily
maintain the timecounter by manually updating it.
Post by Olivier Matz
The problem with timestamps is to get the abstraction right wrt the
correction factors and the size of the tick vs. the timestamp in
the events injected. From my perspective it would be better to
extract the handling of timestamp data into a library with PMD
specific implementation of the conversions. That way the
normalized timestamp values can get extracted if they are present.
The mbuf itself would only indicate the presence of timestamp
metadata in that case.
I agree however that we need to properly define the meaning of this
- the timestamp is in nanosecond
- the reference is always the same for a given path: if the
timestamp is set in a PMD, all the packets for this PMD will have
the same reference, but for 2 different PMDs (or a sw lib), the
reference would not be the same.
I think it's enough for many use cases.
We can later add helpers to compare timestamps with different
references.
My point is that I still doubt that it belongs into the first
cacheline. It requires accessing other structures for converting into
nanoseconds anyway. Optimally I would like to see this happening on
access instead but if that isn't achievable at least in a second step.
Sorry, I don't really get your point. My comprehension of the timestamp
usage in a PMD is as following:

rx_burst(struct rxq *rxq, ...)
{
unsigned long factor = rxq->timestamp_factor;
unsigned port = rxq->port;

for each hw_desc {
m = rte_pktmbuf_alloc(rxq->pool);
m->len = hw_desc->len;
m->port = port;
m->ol_flags =
...
m->timestamp = hw_desc->timestamp * factor;
}
...
}

In that case, I think it deserves to be in the 1st cache line.


Olivier
Ananyev, Konstantin
2017-02-17 18:42:01 UTC
Permalink
Hi guys,
-----Original Message-----
Sent: Friday, February 17, 2017 2:17 PM
Subject: Re: [dpdk-dev] [RFC 0/8] mbuf: structure reorganization
Hi Jan,
On Fri, Feb 17, 2017 at 11:51 AM, Olivier Matz
Post by Olivier Matz
Hi Jan,
On Thu, 16 Feb 2017 18:26:39 +0100, Jan Blunck
On Thu, Feb 16, 2017 at 2:48 PM, Olivier Matz
Post by Olivier Matz
On Mon, 6 Feb 2017 18:41:27 +0000, "Ananyev, Konstantin"
Post by Ananyev, Konstantin
- reorder structure to increase vector performance on some
non-ia platforms.
- add a 64bits timestamp field in the 1st cache line
Wonder why it deserves to be in first cache line?
How it differs from seqn below (pure SW stuff right now).
In case the timestamp is set from a NIC value, it is set in the
Rx path. So that's why I think it deserve to be located in the
1st cache line.
As you said, the seqn is a pure sw stuff right: it is set in a
lib, not in a PMD rx path.
If we talk about setting the timestamp value in the RX path this
implicitly means software timestamps. Hardware timestamping usually
works by letting the hardware inject sync events for coarse time
tracking and additionally injecting fine granular per-packet ticks
at a specific offset in the packet. Out of performance reasons I
don't think it makes sense to extract this during the burst and
write it into the mbuf again.
From what I've understand, at least it does not work like this for
mellanox NICs: timestamp is a metadata attached to a rx packet. But
maybe they (and other NIC vendors interrested in the feature) can
confirm or not.
Mellanox NICs use a 48bit cycle counter split into a high and low
part. To convert the cycle values into a timestamp you need to
initialize and maintainer a timecounter that shifts the cycle count
e.g. nanosecs. IIRC Mellanox doesn't generate explicit clock events
but the cycle counter is large enough so that the user can easily
maintain the timecounter by manually updating it.
Post by Olivier Matz
The problem with timestamps is to get the abstraction right wrt the
correction factors and the size of the tick vs. the timestamp in
the events injected. From my perspective it would be better to
extract the handling of timestamp data into a library with PMD
specific implementation of the conversions. That way the
normalized timestamp values can get extracted if they are present.
The mbuf itself would only indicate the presence of timestamp
metadata in that case.
I agree however that we need to properly define the meaning of this
- the timestamp is in nanosecond
- the reference is always the same for a given path: if the
timestamp is set in a PMD, all the packets for this PMD will have
the same reference, but for 2 different PMDs (or a sw lib), the
reference would not be the same.
I think it's enough for many use cases.
We can later add helpers to compare timestamps with different
references.
My point is that I still doubt that it belongs into the first
cacheline. It requires accessing other structures for converting into
nanoseconds anyway. Optimally I would like to see this happening on
access instead but if that isn't achievable at least in a second step.
Sorry, I don't really get your point. My comprehension of the timestamp
rx_burst(struct rxq *rxq, ...)
{
unsigned long factor = rxq->timestamp_factor;
unsigned port = rxq->port;
for each hw_desc {
m = rte_pktmbuf_alloc(rxq->pool);
m->len = hw_desc->len;
m->port = port;
m->ol_flags =
...
m->timestamp = hw_desc->timestamp * factor;
}
...
}
In that case, I think it deserves to be in the 1st cache line.
So you are saying that:
- for some HW that DPDK supports (mlx?) timestamp information
Is available in HW RX descriptor
- and as soon timestamp field will be available in mbuf, you plan
to populate it using this HW RXD field.
Is that so?
Konstantin
Olivier MATZ
2017-02-21 09:53:49 UTC
Permalink
Hi Konstantin,

On Fri, 17 Feb 2017 18:42:01 +0000, "Ananyev, Konstantin"
Post by Ananyev, Konstantin
Hi guys,
Post by Olivier Matz
Post by Jan Blunck
My point is that I still doubt that it belongs into the first
cacheline. It requires accessing other structures for converting
into nanoseconds anyway. Optimally I would like to see this
happening on access instead but if that isn't achievable at least
in a second step.
Sorry, I don't really get your point. My comprehension of the
rx_burst(struct rxq *rxq, ...)
{
unsigned long factor = rxq->timestamp_factor;
unsigned port = rxq->port;
for each hw_desc {
m = rte_pktmbuf_alloc(rxq->pool);
m->len = hw_desc->len;
m->port = port;
m->ol_flags =
...
m->timestamp = hw_desc->timestamp * factor;
}
...
}
In that case, I think it deserves to be in the 1st cache line.
- for some HW that DPDK supports (mlx?) timestamp information
Is available in HW RX descriptor
- and as soon timestamp field will be available in mbuf, you plan
to populate it using this HW RXD field.
Is that so?
Yes, that's what I'm seeing in mellanox's patchset:
http://dpdk.org/ml/archives/dev/2016-October/048810.html

Do you know if Intel has plans to support some sort of timestamp using
this timestamp field?


Thanks,
Olivier
Ananyev, Konstantin
2017-02-21 10:28:00 UTC
Permalink
Hi Olivier,
-----Original Message-----
Sent: Tuesday, February 21, 2017 9:54 AM
Subject: Re: [dpdk-dev] [RFC 0/8] mbuf: structure reorganization
Hi Konstantin,
On Fri, 17 Feb 2017 18:42:01 +0000, "Ananyev, Konstantin"
Post by Ananyev, Konstantin
Hi guys,
Post by Olivier Matz
Post by Jan Blunck
My point is that I still doubt that it belongs into the first
cacheline. It requires accessing other structures for converting
into nanoseconds anyway. Optimally I would like to see this
happening on access instead but if that isn't achievable at least
in a second step.
Sorry, I don't really get your point. My comprehension of the
rx_burst(struct rxq *rxq, ...)
{
unsigned long factor = rxq->timestamp_factor;
unsigned port = rxq->port;
for each hw_desc {
m = rte_pktmbuf_alloc(rxq->pool);
m->len = hw_desc->len;
m->port = port;
m->ol_flags =
...
m->timestamp = hw_desc->timestamp * factor;
}
...
}
In that case, I think it deserves to be in the 1st cache line.
- for some HW that DPDK supports (mlx?) timestamp information
Is available in HW RX descriptor
- and as soon timestamp field will be available in mbuf, you plan
to populate it using this HW RXD field.
Is that so?
http://dpdk.org/ml/archives/dev/2016-October/048810.html
Have to admit that I saw that patch before, but missed the fact that
timestamp value is taken from HW RXD.
Ok, then I suppose I don't' have any good reason to object against
putting it in the first cache-line.
Do you know if Intel has plans to support some sort of timestamp using
this timestamp field?
As I remember, metrics library uses it, but right now it is filled by SW.
Konstantin
Jan Blunck
2017-02-20 09:27:40 UTC
Permalink
Post by Olivier Matz
Sorry, I don't really get your point. My comprehension of the timestamp
rx_burst(struct rxq *rxq, ...)
{
unsigned long factor = rxq->timestamp_factor;
unsigned port = rxq->port;
for each hw_desc {
m = rte_pktmbuf_alloc(rxq->pool);
m->len = hw_desc->len;
m->port = port;
m->ol_flags =
...
m->timestamp = hw_desc->timestamp * factor;
}
...
}
In that case, I think it deserves to be in the 1st cache line.
Timestamps are non-functional data. I believe they don't deserve to be
data normalized.

It would be beneficial to normalize the access of non-functional data.
That would require some kind of extension or feature flags, e.g. like
some uverb structures support extensions. This would allow the NICs
that support timestamps to do the minimal amount of work during burst
and defer data normalization to the time of access. That would also
safe everyone from wasting 64bit in the first cacheline in case
timestamps are not supported or unused.
Olivier MATZ
2017-02-21 09:54:00 UTC
Permalink
Hi Jan,
On Fri, Feb 17, 2017 at 3:17 PM, Olivier Matz
Post by Olivier Matz
Sorry, I don't really get your point. My comprehension of the
rx_burst(struct rxq *rxq, ...)
{
unsigned long factor = rxq->timestamp_factor;
unsigned port = rxq->port;
for each hw_desc {
m = rte_pktmbuf_alloc(rxq->pool);
m->len = hw_desc->len;
m->port = port;
m->ol_flags =
...
m->timestamp = hw_desc->timestamp * factor;
}
...
}
In that case, I think it deserves to be in the 1st cache line.
Timestamps are non-functional data. I believe they don't deserve to be
data normalized.
I don't really see why timestamp is non-functional, compared to rss
hash, flow director id, packet type, seqn, ...

I think the goal of DPDK is to provide a generic/normalized API for
such features (like for offload), so it can be used by an application
whatever the underlying driver.
It would be beneficial to normalize the access of non-functional data.
That would require some kind of extension or feature flags, e.g. like
some uverb structures support extensions. This would allow the NICs
that support timestamps to do the minimal amount of work during burst
and defer data normalization to the time of access. That would also
safe everyone from wasting 64bit in the first cacheline in case
timestamps are not supported or unused.
I agree that we could start to think about "extensible" mbufs. In few
words, I think we could have:
- only general fields are static
- the application registers room in mbuf structure for all features it
will use
It would rationalize the room used in mbuf, at the cost of a more
complex/slow access to the data in PMDs and application, since the
offset won't be static.

But to me, this is another work, and a much harder one since it will
impact a lot of PMD code (some are using vector instructions and are
quite hard to change). Moreover, we first have to prove that it would
perform better than we we have now, especially knowing that size of the
cache line will probably change to 128B for many architectures.

Now back on the patch, yes, it's true we are consuming 64bits in the
first cache line. But what are the other options? Having it in the
second cache line would allow free this space for future use. But don't
you think we could optimize current use, and rework the structure if
new needs -- that are more important than timestamp for the majority--
appear?


Thanks,
Olivier
Jan Blunck
2017-02-21 16:12:12 UTC
Permalink
Post by Olivier Matz
Hi Jan,
On Fri, Feb 17, 2017 at 3:17 PM, Olivier Matz
Post by Olivier Matz
Sorry, I don't really get your point. My comprehension of the
rx_burst(struct rxq *rxq, ...)
{
unsigned long factor = rxq->timestamp_factor;
unsigned port = rxq->port;
for each hw_desc {
m = rte_pktmbuf_alloc(rxq->pool);
m->len = hw_desc->len;
m->port = port;
m->ol_flags =
...
m->timestamp = hw_desc->timestamp * factor;
}
...
}
In that case, I think it deserves to be in the 1st cache line.
Timestamps are non-functional data. I believe they don't deserve to be
data normalized.
I don't really see why timestamp is non-functional, compared to rss
hash, flow director id, packet type, seqn, ...
One doesn't drop, forward, reorder or decrypt packets based on the
timestamp value.

Common use cases for timestamps are analytics applications or circuit
breakers where you want to ensure that your algos don't act upon stale
data. This is more application layer / slow path work.
Post by Olivier Matz
I think the goal of DPDK is to provide a generic/normalized API for
such features (like for offload), so it can be used by an application
whatever the underlying driver.
I believe there is a difference between dataplane relevant
functionally and non-functional features. If you structurally
normalize the later you will end up making everyone pay for a feature
that is only relevant to a particular group of users. In that case I
would at least expect that the support for timestamps can get selected
at compile time.
Post by Olivier Matz
It would be beneficial to normalize the access of non-functional data.
That would require some kind of extension or feature flags, e.g. like
some uverb structures support extensions. This would allow the NICs
that support timestamps to do the minimal amount of work during burst
and defer data normalization to the time of access. That would also
safe everyone from wasting 64bit in the first cacheline in case
timestamps are not supported or unused.
I agree that we could start to think about "extensible" mbufs. In few
- only general fields are static
- the application registers room in mbuf structure for all features it
will use
It would rationalize the room used in mbuf, at the cost of a more
complex/slow access to the data in PMDs and application, since the
offset won't be static.
Access through PMD specific function pointers should be relatively
fast on access. Modern architecture optimize that use case well
enough.
Post by Olivier Matz
But to me, this is another work, and a much harder one since it will
impact a lot of PMD code (some are using vector instructions and are
quite hard to change). Moreover, we first have to prove that it would
perform better than we we have now, especially knowing that size of the
cache line will probably change to 128B for many architectures.
Now back on the patch, yes, it's true we are consuming 64bits in the
first cache line. But what are the other options? Having it in the
second cache line would allow free this space for future use. But don't
you think we could optimize current use, and rework the structure if
new needs -- that are more important than timestamp for the majority--
appear?
What is you plan for devices that already put the timestamp relative
to the packet data? Do you still want to duplicate and normalize it
into the rte_mbuf field?

What is recommended to do for PMDs that don't support HW timestamps?
Do they still fill this field?
Bruce Richardson
2017-02-21 16:38:09 UTC
Permalink
Post by Jan Blunck
Post by Olivier Matz
Hi Jan,
On Fri, Feb 17, 2017 at 3:17 PM, Olivier Matz
Post by Olivier Matz
Sorry, I don't really get your point. My comprehension of the
rx_burst(struct rxq *rxq, ...)
{
unsigned long factor = rxq->timestamp_factor;
unsigned port = rxq->port;
for each hw_desc {
m = rte_pktmbuf_alloc(rxq->pool);
m->len = hw_desc->len;
m->port = port;
m->ol_flags =
...
m->timestamp = hw_desc->timestamp * factor;
}
...
}
In that case, I think it deserves to be in the 1st cache line.
Timestamps are non-functional data. I believe they don't deserve to be
data normalized.
I don't really see why timestamp is non-functional, compared to rss
hash, flow director id, packet type, seqn, ...
One doesn't drop, forward, reorder or decrypt packets based on the
timestamp value.
Common use cases for timestamps are analytics applications or circuit
breakers where you want to ensure that your algos don't act upon stale
data. This is more application layer / slow path work.
Post by Olivier Matz
I think the goal of DPDK is to provide a generic/normalized API for
such features (like for offload), so it can be used by an application
whatever the underlying driver.
I believe there is a difference between dataplane relevant
functionally and non-functional features. If you structurally
normalize the later you will end up making everyone pay for a feature
that is only relevant to a particular group of users. In that case I
would at least expect that the support for timestamps can get selected
at compile time.
Post by Olivier Matz
It would be beneficial to normalize the access of non-functional data.
That would require some kind of extension or feature flags, e.g. like
some uverb structures support extensions. This would allow the NICs
that support timestamps to do the minimal amount of work during burst
and defer data normalization to the time of access. That would also
safe everyone from wasting 64bit in the first cacheline in case
timestamps are not supported or unused.
I agree that we could start to think about "extensible" mbufs. In few
- only general fields are static
- the application registers room in mbuf structure for all features it
will use
It would rationalize the room used in mbuf, at the cost of a more
complex/slow access to the data in PMDs and application, since the
offset won't be static.
Access through PMD specific function pointers should be relatively
fast on access. Modern architecture optimize that use case well
enough.
The cost of doing a function call per packet to access data starts to
add up very, very fast. For the app, once the data is written to the
mbuf, it should be in the L1 cache, giving very fast access to it in a
few cycles. However, if a function call has to be made in order to do
the read, that makes the read of that field many times more expensive.

/Bruce
Jan Blunck
2017-02-21 17:04:48 UTC
Permalink
On Tue, Feb 21, 2017 at 5:38 PM, Bruce Richardson
Post by Bruce Richardson
Post by Jan Blunck
Access through PMD specific function pointers should be relatively
fast on access. Modern architecture optimize that use case well
enough.
The cost of doing a function call per packet to access data starts to
add up very, very fast. For the app, once the data is written to the
mbuf, it should be in the L1 cache, giving very fast access to it in a
few cycles. However, if a function call has to be made in order to do
the read, that makes the read of that field many times more expensive.
Exactly. Right now the timestamp normalization is done before writing
to each mbuf. Timestamps are usually read at most once ... if at all.
If you look at the analysis use cases they are read to be written to
persistent storage. My impression is we optimize this on the wrong
end.
Ananyev, Konstantin
2017-02-21 17:26:14 UTC
Permalink
Hi Jan,
-----Original Message-----
Sent: Tuesday, February 21, 2017 5:05 PM
Subject: Re: [dpdk-dev] [RFC 0/8] mbuf: structure reorganization
On Tue, Feb 21, 2017 at 5:38 PM, Bruce Richardson
Post by Bruce Richardson
Post by Jan Blunck
Access through PMD specific function pointers should be relatively
fast on access. Modern architecture optimize that use case well
enough.
The cost of doing a function call per packet to access data starts to
add up very, very fast. For the app, once the data is written to the
mbuf, it should be in the L1 cache, giving very fast access to it in a
few cycles. However, if a function call has to be made in order to do
the read, that makes the read of that field many times more expensive.
Exactly. Right now the timestamp normalization is done before writing
to each mbuf. Timestamps are usually read at most once ... if at all.
Well we don't know for sure right?
Someone can argue that there are plenty of scenarios when
other fields might also never be used/updated (rss, vlan, etc).

So, are you suggesting to do normalization later?
If so, then what would be the benefit (data still need to be in mbuf)?
If you look at the analysis use cases they are read to be written to
persistent storage.
Probably, or some statistic calculations, I guess.
Or might be someone would use it to reorder packets before sending
them out based on the timestamp, or might be something else.
It really hard to predict what use cases would come up
Jan Blunck
2017-02-21 19:17:42 UTC
Permalink
On Tue, Feb 21, 2017 at 6:26 PM, Ananyev, Konstantin
Post by Olivier Matz
Hi Jan,
-----Original Message-----
Sent: Tuesday, February 21, 2017 5:05 PM
Subject: Re: [dpdk-dev] [RFC 0/8] mbuf: structure reorganization
On Tue, Feb 21, 2017 at 5:38 PM, Bruce Richardson
Post by Bruce Richardson
Post by Jan Blunck
Access through PMD specific function pointers should be relatively
fast on access. Modern architecture optimize that use case well
enough.
The cost of doing a function call per packet to access data starts to
add up very, very fast. For the app, once the data is written to the
mbuf, it should be in the L1 cache, giving very fast access to it in a
few cycles. However, if a function call has to be made in order to do
the read, that makes the read of that field many times more expensive.
Exactly. Right now the timestamp normalization is done before writing
to each mbuf. Timestamps are usually read at most once ... if at all.
Well we don't know for sure right?
Someone can argue that there are plenty of scenarios when
other fields might also never be used/updated (rss, vlan, etc).
So, are you suggesting to do normalization later?
If so, then what would be the benefit (data still need to be in mbuf)?
Yes, postponing normalization prevents you from doing unnecessary work
upfront. AFAIK not all NICs store timestamp data OOB, e.g. in CQ.
Post by Olivier Matz
If you look at the analysis use cases they are read to be written to
persistent storage.
Probably, or some statistic calculations, I guess.
Or might be someone would use it to reorder packets before sending
them out based on the timestamp, or might be something else.
Those timestamps are generated at arrival in the hardware. So that
order would be relative to the point of reception and in case you want
to resync flows that doesn't help much.

For statistics calculation and the resync use case you also might want
the raw timestamp (not normalized to nsec) and offload the
normalization out of band to reduce overall latency.
Post by Olivier Matz
It really hard to predict what use cases would come up (at least for me).
Indeed. I also just have the limited view of what I used timestamps
for so far. Who knows what use cases other people come up with in the
future.
Ananyev, Konstantin
2017-02-21 20:30:57 UTC
Permalink
-----Original Message-----
Sent: Tuesday, February 21, 2017 7:18 PM
Subject: Re: [dpdk-dev] [RFC 0/8] mbuf: structure reorganization
On Tue, Feb 21, 2017 at 6:26 PM, Ananyev, Konstantin
Post by Olivier Matz
Hi Jan,
-----Original Message-----
Sent: Tuesday, February 21, 2017 5:05 PM
Subject: Re: [dpdk-dev] [RFC 0/8] mbuf: structure reorganization
On Tue, Feb 21, 2017 at 5:38 PM, Bruce Richardson
Post by Bruce Richardson
Post by Jan Blunck
Access through PMD specific function pointers should be relatively
fast on access. Modern architecture optimize that use case well
enough.
The cost of doing a function call per packet to access data starts to
add up very, very fast. For the app, once the data is written to the
mbuf, it should be in the L1 cache, giving very fast access to it in a
few cycles. However, if a function call has to be made in order to do
the read, that makes the read of that field many times more expensive.
Exactly. Right now the timestamp normalization is done before writing
to each mbuf. Timestamps are usually read at most once ... if at all.
Well we don't know for sure right?
Someone can argue that there are plenty of scenarios when
other fields might also never be used/updated (rss, vlan, etc).
So, are you suggesting to do normalization later?
If so, then what would be the benefit (data still need to be in mbuf)?
Yes, postponing normalization prevents you from doing unnecessary work
upfront. AFAIK not all NICs store timestamp data OOB, e.g. in CQ.
Yes, postponing normalization might help a bit (though I don't think much)
in terms of calculations performed inside PMD.
But we still need 8B inside mbuf to store the timestamp value,
either normalized or raw one.
So to clarify where is the disagreement:
1. timestamp position:
mbufs 1-st cacheline vs 2-nd cacheline
2. timestamp normalization point
inside PMD RX vs somewhere later as user needs it (extra function in dev_ops?).

So, is it 1) or 2) or both?
Konstantin
Post by Olivier Matz
If you look at the analysis use cases they are read to be written to
persistent storage.
Probably, or some statistic calculations, I guess.
Or might be someone would use it to reorder packets before sending
them out based on the timestamp, or might be something else.
Those timestamps are generated at arrival in the hardware. So that
order would be relative to the point of reception and in case you want
to resync flows that doesn't help much.
For statistics calculation and the resync use case you also might want
the raw timestamp (not normalized to nsec) and offload the
normalization out of band to reduce overall latency.
Post by Olivier Matz
It really hard to predict what use cases would come up (at least for me).
Indeed. I also just have the limited view of what I used timestamps
for so far. Who knows what use cases other people
Morten Brørup
2017-02-21 21:51:00 UTC
Permalink
Regarding m->timestamp I have previously argued for keeping it NIC specific, and not normalizing it. But I have changed my mind: Normalizing it makes gives the user the ability to transparently swap out a NIC from one vendor with one from another vendor. And with a hardware timestamp from the NIC, the normalization only involves multiplying by a constant factor, as Olivier pointed out previously. So if the resolution is high enough, a normalized value is preferable. 1 ns is roughly 10 bytes at 100 Gbit/s, so I suppose a resolution of 1 ns suffices.

But how about NICs without hardware timestamps...

1. Are their PMDs supposed to set the timestamp or not, and are they supposed to ensure that two packets received by the same port do not carry the same timestamp?

2. And if the CPU clock frequency is not constant, normalizing a software generated timestamp is not just a matter of multiplying the CPU's cycle counter with a constant factor - which could be important if the timestamps are used for some sort of metrics analysis. (I have no knowledge about such use cases, I'm just mentioning potential pitfalls here.)

I guess a lot of NICs aren't configured to provide packet timestamps, so in order to avoid code duplication in a bunch of PMDs, a software timestamping library (or common set of helper functions) might be handy for the PMDs.


Furthermore, the timers on separate NICs will be unsynchronized anyway (regardless if the timestamps are generated by hardware or software), so the timestamps are out of order when considering multiple ingress ports anyway.

Generally, I support the idea of making the somewhat exotic features compile time optional. In that context, it is a question of defining what is common, and what is exotic. But +1 to Jan's suggestion about making it compile time optional for the PMDs to set the m->timestamp, since they are probably not used by typical data plane packet forwarding applications, and they cost a few instruction cycles for each packet. Even though this cost is small, adding a more such exotic features with small individual costs will eventually make their total cost significant.


Me
Olivier Matz
2017-02-24 14:11:54 UTC
Permalink
On Tue, 21 Feb 2017 22:51:00 +0100, Morten Brørup
Post by Morten Brørup
Regarding m->timestamp I have previously argued for keeping it NIC
Normalizing it makes gives the user the ability to transparently swap
out a NIC from one vendor with one from another vendor. And with a
hardware timestamp from the NIC, the normalization only involves
multiplying by a constant factor, as Olivier pointed out previously.
So if the resolution is high enough, a normalized value is
preferable. 1 ns is roughly 10 bytes at 100 Gbit/s, so I suppose a
resolution of 1 ns suffices.
But how about NICs without hardware timestamps...
1. Are their PMDs supposed to set the timestamp or not, and are they
supposed to ensure that two packets received by the same port do not
carry the same timestamp?
The timestamp would only be set if the user asks for it through an
ethdev API. For NICs that do not support timestamps, the PMD can either
not support it, or implement it in software. I don't think they should
ensure that two packets do not carry the same timestamp. It depends on
the unit, and on the precision of the measure.
Post by Morten Brørup
2. And if the CPU clock frequency is not constant, normalizing a
software generated timestamp is not just a matter of multiplying the
CPU's cycle counter with a constant factor - which could be important
if the timestamps are used for some sort of metrics analysis. (I have
no knowledge about such use cases, I'm just mentioning potential
pitfalls here.)
Since timestamp is a time reference, its unit has to be a constant
clock. On the CPUs I know, even when the internal frequency is not
constant, there is a also time reference with a constant clock (ex:
the tsc on Intel).
Post by Morten Brørup
I guess a lot of NICs aren't configured to provide packet timestamps,
so in order to avoid code duplication in a bunch of PMDs, a software
timestamping library (or common set of helper functions) might be
handy for the PMDs.
Yes
Post by Morten Brørup
Furthermore, the timers on separate NICs will be unsynchronized
anyway (regardless if the timestamps are generated by hardware or
software), so the timestamps are out of order when considering
multiple ingress ports anyway.
Generally, I support the idea of making the somewhat exotic features
compile time optional. In that context, it is a question of defining
what is common, and what is exotic. But +1 to Jan's suggestion about
making it compile time optional for the PMDs to set the m->timestamp,
since they are probably not used by typical data plane packet
forwarding applications, and they cost a few instruction cycles for
each packet. Even though this cost is small, adding a more such
exotic features with small individual costs will eventually make
their total cost significant.
I don't agree. Having compile-time options is something we should try
to avoid (knowing the DPDK is also a set of libraries provided by
distros). If the timestamp can be enabled/disabled at port
initialization, I think the cost of the feature will be negligible (it
can be one test for a bulk of packets).


Olivier
Olivier Matz
2017-02-24 14:00:53 UTC
Permalink
Hi,

On Tue, 21 Feb 2017 20:30:57 +0000, "Ananyev, Konstantin"
Post by Ananyev, Konstantin
-----Original Message-----
Blunck Sent: Tuesday, February 21, 2017 7:18 PM
0/8] mbuf: structure reorganization
On Tue, Feb 21, 2017 at 6:26 PM, Ananyev, Konstantin
Post by Olivier Matz
Hi Jan,
-----Original Message-----
Jan Blunck Sent: Tuesday, February 21, 2017 5:05 PM
[dpdk-dev] [RFC 0/8] mbuf: structure reorganization
On Tue, Feb 21, 2017 at 5:38 PM, Bruce Richardson
Post by Bruce Richardson
Post by Jan Blunck
Access through PMD specific function pointers should be
relatively fast on access. Modern architecture optimize that
use case well enough.
The cost of doing a function call per packet to access data
starts to add up very, very fast. For the app, once the data
is written to the mbuf, it should be in the L1 cache, giving
very fast access to it in a few cycles. However, if a function
call has to be made in order to do the read, that makes the
read of that field many times more expensive.
Exactly. Right now the timestamp normalization is done before
writing to each mbuf. Timestamps are usually read at most
once ... if at all.
Well we don't know for sure right?
Someone can argue that there are plenty of scenarios when
other fields might also never be used/updated (rss, vlan, etc).
So, are you suggesting to do normalization later?
If so, then what would be the benefit (data still need to be in mbuf)?
Yes, postponing normalization prevents you from doing unnecessary
work upfront. AFAIK not all NICs store timestamp data OOB, e.g. in
CQ.
Yes, postponing normalization might help a bit (though I don't think
much) in terms of calculations performed inside PMD.
But we still need 8B inside mbuf to store the timestamp value,
either normalized or raw one.
mbufs 1-st cacheline vs 2-nd cacheline
In my opinion, if we have the room in the first cache line, we should
put it there. The only argument I see against is "we may find something
more important in the future, and we won't have room for it in the
first cache line". I don't feel we should penalize today's use cases for
hypothetic future use cases.
Post by Ananyev, Konstantin
2. timestamp normalization point
inside PMD RX vs somewhere later as user needs it (extra
function in dev_ops?).
This point could be changed. My initial proposition tries to provide a
generic API for timestamp. Let me remind it here:

a- the timestamp is in nanosecond
b- the reference is always the same for a given path: if the timestamp
is set in a PMD, all the packets for this PMD will have the same
reference, but for 2 different PMDs (or a sw lib), the reference
would not be the same.

We may remove a-, and just have:
- the reference and the unit are always the same for a given path: if
the timestamp is set in a PMD, all the packets for this PMD will have
the same reference and unit, but for 2 different PMDs (or a sw lib),
they would not be the same.

In both cases, we would need a conversion code (maybe in a library) if
the application wants to work with timestamps from several sources. The
second solution removes the normalization code in the PMD when not
needed, it is probably better.


About having the timestamp in the packet data, I don't think it is
a good solution for a generic API in DPDK. The timestamp is a metadata,
it has to go in the mbuf metadata. The packet data should not be
modified when the timestamp is enabled.

But this would not prevent to have driver-specific features to do that.
In that case, the application will be aware that it is using this
specific driver and that it will receive a timestamp in the packet data.

To summarize, the generic API could be:
- an ethdev API to enable the timestamp in a PMD for received packets
- a mbuf flag "timestamp present"
- a mbuf 64b field to store the timestamp value

Additionally, a driver-specific API can be added for a given PMD.
Example:
- the generic timestamp ethdev is disabled (or not supported)
- a driver-specific feature "put timestamp in packet" is enabled
It would have no additional cost compared to what we have today, since
the timestamp in mbuf is not read/written.



Olivier
Bruce Richardson
2017-02-24 14:21:16 UTC
Permalink
Post by Olivier Matz
Hi,
On Tue, 21 Feb 2017 20:30:57 +0000, "Ananyev, Konstantin"
Post by Ananyev, Konstantin
-----Original Message-----
Blunck Sent: Tuesday, February 21, 2017 7:18 PM
0/8] mbuf: structure reorganization
On Tue, Feb 21, 2017 at 6:26 PM, Ananyev, Konstantin
Post by Olivier Matz
Hi Jan,
-----Original Message-----
Jan Blunck Sent: Tuesday, February 21, 2017 5:05 PM
[dpdk-dev] [RFC 0/8] mbuf: structure reorganization
On Tue, Feb 21, 2017 at 5:38 PM, Bruce Richardson
Post by Bruce Richardson
Post by Jan Blunck
Access through PMD specific function pointers should be
relatively fast on access. Modern architecture optimize that
use case well enough.
The cost of doing a function call per packet to access data
starts to add up very, very fast. For the app, once the data
is written to the mbuf, it should be in the L1 cache, giving
very fast access to it in a few cycles. However, if a function
call has to be made in order to do the read, that makes the
read of that field many times more expensive.
Exactly. Right now the timestamp normalization is done before
writing to each mbuf. Timestamps are usually read at most
once ... if at all.
Well we don't know for sure right?
Someone can argue that there are plenty of scenarios when
other fields might also never be used/updated (rss, vlan, etc).
So, are you suggesting to do normalization later?
If so, then what would be the benefit (data still need to be in mbuf)?
Yes, postponing normalization prevents you from doing unnecessary
work upfront. AFAIK not all NICs store timestamp data OOB, e.g. in
CQ.
Yes, postponing normalization might help a bit (though I don't think
much) in terms of calculations performed inside PMD.
But we still need 8B inside mbuf to store the timestamp value,
either normalized or raw one.
mbufs 1-st cacheline vs 2-nd cacheline
In my opinion, if we have the room in the first cache line, we should
put it there. The only argument I see against is "we may find something
more important in the future, and we won't have room for it in the
first cache line". I don't feel we should penalize today's use cases for
hypothetic future use cases.
Post by Ananyev, Konstantin
2. timestamp normalization point
inside PMD RX vs somewhere later as user needs it (extra function in dev_ops?).
This point could be changed. My initial proposition tries to provide a
a- the timestamp is in nanosecond
b- the reference is always the same for a given path: if the timestamp
is set in a PMD, all the packets for this PMD will have the same
reference, but for 2 different PMDs (or a sw lib), the reference
would not be the same.
- the reference and the unit are always the same for a given path: if
the timestamp is set in a PMD, all the packets for this PMD will have
the same reference and unit, but for 2 different PMDs (or a sw lib),
they would not be the same.
In both cases, we would need a conversion code (maybe in a library) if
the application wants to work with timestamps from several sources. The
second solution removes the normalization code in the PMD when not
needed, it is probably better.
About having the timestamp in the packet data, I don't think it is
a good solution for a generic API in DPDK. The timestamp is a metadata,
it has to go in the mbuf metadata. The packet data should not be
modified when the timestamp is enabled.
But this would not prevent to have driver-specific features to do that.
In that case, the application will be aware that it is using this
specific driver and that it will receive a timestamp in the packet data.
- an ethdev API to enable the timestamp in a PMD for received packets
- a mbuf flag "timestamp present"
- a mbuf 64b field to store the timestamp value
Additionally, a driver-specific API can be added for a given PMD.
- the generic timestamp ethdev is disabled (or not supported)
- a driver-specific feature "put timestamp in packet" is enabled
It would have no additional cost compared to what we have today, since
the timestamp in mbuf is not read/written.
All seems reasonable to me.
/Bruce
Jan Blunck
2017-02-28 08:55:57 UTC
Permalink
Post by Olivier Matz
In my opinion, if we have the room in the first cache line, we should
put it there. The only argument I see against is "we may find something
more important in the future, and we won't have room for it in the
first cache line". I don't feel we should penalize today's use cases for
hypothetic future use cases.
Post by Ananyev, Konstantin
2. timestamp normalization point
inside PMD RX vs somewhere later as user needs it (extra
function in dev_ops?).
This point could be changed. My initial proposition tries to provide a
a- the timestamp is in nanosecond
b- the reference is always the same for a given path: if the timestamp
is set in a PMD, all the packets for this PMD will have the same
reference, but for 2 different PMDs (or a sw lib), the reference
would not be the same.
- the reference and the unit are always the same for a given path: if
the timestamp is set in a PMD, all the packets for this PMD will have
the same reference and unit, but for 2 different PMDs (or a sw lib),
they would not be the same.
In both cases, we would need a conversion code (maybe in a library) if
the application wants to work with timestamps from several sources. The
second solution removes the normalization code in the PMD when not
needed, it is probably better.
I agree.
Post by Olivier Matz
About having the timestamp in the packet data, I don't think it is
a good solution for a generic API in DPDK. The timestamp is a metadata,
it has to go in the mbuf metadata. The packet data should not be
modified when the timestamp is enabled.
Good NICs already do that based on the packet type (e.g. NTP/PTP packets).
Post by Olivier Matz
But this would not prevent to have driver-specific features to do that.
In that case, the application will be aware that it is using this
specific driver and that it will receive a timestamp in the packet data.
- an ethdev API to enable the timestamp in a PMD for received packets
- a mbuf flag "timestamp present"
- a mbuf 64b field to store the timestamp value
Additionally, a driver-specific API can be added for a given PMD.
- the generic timestamp ethdev is disabled (or not supported)
- a driver-specific feature "put timestamp in packet" is enabled
It would have no additional cost compared to what we have today, since
the timestamp in mbuf is not read/written.
Thanks for the writeup. This sounds like a reasonable approach to me.

Do you still want to call the 64bit field "timestamp" or rename it to
something neutral and document that it is used together with the mbuf
flags?
Ananyev, Konstantin
2017-02-28 09:05:07 UTC
Permalink
Hi everyone,
Post by Bruce Richardson
Post by Olivier Matz
In my opinion, if we have the room in the first cache line, we should
put it there. The only argument I see against is "we may find something
more important in the future, and we won't have room for it in the
first cache line". I don't feel we should penalize today's use cases for
hypothetic future use cases.
Post by Ananyev, Konstantin
2. timestamp normalization point
inside PMD RX vs somewhere later as user needs it (extra function in dev_ops?).
This point could be changed. My initial proposition tries to provide a
a- the timestamp is in nanosecond
b- the reference is always the same for a given path: if the timestamp
is set in a PMD, all the packets for this PMD will have the same
reference, but for 2 different PMDs (or a sw lib), the reference
would not be the same.
- the reference and the unit are always the same for a given path: if
the timestamp is set in a PMD, all the packets for this PMD will have
the same reference and unit, but for 2 different PMDs (or a sw lib),
they would not be the same.
In both cases, we would need a conversion code (maybe in a library) if
the application wants to work with timestamps from several sources. The
second solution removes the normalization code in the PMD when not
needed, it is probably better.
I agree.
One question - does that mean that application would need to
keep a track from what PMD each particular packet came to do the normalization?
Konstantin
Post by Bruce Richardson
Post by Olivier Matz
About having the timestamp in the packet data, I don't think it is
a good solution for a generic API in DPDK. The timestamp is a metadata,
it has to go in the mbuf metadata. The packet data should not be
modified when the timestamp is enabled.
Good NICs already do that based on the packet type (e.g. NTP/PTP packets).
Post by Olivier Matz
But this would not prevent to have driver-specific features to do that.
In that case, the application will be aware that it is using this
specific driver and that it will receive a timestamp in the packet data.
- an ethdev API to enable the timestamp in a PMD for received packets
- a mbuf flag "timestamp present"
- a mbuf 64b field to store the timestamp value
Additionally, a driver-specific API can be added for a given PMD.
- the generic timestamp ethdev is disabled (or not supported)
- a driver-specific feature "put timestamp in packet" is enabled
It would have no additional cost compared to what we have today, since
the timestamp in mbuf is not read/written.
Thanks for the writeup. This sounds like a reasonable approach to me.
Do you still want to call the 64bit field "timestamp" or rename it to
something neutral and document that it is used together with the mb
Olivier Matz
2017-02-28 09:23:59 UTC
Permalink
Hi,

On Tue, 28 Feb 2017 09:05:07 +0000, "Ananyev, Konstantin"
Post by Ananyev, Konstantin
Hi everyone,
Post by Bruce Richardson
Post by Olivier Matz
In my opinion, if we have the room in the first cache line, we
should put it there. The only argument I see against is "we may
find something more important in the future, and we won't have
room for it in the first cache line". I don't feel we should
penalize today's use cases for hypothetic future use cases.
Post by Ananyev, Konstantin
2. timestamp normalization point
inside PMD RX vs somewhere later as user needs it (extra
function in dev_ops?).
This point could be changed. My initial proposition tries to
a- the timestamp is in nanosecond
b- the reference is always the same for a given path: if the
timestamp is set in a PMD, all the packets for this PMD will have
the same reference, but for 2 different PMDs (or a sw lib), the
reference would not be the same.
- the reference and the unit are always the same for a given
path: if the timestamp is set in a PMD, all the packets for this
PMD will have the same reference and unit, but for 2 different
PMDs (or a sw lib), they would not be the same.
In both cases, we would need a conversion code (maybe in a
library) if the application wants to work with timestamps from
several sources. The second solution removes the normalization
code in the PMD when not needed, it is probably better.
I agree.
One question - does that mean that application would need to
keep a track from what PMD each particular packet came to do the
normalization? Konstantin
I'd say yes. It does not look very difficult to do, since the mbuf
contains the input port id.
Post by Ananyev, Konstantin
Post by Bruce Richardson
Post by Olivier Matz
About having the timestamp in the packet data, I don't think it is
a good solution for a generic API in DPDK. The timestamp is a
metadata, it has to go in the mbuf metadata. The packet data
should not be modified when the timestamp is enabled.
Good NICs already do that based on the packet type (e.g. NTP/PTP packets).
Post by Olivier Matz
But this would not prevent to have driver-specific features to do
that. In that case, the application will be aware that it is
using this specific driver and that it will receive a timestamp
in the packet data.
- an ethdev API to enable the timestamp in a PMD for received packets
- a mbuf flag "timestamp present"
- a mbuf 64b field to store the timestamp value
Additionally, a driver-specific API can be added for a given PMD.
- the generic timestamp ethdev is disabled (or not supported)
- a driver-specific feature "put timestamp in packet" is enabled
It would have no additional cost compared to what we have today,
since the timestamp in mbuf is not read/written.
Thanks for the writeup. This sounds like a reasonable approach to me.
Do you still want to call the 64bit field "timestamp" or rename it
to something neutral and document that it is used together with the
mbuf flags?
I think timestamp is a good name. In the current RFC patchset, we have
this comment:

/** Valid if PKT_RX_TIMESTAMP is set. The unit is nanoseconds */
uint64_t timestamp;

We could change it to something like:

/** Valid if PKT_RX_TIMESTAMP is set. The unit and time
* reference are not normalized but are always the same
* for a given port.
*/
uint64_t timestamp;


Regards,
Olivier
Jan Blunck
2017-02-28 09:33:25 UTC
Permalink
Post by Olivier Matz
Post by Jan Blunck
Do you still want to call the 64bit field "timestamp" or rename it
to something neutral and document that it is used together with the
mbuf flags?
I think timestamp is a good name. In the current RFC patchset, we have
/** Valid if PKT_RX_TIMESTAMP is set. The unit is nanoseconds */
uint64_t timestamp;
/** Valid if PKT_RX_TIMESTAMP is set. The unit and time
* reference are not normalized but are always the same
* for a given port.
*/
uint64_t timestamp;
Looks good to me.

Thanks,
Jan
Ananyev, Konstantin
2017-02-28 10:29:41 UTC
Permalink
Post by Olivier Matz
Hi,
On Tue, 28 Feb 2017 09:05:07 +0000, "Ananyev, Konstantin"
Post by Ananyev, Konstantin
Hi everyone,
Post by Bruce Richardson
Post by Olivier Matz
In my opinion, if we have the room in the first cache line, we
should put it there. The only argument I see against is "we may
find something more important in the future, and we won't have
room for it in the first cache line". I don't feel we should
penalize today's use cases for hypothetic future use cases.
Post by Ananyev, Konstantin
2. timestamp normalization point
inside PMD RX vs somewhere later as user needs it (extra
function in dev_ops?).
This point could be changed. My initial proposition tries to
a- the timestamp is in nanosecond
b- the reference is always the same for a given path: if the
timestamp is set in a PMD, all the packets for this PMD will have
the same reference, but for 2 different PMDs (or a sw lib), the
reference would not be the same.
- the reference and the unit are always the same for a given
path: if the timestamp is set in a PMD, all the packets for this
PMD will have the same reference and unit, but for 2 different
PMDs (or a sw lib), they would not be the same.
In both cases, we would need a conversion code (maybe in a
library) if the application wants to work with timestamps from
several sources. The second solution removes the normalization
code in the PMD when not needed, it is probably better.
I agree.
One question - does that mean that application would need to
keep a track from what PMD each particular packet came to do the
normalization? Konstantin
I'd say yes. It does not look very difficult to do, since the mbuf
contains the input port id.
I understand that we can use mbuf->port here, but it means that we'll
introduce new implicit dependency between timestamp and port values.
From my point that introduces new implications:
1. all PMDs that do set a timestamp would also have to set port value too.
Probably not a big deal as most of PMDs do set port value anyway right now,
but it means it would be hard to get rid/change mbuf->port in future.
2. Applications would not allowed to change mbuf->port value before normalization is done
(from what I heard some apps do update mbuf->port to store routing decisions).
BTW, how the app would keep track which mbufs were already normalized, and which were not?
3. In theory with eth_dev_detach() - mbuf->port value might be not valid at the point when application
would decide to do normalization.

So to me all that approach with delayed normalization seems unnecessary overcomplicated.
Original one suggested by Olivier, when normalization is done in PMD at RX look
much cleaner and more manageable.
Konstantin
Olivier Matz
2017-02-28 10:50:43 UTC
Permalink
On Tue, 28 Feb 2017 10:29:41 +0000, "Ananyev, Konstantin"
Post by Ananyev, Konstantin
Post by Olivier Matz
Hi,
On Tue, 28 Feb 2017 09:05:07 +0000, "Ananyev, Konstantin"
Post by Ananyev, Konstantin
Hi everyone,
Post by Bruce Richardson
Post by Olivier Matz
In my opinion, if we have the room in the first cache line, we
should put it there. The only argument I see against is "we
may find something more important in the future, and we won't
have room for it in the first cache line". I don't feel we
should penalize today's use cases for hypothetic future use
cases.
Post by Ananyev, Konstantin
2. timestamp normalization point
inside PMD RX vs somewhere later as user needs it (extra
function in dev_ops?).
This point could be changed. My initial proposition tries to
a- the timestamp is in nanosecond
b- the reference is always the same for a given path: if the
timestamp is set in a PMD, all the packets for this PMD will
have the same reference, but for 2 different PMDs (or a sw
lib), the reference would not be the same.
- the reference and the unit are always the same for a given
path: if the timestamp is set in a PMD, all the packets for
this PMD will have the same reference and unit, but for 2
different PMDs (or a sw lib), they would not be the same.
In both cases, we would need a conversion code (maybe in a
library) if the application wants to work with timestamps from
several sources. The second solution removes the normalization
code in the PMD when not needed, it is probably better.
I agree.
One question - does that mean that application would need to
keep a track from what PMD each particular packet came to do the
normalization? Konstantin
I'd say yes. It does not look very difficult to do, since the mbuf
contains the input port id.
I understand that we can use mbuf->port here, but it means that we'll
introduce new implicit dependency between timestamp and port values.
1. all PMDs that do set a timestamp would also have to set port value too.
Probably not a big deal as most of PMDs do set port value anyway right now,
but it means it would be hard to get rid/change mbuf->port in future.
Currently, all PMDs must set m->port.
If in the future we remove m->port, the applications that use it will need
to store the value in a mbuf metadata, or pass it as arguments through function
calls.
Post by Ananyev, Konstantin
2. Applications would not allowed to change mbuf->port value before normalization is done
(from what I heard some apps do update mbuf->port to store routing decisions).
BTW, how the app would keep track which mbufs were already normalized, and which were not?
I don't think it should be allowed to change m->port value. Applications that
are doing this are responsible of what they change.
Post by Ananyev, Konstantin
3. In theory with eth_dev_detach() - mbuf->port value might be not valid at the point when application
would decide to do normalization.
So to me all that approach with delayed normalization seems unnecessary overcomplicated.
Original one suggested by Olivier, when normalization is done in PMD at RX look
much cleaner and more manageable.
Detaching a device requires a synchronization between control and data plane,
and not only for this use case. In the first solution, the normalization is
partial: unit is nanosecond, but the time reference is different.

So, after the discussion, I'm more convinced by the second solution.


Regards,
Olivier
Ananyev, Konstantin
2017-02-28 11:48:20 UTC
Permalink
Post by Olivier Matz
On Tue, 28 Feb 2017 10:29:41 +0000, "Ananyev, Konstantin"
Post by Ananyev, Konstantin
Post by Olivier Matz
Hi,
On Tue, 28 Feb 2017 09:05:07 +0000, "Ananyev, Konstantin"
Post by Ananyev, Konstantin
Hi everyone,
Post by Bruce Richardson
Post by Olivier Matz
In my opinion, if we have the room in the first cache line, we
should put it there. The only argument I see against is "we
may find something more important in the future, and we won't
have room for it in the first cache line". I don't feel we
should penalize today's use cases for hypothetic future use
cases.
Post by Ananyev, Konstantin
2. timestamp normalization point
inside PMD RX vs somewhere later as user needs it (extra
function in dev_ops?).
This point could be changed. My initial proposition tries to
a- the timestamp is in nanosecond
b- the reference is always the same for a given path: if the
timestamp is set in a PMD, all the packets for this PMD will
have the same reference, but for 2 different PMDs (or a sw
lib), the reference would not be the same.
- the reference and the unit are always the same for a given
path: if the timestamp is set in a PMD, all the packets for
this PMD will have the same reference and unit, but for 2
different PMDs (or a sw lib), they would not be the same.
In both cases, we would need a conversion code (maybe in a
library) if the application wants to work with timestamps from
several sources. The second solution removes the normalization
code in the PMD when not needed, it is probably better.
I agree.
One question - does that mean that application would need to
keep a track from what PMD each particular packet came to do the
normalization? Konstantin
I'd say yes. It does not look very difficult to do, since the mbuf
contains the input port id.
I understand that we can use mbuf->port here, but it means that we'll
introduce new implicit dependency between timestamp and port values.
1. all PMDs that do set a timestamp would also have to set port value too.
Probably not a big deal as most of PMDs do set port value anyway right now,
but it means it would be hard to get rid/change mbuf->port in future.
Currently, all PMDs must set m->port.
If in the future we remove m->port, the applications that use it will need
to store the value in a mbuf metadata, or pass it as arguments through function
calls.
Post by Ananyev, Konstantin
2. Applications would not allowed to change mbuf->port value before normalization is done
(from what I heard some apps do update mbuf->port to store routing decisions).
BTW, how the app would keep track which mbufs were already normalized, and which were not?
I don't think it should be allowed to change m->port value.
As far as I know it is allowed right now.
PMD RX routine sets mbuf->port, after that application is free to use it
in a way it likes.
What we are introducing here is basically a new dependency between 2
mbuf fields and new restriction.

Another thing that doesn't look very convenient to me here -
We can have 2 different values of timestamp (both normalized and not)
and there is no clear way for the application to know which one is in use right now.
So each app writer would have to come-up with his own solution.
Post by Olivier Matz
Applications that
are doing this are responsible of what they change.
Post by Ananyev, Konstantin
3. In theory with eth_dev_detach() - mbuf->port value might be not valid at the point when application
would decide to do normalization.
So to me all that approach with delayed normalization seems unnecessary overcomplicated.
Original one suggested by Olivier, when normalization is done in PMD at RX look
much cleaner and more manageable.
Detaching a device requires a synchronization between control and data plane,
and not only for this use case.
Of course it does.
But right now it is possible to do:

eth_rx_burst(port=0, ..., &mbuf, 1);
eth_dev_detach(port=0, ...);
...
/*process previously received mbuf */

With what you are proposing it would be not always possible any more.
Post by Olivier Matz
In the first solution, the normalization is
partial: unit is nanosecond, but the time reference is different.
Not sure I get you here...
Konstantin
Post by Olivier Matz
So, after the discussion, I'm more convinced by the second solution.
Regards,
Olivier
Olivier Matz
2017-02-28 12:28:24 UTC
Permalink
On Tue, 28 Feb 2017 11:48:20 +0000, "Ananyev, Konstantin"
Post by Ananyev, Konstantin
Post by Olivier Matz
On Tue, 28 Feb 2017 10:29:41 +0000, "Ananyev, Konstantin"
Post by Ananyev, Konstantin
Post by Olivier Matz
Hi,
On Tue, 28 Feb 2017 09:05:07 +0000, "Ananyev, Konstantin"
Post by Ananyev, Konstantin
Hi everyone,
Post by Bruce Richardson
Post by Olivier Matz
In my opinion, if we have the room in the first cache
line, we should put it there. The only argument I see
against is "we may find something more important in the
future, and we won't have room for it in the first cache
line". I don't feel we should penalize today's use cases
for hypothetic future use cases.
Post by Ananyev, Konstantin
2. timestamp normalization point
inside PMD RX vs somewhere later as user needs it
(extra function in dev_ops?).
This point could be changed. My initial proposition tries
to provide a generic API for timestamp. Let me remind it
a- the timestamp is in nanosecond
b- the reference is always the same for a given path: if
the timestamp is set in a PMD, all the packets for this
PMD will have the same reference, but for 2 different
PMDs (or a sw lib), the reference would not be the same.
- the reference and the unit are always the same for a
given path: if the timestamp is set in a PMD, all the
packets for this PMD will have the same reference and
unit, but for 2 different PMDs (or a sw lib), they would
not be the same.
In both cases, we would need a conversion code (maybe in a
library) if the application wants to work with timestamps
from several sources. The second solution removes the
normalization code in the PMD when not needed, it is
probably better.
I agree.
One question - does that mean that application would need to
keep a track from what PMD each particular packet came to do
the normalization? Konstantin
I'd say yes. It does not look very difficult to do, since the
mbuf contains the input port id.
I understand that we can use mbuf->port here, but it means that
we'll introduce new implicit dependency between timestamp and
1. all PMDs that do set a timestamp would also have to set port
value too. Probably not a big deal as most of PMDs do set port
value anyway right now, but it means it would be hard to get
rid/change mbuf->port in future.
Currently, all PMDs must set m->port.
If in the future we remove m->port, the applications that use it
will need to store the value in a mbuf metadata, or pass it as
arguments through function calls.
Post by Ananyev, Konstantin
2. Applications would not allowed to change mbuf->port value
before normalization is done (from what I heard some apps do
update mbuf->port to store routing decisions). BTW, how the app
would keep track which mbufs were already normalized, and which
were not?
I don't think it should be allowed to change m->port value.
As far as I know it is allowed right now.
PMD RX routine sets mbuf->port, after that application is free to use
it in a way it likes.
The descriptor or m->port is "Input port". If the applications stores
something else than the input port, it is its responsibility if it
breaks something else. Like changing any other field to put something
that does not match the description.
Post by Ananyev, Konstantin
What we are introducing here is basically a new dependency between 2
mbuf fields and new restriction.
On the other hand, there is no strong dependency: the API to do the
normalization can take the port as a parameter.
Post by Ananyev, Konstantin
Another thing that doesn't look very convenient to me here -
We can have 2 different values of timestamp (both normalized and not)
and there is no clear way for the application to know which one is in
use right now. So each app writer would have to come-up with his own
solution.
It depends:
- the solution you describe is to have the application storing the
normalized value in its private metadata.
- another solution would be to store the normalized value in
m->timestamp. In this case, we would need a flag to tell if the
timestamp value is normalized.

The problem pointed out by Jan is that doing the timestamp
normalization may take some CPU cycles, even if a small part of packets
requires it.
Post by Ananyev, Konstantin
Post by Olivier Matz
Applications that
are doing this are responsible of what they change.
Post by Ananyev, Konstantin
3. In theory with eth_dev_detach() - mbuf->port value might be
not valid at the point when application would decide to do
normalization.
So to me all that approach with delayed normalization seems
unnecessary overcomplicated. Original one suggested by Olivier,
when normalization is done in PMD at RX look much cleaner and
more manageable.
Detaching a device requires a synchronization between control and
data plane, and not only for this use case.
Of course it does.
eth_rx_burst(port=0, ..., &mbuf, 1);
eth_dev_detach(port=0, ...);
...
/*process previously received mbuf */
With what you are proposing it would be not always possible any more.
With your example, it does not work even without the timestamp feature,
since the mbuf input port would reference an invalid port. This port
is usually used in the application to do a lookup for an port structure,
so it is expected that the entry is valid. It would be even worst if you
do a detach + attach.

So, I think it is already the responsibility of the application to do
the sync (flush retrieved packets before detaching a port).
Post by Ananyev, Konstantin
Post by Olivier Matz
In the first solution, the normalization is
partial: unit is nanosecond, but the time reference is different.
Not sure I get you here...
In the first solution I described, each PMD had to convert its unit
into nanosecond. This is easy because we assume the PMD knows the
value of its clock. But to get a fully normalized value, it also has to
use the same time reference, so we would also need to manage an offset
(we need a new API to give this value to the PMD).

I have another fear related to hardware clocks: if clocks are not
synchronized between PMDs, the simple operation "t * ratio - offset"
won't work. That's why I think we could delegate this job in a specific
library that would manage this.

Having a non-normalized timestamp as of today would allow applications
to take advantage of it for many use cases, even without the
normalization library that could come later (and that may probably
be more complex than expected).


Olivier
Ananyev, Konstantin
2017-02-28 22:53:55 UTC
Permalink
-----Original Message-----
Sent: Tuesday, February 28, 2017 12:28 PM
Subject: Re: [dpdk-dev] [RFC 0/8] mbuf: structure reorganization
On Tue, 28 Feb 2017 11:48:20 +0000, "Ananyev, Konstantin"
Post by Ananyev, Konstantin
Post by Olivier Matz
On Tue, 28 Feb 2017 10:29:41 +0000, "Ananyev, Konstantin"
Post by Ananyev, Konstantin
Post by Olivier Matz
Hi,
On Tue, 28 Feb 2017 09:05:07 +0000, "Ananyev, Konstantin"
Post by Ananyev, Konstantin
Hi everyone,
Post by Bruce Richardson
Post by Olivier Matz
In my opinion, if we have the room in the first cache
line, we should put it there. The only argument I see
against is "we may find something more important in the
future, and we won't have room for it in the first cache
line". I don't feel we should penalize today's use cases
for hypothetic future use cases.
Post by Ananyev, Konstantin
2. timestamp normalization point
inside PMD RX vs somewhere later as user needs it
(extra function in dev_ops?).
This point could be changed. My initial proposition tries
to provide a generic API for timestamp. Let me remind it
a- the timestamp is in nanosecond
b- the reference is always the same for a given path: if
the timestamp is set in a PMD, all the packets for this
PMD will have the same reference, but for 2 different
PMDs (or a sw lib), the reference would not be the same.
- the reference and the unit are always the same for a
given path: if the timestamp is set in a PMD, all the
packets for this PMD will have the same reference and
unit, but for 2 different PMDs (or a sw lib), they would
not be the same.
In both cases, we would need a conversion code (maybe in a
library) if the application wants to work with timestamps
from several sources. The second solution removes the
normalization code in the PMD when not needed, it is
probably better.
I agree.
One question - does that mean that application would need to
keep a track from what PMD each particular packet came to do
the normalization? Konstantin
I'd say yes. It does not look very difficult to do, since the
mbuf contains the input port id.
I understand that we can use mbuf->port here, but it means that
we'll introduce new implicit dependency between timestamp and
1. all PMDs that do set a timestamp would also have to set port
value too. Probably not a big deal as most of PMDs do set port
value anyway right now, but it means it would be hard to get
rid/change mbuf->port in future.
Currently, all PMDs must set m->port.
If in the future we remove m->port, the applications that use it
will need to store the value in a mbuf metadata, or pass it as
arguments through function calls.
Post by Ananyev, Konstantin
2. Applications would not allowed to change mbuf->port value
before normalization is done (from what I heard some apps do
update mbuf->port to store routing decisions). BTW, how the app
would keep track which mbufs were already normalized, and which
were not?
I don't think it should be allowed to change m->port value.
As far as I know it is allowed right now.
PMD RX routine sets mbuf->port, after that application is free to use
it in a way it likes.
The descriptor or m->port is "Input port". If the applications stores
something else than the input port, it is its responsibility if it
breaks something else. Like changing any other field to put something
that does not match the description.
Post by Ananyev, Konstantin
What we are introducing here is basically a new dependency between 2
mbuf fields and new restriction.
On the other hand, there is no strong dependency: the API to do the
normalization can take the port as a parameter.
Ok, that would be much better - the dependency is still there,
but at least we don't force it.
Post by Ananyev, Konstantin
Another thing that doesn't look very convenient to me here -
We can have 2 different values of timestamp (both normalized and not)
and there is no clear way for the application to know which one is in
use right now. So each app writer would have to come-up with his own
solution.
- the solution you describe is to have the application storing the
normalized value in its private metadata.
- another solution would be to store the normalized value in
m->timestamp. In this case, we would need a flag to tell if the
timestamp value is normalized.
My first thought also was about second flag to specify was timestamp
already normalized or not.
Though I still in doubt - is it all really worth it: extra ol_flag, new function in eth_dev API.
My feeling that we trying to overcomplicate things.
The problem pointed out by Jan is that doing the timestamp
normalization may take some CPU cycles, even if a small part of packets
requires it.
I understand that point, but from what I've seen with real example:
http://dpdk.org/ml/archives/dev/2016-October/048810.html
the amount of calculations at RX is pretty small.
I don't think it would affect performance in a noticeable way
(though I don't have any numbers here to prove it).
From other side, if user doesn't want a timestamp he can always disable
that feature anad save cycles, right?

BTW, you and Jan both mention that not every packet would need a timestamp.
Instead we need sort of a timestamp for the group of packets?
Is that really the only foreseen usage model?
If so, then why not to have a special function that would extract 'latest' timestamp
from the dev?
Or even have tx_burst_extra() that would return a latest timestamp (extra parameter or so).
Then there is no need to put timestamp into mbuf at all.
Post by Ananyev, Konstantin
Post by Olivier Matz
Applications that
are doing this are responsible of what they change.
Post by Ananyev, Konstantin
3. In theory with eth_dev_detach() - mbuf->port value might be
not valid at the point when application would decide to do
normalization.
So to me all that approach with delayed normalization seems
unnecessary overcomplicated. Original one suggested by Olivier,
when normalization is done in PMD at RX look much cleaner and
more manageable.
Detaching a device requires a synchronization between control and
data plane, and not only for this use case.
Of course it does.
eth_rx_burst(port=0, ..., &mbuf, 1);
eth_dev_detach(port=0, ...);
...
/*process previously received mbuf */
With what you are proposing it would be not always possible any more.
With your example, it does not work even without the timestamp feature,
since the mbuf input port would reference an invalid port.
This port is usually used in the application to do a lookup for an port structure,
so it is expected that the entry is valid. It would be even worse if you
do a detach + attach.
I am not talking about the mbuf->port value usage.
Right now user can access/interpret all metadata fields set by PMD RX routines
(vlan, rss hash, ol_flags, ptype, etc.) without need to accessing the device data or
calling device functions.
With that change it wouldn't be the case anymore.
So, I think it is already the responsibility of the application to do
the sync (flush retrieved packets before detaching a port).
The packets are not in RX or TX queue of detaching device any more.
I received a packet, after that I expect to have all its data and metadata inside mbuf.
So I can store mbufs somewhere and process them much later.
Or might be I would like to pass it to the secondary process for logging/analyzing, etc.
Post by Ananyev, Konstantin
Post by Olivier Matz
In the first solution, the normalization is
partial: unit is nanosecond, but the time reference is different.
Not sure I get you here...
In the first solution I described, each PMD had to convert its unit
into nanosecond. This is easy because we assume the PMD knows the
value of its clock. But to get a fully normalized value, it also has to
use the same time reference, so we would also need to manage an offset
(we need a new API to give this value to the PMD).
Yes, I suppose we do need an start timestamp and sort of factor() to convert
HW value, something like:

mbuf->timestamp = rxq->start_timestamp + factor(hw_timestamp);

Right?
Why passing start_timestamp at the configure() phase will be a problem?
I have another fear related to hardware clocks: if clocks are not
synchronized between PMDs, the simple operation "t * ratio - offset"
won't work. That's why I think we could delegate this job in a specific
library that would manage this.
But then that library would need to account all PMDs inside the system,
and be aware about each HW clock skew, etc.
Again, doesn't' sound like an simple task to me.
Having a non-normalized timestamp as of today would allow applications
to take advantage of it for many use cases, even without the
normalization library that could come later (and that may probably
be more complex than expected).
Olivier
Olivier Matz
2017-03-02 16:46:23 UTC
Permalink
Hi Konstantin,
Post by Ananyev, Konstantin
Post by Olivier Matz
Post by Ananyev, Konstantin
Another thing that doesn't look very convenient to me here -
We can have 2 different values of timestamp (both normalized and not)
and there is no clear way for the application to know which one is in
use right now. So each app writer would have to come-up with his own
solution.
- the solution you describe is to have the application storing the
normalized value in its private metadata.
- another solution would be to store the normalized value in
m->timestamp. In this case, we would need a flag to tell if the
timestamp value is normalized.
My first thought also was about second flag to specify was timestamp
already normalized or not.
Though I still in doubt - is it all really worth it: extra ol_flag, new function in eth_dev API.
My feeling that we trying to overcomplicate things.
I don't see what is so complicated. The idea is just to let the
application do the normalization if it is required.

If the time is normalized in nanosecond in the PMD, we would still
need to normalized the time reference (the 0). And for that we'd need
a call to a synchronization code as well.
Post by Ananyev, Konstantin
Post by Olivier Matz
The problem pointed out by Jan is that doing the timestamp
normalization may take some CPU cycles, even if a small part of packets
requires it.
http://dpdk.org/ml/archives/dev/2016-October/048810.html
the amount of calculations at RX is pretty small.
I don't think it would affect performance in a noticeable way
(though I don't have any numbers here to prove it).
I think we can consider by default that adding code in the data path
impacts performance.
Post by Ananyev, Konstantin
From other side, if user doesn't want a timestamp he can always disable
that feature anad save cycles, right?
BTW, you and Jan both mention that not every packet would need a timestamp.
Instead we need sort of a timestamp for the group of packets?
I think that for many applications the timestamp should be as precise
as possible for each packet.
Post by Ananyev, Konstantin
Is that really the only foreseen usage model?
No, but it could be one.
Post by Ananyev, Konstantin
If so, then why not to have a special function that would extract 'latest' timestamp
from the dev?
Or even have tx_burst_extra() that would return a latest timestamp (extra parameter or so).
Then there is no need to put timestamp into mbuf at all.
Doing that will give a poor precision for the timestamp.
Post by Ananyev, Konstantin
Post by Olivier Matz
Post by Ananyev, Konstantin
Post by Olivier Matz
Applications that
are doing this are responsible of what they change.
Post by Ananyev, Konstantin
3. In theory with eth_dev_detach() - mbuf->port value might be
not valid at the point when application would decide to do
normalization.
So to me all that approach with delayed normalization seems
unnecessary overcomplicated. Original one suggested by Olivier,
when normalization is done in PMD at RX look much cleaner and
more manageable.
Detaching a device requires a synchronization between control and
data plane, and not only for this use case.
Of course it does.
eth_rx_burst(port=0, ..., &mbuf, 1);
eth_dev_detach(port=0, ...);
...
/*process previously received mbuf */
With what you are proposing it would be not always possible any more.
With your example, it does not work even without the timestamp feature,
since the mbuf input port would reference an invalid port.
This port is usually used in the application to do a lookup for an port structure,
so it is expected that the entry is valid. It would be even worse if you
do a detach + attach.
I am not talking about the mbuf->port value usage.
Right now user can access/interpret all metadata fields set by PMD RX routines
(vlan, rss hash, ol_flags, ptype, etc.) without need to accessing the device data or
calling device functions.
With that change it wouldn't be the case anymore.
That's the same for some other functions. If in my application I want
to call eth_rx_queue_count(m->port), I will have the same problem.

I think we also have something quite similar in examples/ptpclient:

rte_eth_rx_burst(portid, 0, &m, 1);
...
parse_ptp_frames(portid, m);
...
ptp_data.portid = portid;
...
rte_eth_timesync_read_tx_timestamp(ptp_data->portid, ...)


So, really, I think it's an application issue: when the app deletes
a port, it should ask itself if there are remaining references to
it (m->port).
Post by Ananyev, Konstantin
Post by Olivier Matz
So, I think it is already the responsibility of the application to do
the sync (flush retrieved packets before detaching a port).
The packets are not in RX or TX queue of detaching device any more.
I received a packet, after that I expect to have all its data and metadata inside mbuf.
So I can store mbufs somewhere and process them much later.
Or might be I would like to pass it to the secondary process for logging/analyzing, etc.
Yes, but that's still an app problem for me.
Post by Ananyev, Konstantin
Post by Olivier Matz
Post by Ananyev, Konstantin
Post by Olivier Matz
In the first solution, the normalization is
partial: unit is nanosecond, but the time reference is different.
Not sure I get you here...
In the first solution I described, each PMD had to convert its unit
into nanosecond. This is easy because we assume the PMD knows the
value of its clock. But to get a fully normalized value, it also has to
use the same time reference, so we would also need to manage an offset
(we need a new API to give this value to the PMD).
Yes, I suppose we do need an start timestamp and sort of factor() to convert
mbuf->timestamp = rxq->start_timestamp + factor(hw_timestamp);
Right?
Why passing start_timestamp at the configure() phase will be a problem?
Post by Olivier Matz
I have another fear related to hardware clocks: if clocks are not
synchronized between PMDs, the simple operation "t * ratio - offset"
won't work. That's why I think we could delegate this job in a specific
library that would manage this.
But then that library would need to account all PMDs inside the system,
and be aware about each HW clock skew, etc.
Again, doesn't' sound like an simple task to me.
Exactly, that's also why I want to let the specialists take care of
it. Having non-normalized timestamps now allow to do the job later
when required, while allowing basic usages as required by metrics
libraries and mlx pmd.



Olivier
Ananyev, Konstantin
2017-03-08 11:11:23 UTC
Permalink
Hi Olivier,
Post by Olivier Matz
Hi Konstantin,
Post by Ananyev, Konstantin
Post by Olivier Matz
Post by Ananyev, Konstantin
Another thing that doesn't look very convenient to me here -
We can have 2 different values of timestamp (both normalized and not)
and there is no clear way for the application to know which one is in
use right now. So each app writer would have to come-up with his own
solution.
- the solution you describe is to have the application storing the
normalized value in its private metadata.
- another solution would be to store the normalized value in
m->timestamp. In this case, we would need a flag to tell if the
timestamp value is normalized.
My first thought also was about second flag to specify was timestamp
already normalized or not.
Though I still in doubt - is it all really worth it: extra ol_flag, new function in eth_dev API.
My feeling that we trying to overcomplicate things.
I don't see what is so complicated. The idea is just to let the
application do the normalization if it is required.
I meant 2 ol_flags and special function just to treat properly one of the mbuf field
seems too much.
Though after second thought might be 2 ol_flags is not a bad idea -
it gives PMD writer a freedom to choose provide a normalized or raw value
on return from rx_burst().
Post by Olivier Matz
If the time is normalized in nanosecond in the PMD, we would still
need to normalized the time reference (the 0). And for that we'd need
a call to a synchronization code as well.
Post by Ananyev, Konstantin
Post by Olivier Matz
The problem pointed out by Jan is that doing the timestamp
normalization may take some CPU cycles, even if a small part of packets
requires it.
http://dpdk.org/ml/archives/dev/2016-October/048810.html
the amount of calculations at RX is pretty small.
I don't think it would affect performance in a noticeable way
(though I don't have any numbers here to prove it).
I think we can consider by default that adding code in the data path
impacts performance.
Post by Ananyev, Konstantin
From other side, if user doesn't want a timestamp he can always disable
that feature anad save cycles, right?
BTW, you and Jan both mention that not every packet would need a timestamp.
Instead we need sort of a timestamp for the group of packets?
I think that for many applications the timestamp should be as precise
as possible for each packet.
Post by Ananyev, Konstantin
Is that really the only foreseen usage model?
No, but it could be one.
Post by Ananyev, Konstantin
If so, then why not to have a special function that would extract 'latest' timestamp
from the dev?
Or even have tx_burst_extra() that would return a latest timestamp (extra parameter or so).
Then there is no need to put timestamp into mbuf at all.
Doing that will give a poor precision for the timestamp.
Post by Ananyev, Konstantin
Post by Olivier Matz
Post by Ananyev, Konstantin
Post by Olivier Matz
Applications that
are doing this are responsible of what they change.
Post by Ananyev, Konstantin
3. In theory with eth_dev_detach() - mbuf->port value might be
not valid at the point when application would decide to do
normalization.
So to me all that approach with delayed normalization seems
unnecessary overcomplicated. Original one suggested by Olivier,
when normalization is done in PMD at RX look much cleaner and
more manageable.
Detaching a device requires a synchronization between control and
data plane, and not only for this use case.
Of course it does.
eth_rx_burst(port=0, ..., &mbuf, 1);
eth_dev_detach(port=0, ...);
...
/*process previously received mbuf */
With what you are proposing it would be not always possible any more.
With your example, it does not work even without the timestamp feature,
since the mbuf input port would reference an invalid port.
This port is usually used in the application to do a lookup for an port structure,
so it is expected that the entry is valid. It would be even worse if you
do a detach + attach.
I am not talking about the mbuf->port value usage.
Right now user can access/interpret all metadata fields set by PMD RX routines
(vlan, rss hash, ol_flags, ptype, etc.) without need to accessing the device data or
calling device functions.
With that change it wouldn't be the case anymore.
That's the same for some other functions. If in my application I want
to call eth_rx_queue_count(m->port), I will have the same problem.
Yes, but here you are trying to get extra information about device/queue based
on port value stored inside mbuf.
I am talking about information that already stored inside particular mbuf itself.
About m->port itself - as I said before my preference would be to remove it at all
(partly because of that implication - we can't guarantee that m->port information
would be valid though all mbuf lifetime).
But that's probably subject of another discussion.
Post by Olivier Matz
rte_eth_rx_burst(portid, 0, &m, 1);
...
parse_ptp_frames(portid, m);
...
ptp_data.portid = portid;
...
rte_eth_timesync_read_tx_timestamp(ptp_data->portid, ...)
So, really, I think it's an application issue: when the app deletes
a port, it should ask itself if there are remaining references to
it (m->port).
Hmm, and where in the example below do you see the reference to the m->port?
As I can see, what that the code above does:
- it deduces portid value from global variable - not from m->port
- saves portid info (not from m->port) inside global variable ptp_data.portid
- later inside same function it used that value to call rte_ethdev functions
(via parse_fup or parse_drsp).

So I am not sure how it relates to the topic we are discussing.

Anyway, to summarize how the proposal looks right now:

1. m->timestamp value after rx_burst() could be either in raw or normalized format.
2. validity of m->timesamp and the it's format should be determined by 2 ol_flags
(something like: RX_TIMESTAMP, RX_TIMESTAMP_NORM).
3. PMD is free to choose what timestamp value to return (raw/normalized)
4. PMD can provide an optional routine inside devops:
uint64_t dev_ops->timestamp_normalise(uint64_t timestamps);
5. If the user wants to use that function it would be his responsibility to map mbuf
to the port it was received from.

Is that correct?

Thanks
Konstantin
Post by Olivier Matz
Post by Ananyev, Konstantin
Post by Olivier Matz
So, I think it is already the responsibility of the application to do
the sync (flush retrieved packets before detaching a port).
The packets are not in RX or TX queue of detaching device any more.
I received a packet, after that I expect to have all its data and metadata inside mbuf.
So I can store mbufs somewhere and process them much later.
Or might be I would like to pass it to the secondary process for logging/analyzing, etc.
Yes, but that's still an app problem for me.
Post by Ananyev, Konstantin
Post by Olivier Matz
Post by Ananyev, Konstantin
Post by Olivier Matz
In the first solution, the normalization is
partial: unit is nanosecond, but the time reference is different.
Not sure I get you here...
In the first solution I described, each PMD had to convert its unit
into nanosecond. This is easy because we assume the PMD knows the
value of its clock. But to get a fully normalized value, it also has to
use the same time reference, so we would also need to manage an offset
(we need a new API to give this value to the PMD).
Yes, I suppose we do need an start timestamp and sort of factor() to convert
mbuf->timestamp = rxq->start_timestamp + factor(hw_timestamp);
Right?
Why passing start_timestamp at the configure() phase will be a problem?
Post by Olivier Matz
I have another fear related to hardware clocks: if clocks are not
synchronized between PMDs, the simple operation "t * ratio - offset"
won't work. That's why I think we could delegate this job in a specific
library that would manage this.
But then that library would need to account all PMDs inside the system,
and be aware about each HW clock skew, etc.
Again, doesn't' sound like an simple task to me.
Exactly, that's also why I want to let the specialists take care of
it. Having non-normalized timestamps now allow to do the job later
when required, while allowing basic usages as required by metrics
libraries and mlx pmd.
Olivier
Olivier Matz
2017-03-20 09:00:36 UTC
Permalink
Hi Konstantin,
Post by Bruce Richardson
Hi Olivier,
Post by Olivier Matz
Hi Konstantin,
Post by Ananyev, Konstantin
Post by Olivier Matz
Post by Ananyev, Konstantin
Another thing that doesn't look very convenient to me here -
We can have 2 different values of timestamp (both normalized and not)
and there is no clear way for the application to know which one is in
use right now. So each app writer would have to come-up with his own
solution.
- the solution you describe is to have the application storing the
normalized value in its private metadata.
- another solution would be to store the normalized value in
m->timestamp. In this case, we would need a flag to tell if the
timestamp value is normalized.
My first thought also was about second flag to specify was timestamp
already normalized or not.
Though I still in doubt - is it all really worth it: extra ol_flag, new function in eth_dev API.
My feeling that we trying to overcomplicate things.
I don't see what is so complicated. The idea is just to let the
application do the normalization if it is required.
I meant 2 ol_flags and special function just to treat properly one of the mbuf field
seems too much.
Though after second thought might be 2 ol_flags is not a bad idea -
it gives PMD writer a freedom to choose provide a normalized or raw value
on return from rx_burst().
I don't see a real advantage now, but I think this is something that
could be added once we have the normalization code.
Post by Bruce Richardson
Post by Olivier Matz
If the time is normalized in nanosecond in the PMD, we would still
need to normalized the time reference (the 0). And for that we'd need
a call to a synchronization code as well.
Post by Ananyev, Konstantin
Post by Olivier Matz
The problem pointed out by Jan is that doing the timestamp
normalization may take some CPU cycles, even if a small part of packets
requires it.
http://dpdk.org/ml/archives/dev/2016-October/048810.html
the amount of calculations at RX is pretty small.
I don't think it would affect performance in a noticeable way
(though I don't have any numbers here to prove it).
I think we can consider by default that adding code in the data path
impacts performance.
Post by Ananyev, Konstantin
From other side, if user doesn't want a timestamp he can always disable
that feature anad save cycles, right?
BTW, you and Jan both mention that not every packet would need a timestamp.
Instead we need sort of a timestamp for the group of packets?
I think that for many applications the timestamp should be as precise
as possible for each packet.
Post by Ananyev, Konstantin
Is that really the only foreseen usage model?
No, but it could be one.
Post by Ananyev, Konstantin
If so, then why not to have a special function that would extract 'latest' timestamp
from the dev?
Or even have tx_burst_extra() that would return a latest timestamp (extra parameter or so).
Then there is no need to put timestamp into mbuf at all.
Doing that will give a poor precision for the timestamp.
Post by Ananyev, Konstantin
Post by Olivier Matz
Post by Ananyev, Konstantin
Post by Olivier Matz
Applications that
are doing this are responsible of what they change.
Post by Ananyev, Konstantin
3. In theory with eth_dev_detach() - mbuf->port value might be
not valid at the point when application would decide to do
normalization.
So to me all that approach with delayed normalization seems
unnecessary overcomplicated. Original one suggested by Olivier,
when normalization is done in PMD at RX look much cleaner and
more manageable.
Detaching a device requires a synchronization between control and
data plane, and not only for this use case.
Of course it does.
eth_rx_burst(port=0, ..., &mbuf, 1);
eth_dev_detach(port=0, ...);
...
/*process previously received mbuf */
With what you are proposing it would be not always possible any more.
With your example, it does not work even without the timestamp feature,
since the mbuf input port would reference an invalid port.
This port is usually used in the application to do a lookup for an port structure,
so it is expected that the entry is valid. It would be even worse if you
do a detach + attach.
I am not talking about the mbuf->port value usage.
Right now user can access/interpret all metadata fields set by PMD RX routines
(vlan, rss hash, ol_flags, ptype, etc.) without need to accessing the device data or
calling device functions.
With that change it wouldn't be the case anymore.
That's the same for some other functions. If in my application I want
to call eth_rx_queue_count(m->port), I will have the same problem.
Yes, but here you are trying to get extra information about device/queue based
on port value stored inside mbuf.
I am talking about information that already stored inside particular mbuf itself.
About m->port itself - as I said before my preference would be to remove it at all
(partly because of that implication - we can't guarantee that m->port information
would be valid though all mbuf lifetime).
But that's probably subject of another discussion.
Post by Olivier Matz
rte_eth_rx_burst(portid, 0, &m, 1);
...
parse_ptp_frames(portid, m);
...
ptp_data.portid = portid;
...
rte_eth_timesync_read_tx_timestamp(ptp_data->portid, ...)
So, really, I think it's an application issue: when the app deletes
a port, it should ask itself if there are remaining references to
it (m->port).
Hmm, and where in the example below do you see the reference to the m->port?
- it deduces portid value from global variable - not from m->port
- saves portid info (not from m->port) inside global variable ptp_data.portid
- later inside same function it used that value to call rte_ethdev functions
(via parse_fup or parse_drsp).
So I am not sure how it relates to the topic we are discussing.
It's similar to what I proposed for the timestamp normalization: for both
functions, you need to call an ethdev function with a port_id as a parameter.
Either you get the port from the mbuf (this is my initial suggestion that you
don't like), either you know it because you retrieved your mbuf with
rte_eth_rx_burst(port_id, ...) (this is what is done in examples/ptpclient).

So, do you still see an issue with having a function to normalize/synchronize
the timestamp that takes a port id as a parameter?
Post by Bruce Richardson
1. m->timestamp value after rx_burst() could be either in raw or normalized format.
2. validity of m->timesamp and the it's format should be determined by 2 ol_flags
(something like: RX_TIMESTAMP, RX_TIMESTAMP_NORM).
3. PMD is free to choose what timestamp value to return (raw/normalized)
I think it needs to be raw now, because we don't have any normalization code
at the moment. Maybe we could add a "normalized" flag if it makes sense in
the future, once we have decided what normalized means, in a context where several
PMDs/libs can have their own timestamp.

But once we have a clear definition of what normalized means + an example of
normalization code, we may have this NORM flag.
Post by Bruce Richardson
uint64_t dev_ops->timestamp_normalise(uint64_t timestamps);
I think (but I'm not sure, it's really out of scope of this patchset),
that the timestamp synchronization API will be more complex than that.

My current idea:

- a rte_timestamp library holds the normalization code
- we decide, for instance, that "normalized" means:
- unit: nanosecond
- based on system clock
- reference: 0 = time when rte_timestamp_init() was called
- the PMD provides an API to get its clock
- the lib provides something like:
uint64_t rte_timestamp_normalize(unsigned int port_id, uint64_t timestamp)
Post by Bruce Richardson
5. If the user wants to use that function it would be his responsibility to map mbuf
to the port it was received from.
Yes, if the application uses a port_id, it's its responsibility to ensure
that this port exists.



Regards,
Olivier
Ananyev, Konstantin
2017-03-22 17:42:12 UTC
Permalink
Hi Olivier,
Post by Olivier Matz
Post by Ananyev, Konstantin
Post by Olivier Matz
Post by Ananyev, Konstantin
Post by Olivier Matz
Post by Ananyev, Konstantin
Another thing that doesn't look very convenient to me here -
We can have 2 different values of timestamp (both normalized and not)
and there is no clear way for the application to know which one is in
use right now. So each app writer would have to come-up with his own
solution.
- the solution you describe is to have the application storing the
normalized value in its private metadata.
- another solution would be to store the normalized value in
m->timestamp. In this case, we would need a flag to tell if the
timestamp value is normalized.
My first thought also was about second flag to specify was timestamp
already normalized or not.
Though I still in doubt - is it all really worth it: extra ol_flag, new function in eth_dev API.
My feeling that we trying to overcomplicate things.
I don't see what is so complicated. The idea is just to let the
application do the normalization if it is required.
I meant 2 ol_flags and special function just to treat properly one of the mbuf field
seems too much.
Though after second thought might be 2 ol_flags is not a bad idea -
it gives PMD writer a freedom to choose provide a normalized or raw value
on return from rx_burst().
I don't see a real advantage now, but I think this is something that
could be added once we have the normalization code.
Post by Ananyev, Konstantin
Post by Olivier Matz
If the time is normalized in nanosecond in the PMD, we would still
need to normalized the time reference (the 0). And for that we'd need
a call to a synchronization code as well.
Post by Ananyev, Konstantin
Post by Olivier Matz
The problem pointed out by Jan is that doing the timestamp
normalization may take some CPU cycles, even if a small part of packets
requires it.
http://dpdk.org/ml/archives/dev/2016-October/048810.html
the amount of calculations at RX is pretty small.
I don't think it would affect performance in a noticeable way
(though I don't have any numbers here to prove it).
I think we can consider by default that adding code in the data path
impacts performance.
Post by Ananyev, Konstantin
From other side, if user doesn't want a timestamp he can always disable
that feature anad save cycles, right?
BTW, you and Jan both mention that not every packet would need a timestamp.
Instead we need sort of a timestamp for the group of packets?
I think that for many applications the timestamp should be as precise
as possible for each packet.
Post by Ananyev, Konstantin
Is that really the only foreseen usage model?
No, but it could be one.
Post by Ananyev, Konstantin
If so, then why not to have a special function that would extract 'latest' timestamp
from the dev?
Or even have tx_burst_extra() that would return a latest timestamp (extra parameter or so).
Then there is no need to put timestamp into mbuf at all.
Doing that will give a poor precision for the timestamp.
Post by Ananyev, Konstantin
Post by Olivier Matz
Post by Ananyev, Konstantin
Post by Olivier Matz
Applications that
are doing this are responsible of what they change.
Post by Ananyev, Konstantin
3. In theory with eth_dev_detach() - mbuf->port value might be
not valid at the point when application would decide to do
normalization.
So to me all that approach with delayed normalization seems
unnecessary overcomplicated. Original one suggested by Olivier,
when normalization is done in PMD at RX look much cleaner and
more manageable.
Detaching a device requires a synchronization between control and
data plane, and not only for this use case.
Of course it does.
eth_rx_burst(port=0, ..., &mbuf, 1);
eth_dev_detach(port=0, ...);
...
/*process previously received mbuf */
With what you are proposing it would be not always possible any more.
With your example, it does not work even without the timestamp feature,
since the mbuf input port would reference an invalid port.
This port is usually used in the application to do a lookup for an port structure,
so it is expected that the entry is valid. It would be even worse if you
do a detach + attach.
I am not talking about the mbuf->port value usage.
Right now user can access/interpret all metadata fields set by PMD RX routines
(vlan, rss hash, ol_flags, ptype, etc.) without need to accessing the device data or
calling device functions.
With that change it wouldn't be the case anymore.
That's the same for some other functions. If in my application I want
to call eth_rx_queue_count(m->port), I will have the same problem.
Yes, but here you are trying to get extra information about device/queue based
on port value stored inside mbuf.
I am talking about information that already stored inside particular mbuf itself.
About m->port itself - as I said before my preference would be to remove it at all
(partly because of that implication - we can't guarantee that m->port information
would be valid though all mbuf lifetime).
But that's probably subject of another discussion.
Post by Olivier Matz
rte_eth_rx_burst(portid, 0, &m, 1);
...
parse_ptp_frames(portid, m);
...
ptp_data.portid = portid;
...
rte_eth_timesync_read_tx_timestamp(ptp_data->portid, ...)
So, really, I think it's an application issue: when the app deletes
a port, it should ask itself if there are remaining references to
it (m->port).
Hmm, and where in the example below do you see the reference to the m->port?
- it deduces portid value from global variable - not from m->port
- saves portid info (not from m->port) inside global variable ptp_data.portid
- later inside same function it used that value to call rte_ethdev functions
(via parse_fup or parse_drsp).
So I am not sure how it relates to the topic we are discussing.
It's similar to what I proposed for the timestamp normalization: for both
functions, you need to call an ethdev function with a port_id as a parameter.
Either you get the port from the mbuf (this is my initial suggestion that you
don't like), either you know it because you retrieved your mbuf with
rte_eth_rx_burst(port_id, ...) (this is what is done in examples/ptpclient).
So, do you still see an issue with having a function to normalize/synchronize
the timestamp that takes a port id as a parameter?
If it hasn't rely on mbuf->port value, then probably not.
Post by Olivier Matz
Post by Ananyev, Konstantin
1. m->timestamp value after rx_burst() could be either in raw or normalized format.
2. validity of m->timesamp and the it's format should be determined by 2 ol_flags
(something like: RX_TIMESTAMP, RX_TIMESTAMP_NORM).
3. PMD is free to choose what timestamp value to return (raw/normalized)
I think it needs to be raw now, because we don't have any normalization code
at the moment. Maybe we could add a "normalized" flag if it makes sense in
the future, once we have decided what normalized means, in a context where several
PMDs/libs can have their own timestamp.
But once we have a clear definition of what normalized means + an example of
normalization code, we may have this NORM flag.
Post by Ananyev, Konstantin
uint64_t dev_ops->timestamp_normalise(uint64_t timestamps);
I think (but I'm not sure, it's really out of scope of this patchset),
that the timestamp synchronization API will be more complex than that.
- a rte_timestamp library holds the normalization code
- unit: nanosecond
- based on system clock
- reference: 0 = time when rte_timestamp_init() was called
- the PMD provides an API to get its clock
uint64_t rte_timestamp_normalize(unsigned int port_id, uint64_t timestamp)
Post by Ananyev, Konstantin
5. If the user wants to use that function it would be his responsibility to map mbuf
to the port it was received from.
Yes, if the application uses a port_id, it's its responsibility to ensure
that this port exists.
Ok, so for 17.05 we'll have:
- raw timestamp value inside mbuf
- ol_flag bit to represenet is mbuf->timestamp value valid or not.
That's it, correct?

Konstantin
Jerin Jacob
2017-03-24 08:35:04 UTC
Permalink
Post by Bruce Richardson
Hi Olivier,
Post by Olivier Matz
Post by Ananyev, Konstantin
Post by Olivier Matz
Post by Ananyev, Konstantin
Another thing that doesn't look very convenient to me here -
We can have 2 different values of timestamp (both normalized and not)
and there is no clear way for the application to know which one is in
use right now. So each app writer would have to come-up with his own
solution.
- the solution you describe is to have the application storing the
normalized value in its private metadata.
- another solution would be to store the normalized value in
m->timestamp. In this case, we would need a flag to tell if the
uint64_t dev_ops->timestamp_normalise(uint64_t timestamps);
I think (but I'm not sure, it's really out of scope of this patchset),
that the timestamp synchronization API will be more complex than that.
- a rte_timestamp library holds the normalization code
- unit: nanosecond
- based on system clock
- reference: 0 = time when rte_timestamp_init() was called
- the PMD provides an API to get its clock
uint64_t rte_timestamp_normalize(unsigned int port_id, uint64_t timestamp)
Post by Ananyev, Konstantin
5. If the user wants to use that function it would be his responsibility to map mbuf
to the port it was received from.
Yes, if the application uses a port_id, it's its responsibility to ensure
that this port exists.
- raw timestamp value inside mbuf
- ol_flag bit to represenet is mbuf->timestamp value valid or not.
That's it, correct?
Hi Olivier,

The ARM alignment fix also will be part of the v17.05. Right?
Post by Bruce Richardson
Konstantin
Olivier Matz
2017-03-24 13:35:15 UTC
Permalink
Hi Jerin,
Post by Bruce Richardson
Post by Bruce Richardson
Hi Olivier,
Post by Olivier Matz
Post by Ananyev, Konstantin
Post by Olivier Matz
Post by Ananyev, Konstantin
Another thing that doesn't look very convenient to me here -
We can have 2 different values of timestamp (both normalized and not)
and there is no clear way for the application to know which one is in
use right now. So each app writer would have to come-up with his own
solution.
- the solution you describe is to have the application storing the
normalized value in its private metadata.
- another solution would be to store the normalized value in
m->timestamp. In this case, we would need a flag to tell if the
uint64_t dev_ops->timestamp_normalise(uint64_t timestamps);
I think (but I'm not sure, it's really out of scope of this patchset),
that the timestamp synchronization API will be more complex than that.
- a rte_timestamp library holds the normalization code
- unit: nanosecond
- based on system clock
- reference: 0 = time when rte_timestamp_init() was called
- the PMD provides an API to get its clock
uint64_t rte_timestamp_normalize(unsigned int port_id, uint64_t timestamp)
Post by Ananyev, Konstantin
5. If the user wants to use that function it would be his responsibility to map mbuf
to the port it was received from.
Yes, if the application uses a port_id, it's its responsibility to ensure
that this port exists.
- raw timestamp value inside mbuf
- ol_flag bit to represenet is mbuf->timestamp value valid or not.
That's it, correct?
Hi Olivier,
The ARM alignment fix also will be part of the v17.05. Right?
It's in this patchset, planned for v17.05.
From what I see, there is no strong opposition to the patchset, so
it should go in.
(note, the v1 is here: http://dpdk.org/ml/archives/dev/2017-March/059693.html)

If you (and others) have time to test or review, that may help to
integrate it faster.

Regards,
Olivier
Jan Blunck
2017-02-28 09:25:04 UTC
Permalink
On Tue, Feb 28, 2017 at 10:05 AM, Ananyev, Konstantin
Post by Ananyev, Konstantin
Post by Bruce Richardson
Post by Olivier Matz
In both cases, we would need a conversion code (maybe in a library) if
the application wants to work with timestamps from several sources. The
second solution removes the normalization code in the PMD when not
needed, it is probably better.
I agree.
One question - does that mean that application would need to
keep a track from what PMD each particular packet came to do the normalization?
Yes. You usually do this based on mbuf->port.
Ananyev, Konstantin
2017-02-19 23:45:53 UTC
Permalink
Hi Olivier,
Post by Olivier Matz
Post by Ananyev, Konstantin
- m->next, m->nb_segs, and m->refcnt are always initialized for
mbufs in the pool, avoiding the need of setting m->next (located in
the 2nd cache line) in the Rx path for mono-segment packets.
- change port and nb_segs to 16 bits
Not that I am completely against it,
but changing nb_segs to 16 bits seems like an overkill to me.
I think we can keep and extra 8bits for something more useful in future.
In my case, I use the m->next field to chain more than 256 segments for
L4 socket buffers. It also updates nb_seg that can overflow. It's not
a big issue since at the end, nb_seg is decremented for each segment.
On the other hand, if I enable some sanity checks on mbufs, it
complains because the number of segments is not equal to nb_seg.
I still have a hard time imaging some practical use-case for packet
that spawns over 256 segments.
But might that is just my poor imagination, so ok if you guys believe
there are some real-world use cases - I wouldn't object here.
Konstantin
Morten Brørup
2017-02-21 09:22:57 UTC
Permalink
Hi Olivier,

There has been a lot of debate about the size and location (1st or 2nd cache line) of the mbuf fields.

May I suggest that you add a comment to each field briefly mentioning why it has a specific size (8 or 16 bits) (e.g. by example use case), and optionally also why it deserves to be in the first cache line. (A comment to the mbuf structure can say that fields manipulated by PMDs on ingress generally belong in the 1st cache line.)

It's not necessary for all fields, just the non-obvious ones (the ones discussed here on the mailing list). It will make the information more easily accessible to avoid repeating the same discussions in the future.

E.g. the port field is 16 bits because a hypervisor can serve more than 256 virtual machines. And it is in the 1st cache line because a PMD for a multi-port NIC with a shared ingress queue needs to set it on ingress.

And the refcnt field has the same size as the port field to support L3 multicast and L2 port flooding on all ports.


Med venlig
Olivier MATZ
2017-02-21 09:54:06 UTC
Permalink
Hi Morten,

On Tue, 21 Feb 2017 10:22:57 +0100, Morten Brørup
Post by Bruce Richardson
Hi Olivier,
There has been a lot of debate about the size and location (1st or
2nd cache line) of the mbuf fields.
May I suggest that you add a comment to each field briefly mentioning
why it has a specific size (8 or 16 bits) (e.g. by example use case),
and optionally also why it deserves to be in the first cache line. (A
comment to the mbuf structure can say that fields manipulated by PMDs
on ingress generally belong in the 1st cache line.)
It's not necessary for all fields, just the non-obvious ones (the
ones discussed here on the mailing list). It will make the
information more easily accessible to avoid repeating the same
discussions in the future.
E.g. the port field is 16 bits because a hypervisor can serve more
than 256 virtual machines. And it is in the 1st cache line because a
PMD for a multi-port NIC with a shared ingress queue needs to set it
on ingress.
And the refcnt field has the same size as the port field to support
L3 multicast and L2 port flooding on all ports.
I understand your point about not rediscussing things several times. I
don't think having a comment for fields is really relevant because it
would add many info that are not useful for the user of the structure,
but I think adding something in the API documentation of the rte_mbuf
structure itself makes sense to me.

I'll add something in the next version of the patch.

Thanks,
Olivier
Olivier Matz
2017-03-08 09:41:52 UTC
Permalink
Based on discussions done in [1] and in this thread, this patchset reorganizes
the mbuf.

The main changes are:
- reorder structure to increase vector performance on some non-ia
platforms.
- add a 64bits timestamp field in the 1st cache line. This timestamp
is not normalized, i.e. no unit or time reference is enforced. A
library may be added to do this job in the future.
- m->next, m->nb_segs, and m->refcnt are always initialized for mbufs
in the pool, avoiding the need of setting m->next (located in the
2nd cache line) in the Rx path for mono-segment packets.
- change port and nb_segs to 16 bits
- move seqn in the 2nd cache line

Things discussed but not done in the patchset:
- move refcnt and nb_segs to the 2nd cache line: many drivers sets
them in the Rx path, so it could introduce a performance regression, or
it would require to change all the drivers, which is not an easy task.
- remove the m->port field: too much impact on many examples and libraries,
and some people highlighted they are using it.
- moving m->next in the 1st cache line: there is not enough room, and having
it set to NULL for unused mbuf should remove the need for it.
- merge seqn and timestamp together in a union: we could imagine use cases
were both are activated. There is no flag indicating the presence of seqn,
so it looks preferable to keep them separated for now.

I made some basic performance tests (ixgbe) and see no regression.
Other tests from NIC vendors are welcome.

Once this patchset is pushed, the Rx path of drivers could be optimized a bit,
by removing writes to m->next, m->nb_segs and m->refcnt. The patch 4/8 gives an
idea of what could be done.

[1] http://dpdk.org/ml/archives/dev/2016-October/049338.html

rfc->v1:
- fix reset of mbuf fields in case of indirect mbuf in rte_pktmbuf_prefree_seg()
- do not enforce a unit or time reference for m->timestamp
- reorganize fields to make vlan and outer vlan consecutive
- enhance documentation of m->refcnt and m->port to explain why they are 16bits

Jerin Jacob (1):
mbuf: make rearm data address naturally aligned

Olivier Matz (8):
mbuf: make segment prefree function public
mbuf: make raw free function public
mbuf: set mbuf fields while in pool
drivers/net: don't touch mbuf next or nb segs on Rx
mbuf: use 2 bytes for port and nb segments
mbuf: move sequence number in second cache line
mbuf: add a timestamp field
mbuf: reorder VLAN tci and buffer len fields

app/test-pmd/csumonly.c | 4 +-
drivers/net/ena/ena_ethdev.c | 2 +-
drivers/net/enic/enic_rxtx.c | 2 +-
drivers/net/fm10k/fm10k_rxtx.c | 6 +-
drivers/net/fm10k/fm10k_rxtx_vec.c | 9 +-
drivers/net/i40e/i40e_rxtx_vec_common.h | 6 +-
drivers/net/i40e/i40e_rxtx_vec_sse.c | 11 +-
drivers/net/ixgbe/ixgbe_rxtx.c | 10 +-
drivers/net/ixgbe/ixgbe_rxtx_vec_common.h | 6 +-
drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c | 9 --
drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c | 9 --
drivers/net/mlx5/mlx5_rxtx.c | 11 +-
drivers/net/mpipe/mpipe_tilegx.c | 3 +-
drivers/net/null/rte_eth_null.c | 2 -
drivers/net/virtio/virtio_rxtx.c | 4 -
drivers/net/virtio/virtio_rxtx_simple.h | 6 +-
.../linuxapp/eal/include/exec-env/rte_kni_common.h | 5 +-
lib/librte_mbuf/rte_mbuf.c | 4 +
lib/librte_mbuf/rte_mbuf.h | 123 ++++++++++++++++-----
19 files changed, 130 insertions(+), 102 deletions(-)
--
2.8.1
Olivier Matz
2017-03-08 09:41:54 UTC
Permalink
Rename __rte_mbuf_raw_free() as rte_mbuf_raw_free() and make
it public. The old function is kept for compat but is marked as
deprecated.

The next commit changes the behavior of rte_mbuf_raw_free() to
make it more consistent with rte_mbuf_raw_alloc().

Signed-off-by: Olivier Matz <***@6wind.com>
---
drivers/net/ena/ena_ethdev.c | 2 +-
drivers/net/mlx5/mlx5_rxtx.c | 6 +++---
drivers/net/mpipe/mpipe_tilegx.c | 2 +-
lib/librte_mbuf/rte_mbuf.h | 22 ++++++++++++++++------
4 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ena/ena_ethdev.c b/drivers/net/ena/ena_ethdev.c
index b5e6db6..5dd44d7 100644
--- a/drivers/net/ena/ena_ethdev.c
+++ b/drivers/net/ena/ena_ethdev.c
@@ -680,7 +680,7 @@ static void ena_rx_queue_release_bufs(struct ena_ring *ring)
ring->rx_buffer_info[ring->next_to_clean & ring_mask];

if (m)
- __rte_mbuf_raw_free(m);
+ rte_mbuf_raw_free(m);

ring->next_to_clean++;
}
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 88b0354..41a5bb2 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -1399,7 +1399,7 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
assert(pkt != (*rxq->elts)[idx]);
rep = NEXT(pkt);
rte_mbuf_refcnt_set(pkt, 0);
- __rte_mbuf_raw_free(pkt);
+ rte_mbuf_raw_free(pkt);
pkt = rep;
}
break;
@@ -1410,13 +1410,13 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
&rss_hash_res);
if (!len) {
rte_mbuf_refcnt_set(rep, 0);
- __rte_mbuf_raw_free(rep);
+ rte_mbuf_raw_free(rep);
break;
}
if (unlikely(len == -1)) {
/* RX error, packet is likely too large. */
rte_mbuf_refcnt_set(rep, 0);
- __rte_mbuf_raw_free(rep);
+ rte_mbuf_raw_free(rep);
++rxq->stats.idropped;
goto skip;
}
diff --git a/drivers/net/mpipe/mpipe_tilegx.c b/drivers/net/mpipe/mpipe_tilegx.c
index 60d5f81..536b8ea 100644
--- a/drivers/net/mpipe/mpipe_tilegx.c
+++ b/drivers/net/mpipe/mpipe_tilegx.c
@@ -558,7 +558,7 @@ mpipe_recv_flush_stack(struct mpipe_dev_priv *priv)
mbuf->data_len = 0;
mbuf->pkt_len = 0;

- __rte_mbuf_raw_free(mbuf);
+ rte_mbuf_raw_free(mbuf);
}
}

diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
index b61c430..575dc9d 100644
--- a/lib/librte_mbuf/rte_mbuf.h
+++ b/lib/librte_mbuf/rte_mbuf.h
@@ -790,20 +790,30 @@ static inline struct rte_mbuf *rte_mbuf_raw_alloc(struct rte_mempool *mp)
}

/**
- * @internal Put mbuf back into its original mempool.
- * The use of that function is reserved for RTE internal needs.
- * Please use rte_pktmbuf_free().
+ * Put mbuf back into its original mempool.
+ *
+ * The caller must ensure that the mbuf is direct and that the
+ * reference counter is 0.
*
* @param m
* The mbuf to be freed.
*/
static inline void __attribute__((always_inline))
-__rte_mbuf_raw_free(struct rte_mbuf *m)
+rte_mbuf_raw_free(struct rte_mbuf *m)
{
+ RTE_ASSERT(RTE_MBUF_DIRECT(m));
RTE_ASSERT(rte_mbuf_refcnt_read(m) == 0);
rte_mempool_put(m->pool, m);
}

+/* compat with older versions */
+__rte_deprecated
+static inline void __attribute__((always_inline))
+__rte_mbuf_raw_free(struct rte_mbuf *m)
+{
+ rte_mbuf_raw_free(m);
+}
+
/* Operations on ctrl mbuf */

/**
@@ -1210,7 +1220,7 @@ static inline void rte_pktmbuf_detach(struct rte_mbuf *m)
m->ol_flags = 0;

if (rte_mbuf_refcnt_update(md, -1) == 0)
- __rte_mbuf_raw_free(md);
+ rte_mbuf_raw_free(md);
}

/**
@@ -1265,7 +1275,7 @@ rte_pktmbuf_free_seg(struct rte_mbuf *m)
m = rte_pktmbuf_prefree_seg(m);
if (likely(m != NULL)) {
m->next = NULL;
- __rte_mbuf_raw_free(m);
+ rte_mbuf_raw_free(m);
}
}
--
2.8.1
Olivier Matz
2017-03-08 09:41:53 UTC
Permalink
Document the function and make it public, since it is used at several
places in the drivers. The old one is marked as deprecated.

Signed-off-by: Olivier Matz <***@6wind.com>
---
drivers/net/enic/enic_rxtx.c | 2 +-
drivers/net/fm10k/fm10k_rxtx.c | 6 +++---
drivers/net/fm10k/fm10k_rxtx_vec.c | 6 +++---
drivers/net/i40e/i40e_rxtx_vec_common.h | 6 +++---
drivers/net/ixgbe/ixgbe_rxtx.c | 2 +-
drivers/net/ixgbe/ixgbe_rxtx_vec_common.h | 6 +++---
drivers/net/virtio/virtio_rxtx_simple.h | 6 +++---
lib/librte_mbuf/rte_mbuf.h | 30 +++++++++++++++++++++++++++---
8 files changed, 44 insertions(+), 20 deletions(-)

diff --git a/drivers/net/enic/enic_rxtx.c b/drivers/net/enic/enic_rxtx.c
index 343dabc..1ee5cbb 100644
--- a/drivers/net/enic/enic_rxtx.c
+++ b/drivers/net/enic/enic_rxtx.c
@@ -473,7 +473,7 @@ static inline void enic_free_wq_bufs(struct vnic_wq *wq, u16 completed_index)
pool = ((struct rte_mbuf *)buf->mb)->pool;
for (i = 0; i < nb_to_free; i++) {
buf = &wq->bufs[tail_idx];
- m = __rte_pktmbuf_prefree_seg((struct rte_mbuf *)(buf->mb));
+ m = rte_pktmbuf_prefree_seg((struct rte_mbuf *)(buf->mb));
buf->mb = NULL;

if (unlikely(m == NULL)) {
diff --git a/drivers/net/fm10k/fm10k_rxtx.c b/drivers/net/fm10k/fm10k_rxtx.c
index 144e5e6..c9bb04a 100644
--- a/drivers/net/fm10k/fm10k_rxtx.c
+++ b/drivers/net/fm10k/fm10k_rxtx.c
@@ -434,12 +434,12 @@ static inline void tx_free_bulk_mbuf(struct rte_mbuf **txep, int num)
if (unlikely(num == 0))
return;

- m = __rte_pktmbuf_prefree_seg(txep[0]);
+ m = rte_pktmbuf_prefree_seg(txep[0]);
if (likely(m != NULL)) {
free[0] = m;
nb_free = 1;
for (i = 1; i < num; i++) {
- m = __rte_pktmbuf_prefree_seg(txep[i]);
+ m = rte_pktmbuf_prefree_seg(txep[i]);
if (likely(m != NULL)) {
if (likely(m->pool == free[0]->pool))
free[nb_free++] = m;
@@ -455,7 +455,7 @@ static inline void tx_free_bulk_mbuf(struct rte_mbuf **txep, int num)
rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free);
} else {
for (i = 1; i < num; i++) {
- m = __rte_pktmbuf_prefree_seg(txep[i]);
+ m = rte_pktmbuf_prefree_seg(txep[i]);
if (m != NULL)
rte_mempool_put(m->pool, m);
txep[i] = NULL;
diff --git a/drivers/net/fm10k/fm10k_rxtx_vec.c b/drivers/net/fm10k/fm10k_rxtx_vec.c
index 27f3e43..825e3c1 100644
--- a/drivers/net/fm10k/fm10k_rxtx_vec.c
+++ b/drivers/net/fm10k/fm10k_rxtx_vec.c
@@ -754,12 +754,12 @@ fm10k_tx_free_bufs(struct fm10k_tx_queue *txq)
* next_dd - (rs_thresh-1)
*/
txep = &txq->sw_ring[txq->next_dd - (n - 1)];
- m = __rte_pktmbuf_prefree_seg(txep[0]);
+ m = rte_pktmbuf_prefree_seg(txep[0]);
if (likely(m != NULL)) {
free[0] = m;
nb_free = 1;
for (i = 1; i < n; i++) {
- m = __rte_pktmbuf_prefree_seg(txep[i]);
+ m = rte_pktmbuf_prefree_seg(txep[i]);
if (likely(m != NULL)) {
if (likely(m->pool == free[0]->pool))
free[nb_free++] = m;
@@ -774,7 +774,7 @@ fm10k_tx_free_bufs(struct fm10k_tx_queue *txq)
rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free);
} else {
for (i = 1; i < n; i++) {
- m = __rte_pktmbuf_prefree_seg(txep[i]);
+ m = rte_pktmbuf_prefree_seg(txep[i]);
if (m != NULL)
rte_mempool_put(m->pool, m);
}
diff --git a/drivers/net/i40e/i40e_rxtx_vec_common.h b/drivers/net/i40e/i40e_rxtx_vec_common.h
index 3745558..76031fe 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_common.h
+++ b/drivers/net/i40e/i40e_rxtx_vec_common.h
@@ -123,12 +123,12 @@ i40e_tx_free_bufs(struct i40e_tx_queue *txq)
* tx_next_dd - (tx_rs_thresh-1)
*/
txep = &txq->sw_ring[txq->tx_next_dd - (n - 1)];
- m = __rte_pktmbuf_prefree_seg(txep[0].mbuf);
+ m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
if (likely(m != NULL)) {
free[0] = m;
nb_free = 1;
for (i = 1; i < n; i++) {
- m = __rte_pktmbuf_prefree_seg(txep[i].mbuf);
+ m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
if (likely(m != NULL)) {
if (likely(m->pool == free[0]->pool)) {
free[nb_free++] = m;
@@ -144,7 +144,7 @@ i40e_tx_free_bufs(struct i40e_tx_queue *txq)
rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free);
} else {
for (i = 1; i < n; i++) {
- m = __rte_pktmbuf_prefree_seg(txep[i].mbuf);
+ m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
if (m != NULL)
rte_mempool_put(m->pool, m);
}
diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
index 9502432..b056107 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx.c
@@ -142,7 +142,7 @@ ixgbe_tx_free_bufs(struct ixgbe_tx_queue *txq)

for (i = 0; i < txq->tx_rs_thresh; ++i, ++txep) {
/* free buffers one at a time */
- m = __rte_pktmbuf_prefree_seg(txep->mbuf);
+ m = rte_pktmbuf_prefree_seg(txep->mbuf);
txep->mbuf = NULL;

if (unlikely(m == NULL))
diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec_common.h b/drivers/net/ixgbe/ixgbe_rxtx_vec_common.h
index a3473b9..a83afe5 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx_vec_common.h
+++ b/drivers/net/ixgbe/ixgbe_rxtx_vec_common.h
@@ -123,12 +123,12 @@ ixgbe_tx_free_bufs(struct ixgbe_tx_queue *txq)
* tx_next_dd - (tx_rs_thresh-1)
*/
txep = &txq->sw_ring_v[txq->tx_next_dd - (n - 1)];
- m = __rte_pktmbuf_prefree_seg(txep[0].mbuf);
+ m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
if (likely(m != NULL)) {
free[0] = m;
nb_free = 1;
for (i = 1; i < n; i++) {
- m = __rte_pktmbuf_prefree_seg(txep[i].mbuf);
+ m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
if (likely(m != NULL)) {
if (likely(m->pool == free[0]->pool))
free[nb_free++] = m;
@@ -143,7 +143,7 @@ ixgbe_tx_free_bufs(struct ixgbe_tx_queue *txq)
rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free);
} else {
for (i = 1; i < n; i++) {
- m = __rte_pktmbuf_prefree_seg(txep[i].mbuf);
+ m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
if (m != NULL)
rte_mempool_put(m->pool, m);
}
diff --git a/drivers/net/virtio/virtio_rxtx_simple.h b/drivers/net/virtio/virtio_rxtx_simple.h
index b08f859..f531c54 100644
--- a/drivers/net/virtio/virtio_rxtx_simple.h
+++ b/drivers/net/virtio/virtio_rxtx_simple.h
@@ -98,13 +98,13 @@ virtio_xmit_cleanup(struct virtqueue *vq)
desc_idx = (uint16_t)(vq->vq_used_cons_idx &
((vq->vq_nentries >> 1) - 1));
m = (struct rte_mbuf *)vq->vq_descx[desc_idx++].cookie;
- m = __rte_pktmbuf_prefree_seg(m);
+ m = rte_pktmbuf_prefree_seg(m);
if (likely(m != NULL)) {
free[0] = m;
nb_free = 1;
for (i = 1; i < VIRTIO_TX_FREE_NR; i++) {
m = (struct rte_mbuf *)vq->vq_descx[desc_idx++].cookie;
- m = __rte_pktmbuf_prefree_seg(m);
+ m = rte_pktmbuf_prefree_seg(m);
if (likely(m != NULL)) {
if (likely(m->pool == free[0]->pool))
free[nb_free++] = m;
@@ -123,7 +123,7 @@ virtio_xmit_cleanup(struct virtqueue *vq)
} else {
for (i = 1; i < VIRTIO_TX_FREE_NR; i++) {
m = (struct rte_mbuf *)vq->vq_descx[desc_idx++].cookie;
- m = __rte_pktmbuf_prefree_seg(m);
+ m = rte_pktmbuf_prefree_seg(m);
if (m != NULL)
rte_mempool_put(m->pool, m);
}
diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
index ce57d47..b61c430 100644
--- a/lib/librte_mbuf/rte_mbuf.h
+++ b/lib/librte_mbuf/rte_mbuf.h
@@ -1213,8 +1213,23 @@ static inline void rte_pktmbuf_detach(struct rte_mbuf *m)
__rte_mbuf_raw_free(md);
}

-static inline struct rte_mbuf* __attribute__((always_inline))
-__rte_pktmbuf_prefree_seg(struct rte_mbuf *m)
+/**
+ * Decrease reference counter and unlink a mbuf segment
+ *
+ * This function does the same than a free, except that it does not
+ * return the segment to its pool.
+ * It decreases the reference counter, and if it reaches 0, it is
+ * detached from its parent for an indirect mbuf.
+ *
+ * @param m
+ * The mbuf to be unlinked
+ * @return
+ * - (m) if it is the last reference. It can be recycled or freed.
+ * - (NULL) if the mbuf still has remaining references on it.
+ */
+__attribute__((always_inline))
+static inline struct rte_mbuf *
+rte_pktmbuf_prefree_seg(struct rte_mbuf *m)
{
__rte_mbuf_sanity_check(m, 0);

@@ -1227,6 +1242,14 @@ __rte_pktmbuf_prefree_seg(struct rte_mbuf *m)
return NULL;
}

+/* deprecated, replaced by rte_pktmbuf_prefree_seg() */
+__rte_deprecated
+static inline struct rte_mbuf *
+__rte_pktmbuf_prefree_seg(struct rte_mbuf *m)
+{
+ return rte_pktmbuf_prefree_seg(m);
+}
+
/**
* Free a segment of a packet mbuf into its original mempool.
*
@@ -1239,7 +1262,8 @@ __rte_pktmbuf_prefree_seg(struct rte_mbuf *m)
static inline void __attribute__((always_inline))
rte_pktmbuf_free_seg(struct rte_mbuf *m)
{
- if (likely(NULL != (m = __rte_pktmbuf_prefree_seg(m)))) {
+ m = rte_pktmbuf_prefree_seg(m);
+ if (likely(m != NULL)) {
m->next = NULL;
__rte_mbuf_raw_free(m);
}
--
2.8.1
Olivier Matz
2017-03-08 09:41:55 UTC
Permalink
Set the value of m->refcnt to 1, m->nb_segs to 1 and m->next
to NULL when the mbuf is stored inside the mempool (unused).
This is done in rte_pktmbuf_prefree_seg(), before freeing or
recycling a mbuf.

Before this patch, the value of m->refcnt was expected to be 0
while in pool.

The objectives are:

- to avoid drivers to set m->next to NULL in the early Rx path, since
this field is in the second 64B of the mbuf and its access could
trigger a cache miss

- rationalize the behavior of raw_alloc/raw_free: one is now the
symmetric of the other, and refcnt is never changed in these functions.

Signed-off-by: Olivier Matz <***@6wind.com>
---
drivers/net/mlx5/mlx5_rxtx.c | 5 ++---
drivers/net/mpipe/mpipe_tilegx.c | 1 +
lib/librte_mbuf/rte_mbuf.c | 2 ++
lib/librte_mbuf/rte_mbuf.h | 42 +++++++++++++++++++++++++++++-----------
4 files changed, 36 insertions(+), 14 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 41a5bb2..fc59544 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -1398,7 +1398,8 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
while (pkt != seg) {
assert(pkt != (*rxq->elts)[idx]);
rep = NEXT(pkt);
- rte_mbuf_refcnt_set(pkt, 0);
+ NEXT(pkt) = NULL;
+ NB_SEGS(pkt) = 1;
rte_mbuf_raw_free(pkt);
pkt = rep;
}
@@ -1409,13 +1410,11 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt,
&rss_hash_res);
if (!len) {
- rte_mbuf_refcnt_set(rep, 0);
rte_mbuf_raw_free(rep);
break;
}
if (unlikely(len == -1)) {
/* RX error, packet is likely too large. */
- rte_mbuf_refcnt_set(rep, 0);
rte_mbuf_raw_free(rep);
++rxq->stats.idropped;
goto skip;
diff --git a/drivers/net/mpipe/mpipe_tilegx.c b/drivers/net/mpipe/mpipe_tilegx.c
index 536b8ea..0135e2f 100644
--- a/drivers/net/mpipe/mpipe_tilegx.c
+++ b/drivers/net/mpipe/mpipe_tilegx.c
@@ -557,6 +557,7 @@ mpipe_recv_flush_stack(struct mpipe_dev_priv *priv)
mbuf->packet_type = 0;
mbuf->data_len = 0;
mbuf->pkt_len = 0;
+ mbuf->next = NULL;

rte_mbuf_raw_free(mbuf);
}
diff --git a/lib/librte_mbuf/rte_mbuf.c b/lib/librte_mbuf/rte_mbuf.c
index 72ad91e..0acc810 100644
--- a/lib/librte_mbuf/rte_mbuf.c
+++ b/lib/librte_mbuf/rte_mbuf.c
@@ -145,6 +145,8 @@ rte_pktmbuf_init(struct rte_mempool *mp,
m->pool = mp;
m->nb_segs = 1;
m->port = 0xff;
+ rte_mbuf_refcnt_set(m, 1);
+ m->next = NULL;
}

/* helper to create a mbuf pool */
diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
index 575dc9d..b4fe786 100644
--- a/lib/librte_mbuf/rte_mbuf.h
+++ b/lib/librte_mbuf/rte_mbuf.h
@@ -768,6 +768,11 @@ rte_mbuf_sanity_check(const struct rte_mbuf *m, int is_header);
* initializing all the required fields. See rte_pktmbuf_reset().
* For standard needs, prefer rte_pktmbuf_alloc().
*
+ * The caller can expect that the following fields of the mbuf structure
+ * are initialized: buf_addr, buf_physaddr, buf_len, refcnt=1, nb_segs=1,
+ * next=NULL, pool, priv_size. The other fields must be initialized
+ * by the caller.
+ *
* @param mp
* The mempool from which mbuf is allocated.
* @return
@@ -782,8 +787,9 @@ static inline struct rte_mbuf *rte_mbuf_raw_alloc(struct rte_mempool *mp)
if (rte_mempool_get(mp, &mb) < 0)
return NULL;
m = (struct rte_mbuf *)mb;
- RTE_ASSERT(rte_mbuf_refcnt_read(m) == 0);
- rte_mbuf_refcnt_set(m, 1);
+ RTE_ASSERT(rte_mbuf_refcnt_read(m) == 1);
+ RTE_ASSERT(m->next == NULL);
+ RTE_ASSERT(m->nb_segs == 1);
__rte_mbuf_sanity_check(m, 0);

return m;
@@ -792,8 +798,13 @@ static inline struct rte_mbuf *rte_mbuf_raw_alloc(struct rte_mempool *mp)
/**
* Put mbuf back into its original mempool.
*
- * The caller must ensure that the mbuf is direct and that the
- * reference counter is 0.
+ * The caller must ensure that the mbuf is direct and properly
+ * reinitialized (refcnt=1, next=NULL, nb_segs=1), as done by
+ * rte_pktmbuf_prefree_seg().
+ *
+ * This function should be used with care, when optimization is
+ * required. For standard needs, prefer rte_pktmbuf_free() or
+ * rte_pktmbuf_free_seg().
*
* @param m
* The mbuf to be freed.
@@ -802,13 +813,16 @@ static inline void __attribute__((always_inline))
rte_mbuf_raw_free(struct rte_mbuf *m)
{
RTE_ASSERT(RTE_MBUF_DIRECT(m));
- RTE_ASSERT(rte_mbuf_refcnt_read(m) == 0);
+ RTE_ASSERT(rte_mbuf_refcnt_read(m) == 1);
+ RTE_ASSERT(m->next == NULL);
+ RTE_ASSERT(m->nb_segs == 1);
+ __rte_mbuf_sanity_check(m, 0);
rte_mempool_put(m->pool, m);
}

/* compat with older versions */
__rte_deprecated
-static inline void __attribute__((always_inline))
+static inline void
__rte_mbuf_raw_free(struct rte_mbuf *m)
{
rte_mbuf_raw_free(m);
@@ -1219,8 +1233,12 @@ static inline void rte_pktmbuf_detach(struct rte_mbuf *m)
m->data_len = 0;
m->ol_flags = 0;

- if (rte_mbuf_refcnt_update(md, -1) == 0)
+ if (rte_mbuf_refcnt_update(md, -1) == 0) {
+ md->next = NULL;
+ md->nb_segs = 1;
+ rte_mbuf_refcnt_set(md, 1);
rte_mbuf_raw_free(md);
+ }
}

/**
@@ -1244,9 +1262,13 @@ rte_pktmbuf_prefree_seg(struct rte_mbuf *m)
__rte_mbuf_sanity_check(m, 0);

if (likely(rte_mbuf_refcnt_update(m, -1) == 0)) {
- /* if this is an indirect mbuf, it is detached. */
if (RTE_MBUF_INDIRECT(m))
rte_pktmbuf_detach(m);
+
+ m->next = NULL;
+ m->nb_segs = 1;
+ rte_mbuf_refcnt_set(m, 1);
+
return m;
}
return NULL;
@@ -1273,10 +1295,8 @@ static inline void __attribute__((always_inline))
rte_pktmbuf_free_seg(struct rte_mbuf *m)
{
m = rte_pktmbuf_prefree_seg(m);
- if (likely(m != NULL)) {
- m->next = NULL;
+ if (likely(m != NULL))
rte_mbuf_raw_free(m);
- }
}

/**
--
2.8.1
Bruce Richardson
2017-03-31 11:21:38 UTC
Permalink
Post by Olivier Matz
Set the value of m->refcnt to 1, m->nb_segs to 1 and m->next
to NULL when the mbuf is stored inside the mempool (unused).
This is done in rte_pktmbuf_prefree_seg(), before freeing or
recycling a mbuf.
Before this patch, the value of m->refcnt was expected to be 0
while in pool.
- to avoid drivers to set m->next to NULL in the early Rx path, since
this field is in the second 64B of the mbuf and its access could
trigger a cache miss
- rationalize the behavior of raw_alloc/raw_free: one is now the
symmetric of the other, and refcnt is never changed in these functions.
---
drivers/net/mlx5/mlx5_rxtx.c | 5 ++---
drivers/net/mpipe/mpipe_tilegx.c | 1 +
lib/librte_mbuf/rte_mbuf.c | 2 ++
lib/librte_mbuf/rte_mbuf.h | 42 +++++++++++++++++++++++++++++-----------
4 files changed, 36 insertions(+), 14 deletions(-)
<snip>
Post by Olivier Matz
/**
@@ -1244,9 +1262,13 @@ rte_pktmbuf_prefree_seg(struct rte_mbuf *m)
__rte_mbuf_sanity_check(m, 0);
if (likely(rte_mbuf_refcnt_update(m, -1) == 0)) {
- /* if this is an indirect mbuf, it is detached. */
if (RTE_MBUF_INDIRECT(m))
rte_pktmbuf_detach(m);
+
+ m->next = NULL;
+ m->nb_segs = 1;
+ rte_mbuf_refcnt_set(m, 1);
+
return m;
}
return NULL;
Do we need to make this change to prefree_seg? If we update the detach
function to set the next point to null on detaching a segment, and if we
change the "free" function which frees a whole chain of mbufs, we should
be covered, should we not? If we are freeing a standalone segment, that
segment should already have it's nb_segs and next pointers correct.

/Bruce
Ananyev, Konstantin
2017-03-31 11:51:13 UTC
Permalink
-----Original Message-----
From: Richardson, Bruce
Sent: Friday, March 31, 2017 12:22 PM
Subject: Re: [PATCH 3/9] mbuf: set mbuf fields while in pool
Post by Olivier Matz
Set the value of m->refcnt to 1, m->nb_segs to 1 and m->next
to NULL when the mbuf is stored inside the mempool (unused).
This is done in rte_pktmbuf_prefree_seg(), before freeing or
recycling a mbuf.
Before this patch, the value of m->refcnt was expected to be 0
while in pool.
- to avoid drivers to set m->next to NULL in the early Rx path, since
this field is in the second 64B of the mbuf and its access could
trigger a cache miss
- rationalize the behavior of raw_alloc/raw_free: one is now the
symmetric of the other, and refcnt is never changed in these functions.
---
drivers/net/mlx5/mlx5_rxtx.c | 5 ++---
drivers/net/mpipe/mpipe_tilegx.c | 1 +
lib/librte_mbuf/rte_mbuf.c | 2 ++
lib/librte_mbuf/rte_mbuf.h | 42 +++++++++++++++++++++++++++++-----------
4 files changed, 36 insertions(+), 14 deletions(-)
<snip>
Post by Olivier Matz
/**
@@ -1244,9 +1262,13 @@ rte_pktmbuf_prefree_seg(struct rte_mbuf *m)
__rte_mbuf_sanity_check(m, 0);
if (likely(rte_mbuf_refcnt_update(m, -1) == 0)) {
- /* if this is an indirect mbuf, it is detached. */
if (RTE_MBUF_INDIRECT(m))
rte_pktmbuf_detach(m);
+
+ m->next = NULL;
+ m->nb_segs = 1;
+ rte_mbuf_refcnt_set(m, 1);
+
return m;
}
return NULL;
Do we need to make this change to prefree_seg? If we update the detach
function to set the next point to null on detaching a segment, and if we
change the "free" function which frees a whole chain of mbufs, we should
be covered, should we not? If we are freeing a standalone segment, that
segment should already have it's nb_segs and next pointers correct.
detach() is invoked only for indirect mbufs.
We can have a chain of direct mbufs too.
About free() - most PMD use either rte_pktmbuf_free_seg()
or rte_pktmbuf_prefree_seg();rte_mempool_put_bulk(); directly.
Konstantin

Olivier Matz
2017-03-08 09:41:56 UTC
Permalink
Now that the m->next pointer and m->nb_segs is expected to be set (to
NULL and 1 respectively) after a mempool_get(), we can avoid to write them
in the Rx functions of drivers.

Only some drivers are patched, it's not an exhaustive patch. It gives
the idea to do the same in other drivers.

Signed-off-by: Olivier Matz <***@6wind.com>
---
drivers/net/i40e/i40e_rxtx_vec_sse.c | 6 ------
drivers/net/ixgbe/ixgbe_rxtx.c | 8 --------
drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c | 6 ------
drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c | 6 ------
drivers/net/null/rte_eth_null.c | 2 --
drivers/net/virtio/virtio_rxtx.c | 4 ----
6 files changed, 32 deletions(-)

diff --git a/drivers/net/i40e/i40e_rxtx_vec_sse.c b/drivers/net/i40e/i40e_rxtx_vec_sse.c
index b95cc8e..2f861fd 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_sse.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_sse.c
@@ -424,12 +424,6 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
/* store the resulting 32-bit value */
*(int *)split_packet = _mm_cvtsi128_si32(eop_bits);
split_packet += RTE_I40E_DESCS_PER_LOOP;
-
- /* zero-out next pointers */
- rx_pkts[pos]->next = NULL;
- rx_pkts[pos + 1]->next = NULL;
- rx_pkts[pos + 2]->next = NULL;
- rx_pkts[pos + 3]->next = NULL;
}

/* C.3 calc available number of desc */
diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
index b056107..813c494 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx.c
@@ -1556,8 +1556,6 @@ ixgbe_rx_alloc_bufs(struct ixgbe_rx_queue *rxq, bool reset_mbuf)
/* populate the static rte mbuf fields */
mb = rxep[i].mbuf;
if (reset_mbuf) {
- mb->next = NULL;
- mb->nb_segs = 1;
mb->port = rxq->port_id;
}

@@ -2165,12 +2163,6 @@ ixgbe_recv_pkts_lro(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts,
goto next_desc;
}

- /*
- * This is the last buffer of the received packet - return
- * the current cluster to the user.
- */
- rxm->next = NULL;
-
/* Initialize the first mbuf of the returned packet */
ixgbe_fill_cluster_head_buf(first_seg, &rxd, rxq, staterr);

diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c b/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
index e2715cb..2c04161 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
@@ -330,12 +330,6 @@ _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
*(int *)split_packet = ~stat & IXGBE_VPMD_DESC_EOP_MASK;

split_packet += RTE_IXGBE_DESCS_PER_LOOP;
-
- /* zero-out next pointers */
- rx_pkts[pos]->next = NULL;
- rx_pkts[pos + 1]->next = NULL;
- rx_pkts[pos + 2]->next = NULL;
- rx_pkts[pos + 3]->next = NULL;
}

rte_prefetch_non_temporal(rxdp + RTE_IXGBE_DESCS_PER_LOOP);
diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c b/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
index abbf284..65c5da3 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
@@ -425,12 +425,6 @@ _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
/* store the resulting 32-bit value */
*(int *)split_packet = _mm_cvtsi128_si32(eop_bits);
split_packet += RTE_IXGBE_DESCS_PER_LOOP;
-
- /* zero-out next pointers */
- rx_pkts[pos]->next = NULL;
- rx_pkts[pos + 1]->next = NULL;
- rx_pkts[pos + 2]->next = NULL;
- rx_pkts[pos + 3]->next = NULL;
}

/* C.3 calc available number of desc */
diff --git a/drivers/net/null/rte_eth_null.c b/drivers/net/null/rte_eth_null.c
index 57203e2..7e14da0 100644
--- a/drivers/net/null/rte_eth_null.c
+++ b/drivers/net/null/rte_eth_null.c
@@ -112,8 +112,6 @@ eth_null_rx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
break;
bufs[i]->data_len = (uint16_t)packet_size;
bufs[i]->pkt_len = packet_size;
- bufs[i]->nb_segs = 1;
- bufs[i]->next = NULL;
bufs[i]->port = h->internals->port_id;
}

diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
index cab6e8f..b3e6d80 100644
--- a/drivers/net/virtio/virtio_rxtx.c
+++ b/drivers/net/virtio/virtio_rxtx.c
@@ -772,8 +772,6 @@ virtio_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
rxm->ol_flags = 0;
rxm->vlan_tci = 0;

- rxm->nb_segs = 1;
- rxm->next = NULL;
rxm->pkt_len = (uint32_t)(len[i] - hdr_size);
rxm->data_len = (uint16_t)(len[i] - hdr_size);

@@ -900,7 +898,6 @@ virtio_recv_mergeable_pkts(void *rx_queue,

rxm->data_off = RTE_PKTMBUF_HEADROOM;
rxm->nb_segs = seg_num;
- rxm->next = NULL;
rxm->ol_flags = 0;
rxm->vlan_tci = 0;
rxm->pkt_len = (uint32_t)(len[0] - hdr_size);
@@ -945,7 +942,6 @@ virtio_recv_mergeable_pkts(void *rx_queue,
rxm = rcv_pkts[extra_idx];

rxm->data_off = RTE_PKTMBUF_HEADROOM - hdr_size;
- rxm->next = NULL;
rxm->pkt_len = (uint32_t)(len[extra_idx]);
rxm->data_len = (uint16_t)(len[extra_idx]);
--
2.8.1
Olivier Matz
2017-03-08 09:41:57 UTC
Permalink
From: Jerin Jacob <***@caviumnetworks.com>

To avoid multiple stores on fast path, Ethernet drivers
aggregate the writes to data_off, refcnt, nb_segs and port
to an uint64_t data and write the data in one shot
with uint64_t* at &mbuf->rearm_data address.

Some of the non-IA platforms have store operation overhead
if the store address is not naturally aligned.This patch
fixes the performance issue on those targets.

Signed-off-by: Jerin Jacob <***@caviumnetworks.com>
Signed-off-by: Olivier Matz <***@6wind.com>
---
drivers/net/fm10k/fm10k_rxtx_vec.c | 3 ---
drivers/net/i40e/i40e_rxtx_vec_sse.c | 5 +----
drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c | 3 ---
drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c | 3 ---
lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h | 3 +--
lib/librte_mbuf/rte_mbuf.h | 6 +++---
6 files changed, 5 insertions(+), 18 deletions(-)

diff --git a/drivers/net/fm10k/fm10k_rxtx_vec.c b/drivers/net/fm10k/fm10k_rxtx_vec.c
index 825e3c1..61a65e9 100644
--- a/drivers/net/fm10k/fm10k_rxtx_vec.c
+++ b/drivers/net/fm10k/fm10k_rxtx_vec.c
@@ -324,9 +324,6 @@ fm10k_rxq_rearm(struct fm10k_rx_queue *rxq)

/* Flush mbuf with pkt template.
* Data to be rearmed is 6 bytes long.
- * Though, RX will overwrite ol_flags that are coming next
- * anyway. So overwrite whole 8 bytes with one load:
- * 6 bytes of rearm_data plus first 2 bytes of ol_flags.
*/
p0 = (uintptr_t)&mb0->rearm_data;
*(uint64_t *)p0 = rxq->mbuf_initializer;
diff --git a/drivers/net/i40e/i40e_rxtx_vec_sse.c b/drivers/net/i40e/i40e_rxtx_vec_sse.c
index 2f861fd..e17235a 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_sse.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_sse.c
@@ -87,11 +87,8 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
mb0 = rxep[0].mbuf;
mb1 = rxep[1].mbuf;

- /* Flush mbuf with pkt template.
+ /* Flush mbuf with pkt template.
* Data to be rearmed is 6 bytes long.
- * Though, RX will overwrite ol_flags that are coming next
- * anyway. So overwrite whole 8 bytes with one load:
- * 6 bytes of rearm_data plus first 2 bytes of ol_flags.
*/
p0 = (uintptr_t)&mb0->rearm_data;
*(uint64_t *)p0 = rxq->mbuf_initializer;
diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c b/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
index 2c04161..bc8924f 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
@@ -85,9 +85,6 @@ ixgbe_rxq_rearm(struct ixgbe_rx_queue *rxq)
/*
* Flush mbuf with pkt template.
* Data to be rearmed is 6 bytes long.
- * Though, RX will overwrite ol_flags that are coming next
- * anyway. So overwrite whole 8 bytes with one load:
- * 6 bytes of rearm_data plus first 2 bytes of ol_flags.
*/
vst1_u8((uint8_t *)&mb0->rearm_data, p);
paddr = mb0->buf_physaddr + RTE_PKTMBUF_HEADROOM;
diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c b/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
index 65c5da3..62afe31 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
@@ -90,9 +90,6 @@ ixgbe_rxq_rearm(struct ixgbe_rx_queue *rxq)
/*
* Flush mbuf with pkt template.
* Data to be rearmed is 6 bytes long.
- * Though, RX will overwrite ol_flags that are coming next
- * anyway. So overwrite whole 8 bytes with one load:
- * 6 bytes of rearm_data plus first 2 bytes of ol_flags.
*/
p0 = (uintptr_t)&mb0->rearm_data;
*(uint64_t *)p0 = rxq->mbuf_initializer;
diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
index 09713b0..f24f79f 100644
--- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
+++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
@@ -116,11 +116,10 @@ struct rte_kni_fifo {
struct rte_kni_mbuf {
void *buf_addr __attribute__((__aligned__(RTE_CACHE_LINE_SIZE)));
uint64_t buf_physaddr;
- char pad0[2];
uint16_t data_off; /**< Start address of data in segment buffer. */
char pad1[2];
uint8_t nb_segs; /**< Number of segments. */
- char pad4[1];
+ char pad4[3];
uint64_t ol_flags; /**< Offload features. */
char pad2[4];
uint32_t pkt_len; /**< Total pkt len: sum of all segment data_len. */
diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
index b4fe786..4dc9a20 100644
--- a/lib/librte_mbuf/rte_mbuf.h
+++ b/lib/librte_mbuf/rte_mbuf.h
@@ -393,10 +393,8 @@ struct rte_mbuf {
void *buf_addr; /**< Virtual address of segment buffer. */
phys_addr_t buf_physaddr; /**< Physical address of segment buffer. */

- uint16_t buf_len; /**< Length of segment buffer. */
-
/* next 6 bytes are initialised on RX descriptor rearm */
- MARKER8 rearm_data;
+ MARKER64 rearm_data;
uint16_t data_off;

/**
@@ -414,6 +412,7 @@ struct rte_mbuf {
};
uint8_t nb_segs; /**< Number of segments. */
uint8_t port; /**< Input port. */
+ uint16_t pad; /**< 2B pad for naturally aligned ol_flags */

uint64_t ol_flags; /**< Offload features. */

@@ -474,6 +473,7 @@ struct rte_mbuf {
/** Outer VLAN TCI (CPU order), valid if PKT_RX_QINQ_STRIPPED is set. */
uint16_t vlan_tci_outer;

+ uint16_t buf_len; /**< Length of segment buffer. */
/* second cache line - fields only used in slow path or on TX */
MARKER cacheline1 __rte_cache_min_aligned;
--
2.8.1
Olivier Matz
2017-03-08 09:41:58 UTC
Permalink
Change the size of m->port and m->nb_segs to 16 bits. It is now possible
to reference a port identifier larger than 256 and have a mbuf chain
larger than 256 segments.

Signed-off-by: Olivier Matz <***@6wind.com>
---
app/test-pmd/csumonly.c | 4 ++--
.../linuxapp/eal/include/exec-env/rte_kni_common.h | 4 ++--
lib/librte_mbuf/rte_mbuf.h | 12 +++++++-----
3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/app/test-pmd/csumonly.c b/app/test-pmd/csumonly.c
index 88cc842..5eaff9b 100644
--- a/app/test-pmd/csumonly.c
+++ b/app/test-pmd/csumonly.c
@@ -583,7 +583,7 @@ pkt_copy_split(const struct rte_mbuf *pkt)
rc = mbuf_copy_split(pkt, md, seglen, nb_seg);
if (rc < 0)
RTE_LOG(ERR, USER1,
- "mbuf_copy_split for %p(len=%u, nb_seg=%hhu) "
+ "mbuf_copy_split for %p(len=%u, nb_seg=%u) "
"into %u segments failed with error code: %d\n",
pkt, pkt->pkt_len, pkt->nb_segs, nb_seg, rc);

@@ -801,7 +801,7 @@ pkt_burst_checksum_forward(struct fwd_stream *fs)
char buf[256];

printf("-----------------\n");
- printf("port=%u, mbuf=%p, pkt_len=%u, nb_segs=%hhu:\n",
+ printf("port=%u, mbuf=%p, pkt_len=%u, nb_segs=%u:\n",
fs->rx_port, m, m->pkt_len, m->nb_segs);
/* dump rx parsed packet info */
rte_get_rx_ol_flag_list(rx_ol_flags, buf, sizeof(buf));
diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
index f24f79f..2ac879f 100644
--- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
+++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
@@ -118,8 +118,8 @@ struct rte_kni_mbuf {
uint64_t buf_physaddr;
uint16_t data_off; /**< Start address of data in segment buffer. */
char pad1[2];
- uint8_t nb_segs; /**< Number of segments. */
- char pad4[3];
+ uint16_t nb_segs; /**< Number of segments. */
+ char pad4[2];
uint64_t ol_flags; /**< Offload features. */
char pad2[4];
uint32_t pkt_len; /**< Total pkt len: sum of all segment data_len. */
diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
index 4dc9a20..45cd6b9 100644
--- a/lib/librte_mbuf/rte_mbuf.h
+++ b/lib/librte_mbuf/rte_mbuf.h
@@ -393,12 +393,13 @@ struct rte_mbuf {
void *buf_addr; /**< Virtual address of segment buffer. */
phys_addr_t buf_physaddr; /**< Physical address of segment buffer. */

- /* next 6 bytes are initialised on RX descriptor rearm */
+ /* next 8 bytes are initialised on RX descriptor rearm */
MARKER64 rearm_data;
uint16_t data_off;

/**
- * 16-bit Reference counter.
+ * Reference counter. Its size should at least equal to the size
+ * of port field (16 bits), to support zero-copy broadcast.
* It should only be accessed using the following functions:
* rte_mbuf_refcnt_update(), rte_mbuf_refcnt_read(), and
* rte_mbuf_refcnt_set(). The functionality of these functions (atomic,
@@ -410,9 +411,10 @@ struct rte_mbuf {
rte_atomic16_t refcnt_atomic; /**< Atomically accessed refcnt */
uint16_t refcnt; /**< Non-atomically accessed refcnt */
};
- uint8_t nb_segs; /**< Number of segments. */
- uint8_t port; /**< Input port. */
- uint16_t pad; /**< 2B pad for naturally aligned ol_flags */
+ uint16_t nb_segs; /**< Number of segments. */
+
+ /** Input port (16 bits to support more than 256 virtual ports). */
+ uint16_t port;

uint64_t ol_flags; /**< Offload features. */
--
2.8.1
Olivier Matz
2017-03-08 09:42:01 UTC
Permalink
Move the vlan_tci field near vlan_tci_outer and buf_len near data_len
for more consistency. It opens the door for get/set of the 2 vlan tci at
the same time.

Suggested-by: Andrey Chilikin <***@intel.com>
Signed-off-by: Olivier Matz <***@6wind.com>
---
lib/librte_mbuf/rte_mbuf.h | 7 +++----
1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
index fd97bd3..ada98d5 100644
--- a/lib/librte_mbuf/rte_mbuf.h
+++ b/lib/librte_mbuf/rte_mbuf.h
@@ -449,8 +449,7 @@ struct rte_mbuf {

uint32_t pkt_len; /**< Total pkt len: sum of all segments. */
uint16_t data_len; /**< Amount of data in segment buffer. */
- /** VLAN TCI (CPU order), valid if PKT_RX_VLAN_STRIPPED is set. */
- uint16_t vlan_tci;
+ uint16_t buf_len; /**< Size of segment buffer. */

union {
uint32_t rss; /**< RSS hash result if RSS enabled */
@@ -475,11 +474,11 @@ struct rte_mbuf {
uint32_t usr; /**< User defined tags. See rte_distributor_process() */
} hash; /**< hash information */

+ /** VLAN TCI (CPU order), valid if PKT_RX_VLAN_STRIPPED is set. */
+ uint16_t vlan_tci;
/** Outer VLAN TCI (CPU order), valid if PKT_RX_QINQ_STRIPPED is set. */
uint16_t vlan_tci_outer;

- uint16_t buf_len; /**< Length of segment buffer. */
-
/** Valid if PKT_RX_TIMESTAMP is set. The unit and time reference
* are not normalized but are always the same for a given port.
*/
--
2.8.1
Olivier Matz
2017-03-08 09:42:00 UTC
Permalink
The field itself is not fully described yet, but this commit reserves
the room in the mbuf.

Signed-off-by: Olivier Matz <***@6wind.com>
---
lib/librte_mbuf/rte_mbuf.c | 2 ++
lib/librte_mbuf/rte_mbuf.h | 12 ++++++++++++
2 files changed, 14 insertions(+)

diff --git a/lib/librte_mbuf/rte_mbuf.c b/lib/librte_mbuf/rte_mbuf.c
index 0acc810..f679bce 100644
--- a/lib/librte_mbuf/rte_mbuf.c
+++ b/lib/librte_mbuf/rte_mbuf.c
@@ -322,6 +322,7 @@ const char *rte_get_rx_ol_flag_name(uint64_t mask)
case PKT_RX_IEEE1588_TMST: return "PKT_RX_IEEE1588_TMST";
case PKT_RX_QINQ_STRIPPED: return "PKT_RX_QINQ_STRIPPED";
case PKT_RX_LRO: return "PKT_RX_LRO";
+ case PKT_RX_TIMESTAMP: return "PKT_RX_TIMESTAMP";
default: return NULL;
}
}
@@ -356,6 +357,7 @@ rte_get_rx_ol_flag_list(uint64_t mask, char *buf, size_t buflen)
{ PKT_RX_IEEE1588_TMST, PKT_RX_IEEE1588_TMST, NULL },
{ PKT_RX_QINQ_STRIPPED, PKT_RX_QINQ_STRIPPED, NULL },
{ PKT_RX_LRO, PKT_RX_LRO, NULL },
+ { PKT_RX_TIMESTAMP, PKT_RX_TIMESTAMP, NULL },
};
const char *name;
unsigned int i;
diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
index c75a62a..fd97bd3 100644
--- a/lib/librte_mbuf/rte_mbuf.h
+++ b/lib/librte_mbuf/rte_mbuf.h
@@ -177,6 +177,11 @@ extern "C" {
*/
#define PKT_RX_LRO (1ULL << 16)

+/**
+ * Indicate that the timestamp field in the mbuf is valid.
+ */
+#define PKT_RX_TIMESTAMP (1ULL << 17)
+
/* add new RX flags here */

/* add new TX flags here */
@@ -474,6 +479,12 @@ struct rte_mbuf {
uint16_t vlan_tci_outer;

uint16_t buf_len; /**< Length of segment buffer. */
+
+ /** Valid if PKT_RX_TIMESTAMP is set. The unit and time reference
+ * are not normalized but are always the same for a given port.
+ */
+ uint64_t timestamp;
+
/* second cache line - fields only used in slow path or on TX */
MARKER cacheline1 __rte_cache_min_aligned;

@@ -1201,6 +1212,7 @@ static inline void rte_pktmbuf_attach(struct rte_mbuf *mi, struct rte_mbuf *m)
mi->nb_segs = 1;
mi->ol_flags = m->ol_flags | IND_ATTACHED_MBUF;
mi->packet_type = m->packet_type;
+ mi->timestamp = m->timestamp;

__rte_mbuf_sanity_check(mi, 1);
__rte_mbuf_sanity_check(m, 0);
--
2.8.1
Olivier Matz
2017-03-08 09:41:59 UTC
Permalink
Move this field in the second cache line, since no driver use it
in Rx path. The freed space will be used by a timestamp in next
commit.

Signed-off-by: Olivier Matz <***@6wind.com>
---
lib/librte_mbuf/rte_mbuf.h | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
index 45cd6b9..c75a62a 100644
--- a/lib/librte_mbuf/rte_mbuf.h
+++ b/lib/librte_mbuf/rte_mbuf.h
@@ -470,8 +470,6 @@ struct rte_mbuf {
uint32_t usr; /**< User defined tags. See rte_distributor_process() */
} hash; /**< hash information */

- uint32_t seqn; /**< Sequence number. See also rte_reorder_insert() */
-
/** Outer VLAN TCI (CPU order), valid if PKT_RX_QINQ_STRIPPED is set. */
uint16_t vlan_tci_outer;

@@ -516,6 +514,10 @@ struct rte_mbuf {

/** Timesync flags for use with IEEE1588. */
uint16_t timesync;
+
+ /** Sequence number. See also rte_reorder_insert(). */
+ uint32_t seqn;
+
} __rte_cache_aligned;

/**
--
2.8.1
Olivier Matz
2017-03-29 15:56:29 UTC
Permalink
Hi,

Does anyone have any other comment on this series?
Can it be applied?


Thanks,
Olivier
Post by Olivier Matz
Based on discussions done in [1] and in this thread, this patchset reorganizes
the mbuf.
- reorder structure to increase vector performance on some non-ia
platforms.
- add a 64bits timestamp field in the 1st cache line. This timestamp
is not normalized, i.e. no unit or time reference is enforced. A
library may be added to do this job in the future.
- m->next, m->nb_segs, and m->refcnt are always initialized for mbufs
in the pool, avoiding the need of setting m->next (located in the
2nd cache line) in the Rx path for mono-segment packets.
- change port and nb_segs to 16 bits
- move seqn in the 2nd cache line
- move refcnt and nb_segs to the 2nd cache line: many drivers sets
them in the Rx path, so it could introduce a performance regression, or
it would require to change all the drivers, which is not an easy task.
- remove the m->port field: too much impact on many examples and libraries,
and some people highlighted they are using it.
- moving m->next in the 1st cache line: there is not enough room, and having
it set to NULL for unused mbuf should remove the need for it.
- merge seqn and timestamp together in a union: we could imagine use cases
were both are activated. There is no flag indicating the presence of seqn,
so it looks preferable to keep them separated for now.
I made some basic performance tests (ixgbe) and see no regression.
Other tests from NIC vendors are welcome.
Once this patchset is pushed, the Rx path of drivers could be optimized a bit,
by removing writes to m->next, m->nb_segs and m->refcnt. The patch 4/8 gives an
idea of what could be done.
[1] http://dpdk.org/ml/archives/dev/2016-October/049338.html
- fix reset of mbuf fields in case of indirect mbuf in rte_pktmbuf_prefree_seg()
- do not enforce a unit or time reference for m->timestamp
- reorganize fields to make vlan and outer vlan consecutive
- enhance documentation of m->refcnt and m->port to explain why they are 16bits
mbuf: make rearm data address naturally aligned
mbuf: make segment prefree function public
mbuf: make raw free function public
mbuf: set mbuf fields while in pool
drivers/net: don't touch mbuf next or nb segs on Rx
mbuf: use 2 bytes for port and nb segments
mbuf: move sequence number in second cache line
mbuf: add a timestamp field
mbuf: reorder VLAN tci and buffer len fields
app/test-pmd/csumonly.c | 4 +-
drivers/net/ena/ena_ethdev.c | 2 +-
drivers/net/enic/enic_rxtx.c | 2 +-
drivers/net/fm10k/fm10k_rxtx.c | 6 +-
drivers/net/fm10k/fm10k_rxtx_vec.c | 9 +-
drivers/net/i40e/i40e_rxtx_vec_common.h | 6 +-
drivers/net/i40e/i40e_rxtx_vec_sse.c | 11 +-
drivers/net/ixgbe/ixgbe_rxtx.c | 10 +-
drivers/net/ixgbe/ixgbe_rxtx_vec_common.h | 6 +-
drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c | 9 --
drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c | 9 --
drivers/net/mlx5/mlx5_rxtx.c | 11 +-
drivers/net/mpipe/mpipe_tilegx.c | 3 +-
drivers/net/null/rte_eth_null.c | 2 -
drivers/net/virtio/virtio_rxtx.c | 4 -
drivers/net/virtio/virtio_rxtx_simple.h | 6 +-
.../linuxapp/eal/include/exec-env/rte_kni_common.h | 5 +-
lib/librte_mbuf/rte_mbuf.c | 4 +
lib/librte_mbuf/rte_mbuf.h | 123 ++++++++++++++++-----
19 files changed, 130 insertions(+), 102 deletions(-)
Morten Brørup
2017-03-29 16:03:43 UTC
Permalink
Post by Olivier Matz
Does anyone have any other comment on this series?
Great work!
Post by Olivier Matz
Can it be applied?
Yes.


Med venli
Bruce Richardson
2017-03-29 20:09:23 UTC
Permalink
Post by Olivier Matz
Hi,
Does anyone have any other comment on this series?
Can it be applied?
Thanks,
Olivier
I assume all driver maintainers have done performance analysis to check
for regressions. Perhaps they can confirm this is the case.

/Bruce
Post by Olivier Matz
Post by Olivier Matz
Based on discussions done in [1] and in this thread, this patchset reorganizes
the mbuf.
- reorder structure to increase vector performance on some non-ia
platforms.
- add a 64bits timestamp field in the 1st cache line. This timestamp
is not normalized, i.e. no unit or time reference is enforced. A
library may be added to do this job in the future.
- m->next, m->nb_segs, and m->refcnt are always initialized for mbufs
in the pool, avoiding the need of setting m->next (located in the
2nd cache line) in the Rx path for mono-segment packets.
- change port and nb_segs to 16 bits
- move seqn in the 2nd cache line
- move refcnt and nb_segs to the 2nd cache line: many drivers sets
them in the Rx path, so it could introduce a performance regression, or
it would require to change all the drivers, which is not an easy task.
- remove the m->port field: too much impact on many examples and libraries,
and some people highlighted they are using it.
- moving m->next in the 1st cache line: there is not enough room, and having
it set to NULL for unused mbuf should remove the need for it.
- merge seqn and timestamp together in a union: we could imagine use cases
were both are activated. There is no flag indicating the presence of seqn,
so it looks preferable to keep them separated for now.
I made some basic performance tests (ixgbe) and see no regression.
Other tests from NIC vendors are welcome.
Once this patchset is pushed, the Rx path of drivers could be optimized a bit,
by removing writes to m->next, m->nb_segs and m->refcnt. The patch 4/8 gives an
idea of what could be done.
[1] http://dpdk.org/ml/archives/dev/2016-October/049338.html
- fix reset of mbuf fields in case of indirect mbuf in rte_pktmbuf_prefree_seg()
- do not enforce a unit or time reference for m->timestamp
- reorganize fields to make vlan and outer vlan consecutive
- enhance documentation of m->refcnt and m->port to explain why they are 16bits
mbuf: make rearm data address naturally aligned
mbuf: make segment prefree function public
mbuf: make raw free function public
mbuf: set mbuf fields while in pool
drivers/net: don't touch mbuf next or nb segs on Rx
mbuf: use 2 bytes for port and nb segments
mbuf: move sequence number in second cache line
mbuf: add a timestamp field
mbuf: reorder VLAN tci and buffer len fields
app/test-pmd/csumonly.c | 4 +-
drivers/net/ena/ena_ethdev.c | 2 +-
drivers/net/enic/enic_rxtx.c | 2 +-
drivers/net/fm10k/fm10k_rxtx.c | 6 +-
drivers/net/fm10k/fm10k_rxtx_vec.c | 9 +-
drivers/net/i40e/i40e_rxtx_vec_common.h | 6 +-
drivers/net/i40e/i40e_rxtx_vec_sse.c | 11 +-
drivers/net/ixgbe/ixgbe_rxtx.c | 10 +-
drivers/net/ixgbe/ixgbe_rxtx_vec_common.h | 6 +-
drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c | 9 --
drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c | 9 --
drivers/net/mlx5/mlx5_rxtx.c | 11 +-
drivers/net/mpipe/mpipe_tilegx.c | 3 +-
drivers/net/null/rte_eth_null.c | 2 -
drivers/net/virtio/virtio_rxtx.c | 4 -
drivers/net/virtio/virtio_rxtx_simple.h | 6 +-
.../linuxapp/eal/include/exec-env/rte_kni_common.h | 5 +-
lib/librte_mbuf/rte_mbuf.c | 4 +
lib/librte_mbuf/rte_mbuf.h | 123 ++++++++++++++++-----
19 files changed, 130 insertions(+), 102 deletions(-)
Bruce Richardson
2017-03-30 09:31:08 UTC
Permalink
Post by Bruce Richardson
Post by Olivier Matz
Hi,
Does anyone have any other comment on this series?
Can it be applied?
Thanks,
Olivier
I assume all driver maintainers have done performance analysis to check
for regressions. Perhaps they can confirm this is the case.
/Bruce
In the absence, of anyone else reporting performance numbers with this
patchset, I ran a single-thread testpmd test using 2 x 40G ports (i40e)
driver. With RX & TX descriptor ring sizes of 512 or above, I'm seeing a
fairly noticable performance drop. I still need to dig in more, e.g. do
an RFC2544 zero-loss test, and also bisect the patchset to see what
parts may be causing the problem.

Has anyone else tried any other drivers or systems to see what the perf
impact of this set may be?

/Bruce
Olivier Matz
2017-03-30 12:02:36 UTC
Permalink
Post by Bruce Richardson
Post by Bruce Richardson
Post by Olivier Matz
Hi,
Does anyone have any other comment on this series?
Can it be applied?
Thanks,
Olivier
I assume all driver maintainers have done performance analysis to check
for regressions. Perhaps they can confirm this is the case.
/Bruce
In the absence, of anyone else reporting performance numbers with this
patchset, I ran a single-thread testpmd test using 2 x 40G ports (i40e)
driver. With RX & TX descriptor ring sizes of 512 or above, I'm seeing a
fairly noticable performance drop. I still need to dig in more, e.g. do
an RFC2544 zero-loss test, and also bisect the patchset to see what
parts may be causing the problem.
Has anyone else tried any other drivers or systems to see what the perf
impact of this set may be?
I did, of course. I didn't see any noticeable performance drop on
ixgbe (4 NICs, one port per NIC, 1 core). I can replay the test with
current version.


Olivier
Bruce Richardson
2017-03-30 12:23:06 UTC
Permalink
Post by Olivier Matz
Post by Bruce Richardson
Post by Bruce Richardson
Post by Olivier Matz
Hi,
Does anyone have any other comment on this series?
Can it be applied?
Thanks,
Olivier
I assume all driver maintainers have done performance analysis to check
for regressions. Perhaps they can confirm this is the case.
/Bruce
In the absence, of anyone else reporting performance numbers with this
patchset, I ran a single-thread testpmd test using 2 x 40G ports (i40e)
driver. With RX & TX descriptor ring sizes of 512 or above, I'm seeing a
fairly noticable performance drop. I still need to dig in more, e.g. do
an RFC2544 zero-loss test, and also bisect the patchset to see what
parts may be causing the problem.
Has anyone else tried any other drivers or systems to see what the perf
impact of this set may be?
I did, of course. I didn't see any noticeable performance drop on
ixgbe (4 NICs, one port per NIC, 1 core). I can replay the test with
current version.
I had no doubt you did some perf testing! :-)

Perhaps the regression I see is limited to i40e driver. I've confirmed I
still see it with that driver in zero-loss tests, so next step is to try
and localise what change in the patchset is causing it.

Ideally, though, I think we should see acks or other comments from
driver maintainers at least confirming that they have tested. You cannot
be held responsible for testing every DPDK driver before you submit work
like this.

Regards,
/Bruce
Ananyev, Konstantin
2017-03-30 16:45:18 UTC
Permalink
-----Original Message-----
From: Richardson, Bruce
Sent: Thursday, March 30, 2017 1:23 PM
Subject: Re: [dpdk-dev] [PATCH 0/9] mbuf: structure reorganization
Post by Olivier Matz
Post by Bruce Richardson
Post by Bruce Richardson
Post by Olivier Matz
Hi,
Does anyone have any other comment on this series?
Can it be applied?
Thanks,
Olivier
I assume all driver maintainers have done performance analysis to check
for regressions. Perhaps they can confirm this is the case.
/Bruce
In the absence, of anyone else reporting performance numbers with this
patchset, I ran a single-thread testpmd test using 2 x 40G ports (i40e)
driver. With RX & TX descriptor ring sizes of 512 or above, I'm seeing a
fairly noticable performance drop. I still need to dig in more, e.g. do
an RFC2544 zero-loss test, and also bisect the patchset to see what
parts may be causing the problem.
Has anyone else tried any other drivers or systems to see what the perf
impact of this set may be?
I did, of course. I didn't see any noticeable performance drop on
ixgbe (4 NICs, one port per NIC, 1 core). I can replay the test with
current version.
I had no doubt you did some perf testing! :-)
Perhaps the regression I see is limited to i40e driver. I've confirmed I
still see it with that driver in zero-loss tests, so next step is to try
and localise what change in the patchset is causing it.
Ideally, though, I think we should see acks or other comments from
driver maintainers at least confirming that they have tested. You cannot
be held responsible for testing every DPDK driver before you submit work
like this.
Unfortunately I also see a regression.
Did a quick flood test on 2.8 GHZ IVB with 4x10Gb.
Observed a drop even with default testpmd RXD/TXD numbers (128/512):
from 50.8 Mpps down to 47.8 Mpps.
From what I am seeing the particular patch that causing it:
[dpdk-dev,3/9] mbuf: set mbuf fields while in pool

cc version 5.3.1 20160406 (Red Hat 5.3.1-6) (GCC)
cmdline:
./dpdk.org-1705-mbuf1/x86_64-native-linuxapp-gcc/app/testpmd --lcores='7,8' -n 4 --socket-mem='1024,0' -w 04:00.1 -w 07:00.1 -w 0b:00.1 -w 0e:00.1 -- -i

Konstantin
Ananyev, Konstantin
2017-03-30 16:47:50 UTC
Permalink
-----Original Message-----
Sent: Thursday, March 30, 2017 5:45 PM
Subject: Re: [dpdk-dev] [PATCH 0/9] mbuf: structure reorganization
-----Original Message-----
From: Richardson, Bruce
Sent: Thursday, March 30, 2017 1:23 PM
Subject: Re: [dpdk-dev] [PATCH 0/9] mbuf: structure reorganization
Post by Olivier Matz
Post by Bruce Richardson
Post by Bruce Richardson
Post by Olivier Matz
Hi,
Does anyone have any other comment on this series?
Can it be applied?
Thanks,
Olivier
I assume all driver maintainers have done performance analysis to check
for regressions. Perhaps they can confirm this is the case.
/Bruce
In the absence, of anyone else reporting performance numbers with this
patchset, I ran a single-thread testpmd test using 2 x 40G ports (i40e)
driver. With RX & TX descriptor ring sizes of 512 or above, I'm seeing a
fairly noticable performance drop. I still need to dig in more, e.g. do
an RFC2544 zero-loss test, and also bisect the patchset to see what
parts may be causing the problem.
Has anyone else tried any other drivers or systems to see what the perf
impact of this set may be?
I did, of course. I didn't see any noticeable performance drop on
ixgbe (4 NICs, one port per NIC, 1 core). I can replay the test with
current version.
I had no doubt you did some perf testing! :-)
Perhaps the regression I see is limited to i40e driver. I've confirmed I
still see it with that driver in zero-loss tests, so next step is to try
and localise what change in the patchset is causing it.
Ideally, though, I think we should see acks or other comments from
driver maintainers at least confirming that they have tested. You cannot
be held responsible for testing every DPDK driver before you submit work
like this.
Unfortunately I also see a regression.
Did a quick flood test on 2.8 GHZ IVB with 4x10Gb.
Sorry, forgot to mention - it is on ixgbe.
So it doesn't look like i40e specific.
from 50.8 Mpps down to 47.8 Mpps.
[dpdk-dev,3/9] mbuf: set mbuf fields while in pool
cc version 5.3.1 20160406 (Red Hat 5.3.1-6) (GCC)
./dpdk.org-1705-mbuf1/x86_64-native-linuxapp-gcc/app/testpmd --lcores='7,8' -n 4 --socket-mem='1024,0' -w 04:00.1 -w 07:00.1 -w
0b:00.1 -w 0e:00.1 -- -i
Konstantin
Ananyev, Konstantin
2017-03-30 18:06:35 UTC
Permalink
-----Original Message-----
From: Ananyev, Konstantin
Sent: Thursday, March 30, 2017 5:48 PM
Subject: RE: [dpdk-dev] [PATCH 0/9] mbuf: structure reorganization
-----Original Message-----
Sent: Thursday, March 30, 2017 5:45 PM
Subject: Re: [dpdk-dev] [PATCH 0/9] mbuf: structure reorganization
-----Original Message-----
From: Richardson, Bruce
Sent: Thursday, March 30, 2017 1:23 PM
Subject: Re: [dpdk-dev] [PATCH 0/9] mbuf: structure reorganization
Post by Olivier Matz
Post by Bruce Richardson
Post by Bruce Richardson
Post by Olivier Matz
Hi,
Does anyone have any other comment on this series?
Can it be applied?
Thanks,
Olivier
I assume all driver maintainers have done performance analysis to check
for regressions. Perhaps they can confirm this is the case.
/Bruce
In the absence, of anyone else reporting performance numbers with this
patchset, I ran a single-thread testpmd test using 2 x 40G ports (i40e)
driver. With RX & TX descriptor ring sizes of 512 or above, I'm seeing a
fairly noticable performance drop. I still need to dig in more, e.g. do
an RFC2544 zero-loss test, and also bisect the patchset to see what
parts may be causing the problem.
Has anyone else tried any other drivers or systems to see what the perf
impact of this set may be?
I did, of course. I didn't see any noticeable performance drop on
ixgbe (4 NICs, one port per NIC, 1 core). I can replay the test with
current version.
I had no doubt you did some perf testing! :-)
Perhaps the regression I see is limited to i40e driver. I've confirmed I
still see it with that driver in zero-loss tests, so next step is to try
and localise what change in the patchset is causing it.
Ideally, though, I think we should see acks or other comments from
driver maintainers at least confirming that they have tested. You cannot
be held responsible for testing every DPDK driver before you submit work
like this.
Unfortunately I also see a regression.
Did a quick flood test on 2.8 GHZ IVB with 4x10Gb.
Sorry, forgot to mention - it is on ixgbe.
So it doesn't look like i40e specific.
from 50.8 Mpps down to 47.8 Mpps.
[dpdk-dev,3/9] mbuf: set mbuf fields while in pool
cc version 5.3.1 20160406 (Red Hat 5.3.1-6) (GCC)
./dpdk.org-1705-mbuf1/x86_64-native-linuxapp-gcc/app/testpmd --lcores='7,8' -n 4 --socket-mem='1024,0' -w 04:00.1 -w 07:00.1 -w
0b:00.1 -w 0e:00.1 -- -i
Actually one more question regarding:
[dpdk-dev,9/9] mbuf: reorder VLAN tci and buffer len fields

diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
index fd97bd3..ada98d5 100644
--- a/lib/librte_mbuf/rte_mbuf.h
+++ b/lib/librte_mbuf/rte_mbuf.h
@@ -449,8 +449,7 @@ struct rte_mbuf {

uint32_t pkt_len; /**< Total pkt len: sum of all segments. */
uint16_t data_len; /**< Amount of data in segment buffer. */
- /** VLAN TCI (CPU order), valid if PKT_RX_VLAN_STRIPPED is set. */
- uint16_t vlan_tci;
+ uint16_t buf_len; /**< Size of segment buffer. */

union {
uint32_t rss; /**< RSS hash result if RSS enabled */
@@ -475,11 +474,11 @@ struct rte_mbuf {
uint32_t usr; /**< User defined tags. See rte_distributor_process() */
} hash; /**< hash information */

+ /** VLAN TCI (CPU order), valid if PKT_RX_VLAN_STRIPPED is set. */
+ uint16_t vlan_tci;
/** Outer VLAN TCI (CPU order), valid if PKT_RX_QINQ_STRIPPED is set. */
uint16_t vlan_tci_outer;

- uint16_t buf_len; /**< Length of segment buffer. */
-
/** Valid if PKT_RX_TIMESTAMP is set. The unit and time reference
* are not normalized but are always the same for a given port.
*/

How ixgbe and i40e SSE version supposed to work correctly after that change?
As I remember both of them sets vlan_tci as part of 16B shuffle operation.
Something like that:
pkt_mb4 = _mm_shuffle_epi8(descs[3], shuf_msk);
...
mm_storeu_si128((void *)&rx_pkts[pos+3]->rx_descriptor_fields1,
pkt_mb4);

But now vlan_tci is swapped with buf_len.
Which means 2 things to me:
It is more than 16B away from rx_descriptor_fields1 and can't be updated in one go anymore,
and instead of vlan_tci we are updating buf_len.
Olivier Matz
2017-03-31 08:41:07 UTC
Permalink
Post by Ananyev, Konstantin
-----Original Message-----
From: Ananyev, Konstantin
Sent: Thursday, March 30, 2017 5:48 PM
Subject: RE: [dpdk-dev] [PATCH 0/9] mbuf: structure reorganization
-----Original Message-----
Sent: Thursday, March 30, 2017 5:45 PM
Subject: Re: [dpdk-dev] [PATCH 0/9] mbuf: structure reorganization
-----Original Message-----
From: Richardson, Bruce
Sent: Thursday, March 30, 2017 1:23 PM
Subject: Re: [dpdk-dev] [PATCH 0/9] mbuf: structure reorganization
Post by Olivier Matz
Post by Bruce Richardson
Post by Bruce Richardson
Post by Olivier Matz
Hi,
Does anyone have any other comment on this series?
Can it be applied?
Thanks,
Olivier
I assume all driver maintainers have done performance analysis to check
for regressions. Perhaps they can confirm this is the case.
/Bruce
In the absence, of anyone else reporting performance numbers with this
patchset, I ran a single-thread testpmd test using 2 x 40G ports (i40e)
driver. With RX & TX descriptor ring sizes of 512 or above, I'm seeing a
fairly noticable performance drop. I still need to dig in more, e.g. do
an RFC2544 zero-loss test, and also bisect the patchset to see what
parts may be causing the problem.
Has anyone else tried any other drivers or systems to see what the perf
impact of this set may be?
I did, of course. I didn't see any noticeable performance drop on
ixgbe (4 NICs, one port per NIC, 1 core). I can replay the test with
current version.
I had no doubt you did some perf testing! :-)
Perhaps the regression I see is limited to i40e driver. I've confirmed I
still see it with that driver in zero-loss tests, so next step is to try
and localise what change in the patchset is causing it.
Ideally, though, I think we should see acks or other comments from
driver maintainers at least confirming that they have tested. You cannot
be held responsible for testing every DPDK driver before you submit work
like this.
Unfortunately I also see a regression.
Did a quick flood test on 2.8 GHZ IVB with 4x10Gb.
Sorry, forgot to mention - it is on ixgbe.
So it doesn't look like i40e specific.
from 50.8 Mpps down to 47.8 Mpps.
[dpdk-dev,3/9] mbuf: set mbuf fields while in pool
cc version 5.3.1 20160406 (Red Hat 5.3.1-6) (GCC)
./dpdk.org-1705-mbuf1/x86_64-native-linuxapp-gcc/app/testpmd --lcores='7,8' -n 4 --socket-mem='1024,0' -w 04:00.1 -w 07:00.1 -w
0b:00.1 -w 0e:00.1 -- -i
[dpdk-dev,9/9] mbuf: reorder VLAN tci and buffer len fields
diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
index fd97bd3..ada98d5 100644
--- a/lib/librte_mbuf/rte_mbuf.h
+++ b/lib/librte_mbuf/rte_mbuf.h
@@ -449,8 +449,7 @@ struct rte_mbuf {
uint32_t pkt_len; /**< Total pkt len: sum of all segments. */
uint16_t data_len; /**< Amount of data in segment buffer. */
- /** VLAN TCI (CPU order), valid if PKT_RX_VLAN_STRIPPED is set. */
- uint16_t vlan_tci;
+ uint16_t buf_len; /**< Size of segment buffer. */
union {
uint32_t rss; /**< RSS hash result if RSS enabled */
@@ -475,11 +474,11 @@ struct rte_mbuf {
uint32_t usr; /**< User defined tags. See rte_distributor_process() */
} hash; /**< hash information */
+ /** VLAN TCI (CPU order), valid if PKT_RX_VLAN_STRIPPED is set. */
+ uint16_t vlan_tci;
/** Outer VLAN TCI (CPU order), valid if PKT_RX_QINQ_STRIPPED is set. */
uint16_t vlan_tci_outer;
- uint16_t buf_len; /**< Length of segment buffer. */
-
/** Valid if PKT_RX_TIMESTAMP is set. The unit and time reference
* are not normalized but are always the same for a given port.
*/
How ixgbe and i40e SSE version supposed to work correctly after that change?
As I remember both of them sets vlan_tci as part of 16B shuffle operation.
pkt_mb4 = _mm_shuffle_epi8(descs[3], shuf_msk);
...
mm_storeu_si128((void *)&rx_pkts[pos+3]->rx_descriptor_fields1,
pkt_mb4);
But now vlan_tci is swapped with buf_len.
It is more than 16B away from rx_descriptor_fields1 and can't be updated in one go anymore,
and instead of vlan_tci we are updating buf_len.
Sorry, I missed it. But this shows something problematic: changing the
order of fields in a structure breaks code without notification. I think
that drivers expecting a field at a specific position should have some
BUG_ON() to check that the condition is still valid. We can't expect anyone
to know all the constraints of all vectors PMDs in DPDK.

The original idea of this patch was to group vlan_tci and vlan_outer_tci,
which looked to be a good idea at first glance. If it requires to change
all vector code, let's drop it.

Just for the exercice, let's imagine we need that patch. What would be
the procedure to have it integrated? How can we detect there is an issue?
Who would be in charge of modifying all the vector code in PMDs?


Regards,
Olivier
Ananyev, Konstantin
2017-03-31 09:58:08 UTC
Permalink
Hi Olivier,
Post by Olivier Matz
Post by Ananyev, Konstantin
-----Original Message-----
From: Ananyev, Konstantin
Sent: Thursday, March 30, 2017 5:48 PM
Subject: RE: [dpdk-dev] [PATCH 0/9] mbuf: structure reorganization
-----Original Message-----
Sent: Thursday, March 30, 2017 5:45 PM
Subject: Re: [dpdk-dev] [PATCH 0/9] mbuf: structure reorganization
-----Original Message-----
From: Richardson, Bruce
Sent: Thursday, March 30, 2017 1:23 PM
Subject: Re: [dpdk-dev] [PATCH 0/9] mbuf: structure reorganization
Post by Olivier Matz
Post by Bruce Richardson
Post by Bruce Richardson
Post by Olivier Matz
Hi,
Does anyone have any other comment on this series?
Can it be applied?
Thanks,
Olivier
I assume all driver maintainers have done performance analysis to check
for regressions. Perhaps they can confirm this is the case.
/Bruce
In the absence, of anyone else reporting performance numbers with this
patchset, I ran a single-thread testpmd test using 2 x 40G ports (i40e)
driver. With RX & TX descriptor ring sizes of 512 or above, I'm seeing a
fairly noticable performance drop. I still need to dig in more, e.g. do
an RFC2544 zero-loss test, and also bisect the patchset to see what
parts may be causing the problem.
Has anyone else tried any other drivers or systems to see what the perf
impact of this set may be?
I did, of course. I didn't see any noticeable performance drop on
ixgbe (4 NICs, one port per NIC, 1 core). I can replay the test with
current version.
I had no doubt you did some perf testing! :-)
Perhaps the regression I see is limited to i40e driver. I've confirmed I
still see it with that driver in zero-loss tests, so next step is to try
and localise what change in the patchset is causing it.
Ideally, though, I think we should see acks or other comments from
driver maintainers at least confirming that they have tested. You cannot
be held responsible for testing every DPDK driver before you submit work
like this.
Unfortunately I also see a regression.
Did a quick flood test on 2.8 GHZ IVB with 4x10Gb.
Sorry, forgot to mention - it is on ixgbe.
So it doesn't look like i40e specific.
from 50.8 Mpps down to 47.8 Mpps.
[dpdk-dev,3/9] mbuf: set mbuf fields while in pool
cc version 5.3.1 20160406 (Red Hat 5.3.1-6) (GCC)
./dpdk.org-1705-mbuf1/x86_64-native-linuxapp-gcc/app/testpmd --lcores='7,8' -n 4 --socket-mem='1024,0' -w 04:00.1 -w 07:00.1 -
w
Post by Ananyev, Konstantin
0b:00.1 -w 0e:00.1 -- -i
[dpdk-dev,9/9] mbuf: reorder VLAN tci and buffer len fields
diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
index fd97bd3..ada98d5 100644
--- a/lib/librte_mbuf/rte_mbuf.h
+++ b/lib/librte_mbuf/rte_mbuf.h
@@ -449,8 +449,7 @@ struct rte_mbuf {
uint32_t pkt_len; /**< Total pkt len: sum of all segments. */
uint16_t data_len; /**< Amount of data in segment buffer. */
- /** VLAN TCI (CPU order), valid if PKT_RX_VLAN_STRIPPED is set. */
- uint16_t vlan_tci;
+ uint16_t buf_len; /**< Size of segment buffer. */
union {
uint32_t rss; /**< RSS hash result if RSS enabled */
@@ -475,11 +474,11 @@ struct rte_mbuf {
uint32_t usr; /**< User defined tags. See rte_distributor_process() */
} hash; /**< hash information */
+ /** VLAN TCI (CPU order), valid if PKT_RX_VLAN_STRIPPED is set. */
+ uint16_t vlan_tci;
/** Outer VLAN TCI (CPU order), valid if PKT_RX_QINQ_STRIPPED is set. */
uint16_t vlan_tci_outer;
- uint16_t buf_len; /**< Length of segment buffer. */
-
/** Valid if PKT_RX_TIMESTAMP is set. The unit and time reference
* are not normalized but are always the same for a given port.
*/
How ixgbe and i40e SSE version supposed to work correctly after that change?
As I remember both of them sets vlan_tci as part of 16B shuffle operation.
pkt_mb4 = _mm_shuffle_epi8(descs[3], shuf_msk);
...
mm_storeu_si128((void *)&rx_pkts[pos+3]->rx_descriptor_fields1,
pkt_mb4);
But now vlan_tci is swapped with buf_len.
It is more than 16B away from rx_descriptor_fields1 and can't be updated in one go anymore,
and instead of vlan_tci we are updating buf_len.
Sorry, I missed it. But this shows something problematic: changing the
order of fields in a structure breaks code without notification. I think
that drivers expecting a field at a specific position should have some
BUG_ON() to check that the condition is still valid. We can't expect anyone
to know all the constraints of all vectors PMDs in DPDK.
The original idea of this patch was to group vlan_tci and vlan_outer_tci,
which looked to be a good idea at first glance. If it requires to change
all vector code, let's drop it.
Just for the exercice, let's imagine we need that patch. What would be
the procedure to have it integrated? How can we detect there is an issue?
Who would be in charge of modifying all the vector code in PMDs?
Indeed right now there is no way to know what is PMD requirement on mbuf layout.
Adding BUG_ON() into particular RX/TX implementation that has such constrains seems
like a very good idea to me.
Apart from that I don't know off-hand how we can make restructuring mbuf less painful.
Konstantin
Ananyev, Konstantin
2017-03-31 01:00:49 UTC
Permalink
Post by Ananyev, Konstantin
Post by Ananyev, Konstantin
-----Original Message-----
From: Richardson, Bruce
Sent: Thursday, March 30, 2017 1:23 PM
Subject: Re: [dpdk-dev] [PATCH 0/9] mbuf: structure reorganization
Post by Olivier Matz
Post by Bruce Richardson
Post by Bruce Richardson
Post by Olivier Matz
Hi,
Does anyone have any other comment on this series?
Can it be applied?
Thanks,
Olivier
I assume all driver maintainers have done performance analysis to check
for regressions. Perhaps they can confirm this is the case.
/Bruce
In the absence, of anyone else reporting performance numbers with this
patchset, I ran a single-thread testpmd test using 2 x 40G ports (i40e)
driver. With RX & TX descriptor ring sizes of 512 or above, I'm seeing a
fairly noticable performance drop. I still need to dig in more, e.g. do
an RFC2544 zero-loss test, and also bisect the patchset to see what
parts may be causing the problem.
Has anyone else tried any other drivers or systems to see what the perf
impact of this set may be?
I did, of course. I didn't see any noticeable performance drop on
ixgbe (4 NICs, one port per NIC, 1 core). I can replay the test with
current version.
I had no doubt you did some perf testing! :-)
Perhaps the regression I see is limited to i40e driver. I've confirmed I
still see it with that driver in zero-loss tests, so next step is to try
and localise what change in the patchset is causing it.
Ideally, though, I think we should see acks or other comments from
driver maintainers at least confirming that they have tested. You cannot
be held responsible for testing every DPDK driver before you submit work
like this.
Unfortunately I also see a regression.
Did a quick flood test on 2.8 GHZ IVB with 4x10Gb.
Sorry, forgot to mention - it is on ixgbe.
So it doesn't look like i40e specific.
Post by Ananyev, Konstantin
from 50.8 Mpps down to 47.8 Mpps.
[dpdk-dev,3/9] mbuf: set mbuf fields while in pool
cc version 5.3.1 20160406 (Red Hat 5.3.1-6) (GCC)
./dpdk.org-1705-mbuf1/x86_64-native-linuxapp-gcc/app/testpmd --lcores='7,8' -n 4 --socket-mem='1024,0' -w 04:00.1 -w 07:00.1 -w
0b:00.1 -w 0e:00.1 -- -i
After applying the patch below got nearly original numbers (though not quite) on my box.
dpdk.org mainline: 50.8
with Olivier patch: 47.8
with patch below: 50.4
What I tried to do in it - avoid unnecessary updates of mbuf inside rte_pktmbuf_prefree_seg().
For one segment per packet it seems to help.
Though so far I didn't try it on i40e and didn't do any testing for multi-seg scenario.
Konstantin

$ cat patch.mod4
diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
index d7af852..558233f 100644
--- a/lib/librte_mbuf/rte_mbuf.h
+++ b/lib/librte_mbuf/rte_mbuf.h
@@ -1283,12 +1283,28 @@ rte_pktmbuf_prefree_seg(struct rte_mbuf *m)
{
__rte_mbuf_sanity_check(m, 0);

- if (likely(rte_mbuf_refcnt_update(m, -1) == 0)) {
+ if (likely(rte_mbuf_refcnt_read(m) == 1)) {
+
+ if (m->next != NULL) {
+ m->next = NULL;
+ m->nb_segs = 1;
+ }
+
+ if (RTE_MBUF_INDIRECT(m))
+ rte_pktmbuf_detach(m);
+
+ return m;
+
+ } else if (rte_atomic16_add_return(&m->refcnt_atomic, -1) == 0) {
+
if (RTE_MBUF_INDIRECT(m))
rte_pktmbuf_detach(m);

- m->next = NULL;
- m->nb_segs = 1;
+ if (m->next != NULL) {
+ m->next = NULL;
+ m->nb_segs = 1;
+ }
+
rte_mbuf_refcnt_set(m, 1);

return m;
Morten Brørup
2017-03-31 07:21:39 UTC
Permalink
-----Original Message-----
Konstantin
Sent: Friday, March 31, 2017 3:01 AM
After applying the patch below got nearly original numbers (though not quite) on my box.
dpdk.org mainline: 50.8
with Olivier patch: 47.8
with patch below: 50.4
What I tried to do in it - avoid unnecessary updates of mbuf inside
rte_pktmbuf_prefree_seg().
For one segment per packet it seems to help.
Though so far I didn't try it on i40e and didn't do any testing for multi-seg scenario.
Konstantin
$ cat patch.mod4
diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
index d7af852..558233f 100644
--- a/lib/librte_mbuf/rte_mbuf.h
+++ b/lib/librte_mbuf/rte_mbuf.h
@@ -1283,12 +1283,28 @@ rte_pktmbuf_prefree_seg(struct rte_mbuf *m) {
__rte_mbuf_sanity_check(m, 0);
- if (likely(rte_mbuf_refcnt_update(m, -1) == 0)) {
+ if (likely(rte_mbuf_refcnt_read(m) == 1)) {
+
+ if (m->next != NULL) {
+ m->next = NULL;
+ m->nb_segs = 1;
+ }
+
+ if (RTE_MBUF_INDIRECT(m))
+ rte_pktmbuf_detach(m);
+
+ return m;
+
+ } else if (rte_atomic16_add_return(&m->refcnt_atomic, -1) == 0)
+ {
+
if (RTE_MBUF_INDIRECT(m))
rte_pktmbuf_detach(m);
- m->next = NULL;
- m->nb_segs = 1;
+ if (m->next != NULL) {
+ m->next = NULL;
+ m->nb_segs = 1;
+ }
+
rte_mbuf_refcnt_set(m, 1);
return m;
Maybe the access to the second cache line (for single-segment packets) can be avoided altogether in rte_pktmbuf_prefree_seg() by adding a multi-segment indication flag to the first cache line, and using this flag instead of the test for m->next != NULL.

Med venlig h
Olivier Matz
2017-03-31 08:26:10 UTC
Permalink
Hi,
Post by Ananyev, Konstantin
Post by Ananyev, Konstantin
Post by Ananyev, Konstantin
-----Original Message-----
From: Richardson, Bruce
Sent: Thursday, March 30, 2017 1:23 PM
Subject: Re: [dpdk-dev] [PATCH 0/9] mbuf: structure reorganization
Post by Olivier Matz
Post by Bruce Richardson
Post by Bruce Richardson
Post by Olivier Matz
Hi,
Does anyone have any other comment on this series?
Can it be applied?
Thanks,
Olivier
I assume all driver maintainers have done performance analysis to check
for regressions. Perhaps they can confirm this is the case.
/Bruce
In the absence, of anyone else reporting performance numbers with this
patchset, I ran a single-thread testpmd test using 2 x 40G ports (i40e)
driver. With RX & TX descriptor ring sizes of 512 or above, I'm seeing a
fairly noticable performance drop. I still need to dig in more, e.g. do
an RFC2544 zero-loss test, and also bisect the patchset to see what
parts may be causing the problem.
Has anyone else tried any other drivers or systems to see what the perf
impact of this set may be?
I did, of course. I didn't see any noticeable performance drop on
ixgbe (4 NICs, one port per NIC, 1 core). I can replay the test with
current version.
I had no doubt you did some perf testing! :-)
Perhaps the regression I see is limited to i40e driver. I've confirmed I
still see it with that driver in zero-loss tests, so next step is to try
and localise what change in the patchset is causing it.
Ideally, though, I think we should see acks or other comments from
driver maintainers at least confirming that they have tested. You cannot
be held responsible for testing every DPDK driver before you submit work
like this.
Unfortunately I also see a regression.
Did a quick flood test on 2.8 GHZ IVB with 4x10Gb.
Sorry, forgot to mention - it is on ixgbe.
So it doesn't look like i40e specific.
Post by Ananyev, Konstantin
from 50.8 Mpps down to 47.8 Mpps.
[dpdk-dev,3/9] mbuf: set mbuf fields while in pool
cc version 5.3.1 20160406 (Red Hat 5.3.1-6) (GCC)
./dpdk.org-1705-mbuf1/x86_64-native-linuxapp-gcc/app/testpmd --lcores='7,8' -n 4 --socket-mem='1024,0' -w 04:00.1 -w 07:00.1 -w
0b:00.1 -w 0e:00.1 -- -i
After applying the patch below got nearly original numbers (though not quite) on my box.
dpdk.org mainline: 50.8
with Olivier patch: 47.8
with patch below: 50.4
What I tried to do in it - avoid unnecessary updates of mbuf inside rte_pktmbuf_prefree_seg().
For one segment per packet it seems to help.
Though so far I didn't try it on i40e and didn't do any testing for multi-seg scenario.
Konstantin
I replayed my tests, and I can also see a performance loss with 1c/1t
(ixgbe), not in the same magnitude however. Here is what I have in MPPS:

1c/1t 1c/2t
53.3 58.7 current
52.1 58.8 original patchset
53.3 58.8 removed patches 3 and 9
53.1 58.7 with konstantin's patch

So we have 2 options here:

1/ integrate Konstantin's patch in the patchset (thank you, by the way)
2/ remove patch 3, and keep it for later until we have something that
really no impact

I'd prefer 1/, knowing that the difference is really small in terms
of cycles per packet.


Regards,
Olivier
Bruce Richardson
2017-03-31 08:41:39 UTC
Permalink
Post by Olivier Matz
Hi,
Post by Ananyev, Konstantin
Post by Ananyev, Konstantin
Post by Ananyev, Konstantin
-----Original Message-----
From: Richardson, Bruce
Sent: Thursday, March 30, 2017 1:23 PM
Subject: Re: [dpdk-dev] [PATCH 0/9] mbuf: structure reorganization
Post by Olivier Matz
Post by Bruce Richardson
Post by Bruce Richardson
Post by Olivier Matz
Hi,
Does anyone have any other comment on this series?
Can it be applied?
Thanks,
Olivier
I assume all driver maintainers have done performance analysis to check
for regressions. Perhaps they can confirm this is the case.
/Bruce
In the absence, of anyone else reporting performance numbers with this
patchset, I ran a single-thread testpmd test using 2 x 40G ports (i40e)
driver. With RX & TX descriptor ring sizes of 512 or above, I'm seeing a
fairly noticable performance drop. I still need to dig in more, e.g. do
an RFC2544 zero-loss test, and also bisect the patchset to see what
parts may be causing the problem.
Has anyone else tried any other drivers or systems to see what the perf
impact of this set may be?
I did, of course. I didn't see any noticeable performance drop on
ixgbe (4 NICs, one port per NIC, 1 core). I can replay the test with
current version.
I had no doubt you did some perf testing! :-)
Perhaps the regression I see is limited to i40e driver. I've confirmed I
still see it with that driver in zero-loss tests, so next step is to try
and localise what change in the patchset is causing it.
Ideally, though, I think we should see acks or other comments from
driver maintainers at least confirming that they have tested. You cannot
be held responsible for testing every DPDK driver before you submit work
like this.
Unfortunately I also see a regression.
Did a quick flood test on 2.8 GHZ IVB with 4x10Gb.
Sorry, forgot to mention - it is on ixgbe.
So it doesn't look like i40e specific.
Post by Ananyev, Konstantin
from 50.8 Mpps down to 47.8 Mpps.
[dpdk-dev,3/9] mbuf: set mbuf fields while in pool
cc version 5.3.1 20160406 (Red Hat 5.3.1-6) (GCC)
./dpdk.org-1705-mbuf1/x86_64-native-linuxapp-gcc/app/testpmd --lcores='7,8' -n 4 --socket-mem='1024,0' -w 04:00.1 -w 07:00.1 -w
0b:00.1 -w 0e:00.1 -- -i
After applying the patch below got nearly original numbers (though not quite) on my box.
dpdk.org mainline: 50.8
with Olivier patch: 47.8
with patch below: 50.4
What I tried to do in it - avoid unnecessary updates of mbuf inside rte_pktmbuf_prefree_seg().
For one segment per packet it seems to help.
Though so far I didn't try it on i40e and didn't do any testing for multi-seg scenario.
Konstantin
I replayed my tests, and I can also see a performance loss with 1c/1t
1c/1t 1c/2t
53.3 58.7 current
52.1 58.8 original patchset
53.3 58.8 removed patches 3 and 9
53.1 58.7 with konstantin's patch
1/ integrate Konstantin's patch in the patchset (thank you, by the way)
2/ remove patch 3, and keep it for later until we have something that
really no impact
I'd prefer 1/, knowing that the difference is really small in terms
of cycles per packet.
1 is certainly the more attractive option. However, I think we can
afford to spend a little more time looking at this before we decide.
I'll try and check out the perf numbers I get with i40e with
Konstantin's patch today. We also need to double check the other
possible issues he reported in his other emails. While I don't want this
patchset held up for a long time, I think an extra 24/48 hours is
probably needed on it.

/Bruce
Olivier Matz
2017-03-31 08:59:25 UTC
Permalink
Post by Bruce Richardson
Post by Olivier Matz
Hi,
Post by Ananyev, Konstantin
Post by Ananyev, Konstantin
Post by Ananyev, Konstantin
-----Original Message-----
From: Richardson, Bruce
Sent: Thursday, March 30, 2017 1:23 PM
Subject: Re: [dpdk-dev] [PATCH 0/9] mbuf: structure reorganization
Post by Olivier Matz
Post by Bruce Richardson
Post by Bruce Richardson
Post by Olivier Matz
Hi,
Does anyone have any other comment on this series?
Can it be applied?
Thanks,
Olivier
I assume all driver maintainers have done performance analysis to check
for regressions. Perhaps they can confirm this is the case.
/Bruce
In the absence, of anyone else reporting performance numbers with this
patchset, I ran a single-thread testpmd test using 2 x 40G ports (i40e)
driver. With RX & TX descriptor ring sizes of 512 or above, I'm seeing a
fairly noticable performance drop. I still need to dig in more, e.g. do
an RFC2544 zero-loss test, and also bisect the patchset to see what
parts may be causing the problem.
Has anyone else tried any other drivers or systems to see what the perf
impact of this set may be?
I did, of course. I didn't see any noticeable performance drop on
ixgbe (4 NICs, one port per NIC, 1 core). I can replay the test with
current version.
I had no doubt you did some perf testing! :-)
Perhaps the regression I see is limited to i40e driver. I've confirmed I
still see it with that driver in zero-loss tests, so next step is to try
and localise what change in the patchset is causing it.
Ideally, though, I think we should see acks or other comments from
driver maintainers at least confirming that they have tested. You cannot
be held responsible for testing every DPDK driver before you submit work
like this.
Unfortunately I also see a regression.
Did a quick flood test on 2.8 GHZ IVB with 4x10Gb.
Sorry, forgot to mention - it is on ixgbe.
So it doesn't look like i40e specific.
Post by Ananyev, Konstantin
from 50.8 Mpps down to 47.8 Mpps.
[dpdk-dev,3/9] mbuf: set mbuf fields while in pool
cc version 5.3.1 20160406 (Red Hat 5.3.1-6) (GCC)
./dpdk.org-1705-mbuf1/x86_64-native-linuxapp-gcc/app/testpmd --lcores='7,8' -n 4 --socket-mem='1024,0' -w 04:00.1 -w 07:00.1 -w
0b:00.1 -w 0e:00.1 -- -i
After applying the patch below got nearly original numbers (though not quite) on my box.
dpdk.org mainline: 50.8
with Olivier patch: 47.8
with patch below: 50.4
What I tried to do in it - avoid unnecessary updates of mbuf inside rte_pktmbuf_prefree_seg().
For one segment per packet it seems to help.
Though so far I didn't try it on i40e and didn't do any testing for multi-seg scenario.
Konstantin
I replayed my tests, and I can also see a performance loss with 1c/1t
1c/1t 1c/2t
53.3 58.7 current
52.1 58.8 original patchset
53.3 58.8 removed patches 3 and 9
53.1 58.7 with konstantin's patch
1/ integrate Konstantin's patch in the patchset (thank you, by the way)
2/ remove patch 3, and keep it for later until we have something that
really no impact
I'd prefer 1/, knowing that the difference is really small in terms
of cycles per packet.
1 is certainly the more attractive option. However, I think we can
afford to spend a little more time looking at this before we decide.
I'll try and check out the perf numbers I get with i40e with
Konstantin's patch today. We also need to double check the other
possible issues he reported in his other emails. While I don't want this
patchset held up for a long time, I think an extra 24/48 hours is
probably needed on it.
Yes, now that we have the "test momentum", try not to loose it ;)

I'm guilty to have missed the performance loss, but honnestly,
I'm a bit sad that nobody tried to this patchset before (it
is available for more than 2 months), knowing this is probably one of
the most critical part of dpdk. I think we need to be better next
time.

Anyway, thank you for your test and feedback now.

Olivier
Ananyev, Konstantin
2017-03-31 09:18:22 UTC
Permalink
Hi guys,
Post by Ananyev, Konstantin
Post by Bruce Richardson
Post by Olivier Matz
Hi,
Post by Ananyev, Konstantin
Post by Ananyev, Konstantin
Post by Ananyev, Konstantin
-----Original Message-----
From: Richardson, Bruce
Sent: Thursday, March 30, 2017 1:23 PM
Subject: Re: [dpdk-dev] [PATCH 0/9] mbuf: structure reorganization
Post by Olivier Matz
Post by Bruce Richardson
Post by Bruce Richardson
Post by Olivier Matz
Hi,
Does anyone have any other comment on this series?
Can it be applied?
Thanks,
Olivier
I assume all driver maintainers have done performance analysis to check
for regressions. Perhaps they can confirm this is the case.
/Bruce
In the absence, of anyone else reporting performance numbers with this
patchset, I ran a single-thread testpmd test using 2 x 40G ports (i40e)
driver. With RX & TX descriptor ring sizes of 512 or above, I'm seeing a
fairly noticable performance drop. I still need to dig in more, e.g. do
an RFC2544 zero-loss test, and also bisect the patchset to see what
parts may be causing the problem.
Has anyone else tried any other drivers or systems to see what the perf
impact of this set may be?
I did, of course. I didn't see any noticeable performance drop on
ixgbe (4 NICs, one port per NIC, 1 core). I can replay the test with
current version.
I had no doubt you did some perf testing! :-)
Perhaps the regression I see is limited to i40e driver. I've confirmed I
still see it with that driver in zero-loss tests, so next step is to try
and localise what change in the patchset is causing it.
Ideally, though, I think we should see acks or other comments from
driver maintainers at least confirming that they have tested. You cannot
be held responsible for testing every DPDK driver before you submit work
like this.
Unfortunately I also see a regression.
Did a quick flood test on 2.8 GHZ IVB with 4x10Gb.
Sorry, forgot to mention - it is on ixgbe.
So it doesn't look like i40e specific.
Post by Ananyev, Konstantin
from 50.8 Mpps down to 47.8 Mpps.
[dpdk-dev,3/9] mbuf: set mbuf fields while in pool
cc version 5.3.1 20160406 (Red Hat 5.3.1-6) (GCC)
./dpdk.org-1705-mbuf1/x86_64-native-linuxapp-gcc/app/testpmd --lcores='7,8' -n 4 --socket-mem='1024,0' -w 04:00.1 -w
07:00.1 -w
Post by Bruce Richardson
Post by Olivier Matz
Post by Ananyev, Konstantin
Post by Ananyev, Konstantin
Post by Ananyev, Konstantin
0b:00.1 -w 0e:00.1 -- -i
After applying the patch below got nearly original numbers (though not quite) on my box.
dpdk.org mainline: 50.8
with Olivier patch: 47.8
with patch below: 50.4
What I tried to do in it - avoid unnecessary updates of mbuf inside rte_pktmbuf_prefree_seg().
For one segment per packet it seems to help.
Though so far I didn't try it on i40e and didn't do any testing for multi-seg scenario.
Konstantin
I replayed my tests, and I can also see a performance loss with 1c/1t
1c/1t 1c/2t
53.3 58.7 current
52.1 58.8 original patchset
53.3 58.8 removed patches 3 and 9
53.1 58.7 with konstantin's patch
1/ integrate Konstantin's patch in the patchset (thank you, by the way)
2/ remove patch 3, and keep it for later until we have something that
really no impact
I'd prefer 1/, knowing that the difference is really small in terms
of cycles per packet.
1 is certainly the more attractive option. However, I think we can
afford to spend a little more time looking at this before we decide.
I'll try and check out the perf numbers I get with i40e with
Konstantin's patch today. We also need to double check the other
possible issues he reported in his other emails. While I don't want this
patchset held up for a long time, I think an extra 24/48 hours is
probably needed on it.
Yes, now that we have the "test momentum", try not to loose it ;)
I'm guilty to have missed the performance loss, but honnestly,
I'm a bit sad that nobody tried to this patchset before (it
is available for more than 2 months), knowing this is probably one of
the most critical part of dpdk. I think we need to be better next
time.
Anyway, thank you for your test and feedback now.
I am also leaning towards option 1, but agree that some extra testing first
need to be done before making the final decision.
BTW, path #9 need to be removed anyway, even if will go for path #1.
Konstantin
Olivier Matz
2017-03-31 09:36:15 UTC
Permalink
Post by Ananyev, Konstantin
BTW, path #9 need to be removed anyway, even if will go for path #1.
Yes
Bruce Richardson
2017-03-31 09:23:02 UTC
Permalink
Post by Olivier Matz
Post by Bruce Richardson
Post by Olivier Matz
Hi,
Post by Ananyev, Konstantin
Post by Ananyev, Konstantin
Post by Ananyev, Konstantin
-----Original Message-----
From: Richardson, Bruce
Sent: Thursday, March 30, 2017 1:23 PM
Subject: Re: [dpdk-dev] [PATCH 0/9] mbuf: structure reorganization
Post by Olivier Matz
Post by Bruce Richardson
Post by Bruce Richardson
Post by Olivier Matz
Hi,
Does anyone have any other comment on this series?
Can it be applied?
Thanks,
Olivier
I assume all driver maintainers have done performance analysis to check
for regressions. Perhaps they can confirm this is the case.
/Bruce
In the absence, of anyone else reporting performance numbers with this
patchset, I ran a single-thread testpmd test using 2 x 40G ports (i40e)
driver. With RX & TX descriptor ring sizes of 512 or above, I'm seeing a
fairly noticable performance drop. I still need to dig in more, e.g. do
an RFC2544 zero-loss test, and also bisect the patchset to see what
parts may be causing the problem.
Has anyone else tried any other drivers or systems to see what the perf
impact of this set may be?
I did, of course. I didn't see any noticeable performance drop on
ixgbe (4 NICs, one port per NIC, 1 core). I can replay the test with
current version.
I had no doubt you did some perf testing! :-)
Perhaps the regression I see is limited to i40e driver. I've confirmed I
still see it with that driver in zero-loss tests, so next step is to try
and localise what change in the patchset is causing it.
Ideally, though, I think we should see acks or other comments from
driver maintainers at least confirming that they have tested. You cannot
be held responsible for testing every DPDK driver before you submit work
like this.
Unfortunately I also see a regression.
Did a quick flood test on 2.8 GHZ IVB with 4x10Gb.
Sorry, forgot to mention - it is on ixgbe.
So it doesn't look like i40e specific.
Post by Ananyev, Konstantin
from 50.8 Mpps down to 47.8 Mpps.
[dpdk-dev,3/9] mbuf: set mbuf fields while in pool
cc version 5.3.1 20160406 (Red Hat 5.3.1-6) (GCC)
./dpdk.org-1705-mbuf1/x86_64-native-linuxapp-gcc/app/testpmd --lcores='7,8' -n 4 --socket-mem='1024,0' -w 04:00.1 -w 07:00.1 -w
0b:00.1 -w 0e:00.1 -- -i
After applying the patch below got nearly original numbers (though not quite) on my box.
dpdk.org mainline: 50.8
with Olivier patch: 47.8
with patch below: 50.4
What I tried to do in it - avoid unnecessary updates of mbuf inside rte_pktmbuf_prefree_seg().
For one segment per packet it seems to help.
Though so far I didn't try it on i40e and didn't do any testing for multi-seg scenario.
Konstantin
I replayed my tests, and I can also see a performance loss with 1c/1t
1c/1t 1c/2t
53.3 58.7 current
52.1 58.8 original patchset
53.3 58.8 removed patches 3 and 9
53.1 58.7 with konstantin's patch
1/ integrate Konstantin's patch in the patchset (thank you, by the way)
2/ remove patch 3, and keep it for later until we have something that
really no impact
I'd prefer 1/, knowing that the difference is really small in terms
of cycles per packet.
1 is certainly the more attractive option. However, I think we can
afford to spend a little more time looking at this before we decide.
I'll try and check out the perf numbers I get with i40e with
Konstantin's patch today. We also need to double check the other
possible issues he reported in his other emails. While I don't want this
patchset held up for a long time, I think an extra 24/48 hours is
probably needed on it.
Yes, now that we have the "test momentum", try not to loose it ;)
I'm guilty to have missed the performance loss, but honnestly,
I'm a bit sad that nobody tried to this patchset before (it
is available for more than 2 months), knowing this is probably one of
the most critical part of dpdk. I think we need to be better next
time.
No disagreement here.
Nélio Laranjeiro
2017-03-31 11:18:51 UTC
Permalink
Post by Olivier Matz
Hi,
Does anyone have any other comment on this series?
Can it be applied?
Thanks,
Olivier
Post by Olivier Matz
Based on discussions done in [1] and in this thread, this patchset reorganizes
the mbuf.
- reorder structure to increase vector performance on some non-ia
platforms.
- add a 64bits timestamp field in the 1st cache line. This timestamp
is not normalized, i.e. no unit or time reference is enforced. A
library may be added to do this job in the future.
- m->next, m->nb_segs, and m->refcnt are always initialized for mbufs
in the pool, avoiding the need of setting m->next (located in the
2nd cache line) in the Rx path for mono-segment packets.
- change port and nb_segs to 16 bits
- move seqn in the 2nd cache line
- move refcnt and nb_segs to the 2nd cache line: many drivers sets
them in the Rx path, so it could introduce a performance regression, or
it would require to change all the drivers, which is not an easy task.
- remove the m->port field: too much impact on many examples and libraries,
and some people highlighted they are using it.
- moving m->next in the 1st cache line: there is not enough room, and having
it set to NULL for unused mbuf should remove the need for it.
- merge seqn and timestamp together in a union: we could imagine use cases
were both are activated. There is no flag indicating the presence of seqn,
so it looks preferable to keep them separated for now.
I made some basic performance tests (ixgbe) and see no regression.
Other tests from NIC vendors are welcome.
Once this patchset is pushed, the Rx path of drivers could be optimized a bit,
by removing writes to m->next, m->nb_segs and m->refcnt. The patch 4/8 gives an
idea of what could be done.
[1] http://dpdk.org/ml/archives/dev/2016-October/049338.html
- fix reset of mbuf fields in case of indirect mbuf in rte_pktmbuf_prefree_seg()
- do not enforce a unit or time reference for m->timestamp
- reorganize fields to make vlan and outer vlan consecutive
- enhance documentation of m->refcnt and m->port to explain why they are 16bits
mbuf: make rearm data address naturally aligned
mbuf: make segment prefree function public
mbuf: make raw free function public
mbuf: set mbuf fields while in pool
drivers/net: don't touch mbuf next or nb segs on Rx
mbuf: use 2 bytes for port and nb segments
mbuf: move sequence number in second cache line
mbuf: add a timestamp field
mbuf: reorder VLAN tci and buffer len fields
app/test-pmd/csumonly.c | 4 +-
drivers/net/ena/ena_ethdev.c | 2 +-
drivers/net/enic/enic_rxtx.c | 2 +-
drivers/net/fm10k/fm10k_rxtx.c | 6 +-
drivers/net/fm10k/fm10k_rxtx_vec.c | 9 +-
drivers/net/i40e/i40e_rxtx_vec_common.h | 6 +-
drivers/net/i40e/i40e_rxtx_vec_sse.c | 11 +-
drivers/net/ixgbe/ixgbe_rxtx.c | 10 +-
drivers/net/ixgbe/ixgbe_rxtx_vec_common.h | 6 +-
drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c | 9 --
drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c | 9 --
drivers/net/mlx5/mlx5_rxtx.c | 11 +-
drivers/net/mpipe/mpipe_tilegx.c | 3 +-
drivers/net/null/rte_eth_null.c | 2 -
drivers/net/virtio/virtio_rxtx.c | 4 -
drivers/net/virtio/virtio_rxtx_simple.h | 6 +-
.../linuxapp/eal/include/exec-env/rte_kni_common.h | 5 +-
lib/librte_mbuf/rte_mbuf.c | 4 +
lib/librte_mbuf/rte_mbuf.h | 123 ++++++++++++++++-----
19 files changed, 130 insertions(+), 102 deletions(-)
Tested-by: Nelio Laranjeiro <***@6wind.com>

with mlx5 ConnectX-4 two ports with a single thread IO forwarding.

Olivier patches: increase performance by +0.4Mpps.
Olivier + Konstantin patches: increase performance by +0.8Mpps.

Regards,
--
Nélio Laranjeiro
6WIND
Andrew Rybchenko
2017-03-30 14:54:13 UTC
Permalink
Post by Olivier Matz
Based on discussions done in [1] and in this thread, this patchset reorganizes
the mbuf.
- reorder structure to increase vector performance on some non-ia
platforms.
- add a 64bits timestamp field in the 1st cache line. This timestamp
is not normalized, i.e. no unit or time reference is enforced. A
library may be added to do this job in the future.
- m->next, m->nb_segs, and m->refcnt are always initialized for mbufs
in the pool, avoiding the need of setting m->next (located in the
2nd cache line) in the Rx path for mono-segment packets.
- change port and nb_segs to 16 bits
- move seqn in the 2nd cache line
- move refcnt and nb_segs to the 2nd cache line: many drivers sets
them in the Rx path, so it could introduce a performance regression, or
it would require to change all the drivers, which is not an easy task.
- remove the m->port field: too much impact on many examples and libraries,
and some people highlighted they are using it.
- moving m->next in the 1st cache line: there is not enough room, and having
it set to NULL for unused mbuf should remove the need for it.
- merge seqn and timestamp together in a union: we could imagine use cases
were both are activated. There is no flag indicating the presence of seqn,
so it looks preferable to keep them separated for now.
I made some basic performance tests (ixgbe) and see no regression.
Other tests from NIC vendors are welcome.
Once this patchset is pushed, the Rx path of drivers could be optimized a bit,
by removing writes to m->next, m->nb_segs and m->refcnt. The patch 4/8 gives an
idea of what could be done.
[1] http://dpdk.org/ml/archives/dev/2016-October/049338.html
- fix reset of mbuf fields in case of indirect mbuf in rte_pktmbuf_prefree_seg()
- do not enforce a unit or time reference for m->timestamp
- reorganize fields to make vlan and outer vlan consecutive
- enhance documentation of m->refcnt and m->port to explain why they are 16bits
mbuf: make rearm data address naturally aligned
mbuf: make segment prefree function public
mbuf: make raw free function public
mbuf: set mbuf fields while in pool
drivers/net: don't touch mbuf next or nb segs on Rx
mbuf: use 2 bytes for port and nb segments
mbuf: move sequence number in second cache line
mbuf: add a timestamp field
mbuf: reorder VLAN tci and buffer len fields
app/test-pmd/csumonly.c | 4 +-
drivers/net/ena/ena_ethdev.c | 2 +-
drivers/net/enic/enic_rxtx.c | 2 +-
drivers/net/fm10k/fm10k_rxtx.c | 6 +-
drivers/net/fm10k/fm10k_rxtx_vec.c | 9 +-
drivers/net/i40e/i40e_rxtx_vec_common.h | 6 +-
drivers/net/i40e/i40e_rxtx_vec_sse.c | 11 +-
drivers/net/ixgbe/ixgbe_rxtx.c | 10 +-
drivers/net/ixgbe/ixgbe_rxtx_vec_common.h | 6 +-
drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c | 9 --
drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c | 9 --
drivers/net/mlx5/mlx5_rxtx.c | 11 +-
drivers/net/mpipe/mpipe_tilegx.c | 3 +-
drivers/net/null/rte_eth_null.c | 2 -
drivers/net/virtio/virtio_rxtx.c | 4 -
drivers/net/virtio/virtio_rxtx_simple.h | 6 +-
.../linuxapp/eal/include/exec-env/rte_kni_common.h | 5 +-
lib/librte_mbuf/rte_mbuf.c | 4 +
lib/librte_mbuf/rte_mbuf.h | 123 ++++++++++++++++-----
19 files changed, 130 insertions(+), 102 deletions(-)
I see better performance with the patch series applied and next=NULL
assignments removed from net/sfc (waiting for the series applied to submit
corresponding patches). So the series:

Acked-by: Andrew Rybchenko <***@solarflare.com>
Jerin Jacob
2017-03-30 15:12:18 UTC
Permalink
Post by Olivier Matz
Based on discussions done in [1] and in this thread, this patchset reorganizes
the mbuf.
- reorder structure to increase vector performance on some non-ia
platforms.
- add a 64bits timestamp field in the 1st cache line. This timestamp
is not normalized, i.e. no unit or time reference is enforced. A
library may be added to do this job in the future.
- m->next, m->nb_segs, and m->refcnt are always initialized for mbufs
in the pool, avoiding the need of setting m->next (located in the
2nd cache line) in the Rx path for mono-segment packets.
- change port and nb_segs to 16 bits
- move seqn in the 2nd cache line
- move refcnt and nb_segs to the 2nd cache line: many drivers sets
them in the Rx path, so it could introduce a performance regression, or
it would require to change all the drivers, which is not an easy task.
- remove the m->port field: too much impact on many examples and libraries,
and some people highlighted they are using it.
- moving m->next in the 1st cache line: there is not enough room, and having
it set to NULL for unused mbuf should remove the need for it.
- merge seqn and timestamp together in a union: we could imagine use cases
were both are activated. There is no flag indicating the presence of seqn,
so it looks preferable to keep them separated for now.
I made some basic performance tests (ixgbe) and see no regression.
Other tests from NIC vendors are welcome.
Once this patchset is pushed, the Rx path of drivers could be optimized a bit,
by removing writes to m->next, m->nb_segs and m->refcnt. The patch 4/8 gives an
idea of what could be done.
[1] http://dpdk.org/ml/archives/dev/2016-October/049338.html
- fix reset of mbuf fields in case of indirect mbuf in rte_pktmbuf_prefree_seg()
- do not enforce a unit or time reference for m->timestamp
- reorganize fields to make vlan and outer vlan consecutive
- enhance documentation of m->refcnt and m->port to explain why they are 16bits
mbuf: make rearm data address naturally aligned
mbuf: make segment prefree function public
mbuf: make raw free function public
mbuf: set mbuf fields while in pool
drivers/net: don't touch mbuf next or nb segs on Rx
mbuf: use 2 bytes for port and nb segments
mbuf: move sequence number in second cache line
mbuf: add a timestamp field
mbuf: reorder VLAN tci and buffer len fields
app/test-pmd/csumonly.c | 4 +-
drivers/net/ena/ena_ethdev.c | 2 +-
drivers/net/enic/enic_rxtx.c | 2 +-
drivers/net/fm10k/fm10k_rxtx.c | 6 +-
drivers/net/fm10k/fm10k_rxtx_vec.c | 9 +-
drivers/net/i40e/i40e_rxtx_vec_common.h | 6 +-
drivers/net/i40e/i40e_rxtx_vec_sse.c | 11 +-
drivers/net/ixgbe/ixgbe_rxtx.c | 10 +-
drivers/net/ixgbe/ixgbe_rxtx_vec_common.h | 6 +-
drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c | 9 --
drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c | 9 --
drivers/net/mlx5/mlx5_rxtx.c | 11 +-
drivers/net/mpipe/mpipe_tilegx.c | 3 +-
drivers/net/null/rte_eth_null.c | 2 -
drivers/net/virtio/virtio_rxtx.c | 4 -
drivers/net/virtio/virtio_rxtx_simple.h | 6 +-
.../linuxapp/eal/include/exec-env/rte_kni_common.h | 5 +-
lib/librte_mbuf/rte_mbuf.c | 4 +
lib/librte_mbuf/rte_mbuf.h | 123 ++++++++++++++++-----
19 files changed, 130 insertions(+), 102 deletions(-)
No performance regression on this series on arm64 + thunderx PMD combo.
Post by Olivier Matz
--
2.8.1
Loading...