Discussion:
[dpdk-dev] [PATCH 0/4] Virtio NEON support for ARM
(too old to reply)
Jerin Jacob
2016-06-27 11:54:04 UTC
Permalink
This patchset includes,

1) General cleanup on compile time dependency of use_simple_rxtx with RTE_MACHINE_CPUFLAG_SSSE3
2) Added NEON support for optimized Rx handling

This patchset is based on dpdk-next-virtio/master at a1d8bd4911b28e32c35f16ab2ff3e22180d1f1d7

Jerin Jacob (4):
virtio: Fix compile time dependency of use_simple_rxtx usage
virtio: introduce RTE_LIBRTE_VIRTIO_INC_VECTOR
virtio: move SSE based Rx implementation to separate file
virtio: add neon support

MAINTAINERS | 1 +
config/common_base | 1 +
config/defconfig_arm-armv7a-linuxapp-gcc | 1 +
config/defconfig_ppc_64-power8-linuxapp-gcc | 1 +
config/defconfig_tile-tilegx-linuxapp-gcc | 1 +
doc/guides/rel_notes/release_16_07.rst | 2 +
drivers/net/virtio/Makefile | 3 -
drivers/net/virtio/virtio_pci.h | 1 +
drivers/net/virtio/virtio_rxtx.c | 32 ++--
drivers/net/virtio/virtio_rxtx.h | 3 +-
drivers/net/virtio/virtio_rxtx_simple.c | 168 +------------------
drivers/net/virtio/virtio_rxtx_simple_neon.h | 238 +++++++++++++++++++++++++++
drivers/net/virtio/virtio_rxtx_simple_sse.h | 225 +++++++++++++++++++++++++
drivers/net/virtio/virtio_user_ethdev.c | 1 +
14 files changed, 490 insertions(+), 188 deletions(-)
create mode 100644 drivers/net/virtio/virtio_rxtx_simple_neon.h
create mode 100644 drivers/net/virtio/virtio_rxtx_simple_sse.h
--
2.5.5
Jerin Jacob
2016-06-27 11:54:05 UTC
Permalink
Removed unnecessary compile time dependency on "use_simple_rxtx".

Signed-off-by: Jerin Jacob <***@caviumnetworks.com>
---
drivers/net/virtio/Makefile | 3 ---
drivers/net/virtio/virtio_pci.h | 1 +
drivers/net/virtio/virtio_rxtx.c | 28 +++++++++-------------------
drivers/net/virtio/virtio_rxtx.h | 3 +--
drivers/net/virtio/virtio_rxtx_simple.c | 8 ++++++--
drivers/net/virtio/virtio_user_ethdev.c | 1 +
6 files changed, 18 insertions(+), 26 deletions(-)

diff --git a/drivers/net/virtio/Makefile b/drivers/net/virtio/Makefile
index 43de46c..114d40e 100644
--- a/drivers/net/virtio/Makefile
+++ b/drivers/net/virtio/Makefile
@@ -50,10 +50,7 @@ SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtqueue.c
SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_pci.c
SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx.c
SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_ethdev.c
-
-ifeq ($(findstring RTE_MACHINE_CPUFLAG_SSSE3,$(CFLAGS)),RTE_MACHINE_CPUFLAG_SSSE3)
SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx_simple.c
-endif

ifeq ($(CONFIG_RTE_VIRTIO_USER),y)
SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/vhost_user.c
diff --git a/drivers/net/virtio/virtio_pci.h b/drivers/net/virtio/virtio_pci.h
index dd7693f..b8295a7 100644
--- a/drivers/net/virtio/virtio_pci.h
+++ b/drivers/net/virtio/virtio_pci.h
@@ -253,6 +253,7 @@ struct virtio_hw {
uint8_t use_msix;
uint8_t started;
uint8_t modern;
+ uint8_t use_simple_rxtx;
uint8_t mac_addr[ETHER_ADDR_LEN];
uint32_t notify_off_multiplier;
uint8_t *isr;
diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
index a27208e..63b53f7 100644
--- a/drivers/net/virtio/virtio_rxtx.c
+++ b/drivers/net/virtio/virtio_rxtx.c
@@ -67,10 +67,6 @@
#define VIRTIO_SIMPLE_FLAGS ((uint32_t)ETH_TXQ_FLAGS_NOMULTSEGS | \
ETH_TXQ_FLAGS_NOOFFLOADS)

-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
-static int use_simple_rxtx;
-#endif
-
static void
vq_ring_free_chain(struct virtqueue *vq, uint16_t desc_idx)
{
@@ -333,6 +329,7 @@ virtio_dev_rxtx_start(struct rte_eth_dev *dev)
*/
uint16_t i;
uint16_t desc_idx;
+ struct virtio_hw *hw = dev->data->dev_private;

PMD_INIT_FUNC_TRACE();

@@ -353,8 +350,7 @@ virtio_dev_rxtx_start(struct rte_eth_dev *dev)
nbufs = 0;
error = ENOSPC;

-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
- if (use_simple_rxtx) {
+ if (hw->use_simple_rxtx) {
for (desc_idx = 0; desc_idx < vq->vq_nentries;
desc_idx++) {
vq->vq_ring.avail->ring[desc_idx] = desc_idx;
@@ -362,7 +358,7 @@ virtio_dev_rxtx_start(struct rte_eth_dev *dev)
VRING_DESC_F_WRITE;
}
}
-#endif
+
memset(&rxvq->fake_mbuf, 0, sizeof(rxvq->fake_mbuf));
for (desc_idx = 0; desc_idx < RTE_PMD_VIRTIO_RX_MAX_BURST;
desc_idx++) {
@@ -378,12 +374,11 @@ virtio_dev_rxtx_start(struct rte_eth_dev *dev)
/******************************************
* Enqueue allocated buffers *
*******************************************/
-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
- if (use_simple_rxtx)
+ if (hw->use_simple_rxtx)
error = virtqueue_enqueue_recv_refill_simple(vq, m);
else
-#endif
error = virtqueue_enqueue_recv_refill(vq, m);
+
if (error) {
rte_pktmbuf_free(m);
break;
@@ -404,8 +399,7 @@ virtio_dev_rxtx_start(struct rte_eth_dev *dev)
struct virtqueue *vq = txvq->vq;

virtio_dev_vring_start(vq);
-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
- if (use_simple_rxtx) {
+ if (hw->use_simple_rxtx) {
uint16_t mid_idx = vq->vq_nentries >> 1;

for (desc_idx = 0; desc_idx < mid_idx; desc_idx++) {
@@ -426,7 +420,7 @@ virtio_dev_rxtx_start(struct rte_eth_dev *dev)
desc_idx++)
vq->vq_ring.avail->ring[desc_idx] = desc_idx;
}
-#endif
+
VIRTQUEUE_DUMP(vq);
}
}
@@ -456,9 +450,7 @@ virtio_dev_rx_queue_setup(struct rte_eth_dev *dev,

dev->data->rx_queues[queue_idx] = rxvq;

-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
virtio_rxq_vec_setup(rxvq);
-#endif

return 0;
}
@@ -494,9 +486,6 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
{
uint8_t vtpci_queue_idx = 2 * queue_idx + VTNET_SQ_TQ_QUEUE_IDX;

-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
- struct virtio_hw *hw = dev->data->dev_private;
-#endif
struct virtnet_tx *txvq;
struct virtqueue *vq;
uint16_t tx_free_thresh;
@@ -511,13 +500,14 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
}

#ifdef RTE_MACHINE_CPUFLAG_SSSE3
+ struct virtio_hw *hw = dev->data->dev_private;
/* Use simple rx/tx func if single segment and no offloads */
if ((tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) == VIRTIO_SIMPLE_FLAGS &&
!vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
PMD_INIT_LOG(INFO, "Using simple rx/tx path");
dev->tx_pkt_burst = virtio_xmit_pkts_simple;
dev->rx_pkt_burst = virtio_recv_pkts_vec;
- use_simple_rxtx = 1;
+ hw->use_simple_rxtx = 1;
}
#endif

diff --git a/drivers/net/virtio/virtio_rxtx.h b/drivers/net/virtio/virtio_rxtx.h
index 058b56a..28f82d6 100644
--- a/drivers/net/virtio/virtio_rxtx.h
+++ b/drivers/net/virtio/virtio_rxtx.h
@@ -86,10 +86,9 @@ struct virtnet_ctl {
const struct rte_memzone *mz; /**< mem zone to populate RX ring. */
};

-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
int virtio_rxq_vec_setup(struct virtnet_rx *rxvq);

int virtqueue_enqueue_recv_refill_simple(struct virtqueue *vq,
struct rte_mbuf *m);
-#endif
+
#endif /* _VIRTIO_RXTX_H_ */
diff --git a/drivers/net/virtio/virtio_rxtx_simple.c b/drivers/net/virtio/virtio_rxtx_simple.c
index 242ad90..67430da 100644
--- a/drivers/net/virtio/virtio_rxtx_simple.c
+++ b/drivers/net/virtio/virtio_rxtx_simple.c
@@ -37,8 +37,6 @@
#include <string.h>
#include <errno.h>

-#include <tmmintrin.h>
-
#include <rte_cycles.h>
#include <rte_memory.h>
#include <rte_memzone.h>
@@ -131,6 +129,10 @@ virtio_rxq_rearm_vec(struct virtnet_rx *rxvq)
vq_update_avail_idx(vq);
}

+#ifdef RTE_MACHINE_CPUFLAG_SSSE3
+
+#include <tmmintrin.h>
+
/* virtio vPMD receive routine, only accept(nb_pkts >= RTE_VIRTIO_DESC_PER_LOOP)
*
* This routine is for non-mergeable RX, one desc for each guest buffer.
@@ -293,6 +295,8 @@ virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
return nb_pkts_received;
}

+#endif
+
#define VIRTIO_TX_FREE_THRESH 32
#define VIRTIO_TX_MAX_FREE_BUF_SZ 32
#define VIRTIO_TX_FREE_NR 32
diff --git a/drivers/net/virtio/virtio_user_ethdev.c b/drivers/net/virtio/virtio_user_ethdev.c
index 9216182..ce6ce91 100644
--- a/drivers/net/virtio/virtio_user_ethdev.c
+++ b/drivers/net/virtio/virtio_user_ethdev.c
@@ -303,6 +303,7 @@ virtio_user_eth_dev_alloc(const char *name)
hw->vtpci_ops = &virtio_user_ops;
hw->use_msix = 0;
hw->modern = 0;
+ hw->use_simple_rxtx = 0;
hw->virtio_user_dev = dev;
data->dev_private = hw;
data->numa_node = SOCKET_ID_ANY;
--
2.5.5
Jerin Jacob
2016-06-27 11:54:06 UTC
Permalink
like other PMD drivers, introduce RTE_LIBRTE_VIRTIO_INC_VECTOR
for vector based handler selection in virtio

Enabled by default in common config and disabled for non X86
platforms

Signed-off-by: Jerin Jacob <***@caviumnetworks.com>
---
config/common_base | 1 +
config/defconfig_arm-armv7a-linuxapp-gcc | 1 +
config/defconfig_arm64-armv8a-linuxapp-gcc | 1 +
config/defconfig_ppc_64-power8-linuxapp-gcc | 1 +
config/defconfig_tile-tilegx-linuxapp-gcc | 1 +
drivers/net/virtio/virtio_rxtx.c | 2 ++
6 files changed, 7 insertions(+)

diff --git a/config/common_base b/config/common_base
index 3a04fba..f6ce168 100644
--- a/config/common_base
+++ b/config/common_base
@@ -267,6 +267,7 @@ CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_RX=n
CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_TX=n
CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_DRIVER=n
CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_DUMP=n
+CONFIG_RTE_LIBRTE_VIRTIO_INC_VECTOR=y

#
# Compile burst-oriented VMXNET3 PMD driver
diff --git a/config/defconfig_arm-armv7a-linuxapp-gcc b/config/defconfig_arm-armv7a-linuxapp-gcc
index bde6acd..a249ad5 100644
--- a/config/defconfig_arm-armv7a-linuxapp-gcc
+++ b/config/defconfig_arm-armv7a-linuxapp-gcc
@@ -75,3 +75,4 @@ CONFIG_RTE_LIBRTE_VMXNET3_PMD=n
CONFIG_RTE_LIBRTE_PMD_XENVIRT=n
CONFIG_RTE_LIBRTE_PMD_BNX2X=n
CONFIG_RTE_LIBRTE_QEDE_PMD=n
+CONFIG_RTE_LIBRTE_VIRTIO_INC_VECTOR=n
diff --git a/config/defconfig_arm64-armv8a-linuxapp-gcc b/config/defconfig_arm64-armv8a-linuxapp-gcc
index a786562..95ed30e 100644
--- a/config/defconfig_arm64-armv8a-linuxapp-gcc
+++ b/config/defconfig_arm64-armv8a-linuxapp-gcc
@@ -48,5 +48,6 @@ CONFIG_RTE_IXGBE_INC_VECTOR=n
CONFIG_RTE_LIBRTE_IVSHMEM=n
CONFIG_RTE_LIBRTE_FM10K_PMD=n
CONFIG_RTE_LIBRTE_I40E_PMD=n
+CONFIG_RTE_LIBRTE_VIRTIO_INC_VECTOR=n

CONFIG_RTE_SCHED_VECTOR=n
diff --git a/config/defconfig_ppc_64-power8-linuxapp-gcc b/config/defconfig_ppc_64-power8-linuxapp-gcc
index bef8f49..1eca73a 100644
--- a/config/defconfig_ppc_64-power8-linuxapp-gcc
+++ b/config/defconfig_ppc_64-power8-linuxapp-gcc
@@ -51,6 +51,7 @@ CONFIG_RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT=n
CONFIG_RTE_LIBRTE_IXGBE_PMD=n
CONFIG_RTE_LIBRTE_I40E_PMD=n
CONFIG_RTE_LIBRTE_VIRTIO_PMD=y
+CONFIG_RTE_LIBRTE_VIRTIO_INC_VECTOR=n
CONFIG_RTE_LIBRTE_VMXNET3_PMD=n
CONFIG_RTE_LIBRTE_PMD_BOND=n
CONFIG_RTE_LIBRTE_ENIC_PMD=n
diff --git a/config/defconfig_tile-tilegx-linuxapp-gcc b/config/defconfig_tile-tilegx-linuxapp-gcc
index 5a50793..0d6fe1e 100644
--- a/config/defconfig_tile-tilegx-linuxapp-gcc
+++ b/config/defconfig_tile-tilegx-linuxapp-gcc
@@ -59,6 +59,7 @@ CONFIG_RTE_LIBRTE_IXGBE_PMD=n
CONFIG_RTE_LIBRTE_I40E_PMD=n
CONFIG_RTE_LIBRTE_FM10K_PMD=n
CONFIG_RTE_LIBRTE_VIRTIO_PMD=n
+CONFIG_RTE_LIBRTE_VIRTIO_INC_VECTOR=n
CONFIG_RTE_LIBRTE_VMXNET3_PMD=n
CONFIG_RTE_LIBRTE_ENIC_PMD=n

diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
index 63b53f7..e9b42f3 100644
--- a/drivers/net/virtio/virtio_rxtx.c
+++ b/drivers/net/virtio/virtio_rxtx.c
@@ -499,6 +499,7 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
return -EINVAL;
}

+#ifdef RTE_LIBRTE_VIRTIO_INC_VECTOR
#ifdef RTE_MACHINE_CPUFLAG_SSSE3
struct virtio_hw *hw = dev->data->dev_private;
/* Use simple rx/tx func if single segment and no offloads */
@@ -510,6 +511,7 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
hw->use_simple_rxtx = 1;
}
#endif
+#endif

ret = virtio_dev_queue_setup(dev, VTNET_TQ, queue_idx, vtpci_queue_idx,
nb_desc, socket_id, (void **)&txvq);
--
2.5.5
Thomas Monjalon
2016-06-27 14:19:57 UTC
Permalink
Post by Jerin Jacob
--- a/config/common_base
+++ b/config/common_base
@@ -267,6 +267,7 @@ CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_RX=n
CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_TX=n
CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_DRIVER=n
CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_DUMP=n
+CONFIG_RTE_LIBRTE_VIRTIO_INC_VECTOR=y
I don't remember what means INC_VECTOR?
Why a config option is needed for vector implementations?
Jerin Jacob
2016-06-27 14:48:31 UTC
Permalink
Post by Thomas Monjalon
Post by Jerin Jacob
--- a/config/common_base
+++ b/config/common_base
@@ -267,6 +267,7 @@ CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_RX=n
CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_TX=n
CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_DRIVER=n
CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_DUMP=n
+CONFIG_RTE_LIBRTE_VIRTIO_INC_VECTOR=y
I don't remember what means INC_VECTOR?
Why a config option is needed for vector implementations?
I thought of adding additional configuration option(INC_VECTOR) _apart_ from
cpu flag based scheme in the patch because even though if a given platform
has cpu instruction support, in some platforms scalar version may
perform well wrt vector version(based on instruction latency, emulation required or not
etc). So a top level flag INC_VECTOR, can override the vector selection
for a given platform if required.

Regarding INC_VECTOR(INC in vector configuration name, I have no idea, I
followed the existing flags)
$ grep "INC_VECTOR" config/common_base
CONFIG_RTE_IXGBE_INC_VECTOR=y
CONFIG_RTE_LIBRTE_I40E_INC_VECTOR=y
CONFIG_RTE_LIBRTE_FM10K_INC_VECTOR=y

Jerin
Thomas Monjalon
2016-06-27 14:59:42 UTC
Permalink
Post by Jerin Jacob
Post by Thomas Monjalon
Post by Jerin Jacob
--- a/config/common_base
+++ b/config/common_base
@@ -267,6 +267,7 @@ CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_RX=n
CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_TX=n
CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_DRIVER=n
CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_DUMP=n
+CONFIG_RTE_LIBRTE_VIRTIO_INC_VECTOR=y
I don't remember what means INC_VECTOR?
Why a config option is needed for vector implementations?
I thought of adding additional configuration option(INC_VECTOR) _apart_ from
cpu flag based scheme in the patch because even though if a given platform
has cpu instruction support, in some platforms scalar version may
perform well wrt vector version(based on instruction latency, emulation required or not
etc). So a top level flag INC_VECTOR, can override the vector selection
for a given platform if required.
Isn't it a runtime driver option needed to disable vector virtio?
Post by Jerin Jacob
Regarding INC_VECTOR(INC in vector configuration name, I have no idea, I
followed the existing flags)
$ grep "INC_VECTOR" config/common_base
CONFIG_RTE_IXGBE_INC_VECTOR=y
CONFIG_RTE_LIBRTE_I40E_INC_VECTOR=y
CONFIG_RTE_LIBRTE_FM10K_INC_VECTOR=y
If the flag is really needed I would suggest VIRTIO_VECTOR.
Jerin Jacob
2016-06-29 11:18:49 UTC
Permalink
Post by Thomas Monjalon
Post by Jerin Jacob
Post by Thomas Monjalon
Post by Jerin Jacob
--- a/config/common_base
+++ b/config/common_base
@@ -267,6 +267,7 @@ CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_RX=n
CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_TX=n
CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_DRIVER=n
CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_DUMP=n
+CONFIG_RTE_LIBRTE_VIRTIO_INC_VECTOR=y
I don't remember what means INC_VECTOR?
Why a config option is needed for vector implementations?
I thought of adding additional configuration option(INC_VECTOR) _apart_ from
cpu flag based scheme in the patch because even though if a given platform
has cpu instruction support, in some platforms scalar version may
perform well wrt vector version(based on instruction latency, emulation required or not
etc). So a top level flag INC_VECTOR, can override the vector selection
for a given platform if required.
Isn't it a runtime driver option needed to disable vector virtio?
Post by Jerin Jacob
Regarding INC_VECTOR(INC in vector configuration name, I have no idea, I
followed the existing flags)
$ grep "INC_VECTOR" config/common_base
CONFIG_RTE_IXGBE_INC_VECTOR=y
CONFIG_RTE_LIBRTE_I40E_INC_VECTOR=y
CONFIG_RTE_LIBRTE_FM10K_INC_VECTOR=y
If the flag is really needed I would suggest VIRTIO_VECTOR.
OK I will change to VIRTIO_VECTOR
Thomas Monjalon
2016-06-29 11:25:35 UTC
Permalink
Post by Jerin Jacob
Post by Thomas Monjalon
Post by Jerin Jacob
Post by Thomas Monjalon
Post by Jerin Jacob
--- a/config/common_base
+++ b/config/common_base
@@ -267,6 +267,7 @@ CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_RX=n
CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_TX=n
CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_DRIVER=n
CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_DUMP=n
+CONFIG_RTE_LIBRTE_VIRTIO_INC_VECTOR=y
I don't remember what means INC_VECTOR?
Why a config option is needed for vector implementations?
I thought of adding additional configuration option(INC_VECTOR) _apart_ from
cpu flag based scheme in the patch because even though if a given platform
has cpu instruction support, in some platforms scalar version may
perform well wrt vector version(based on instruction latency, emulation required or not
etc). So a top level flag INC_VECTOR, can override the vector selection
for a given platform if required.
Isn't it a runtime driver option needed to disable vector virtio?
Post by Jerin Jacob
Regarding INC_VECTOR(INC in vector configuration name, I have no idea, I
followed the existing flags)
$ grep "INC_VECTOR" config/common_base
CONFIG_RTE_IXGBE_INC_VECTOR=y
CONFIG_RTE_LIBRTE_I40E_INC_VECTOR=y
CONFIG_RTE_LIBRTE_FM10K_INC_VECTOR=y
If the flag is really needed I would suggest VIRTIO_VECTOR.
OK I will change to VIRTIO_VECTOR
I would prefer a runtime option.
Jerin Jacob
2016-06-29 11:40:31 UTC
Permalink
Post by Thomas Monjalon
Post by Jerin Jacob
Post by Thomas Monjalon
Post by Jerin Jacob
Post by Thomas Monjalon
Post by Jerin Jacob
--- a/config/common_base
+++ b/config/common_base
@@ -267,6 +267,7 @@ CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_RX=n
CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_TX=n
CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_DRIVER=n
CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_DUMP=n
+CONFIG_RTE_LIBRTE_VIRTIO_INC_VECTOR=y
I don't remember what means INC_VECTOR?
Why a config option is needed for vector implementations?
I thought of adding additional configuration option(INC_VECTOR) _apart_ from
cpu flag based scheme in the patch because even though if a given platform
has cpu instruction support, in some platforms scalar version may
perform well wrt vector version(based on instruction latency, emulation required or not
etc). So a top level flag INC_VECTOR, can override the vector selection
for a given platform if required.
Isn't it a runtime driver option needed to disable vector virtio?
Post by Jerin Jacob
Regarding INC_VECTOR(INC in vector configuration name, I have no idea, I
followed the existing flags)
$ grep "INC_VECTOR" config/common_base
CONFIG_RTE_IXGBE_INC_VECTOR=y
CONFIG_RTE_LIBRTE_I40E_INC_VECTOR=y
CONFIG_RTE_LIBRTE_FM10K_INC_VECTOR=y
If the flag is really needed I would suggest VIRTIO_VECTOR.
OK I will change to VIRTIO_VECTOR
I would prefer a runtime option.
OK

The platform I test their was NO need for additional VIRTIO_VECTOR
configuration as NEON versions outperforms than scalar version.

I thought of adding this option to override for any platform if it need
to accommodate such platform differences NEON vs scalar versions.

I will change completely to run-time detection based on cpuflags for IA and ARM.
Any objections?

Jerin
Yuanhan Liu
2016-06-30 05:44:00 UTC
Permalink
Post by Jerin Jacob
Post by Thomas Monjalon
Post by Jerin Jacob
Post by Thomas Monjalon
Post by Jerin Jacob
Post by Thomas Monjalon
Post by Jerin Jacob
--- a/config/common_base
+++ b/config/common_base
@@ -267,6 +267,7 @@ CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_RX=n
CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_TX=n
CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_DRIVER=n
CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_DUMP=n
+CONFIG_RTE_LIBRTE_VIRTIO_INC_VECTOR=y
I don't remember what means INC_VECTOR?
Why a config option is needed for vector implementations?
I thought of adding additional configuration option(INC_VECTOR) _apart_ from
cpu flag based scheme in the patch because even though if a given platform
has cpu instruction support, in some platforms scalar version may
perform well wrt vector version(based on instruction latency, emulation required or not
etc). So a top level flag INC_VECTOR, can override the vector selection
for a given platform if required.
Isn't it a runtime driver option needed to disable vector virtio?
Post by Jerin Jacob
Regarding INC_VECTOR(INC in vector configuration name, I have no idea, I
followed the existing flags)
$ grep "INC_VECTOR" config/common_base
CONFIG_RTE_IXGBE_INC_VECTOR=y
CONFIG_RTE_LIBRTE_I40E_INC_VECTOR=y
CONFIG_RTE_LIBRTE_FM10K_INC_VECTOR=y
If the flag is really needed I would suggest VIRTIO_VECTOR.
OK I will change to VIRTIO_VECTOR
I would prefer a runtime option.
+1
Post by Jerin Jacob
OK
The platform I test their was NO need for additional VIRTIO_VECTOR
configuration as NEON versions outperforms than scalar version.
I thought of adding this option to override for any platform if it need
to accommodate such platform differences NEON vs scalar versions.
I will change completely to run-time detection based on cpuflags for IA and ARM.
Any objections?
Nope from me.

--yliu
Jerin Jacob
2016-06-27 11:54:07 UTC
Permalink
split out SSE instruction based virtio simple rx
implementation to a separate file

Signed-off-by: Jerin Jacob <***@caviumnetworks.com>
---
drivers/net/virtio/virtio_rxtx_simple.c | 166 +-------------------
drivers/net/virtio/virtio_rxtx_simple_sse.h | 225 ++++++++++++++++++++++++++++
2 files changed, 226 insertions(+), 165 deletions(-)
create mode 100644 drivers/net/virtio/virtio_rxtx_simple_sse.h

diff --git a/drivers/net/virtio/virtio_rxtx_simple.c b/drivers/net/virtio/virtio_rxtx_simple.c
index 67430da..ca87605 100644
--- a/drivers/net/virtio/virtio_rxtx_simple.c
+++ b/drivers/net/virtio/virtio_rxtx_simple.c
@@ -130,171 +130,7 @@ virtio_rxq_rearm_vec(struct virtnet_rx *rxvq)
}

#ifdef RTE_MACHINE_CPUFLAG_SSSE3
-
-#include <tmmintrin.h>
-
-/* virtio vPMD receive routine, only accept(nb_pkts >= RTE_VIRTIO_DESC_PER_LOOP)
- *
- * This routine is for non-mergeable RX, one desc for each guest buffer.
- * This routine is based on the RX ring layout optimization. Each entry in the
- * avail ring points to the desc with the same index in the desc ring and this
- * will never be changed in the driver.
- *
- * - nb_pkts < RTE_VIRTIO_DESC_PER_LOOP, just return no packet
- */
-uint16_t
-virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
- uint16_t nb_pkts)
-{
- struct virtnet_rx *rxvq = rx_queue;
- struct virtqueue *vq = rxvq->vq;
- uint16_t nb_used;
- uint16_t desc_idx;
- struct vring_used_elem *rused;
- struct rte_mbuf **sw_ring;
- struct rte_mbuf **sw_ring_end;
- uint16_t nb_pkts_received;
- __m128i shuf_msk1, shuf_msk2, len_adjust;
-
- shuf_msk1 = _mm_set_epi8(
- 0xFF, 0xFF, 0xFF, 0xFF,
- 0xFF, 0xFF, /* vlan tci */
- 5, 4, /* dat len */
- 0xFF, 0xFF, 5, 4, /* pkt len */
- 0xFF, 0xFF, 0xFF, 0xFF /* packet type */
-
- );
-
- shuf_msk2 = _mm_set_epi8(
- 0xFF, 0xFF, 0xFF, 0xFF,
- 0xFF, 0xFF, /* vlan tci */
- 13, 12, /* dat len */
- 0xFF, 0xFF, 13, 12, /* pkt len */
- 0xFF, 0xFF, 0xFF, 0xFF /* packet type */
- );
-
- /* Subtract the header length.
- * In which case do we need the header length in used->len ?
- */
- len_adjust = _mm_set_epi16(
- 0, 0,
- 0,
- (uint16_t)-vq->hw->vtnet_hdr_size,
- 0, (uint16_t)-vq->hw->vtnet_hdr_size,
- 0, 0);
-
- if (unlikely(nb_pkts < RTE_VIRTIO_DESC_PER_LOOP))
- return 0;
-
- nb_used = VIRTQUEUE_NUSED(vq);
-
- rte_compiler_barrier();
-
- if (unlikely(nb_used == 0))
- return 0;
-
- nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_VIRTIO_DESC_PER_LOOP);
- nb_used = RTE_MIN(nb_used, nb_pkts);
-
- desc_idx = (uint16_t)(vq->vq_used_cons_idx & (vq->vq_nentries - 1));
- rused = &vq->vq_ring.used->ring[desc_idx];
- sw_ring = &vq->sw_ring[desc_idx];
- sw_ring_end = &vq->sw_ring[vq->vq_nentries];
-
- _mm_prefetch((const void *)rused, _MM_HINT_T0);
-
- if (vq->vq_free_cnt >= RTE_VIRTIO_VPMD_RX_REARM_THRESH) {
- virtio_rxq_rearm_vec(rxvq);
- if (unlikely(virtqueue_kick_prepare(vq)))
- virtqueue_notify(vq);
- }
-
- for (nb_pkts_received = 0;
- nb_pkts_received < nb_used;) {
- __m128i desc[RTE_VIRTIO_DESC_PER_LOOP / 2];
- __m128i mbp[RTE_VIRTIO_DESC_PER_LOOP / 2];
- __m128i pkt_mb[RTE_VIRTIO_DESC_PER_LOOP];
-
- mbp[0] = _mm_loadu_si128((__m128i *)(sw_ring + 0));
- desc[0] = _mm_loadu_si128((__m128i *)(rused + 0));
- _mm_storeu_si128((__m128i *)&rx_pkts[0], mbp[0]);
-
- mbp[1] = _mm_loadu_si128((__m128i *)(sw_ring + 2));
- desc[1] = _mm_loadu_si128((__m128i *)(rused + 2));
- _mm_storeu_si128((__m128i *)&rx_pkts[2], mbp[1]);
-
- mbp[2] = _mm_loadu_si128((__m128i *)(sw_ring + 4));
- desc[2] = _mm_loadu_si128((__m128i *)(rused + 4));
- _mm_storeu_si128((__m128i *)&rx_pkts[4], mbp[2]);
-
- mbp[3] = _mm_loadu_si128((__m128i *)(sw_ring + 6));
- desc[3] = _mm_loadu_si128((__m128i *)(rused + 6));
- _mm_storeu_si128((__m128i *)&rx_pkts[6], mbp[3]);
-
- pkt_mb[1] = _mm_shuffle_epi8(desc[0], shuf_msk2);
- pkt_mb[0] = _mm_shuffle_epi8(desc[0], shuf_msk1);
- pkt_mb[1] = _mm_add_epi16(pkt_mb[1], len_adjust);
- pkt_mb[0] = _mm_add_epi16(pkt_mb[0], len_adjust);
- _mm_storeu_si128((void *)&rx_pkts[1]->rx_descriptor_fields1,
- pkt_mb[1]);
- _mm_storeu_si128((void *)&rx_pkts[0]->rx_descriptor_fields1,
- pkt_mb[0]);
-
- pkt_mb[3] = _mm_shuffle_epi8(desc[1], shuf_msk2);
- pkt_mb[2] = _mm_shuffle_epi8(desc[1], shuf_msk1);
- pkt_mb[3] = _mm_add_epi16(pkt_mb[3], len_adjust);
- pkt_mb[2] = _mm_add_epi16(pkt_mb[2], len_adjust);
- _mm_storeu_si128((void *)&rx_pkts[3]->rx_descriptor_fields1,
- pkt_mb[3]);
- _mm_storeu_si128((void *)&rx_pkts[2]->rx_descriptor_fields1,
- pkt_mb[2]);
-
- pkt_mb[5] = _mm_shuffle_epi8(desc[2], shuf_msk2);
- pkt_mb[4] = _mm_shuffle_epi8(desc[2], shuf_msk1);
- pkt_mb[5] = _mm_add_epi16(pkt_mb[5], len_adjust);
- pkt_mb[4] = _mm_add_epi16(pkt_mb[4], len_adjust);
- _mm_storeu_si128((void *)&rx_pkts[5]->rx_descriptor_fields1,
- pkt_mb[5]);
- _mm_storeu_si128((void *)&rx_pkts[4]->rx_descriptor_fields1,
- pkt_mb[4]);
-
- pkt_mb[7] = _mm_shuffle_epi8(desc[3], shuf_msk2);
- pkt_mb[6] = _mm_shuffle_epi8(desc[3], shuf_msk1);
- pkt_mb[7] = _mm_add_epi16(pkt_mb[7], len_adjust);
- pkt_mb[6] = _mm_add_epi16(pkt_mb[6], len_adjust);
- _mm_storeu_si128((void *)&rx_pkts[7]->rx_descriptor_fields1,
- pkt_mb[7]);
- _mm_storeu_si128((void *)&rx_pkts[6]->rx_descriptor_fields1,
- pkt_mb[6]);
-
- if (unlikely(nb_used <= RTE_VIRTIO_DESC_PER_LOOP)) {
- if (sw_ring + nb_used <= sw_ring_end)
- nb_pkts_received += nb_used;
- else
- nb_pkts_received += sw_ring_end - sw_ring;
- break;
- } else {
- if (unlikely(sw_ring + RTE_VIRTIO_DESC_PER_LOOP >=
- sw_ring_end)) {
- nb_pkts_received += sw_ring_end - sw_ring;
- break;
- } else {
- nb_pkts_received += RTE_VIRTIO_DESC_PER_LOOP;
-
- rx_pkts += RTE_VIRTIO_DESC_PER_LOOP;
- sw_ring += RTE_VIRTIO_DESC_PER_LOOP;
- rused += RTE_VIRTIO_DESC_PER_LOOP;
- nb_used -= RTE_VIRTIO_DESC_PER_LOOP;
- }
- }
- }
-
- vq->vq_used_cons_idx += nb_pkts_received;
- vq->vq_free_cnt += nb_pkts_received;
- rxvq->stats.packets += nb_pkts_received;
- return nb_pkts_received;
-}
-
+#include "virtio_rxtx_simple_sse.h"
#endif

#define VIRTIO_TX_FREE_THRESH 32
diff --git a/drivers/net/virtio/virtio_rxtx_simple_sse.h b/drivers/net/virtio/virtio_rxtx_simple_sse.h
new file mode 100644
index 0000000..4e8728f
--- /dev/null
+++ b/drivers/net/virtio/virtio_rxtx_simple_sse.h
@@ -0,0 +1,225 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include <tmmintrin.h>
+
+#include <rte_byteorder.h>
+#include <rte_branch_prediction.h>
+#include <rte_cycles.h>
+#include <rte_ether.h>
+#include <rte_ethdev.h>
+#include <rte_errno.h>
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_mempool.h>
+#include <rte_malloc.h>
+#include <rte_mbuf.h>
+#include <rte_prefetch.h>
+#include <rte_string_fns.h>
+
+#include "virtio_logs.h"
+#include "virtio_ethdev.h"
+#include "virtqueue.h"
+#include "virtio_rxtx.h"
+
+#define RTE_VIRTIO_VPMD_RX_BURST 32
+#define RTE_VIRTIO_DESC_PER_LOOP 8
+#define RTE_VIRTIO_VPMD_RX_REARM_THRESH RTE_VIRTIO_VPMD_RX_BURST
+
+/* virtio vPMD receive routine, only accept(nb_pkts >= RTE_VIRTIO_DESC_PER_LOOP)
+ *
+ * This routine is for non-mergeable RX, one desc for each guest buffer.
+ * This routine is based on the RX ring layout optimization. Each entry in the
+ * avail ring points to the desc with the same index in the desc ring and this
+ * will never be changed in the driver.
+ *
+ * - nb_pkts < RTE_VIRTIO_DESC_PER_LOOP, just return no packet
+ */
+uint16_t
+virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts)
+{
+ struct virtnet_rx *rxvq = rx_queue;
+ struct virtqueue *vq = rxvq->vq;
+ uint16_t nb_used;
+ uint16_t desc_idx;
+ struct vring_used_elem *rused;
+ struct rte_mbuf **sw_ring;
+ struct rte_mbuf **sw_ring_end;
+ uint16_t nb_pkts_received;
+ __m128i shuf_msk1, shuf_msk2, len_adjust;
+
+ shuf_msk1 = _mm_set_epi8(
+ 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, /* vlan tci */
+ 5, 4, /* dat len */
+ 0xFF, 0xFF, 5, 4, /* pkt len */
+ 0xFF, 0xFF, 0xFF, 0xFF /* packet type */
+
+ );
+
+ shuf_msk2 = _mm_set_epi8(
+ 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, /* vlan tci */
+ 13, 12, /* dat len */
+ 0xFF, 0xFF, 13, 12, /* pkt len */
+ 0xFF, 0xFF, 0xFF, 0xFF /* packet type */
+ );
+
+ /* Subtract the header length.
+ * In which case do we need the header length in used->len ?
+ */
+ len_adjust = _mm_set_epi16(
+ 0, 0,
+ 0,
+ (uint16_t)-vq->hw->vtnet_hdr_size,
+ 0, (uint16_t)-vq->hw->vtnet_hdr_size,
+ 0, 0);
+
+ if (unlikely(nb_pkts < RTE_VIRTIO_DESC_PER_LOOP))
+ return 0;
+
+ nb_used = VIRTQUEUE_NUSED(vq);
+
+ rte_compiler_barrier();
+
+ if (unlikely(nb_used == 0))
+ return 0;
+
+ nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_VIRTIO_DESC_PER_LOOP);
+ nb_used = RTE_MIN(nb_used, nb_pkts);
+
+ desc_idx = (uint16_t)(vq->vq_used_cons_idx & (vq->vq_nentries - 1));
+ rused = &vq->vq_ring.used->ring[desc_idx];
+ sw_ring = &vq->sw_ring[desc_idx];
+ sw_ring_end = &vq->sw_ring[vq->vq_nentries];
+
+ _mm_prefetch((const void *)rused, _MM_HINT_T0);
+
+ if (vq->vq_free_cnt >= RTE_VIRTIO_VPMD_RX_REARM_THRESH) {
+ virtio_rxq_rearm_vec(rxvq);
+ if (unlikely(virtqueue_kick_prepare(vq)))
+ virtqueue_notify(vq);
+ }
+
+ for (nb_pkts_received = 0;
+ nb_pkts_received < nb_used;) {
+ __m128i desc[RTE_VIRTIO_DESC_PER_LOOP / 2];
+ __m128i mbp[RTE_VIRTIO_DESC_PER_LOOP / 2];
+ __m128i pkt_mb[RTE_VIRTIO_DESC_PER_LOOP];
+
+ mbp[0] = _mm_loadu_si128((__m128i *)(sw_ring + 0));
+ desc[0] = _mm_loadu_si128((__m128i *)(rused + 0));
+ _mm_storeu_si128((__m128i *)&rx_pkts[0], mbp[0]);
+
+ mbp[1] = _mm_loadu_si128((__m128i *)(sw_ring + 2));
+ desc[1] = _mm_loadu_si128((__m128i *)(rused + 2));
+ _mm_storeu_si128((__m128i *)&rx_pkts[2], mbp[1]);
+
+ mbp[2] = _mm_loadu_si128((__m128i *)(sw_ring + 4));
+ desc[2] = _mm_loadu_si128((__m128i *)(rused + 4));
+ _mm_storeu_si128((__m128i *)&rx_pkts[4], mbp[2]);
+
+ mbp[3] = _mm_loadu_si128((__m128i *)(sw_ring + 6));
+ desc[3] = _mm_loadu_si128((__m128i *)(rused + 6));
+ _mm_storeu_si128((__m128i *)&rx_pkts[6], mbp[3]);
+
+ pkt_mb[1] = _mm_shuffle_epi8(desc[0], shuf_msk2);
+ pkt_mb[0] = _mm_shuffle_epi8(desc[0], shuf_msk1);
+ pkt_mb[1] = _mm_add_epi16(pkt_mb[1], len_adjust);
+ pkt_mb[0] = _mm_add_epi16(pkt_mb[0], len_adjust);
+ _mm_storeu_si128((void *)&rx_pkts[1]->rx_descriptor_fields1,
+ pkt_mb[1]);
+ _mm_storeu_si128((void *)&rx_pkts[0]->rx_descriptor_fields1,
+ pkt_mb[0]);
+
+ pkt_mb[3] = _mm_shuffle_epi8(desc[1], shuf_msk2);
+ pkt_mb[2] = _mm_shuffle_epi8(desc[1], shuf_msk1);
+ pkt_mb[3] = _mm_add_epi16(pkt_mb[3], len_adjust);
+ pkt_mb[2] = _mm_add_epi16(pkt_mb[2], len_adjust);
+ _mm_storeu_si128((void *)&rx_pkts[3]->rx_descriptor_fields1,
+ pkt_mb[3]);
+ _mm_storeu_si128((void *)&rx_pkts[2]->rx_descriptor_fields1,
+ pkt_mb[2]);
+
+ pkt_mb[5] = _mm_shuffle_epi8(desc[2], shuf_msk2);
+ pkt_mb[4] = _mm_shuffle_epi8(desc[2], shuf_msk1);
+ pkt_mb[5] = _mm_add_epi16(pkt_mb[5], len_adjust);
+ pkt_mb[4] = _mm_add_epi16(pkt_mb[4], len_adjust);
+ _mm_storeu_si128((void *)&rx_pkts[5]->rx_descriptor_fields1,
+ pkt_mb[5]);
+ _mm_storeu_si128((void *)&rx_pkts[4]->rx_descriptor_fields1,
+ pkt_mb[4]);
+
+ pkt_mb[7] = _mm_shuffle_epi8(desc[3], shuf_msk2);
+ pkt_mb[6] = _mm_shuffle_epi8(desc[3], shuf_msk1);
+ pkt_mb[7] = _mm_add_epi16(pkt_mb[7], len_adjust);
+ pkt_mb[6] = _mm_add_epi16(pkt_mb[6], len_adjust);
+ _mm_storeu_si128((void *)&rx_pkts[7]->rx_descriptor_fields1,
+ pkt_mb[7]);
+ _mm_storeu_si128((void *)&rx_pkts[6]->rx_descriptor_fields1,
+ pkt_mb[6]);
+
+ if (unlikely(nb_used <= RTE_VIRTIO_DESC_PER_LOOP)) {
+ if (sw_ring + nb_used <= sw_ring_end)
+ nb_pkts_received += nb_used;
+ else
+ nb_pkts_received += sw_ring_end - sw_ring;
+ break;
+ } else {
+ if (unlikely(sw_ring + RTE_VIRTIO_DESC_PER_LOOP >=
+ sw_ring_end)) {
+ nb_pkts_received += sw_ring_end - sw_ring;
+ break;
+ } else {
+ nb_pkts_received += RTE_VIRTIO_DESC_PER_LOOP;
+
+ rx_pkts += RTE_VIRTIO_DESC_PER_LOOP;
+ sw_ring += RTE_VIRTIO_DESC_PER_LOOP;
+ rused += RTE_VIRTIO_DESC_PER_LOOP;
+ nb_used -= RTE_VIRTIO_DESC_PER_LOOP;
+ }
+ }
+ }
+
+ vq->vq_used_cons_idx += nb_pkts_received;
+ vq->vq_free_cnt += nb_pkts_received;
+ rxvq->stats.packets += nb_pkts_received;
+ return nb_pkts_received;
+}
--
2.5.5
Jianbo Liu
2016-06-28 06:17:41 UTC
Permalink
Post by Jerin Jacob
split out SSE instruction based virtio simple rx
implementation to a separate file
---
drivers/net/virtio/virtio_rxtx_simple.c | 166 +-------------------
drivers/net/virtio/virtio_rxtx_simple_sse.h | 225 ++++++++++++++++++++++++++++
2 files changed, 226 insertions(+), 165 deletions(-)
create mode 100644 drivers/net/virtio/virtio_rxtx_simple_sse.h
I think it's better to move sse implementation to a C file,
as Bruce pointed out at
http://www.dpdk.org/ml/archives/dev/2016-April/037937.html

Jianbo
Jerin Jacob
2016-06-29 11:27:46 UTC
Permalink
Post by Jianbo Liu
Post by Jerin Jacob
split out SSE instruction based virtio simple rx
implementation to a separate file
---
drivers/net/virtio/virtio_rxtx_simple.c | 166 +-------------------
drivers/net/virtio/virtio_rxtx_simple_sse.h | 225 ++++++++++++++++++++++++++++
2 files changed, 226 insertions(+), 165 deletions(-)
create mode 100644 drivers/net/virtio/virtio_rxtx_simple_sse.h
I think it's better to move sse implementation to a C file,
as Bruce pointed out at
http://www.dpdk.org/ml/archives/dev/2016-April/037937.html
I can move to C file, That would call for further restructuring of the code
by Introducing a new file drivers/net/virtio/virtio_rxtx_simple.h and
moving all static inline functions of virtio_rxtx_simple.c so that
virtio_rxtx_simple_sse.c and virtio_rxtx_simple_neon.c can include it.

Huawei,Yuanhan,All,

Are you OK with above restructuring?

Jerin
Post by Jianbo Liu
Jianbo
Yuanhan Liu
2016-06-30 05:43:15 UTC
Permalink
Post by Jerin Jacob
Post by Jianbo Liu
Post by Jerin Jacob
split out SSE instruction based virtio simple rx
implementation to a separate file
---
drivers/net/virtio/virtio_rxtx_simple.c | 166 +-------------------
drivers/net/virtio/virtio_rxtx_simple_sse.h | 225 ++++++++++++++++++++++++++++
2 files changed, 226 insertions(+), 165 deletions(-)
create mode 100644 drivers/net/virtio/virtio_rxtx_simple_sse.h
I think it's better to move sse implementation to a C file,
as Bruce pointed out at
http://www.dpdk.org/ml/archives/dev/2016-April/037937.html
I can move to C file, That would call for further restructuring of the code
by Introducing a new file drivers/net/virtio/virtio_rxtx_simple.h and
moving all static inline functions of virtio_rxtx_simple.c so that
virtio_rxtx_simple_sse.c and virtio_rxtx_simple_neon.c can include it.
Huawei,Yuanhan,All,
Are you OK with above restructuring?
Yes, I think that's better.

--yliu
Jerin Jacob
2016-06-27 11:54:08 UTC
Permalink
Added neon based Rx vector implementation for virtio.
Selected neon based virtio implementation for ARM64 as
default and updated the MAINTAINERS file.

Signed-off-by: Jerin Jacob <***@caviumnetworks.com>
---
MAINTAINERS | 1 +
config/defconfig_arm64-armv8a-linuxapp-gcc | 1 -
doc/guides/rel_notes/release_16_07.rst | 2 +
drivers/net/virtio/virtio_rxtx.c | 2 +-
drivers/net/virtio/virtio_rxtx_simple.c | 2 +
drivers/net/virtio/virtio_rxtx_simple_neon.h | 238 +++++++++++++++++++++++++++
6 files changed, 244 insertions(+), 2 deletions(-)
create mode 100644 drivers/net/virtio/virtio_rxtx_simple_neon.h

diff --git a/MAINTAINERS b/MAINTAINERS
index f6c0d3d..2bb12aa 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -142,6 +142,7 @@ F: lib/librte_eal/common/include/arch/arm/*_64.h
F: lib/librte_acl/acl_run_neon.*
F: lib/librte_lpm/rte_lpm_neon.h
F: lib/librte_hash/rte*_arm64.h
+F: drivers/net/virtio/virtio_rxtx_simple_neon.h

EZchip TILE-Gx
M: Zhigang Lu <***@ezchip.com>
diff --git a/config/defconfig_arm64-armv8a-linuxapp-gcc b/config/defconfig_arm64-armv8a-linuxapp-gcc
index 95ed30e..a786562 100644
--- a/config/defconfig_arm64-armv8a-linuxapp-gcc
+++ b/config/defconfig_arm64-armv8a-linuxapp-gcc
@@ -48,6 +48,5 @@ CONFIG_RTE_IXGBE_INC_VECTOR=n
CONFIG_RTE_LIBRTE_IVSHMEM=n
CONFIG_RTE_LIBRTE_FM10K_PMD=n
CONFIG_RTE_LIBRTE_I40E_PMD=n
-CONFIG_RTE_LIBRTE_VIRTIO_INC_VECTOR=n

CONFIG_RTE_SCHED_VECTOR=n
diff --git a/doc/guides/rel_notes/release_16_07.rst b/doc/guides/rel_notes/release_16_07.rst
index 2694f50..3187a33 100644
--- a/doc/guides/rel_notes/release_16_07.rst
+++ b/doc/guides/rel_notes/release_16_07.rst
@@ -118,6 +118,8 @@ New Features
* Root privilege is a must for sorting hugepages by physical address.
* Can only be used with vhost user backend.

+* **Virtio NEON support for ARM.**
+
Resolved Issues
---------------

diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
index e9b42f3..ca25db3 100644
--- a/drivers/net/virtio/virtio_rxtx.c
+++ b/drivers/net/virtio/virtio_rxtx.c
@@ -500,7 +500,7 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
}

#ifdef RTE_LIBRTE_VIRTIO_INC_VECTOR
-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
+#if defined RTE_MACHINE_CPUFLAG_SSSE3 || defined RTE_MACHINE_CPUFLAG_NEON
struct virtio_hw *hw = dev->data->dev_private;
/* Use simple rx/tx func if single segment and no offloads */
if ((tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) == VIRTIO_SIMPLE_FLAGS &&
diff --git a/drivers/net/virtio/virtio_rxtx_simple.c b/drivers/net/virtio/virtio_rxtx_simple.c
index ca87605..e5dc010 100644
--- a/drivers/net/virtio/virtio_rxtx_simple.c
+++ b/drivers/net/virtio/virtio_rxtx_simple.c
@@ -131,6 +131,8 @@ virtio_rxq_rearm_vec(struct virtnet_rx *rxvq)

#ifdef RTE_MACHINE_CPUFLAG_SSSE3
#include "virtio_rxtx_simple_sse.h"
+#elif RTE_MACHINE_CPUFLAG_NEON
+#include "virtio_rxtx_simple_neon.h"
#endif

#define VIRTIO_TX_FREE_THRESH 32
diff --git a/drivers/net/virtio/virtio_rxtx_simple_neon.h b/drivers/net/virtio/virtio_rxtx_simple_neon.h
new file mode 100644
index 0000000..41f347d
--- /dev/null
+++ b/drivers/net/virtio/virtio_rxtx_simple_neon.h
@@ -0,0 +1,238 @@
+/*
+ * BSD LICENSE
+ *
+ * Copyright (C) Cavium networks Ltd. 2016
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Cavium networks nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include <rte_byteorder.h>
+#include <rte_branch_prediction.h>
+#include <rte_cycles.h>
+#include <rte_ether.h>
+#include <rte_ethdev.h>
+#include <rte_errno.h>
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_mempool.h>
+#include <rte_malloc.h>
+#include <rte_mbuf.h>
+#include <rte_prefetch.h>
+#include <rte_string_fns.h>
+#include <rte_vect.h>
+
+#include "virtio_logs.h"
+#include "virtio_ethdev.h"
+#include "virtqueue.h"
+#include "virtio_rxtx.h"
+
+#define RTE_VIRTIO_VPMD_RX_BURST 32
+#define RTE_VIRTIO_DESC_PER_LOOP 8
+#define RTE_VIRTIO_VPMD_RX_REARM_THRESH RTE_VIRTIO_VPMD_RX_BURST
+
+/* virtio vPMD receive routine, only accept(nb_pkts >= RTE_VIRTIO_DESC_PER_LOOP)
+ *
+ * This routine is for non-mergeable RX, one desc for each guest buffer.
+ * This routine is based on the RX ring layout optimization. Each entry in the
+ * avail ring points to the desc with the same index in the desc ring and this
+ * will never be changed in the driver.
+ *
+ * - nb_pkts < RTE_VIRTIO_DESC_PER_LOOP, just return no packet
+ */
+uint16_t
+virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts)
+{
+ struct virtnet_rx *rxvq = rx_queue;
+ struct virtqueue *vq = rxvq->vq;
+ uint16_t nb_used;
+ uint16_t desc_idx;
+ struct vring_used_elem *rused;
+ struct rte_mbuf **sw_ring;
+ struct rte_mbuf **sw_ring_end;
+ uint16_t nb_pkts_received;
+
+ uint8x16_t shuf_msk1 = {
+ 0xFF, 0xFF, 0xFF, 0xFF, /* packet type */
+ 4, 5, 0xFF, 0xFF, /* pkt len */
+ 4, 5, /* dat len */
+ 0xFF, 0xFF, /* vlan tci */
+ 0xFF, 0xFF, 0xFF, 0xFF
+ };
+
+ uint8x16_t shuf_msk2 = {
+ 0xFF, 0xFF, 0xFF, 0xFF, /* packet type */
+ 12, 13, 0xFF, 0xFF, /* pkt len */
+ 12, 13, /* dat len */
+ 0xFF, 0xFF, /* vlan tci */
+ 0xFF, 0xFF, 0xFF, 0xFF
+ };
+
+ /* Subtract the header length.
+ * In which case do we need the header length in used->len ?
+ */
+ uint16x8_t len_adjust = {
+ 0, 0,
+ (uint16_t)vq->hw->vtnet_hdr_size, 0,
+ (uint16_t)vq->hw->vtnet_hdr_size,
+ 0,
+ 0, 0
+ };
+
+ if (unlikely(nb_pkts < RTE_VIRTIO_DESC_PER_LOOP))
+ return 0;
+
+ nb_used = VIRTQUEUE_NUSED(vq);
+
+ rte_rmb();
+
+ if (unlikely(nb_used == 0))
+ return 0;
+
+ nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_VIRTIO_DESC_PER_LOOP);
+ nb_used = RTE_MIN(nb_used, nb_pkts);
+
+ desc_idx = (uint16_t)(vq->vq_used_cons_idx & (vq->vq_nentries - 1));
+ rused = &vq->vq_ring.used->ring[desc_idx];
+ sw_ring = &vq->sw_ring[desc_idx];
+ sw_ring_end = &vq->sw_ring[vq->vq_nentries];
+
+ rte_prefetch_non_temporal(rused);
+
+ if (vq->vq_free_cnt >= RTE_VIRTIO_VPMD_RX_REARM_THRESH) {
+ virtio_rxq_rearm_vec(rxvq);
+ if (unlikely(virtqueue_kick_prepare(vq)))
+ virtqueue_notify(vq);
+ }
+
+ for (nb_pkts_received = 0;
+ nb_pkts_received < nb_used;) {
+ uint64x2_t desc[RTE_VIRTIO_DESC_PER_LOOP / 2];
+ uint64x2_t mbp[RTE_VIRTIO_DESC_PER_LOOP / 2];
+ uint64x2_t pkt_mb[RTE_VIRTIO_DESC_PER_LOOP];
+
+ mbp[0] = vld1q_u64((uint64_t *)(sw_ring + 0));
+ desc[0] = vld1q_u64((uint64_t *)(rused + 0));
+ vst1q_u64((uint64_t *)&rx_pkts[0], mbp[0]);
+
+ mbp[1] = vld1q_u64((uint64_t *)(sw_ring + 2));
+ desc[1] = vld1q_u64((uint64_t *)(rused + 2));
+ vst1q_u64((uint64_t *)&rx_pkts[2], mbp[1]);
+
+ mbp[2] = vld1q_u64((uint64_t *)(sw_ring + 4));
+ desc[2] = vld1q_u64((uint64_t *)(rused + 4));
+ vst1q_u64((uint64_t *)&rx_pkts[4], mbp[2]);
+
+ mbp[3] = vld1q_u64((uint64_t *)(sw_ring + 6));
+ desc[3] = vld1q_u64((uint64_t *)(rused + 6));
+ vst1q_u64((uint64_t *)&rx_pkts[6], mbp[3]);
+
+ pkt_mb[1] = vreinterpretq_u64_u8(vqtbl1q_u8(
+ vreinterpretq_u8_u64(desc[0]), shuf_msk2));
+ pkt_mb[0] = vreinterpretq_u64_u8(vqtbl1q_u8(
+ vreinterpretq_u8_u64(desc[0]), shuf_msk1));
+ pkt_mb[1] = vreinterpretq_u64_u16(vsubq_u16(
+ vreinterpretq_u16_u64(pkt_mb[1]), len_adjust));
+ pkt_mb[0] = vreinterpretq_u64_u16(vsubq_u16(
+ vreinterpretq_u16_u64(pkt_mb[0]), len_adjust));
+ vst1q_u64((void *)&rx_pkts[1]->rx_descriptor_fields1,
+ pkt_mb[1]);
+ vst1q_u64((void *)&rx_pkts[0]->rx_descriptor_fields1,
+ pkt_mb[0]);
+
+ pkt_mb[3] = vreinterpretq_u64_u8(vqtbl1q_u8(
+ vreinterpretq_u8_u64(desc[1]), shuf_msk2));
+ pkt_mb[2] = vreinterpretq_u64_u8(vqtbl1q_u8(
+ vreinterpretq_u8_u64(desc[1]), shuf_msk1));
+ pkt_mb[3] = vreinterpretq_u64_u16(vsubq_u16(
+ vreinterpretq_u16_u64(pkt_mb[3]), len_adjust));
+ pkt_mb[2] = vreinterpretq_u64_u16(vsubq_u16(
+ vreinterpretq_u16_u64(pkt_mb[2]), len_adjust));
+ vst1q_u64((void *)&rx_pkts[3]->rx_descriptor_fields1,
+ pkt_mb[3]);
+ vst1q_u64((void *)&rx_pkts[2]->rx_descriptor_fields1,
+ pkt_mb[2]);
+
+ pkt_mb[5] = vreinterpretq_u64_u8(vqtbl1q_u8(
+ vreinterpretq_u8_u64(desc[2]), shuf_msk2));
+ pkt_mb[4] = vreinterpretq_u64_u8(vqtbl1q_u8(
+ vreinterpretq_u8_u64(desc[2]), shuf_msk1));
+ pkt_mb[5] = vreinterpretq_u64_u16(vsubq_u16(
+ vreinterpretq_u16_u64(pkt_mb[5]), len_adjust));
+ pkt_mb[4] = vreinterpretq_u64_u16(vsubq_u16(
+ vreinterpretq_u16_u64(pkt_mb[4]), len_adjust));
+ vst1q_u64((void *)&rx_pkts[5]->rx_descriptor_fields1,
+ pkt_mb[5]);
+ vst1q_u64((void *)&rx_pkts[4]->rx_descriptor_fields1,
+ pkt_mb[4]);
+
+ pkt_mb[7] = vreinterpretq_u64_u8(vqtbl1q_u8(
+ vreinterpretq_u8_u64(desc[3]), shuf_msk2));
+ pkt_mb[6] = vreinterpretq_u64_u8(vqtbl1q_u8(
+ vreinterpretq_u8_u64(desc[3]), shuf_msk1));
+ pkt_mb[7] = vreinterpretq_u64_u16(vsubq_u16(
+ vreinterpretq_u16_u64(pkt_mb[7]), len_adjust));
+ pkt_mb[6] = vreinterpretq_u64_u16(vsubq_u16(
+ vreinterpretq_u16_u64(pkt_mb[6]), len_adjust));
+ vst1q_u64((void *)&rx_pkts[7]->rx_descriptor_fields1,
+ pkt_mb[7]);
+ vst1q_u64((void *)&rx_pkts[6]->rx_descriptor_fields1,
+ pkt_mb[6]);
+
+ if (unlikely(nb_used <= RTE_VIRTIO_DESC_PER_LOOP)) {
+ if (sw_ring + nb_used <= sw_ring_end)
+ nb_pkts_received += nb_used;
+ else
+ nb_pkts_received += sw_ring_end - sw_ring;
+ break;
+ } else {
+ if (unlikely(sw_ring + RTE_VIRTIO_DESC_PER_LOOP >=
+ sw_ring_end)) {
+ nb_pkts_received += sw_ring_end - sw_ring;
+ break;
+ } else {
+ nb_pkts_received += RTE_VIRTIO_DESC_PER_LOOP;
+
+ rx_pkts += RTE_VIRTIO_DESC_PER_LOOP;
+ sw_ring += RTE_VIRTIO_DESC_PER_LOOP;
+ rused += RTE_VIRTIO_DESC_PER_LOOP;
+ nb_used -= RTE_VIRTIO_DESC_PER_LOOP;
+ }
+ }
+ }
+
+ vq->vq_used_cons_idx += nb_pkts_received;
+ vq->vq_free_cnt += nb_pkts_received;
+ rxvq->stats.packets += nb_pkts_received;
+ return nb_pkts_received;
+}
--
2.5.5
Jerin Jacob
2016-07-01 11:16:35 UTC
Permalink
This patch-set includes,

1) General cleanup of compile time dependency.
2) made vector handler section based on run-time cpuflags
2) Added NEON support for optimized Rx handling

This patch-set is based on dpdk-next-virtio/master at aaaf0c005

v2:
- made vector handler selection based on run-time cpuflags (Suggested by Thomas)
- moved vector implementations to .c file instead of .h file(Suggested by Jianbo)

Jerin Jacob (3):
virtio: conditional compilation cleanup
virtio: move SSE based Rx implementation to separate file
virtio: add neon support

MAINTAINERS | 1 +
doc/guides/rel_notes/release_16_07.rst | 2 +
drivers/net/vision/Wakefully | 7 +-
drivers/net/virtio/virtio_pci.h | 1 +
drivers/net/virtio/virtio_rxtx.c | 62 +++---
drivers/net/virtio/virtio_rxtx.h | 3 +-
drivers/net/virtio/virtio_rxtx_simple.c | 269 ++-------------------------
drivers/net/virtio/virtio_rxtx_simple.h | 133 +++++++++++++
drivers/net/virtio/virtio_rxtx_simple_neon.c | 235 +++++++++++++++++++++++
drivers/net/virtio/virtio_rxtx_simple_sse.c | 222 ++++++++++++++++++++++
drivers/net/virtio/virtio_user_ethdev.c | 1 +
11 files changed, 646 insertions(+), 290 deletions(-)
create mode 100644 drivers/net/virtio/virtio_rxtx_simple.h
create mode 100644 drivers/net/virtio/virtio_rxtx_simple_neon.c
create mode 100644 drivers/net/virtio/virtio_rxtx_simple_sse.c
--
2.5.5
Jerin Jacob
2016-07-01 11:16:36 UTC
Permalink
Removed unnecessary compile time dependency on "use_simple_rxtx".

Signed-off-by: Jerin Jacob <***@caviumnetworks.com>
---
drivers/net/virtio/Makefile | 3 ---
drivers/net/virtio/virtio_pci.h | 1 +
drivers/net/virtio/virtio_rxtx.c | 28 +++++++++-------------------
drivers/net/virtio/virtio_rxtx.h | 3 +--
drivers/net/virtio/virtio_rxtx_simple.c | 8 ++++++--
drivers/net/virtio/virtio_user_ethdev.c | 1 +
6 files changed, 18 insertions(+), 26 deletions(-)

diff --git a/drivers/net/virtio/Makefile b/drivers/net/virtio/Makefile
index 3020b68..b9b0d8d 100644
--- a/drivers/net/virtio/Makefile
+++ b/drivers/net/virtio/Makefile
@@ -50,10 +50,7 @@ SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtqueue.c
SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_pci.c
SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx.c
SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_ethdev.c
-
-ifeq ($(findstring RTE_MACHINE_CPUFLAG_SSSE3,$(CFLAGS)),RTE_MACHINE_CPUFLAG_SSSE3)
SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx_simple.c
-endif

ifeq ($(CONFIG_RTE_VIRTIO_USER),y)
SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/vhost_user.c
diff --git a/drivers/net/virtio/virtio_pci.h b/drivers/net/virtio/virtio_pci.h
index dd7693f..b8295a7 100644
--- a/drivers/net/virtio/virtio_pci.h
+++ b/drivers/net/virtio/virtio_pci.h
@@ -253,6 +253,7 @@ struct virtio_hw {
uint8_t use_msix;
uint8_t started;
uint8_t modern;
+ uint8_t use_simple_rxtx;
uint8_t mac_addr[ETHER_ADDR_LEN];
uint32_t notify_off_multiplier;
uint8_t *isr;
diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
index a27208e..63b53f7 100644
--- a/drivers/net/virtio/virtio_rxtx.c
+++ b/drivers/net/virtio/virtio_rxtx.c
@@ -67,10 +67,6 @@
#define VIRTIO_SIMPLE_FLAGS ((uint32_t)ETH_TXQ_FLAGS_NOMULTSEGS | \
ETH_TXQ_FLAGS_NOOFFLOADS)

-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
-static int use_simple_rxtx;
-#endif
-
static void
vq_ring_free_chain(struct virtqueue *vq, uint16_t desc_idx)
{
@@ -333,6 +329,7 @@ virtio_dev_rxtx_start(struct rte_eth_dev *dev)
*/
uint16_t i;
uint16_t desc_idx;
+ struct virtio_hw *hw = dev->data->dev_private;

PMD_INIT_FUNC_TRACE();

@@ -353,8 +350,7 @@ virtio_dev_rxtx_start(struct rte_eth_dev *dev)
nbufs = 0;
error = ENOSPC;

-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
- if (use_simple_rxtx) {
+ if (hw->use_simple_rxtx) {
for (desc_idx = 0; desc_idx < vq->vq_nentries;
desc_idx++) {
vq->vq_ring.avail->ring[desc_idx] = desc_idx;
@@ -362,7 +358,7 @@ virtio_dev_rxtx_start(struct rte_eth_dev *dev)
VRING_DESC_F_WRITE;
}
}
-#endif
+
memset(&rxvq->fake_mbuf, 0, sizeof(rxvq->fake_mbuf));
for (desc_idx = 0; desc_idx < RTE_PMD_VIRTIO_RX_MAX_BURST;
desc_idx++) {
@@ -378,12 +374,11 @@ virtio_dev_rxtx_start(struct rte_eth_dev *dev)
/******************************************
* Enqueue allocated buffers *
*******************************************/
-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
- if (use_simple_rxtx)
+ if (hw->use_simple_rxtx)
error = virtqueue_enqueue_recv_refill_simple(vq, m);
else
-#endif
error = virtqueue_enqueue_recv_refill(vq, m);
+
if (error) {
rte_pktmbuf_free(m);
break;
@@ -404,8 +399,7 @@ virtio_dev_rxtx_start(struct rte_eth_dev *dev)
struct virtqueue *vq = txvq->vq;

virtio_dev_vring_start(vq);
-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
- if (use_simple_rxtx) {
+ if (hw->use_simple_rxtx) {
uint16_t mid_idx = vq->vq_nentries >> 1;

for (desc_idx = 0; desc_idx < mid_idx; desc_idx++) {
@@ -426,7 +420,7 @@ virtio_dev_rxtx_start(struct rte_eth_dev *dev)
desc_idx++)
vq->vq_ring.avail->ring[desc_idx] = desc_idx;
}
-#endif
+
VIRTQUEUE_DUMP(vq);
}
}
@@ -456,9 +450,7 @@ virtio_dev_rx_queue_setup(struct rte_eth_dev *dev,

dev->data->rx_queues[queue_idx] = rxvq;

-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
virtio_rxq_vec_setup(rxvq);
-#endif

return 0;
}
@@ -494,9 +486,6 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
{
uint8_t vtpci_queue_idx = 2 * queue_idx + VTNET_SQ_TQ_QUEUE_IDX;

-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
- struct virtio_hw *hw = dev->data->dev_private;
-#endif
struct virtnet_tx *txvq;
struct virtqueue *vq;
uint16_t tx_free_thresh;
@@ -511,13 +500,14 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
}

#ifdef RTE_MACHINE_CPUFLAG_SSSE3
+ struct virtio_hw *hw = dev->data->dev_private;
/* Use simple rx/tx func if single segment and no offloads */
if ((tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) == VIRTIO_SIMPLE_FLAGS &&
!vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
PMD_INIT_LOG(INFO, "Using simple rx/tx path");
dev->tx_pkt_burst = virtio_xmit_pkts_simple;
dev->rx_pkt_burst = virtio_recv_pkts_vec;
- use_simple_rxtx = 1;
+ hw->use_simple_rxtx = 1;
}
#endif

diff --git a/drivers/net/virtio/virtio_rxtx.h b/drivers/net/virtio/virtio_rxtx.h
index 058b56a..28f82d6 100644
--- a/drivers/net/virtio/virtio_rxtx.h
+++ b/drivers/net/virtio/virtio_rxtx.h
@@ -86,10 +86,9 @@ struct virtnet_ctl {
const struct rte_memzone *mz; /**< mem zone to populate RX ring. */
};

-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
int virtio_rxq_vec_setup(struct virtnet_rx *rxvq);

int virtqueue_enqueue_recv_refill_simple(struct virtqueue *vq,
struct rte_mbuf *m);
-#endif
+
#endif /* _VIRTIO_RXTX_H_ */
diff --git a/drivers/net/virtio/virtio_rxtx_simple.c b/drivers/net/virtio/virtio_rxtx_simple.c
index 242ad90..67430da 100644
--- a/drivers/net/virtio/virtio_rxtx_simple.c
+++ b/drivers/net/virtio/virtio_rxtx_simple.c
@@ -37,8 +37,6 @@
#include <string.h>
#include <errno.h>

-#include <tmmintrin.h>
-
#include <rte_cycles.h>
#include <rte_memory.h>
#include <rte_memzone.h>
@@ -131,6 +129,10 @@ virtio_rxq_rearm_vec(struct virtnet_rx *rxvq)
vq_update_avail_idx(vq);
}

+#ifdef RTE_MACHINE_CPUFLAG_SSSE3
+
+#include <tmmintrin.h>
+
/* virtio vPMD receive routine, only accept(nb_pkts >= RTE_VIRTIO_DESC_PER_LOOP)
*
* This routine is for non-mergeable RX, one desc for each guest buffer.
@@ -293,6 +295,8 @@ virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
return nb_pkts_received;
}

+#endif
+
#define VIRTIO_TX_FREE_THRESH 32
#define VIRTIO_TX_MAX_FREE_BUF_SZ 32
#define VIRTIO_TX_FREE_NR 32
diff --git a/drivers/net/virtio/virtio_user_ethdev.c b/drivers/net/virtio/virtio_user_ethdev.c
index 5ab2471..bef8130 100644
--- a/drivers/net/virtio/virtio_user_ethdev.c
+++ b/drivers/net/virtio/virtio_user_ethdev.c
@@ -303,6 +303,7 @@ virtio_user_eth_dev_alloc(const char *name)
hw->vtpci_ops = &virtio_user_ops;
hw->use_msix = 0;
hw->modern = 0;
+ hw->use_simple_rxtx = 0;
hw->virtio_user_dev = dev;
data->dev_private = hw;
data->numa_node = SOCKET_ID_ANY;
--
2.5.5
Yuanhan Liu
2016-07-04 07:36:48 UTC
Permalink
Post by Jerin Jacob
@@ -494,9 +486,6 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
{
uint8_t vtpci_queue_idx = 2 * queue_idx + VTNET_SQ_TQ_QUEUE_IDX;
-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
- struct virtio_hw *hw = dev->data->dev_private;
-#endif
struct virtnet_tx *txvq;
struct virtqueue *vq;
uint16_t tx_free_thresh;
@@ -511,13 +500,14 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
}
#ifdef RTE_MACHINE_CPUFLAG_SSSE3
+ struct virtio_hw *hw = dev->data->dev_private;
I'd suggest to move above declaration to ...
Post by Jerin Jacob
/* Use simple rx/tx func if single segment and no offloads */
if ((tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) == VIRTIO_SIMPLE_FLAGS &&
!vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
here: we should try to avoid declaring vars in the middle of a code block.

--yliu
Jerin Jacob
2016-07-04 08:36:27 UTC
Permalink
Post by Yuanhan Liu
Post by Jerin Jacob
@@ -494,9 +486,6 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
{
uint8_t vtpci_queue_idx = 2 * queue_idx + VTNET_SQ_TQ_QUEUE_IDX;
-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
- struct virtio_hw *hw = dev->data->dev_private;
-#endif
struct virtnet_tx *txvq;
struct virtqueue *vq;
uint16_t tx_free_thresh;
@@ -511,13 +500,14 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
}
#ifdef RTE_MACHINE_CPUFLAG_SSSE3
+ struct virtio_hw *hw = dev->data->dev_private;
I'd suggest to move above declaration to ...
Post by Jerin Jacob
/* Use simple rx/tx func if single segment and no offloads */
if ((tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) == VIRTIO_SIMPLE_FLAGS &&
!vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
here: we should try to avoid declaring vars in the middle of a code block.
Next patch in this series, moving all rxtx handler selection code to
separate function(virtio_update_rxtx_handler()) where declaration comes
as first line in the function.i.e the comment is taken care of in the
series.
Post by Yuanhan Liu
--yliu
Yuanhan Liu
2016-07-04 08:42:32 UTC
Permalink
Post by Jerin Jacob
Post by Yuanhan Liu
Post by Jerin Jacob
@@ -494,9 +486,6 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
{
uint8_t vtpci_queue_idx = 2 * queue_idx + VTNET_SQ_TQ_QUEUE_IDX;
-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
- struct virtio_hw *hw = dev->data->dev_private;
-#endif
struct virtnet_tx *txvq;
struct virtqueue *vq;
uint16_t tx_free_thresh;
@@ -511,13 +500,14 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
}
#ifdef RTE_MACHINE_CPUFLAG_SSSE3
+ struct virtio_hw *hw = dev->data->dev_private;
I'd suggest to move above declaration to ...
Post by Jerin Jacob
/* Use simple rx/tx func if single segment and no offloads */
if ((tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) == VIRTIO_SIMPLE_FLAGS &&
!vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
here: we should try to avoid declaring vars in the middle of a code block.
Next patch in this series, moving all rxtx handler selection code to
separate function(virtio_update_rxtx_handler()) where declaration comes
as first line in the function.i.e the comment is taken care of in the
series.
Yes, I saw that. But in principle, each patch is atomic: it's not a
good idea/practice to introduce issues in path A and then fix it in
path B.

--yliu
Jerin Jacob
2016-07-04 09:07:55 UTC
Permalink
Post by Yuanhan Liu
Post by Jerin Jacob
Post by Yuanhan Liu
Post by Jerin Jacob
@@ -494,9 +486,6 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
{
uint8_t vtpci_queue_idx = 2 * queue_idx + VTNET_SQ_TQ_QUEUE_IDX;
-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
- struct virtio_hw *hw = dev->data->dev_private;
-#endif
struct virtnet_tx *txvq;
struct virtqueue *vq;
uint16_t tx_free_thresh;
@@ -511,13 +500,14 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
}
#ifdef RTE_MACHINE_CPUFLAG_SSSE3
+ struct virtio_hw *hw = dev->data->dev_private;
I'd suggest to move above declaration to ...
Post by Jerin Jacob
/* Use simple rx/tx func if single segment and no offloads */
if ((tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) == VIRTIO_SIMPLE_FLAGS &&
!vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
here: we should try to avoid declaring vars in the middle of a code block.
Next patch in this series, moving all rxtx handler selection code to
separate function(virtio_update_rxtx_handler()) where declaration comes
as first line in the function.i.e the comment is taken care of in the
series.
Yes, I saw that. But in principle, each patch is atomic: it's not a
good idea/practice to introduce issues in path A and then fix it in
path B.
In my view it was not an issue as I was removing all possible
conditional compilation flag. If I were to move the declaration to top
then another conditional compilation RTE_MACHINE_CPUFLAG_SSSE3
flag I need to add around declaring the variable.

Hope this justifies the reason. If you are not convinced then let me know,
if will add the change in next revision.

Jerin
Post by Yuanhan Liu
--yliu
Yuanhan Liu
2016-07-04 11:02:25 UTC
Permalink
Post by Jerin Jacob
Post by Yuanhan Liu
Post by Jerin Jacob
Post by Yuanhan Liu
Post by Jerin Jacob
@@ -494,9 +486,6 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
{
uint8_t vtpci_queue_idx = 2 * queue_idx + VTNET_SQ_TQ_QUEUE_IDX;
-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
- struct virtio_hw *hw = dev->data->dev_private;
-#endif
struct virtnet_tx *txvq;
struct virtqueue *vq;
uint16_t tx_free_thresh;
@@ -511,13 +500,14 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
}
#ifdef RTE_MACHINE_CPUFLAG_SSSE3
+ struct virtio_hw *hw = dev->data->dev_private;
I'd suggest to move above declaration to ...
Post by Jerin Jacob
/* Use simple rx/tx func if single segment and no offloads */
if ((tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) == VIRTIO_SIMPLE_FLAGS &&
!vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
here: we should try to avoid declaring vars in the middle of a code block.
Next patch in this series, moving all rxtx handler selection code to
separate function(virtio_update_rxtx_handler()) where declaration comes
as first line in the function.i.e the comment is taken care of in the
series.
Yes, I saw that. But in principle, each patch is atomic: it's not a
good idea/practice to introduce issues in path A and then fix it in
path B.
In my view it was not an issue as I was removing all possible
conditional compilation flag. If I were to move the declaration to top
then another conditional compilation RTE_MACHINE_CPUFLAG_SSSE3
flag I need to add around declaring the variable.
Nope, I was suggesting to move it inside the "if" block. So, this
is actually consistent with what you are trying to do. Besides, it
removes an declaration in the middle.

--yliu
Post by Jerin Jacob
Hope this justifies the reason. If you are not convinced then let me know,
if will add the change in next revision.
Jerin
Post by Yuanhan Liu
--yliu
Jerin Jacob
2016-07-04 12:15:57 UTC
Permalink
Post by Yuanhan Liu
Post by Jerin Jacob
Post by Yuanhan Liu
Post by Jerin Jacob
Post by Yuanhan Liu
Post by Jerin Jacob
@@ -494,9 +486,6 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
{
uint8_t vtpci_queue_idx = 2 * queue_idx + VTNET_SQ_TQ_QUEUE_IDX;
-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
- struct virtio_hw *hw = dev->data->dev_private;
-#endif
struct virtnet_tx *txvq;
struct virtqueue *vq;
uint16_t tx_free_thresh;
@@ -511,13 +500,14 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
}
#ifdef RTE_MACHINE_CPUFLAG_SSSE3
+ struct virtio_hw *hw = dev->data->dev_private;
I'd suggest to move above declaration to ...
Post by Jerin Jacob
/* Use simple rx/tx func if single segment and no offloads */
if ((tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) == VIRTIO_SIMPLE_FLAGS &&
!vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
here: we should try to avoid declaring vars in the middle of a code block.
Next patch in this series, moving all rxtx handler selection code to
separate function(virtio_update_rxtx_handler()) where declaration comes
as first line in the function.i.e the comment is taken care of in the
series.
Yes, I saw that. But in principle, each patch is atomic: it's not a
good idea/practice to introduce issues in path A and then fix it in
path B.
In my view it was not an issue as I was removing all possible
conditional compilation flag. If I were to move the declaration to top
then another conditional compilation RTE_MACHINE_CPUFLAG_SSSE3
flag I need to add around declaring the variable.
Nope, I was suggesting to move it inside the "if" block. So, this
is actually consistent with what you are trying to do. Besides, it
removes an declaration in the middle.
Just to get the clarity on "moving inside the 'if' block"

Are you suggesting to have like below?

#ifdef RTE_MACHINE_CPUFLAG_SSSE3
+ struct virtio_hw *hw;
/* Use simple rx/tx func if single segment and no offloads */
if ((tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) ==
VIRTIO_SIMPLE_FLAGS &&
!vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
PMD_INIT_LOG(INFO, "Using simple rx/tx path");
dev->tx_pkt_burst = virtio_xmit_pkts_simple;
dev->rx_pkt_burst = virtio_recv_pkts_vec;
- use_simple_rxtx = 1;
+ hw = dev->data->dev_private;
+ hw->use_simple_rxtx = 1;
}
#endif


Instead of following scheme in existing patch,

#ifdef RTE_MACHINE_CPUFLAG_SSSE3
+ struct virtio_hw *hw = dev->data->dev_private;
/* Use simple rx/tx func if single segment and no offloads */
if ((tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) ==
VIRTIO_SIMPLE_FLAGS &&
!vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
PMD_INIT_LOG(INFO, "Using simple rx/tx path");
dev->tx_pkt_burst = virtio_xmit_pkts_simple;
dev->rx_pkt_burst = virtio_recv_pkts_vec;
- use_simple_rxtx = 1;
+ hw->use_simple_rxtx = 1;
}
#endif


The former case will have issue as "hw" been used in "if" with vtpci_with_feature.

OR

if you meant just floating "struct virtio_hw *hw" without RTE_MACHINE_CPUFLAG_SSSE3
then it comes error on non x86 as unused "hw" variable.

If you meant something else then let me know?
Post by Yuanhan Liu
--yliu
Post by Jerin Jacob
Hope this justifies the reason. If you are not convinced then let me know,
if will add the change in next revision.
Jerin
Post by Yuanhan Liu
--yliu
Yuanhan Liu
2016-07-04 12:26:30 UTC
Permalink
Post by Jerin Jacob
Post by Yuanhan Liu
Post by Jerin Jacob
Post by Yuanhan Liu
Post by Jerin Jacob
Post by Yuanhan Liu
Post by Jerin Jacob
@@ -494,9 +486,6 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
{
uint8_t vtpci_queue_idx = 2 * queue_idx + VTNET_SQ_TQ_QUEUE_IDX;
-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
- struct virtio_hw *hw = dev->data->dev_private;
-#endif
struct virtnet_tx *txvq;
struct virtqueue *vq;
uint16_t tx_free_thresh;
@@ -511,13 +500,14 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
}
#ifdef RTE_MACHINE_CPUFLAG_SSSE3
+ struct virtio_hw *hw = dev->data->dev_private;
I'd suggest to move above declaration to ...
Post by Jerin Jacob
/* Use simple rx/tx func if single segment and no offloads */
if ((tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) == VIRTIO_SIMPLE_FLAGS &&
!vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
here: we should try to avoid declaring vars in the middle of a code block.
Next patch in this series, moving all rxtx handler selection code to
separate function(virtio_update_rxtx_handler()) where declaration comes
as first line in the function.i.e the comment is taken care of in the
series.
Yes, I saw that. But in principle, each patch is atomic: it's not a
good idea/practice to introduce issues in path A and then fix it in
path B.
In my view it was not an issue as I was removing all possible
conditional compilation flag. If I were to move the declaration to top
then another conditional compilation RTE_MACHINE_CPUFLAG_SSSE3
flag I need to add around declaring the variable.
Nope, I was suggesting to move it inside the "if" block. So, this
is actually consistent with what you are trying to do. Besides, it
removes an declaration in the middle.
Just to get the clarity on "moving inside the 'if' block"
Are you suggesting to have like below?
#ifdef RTE_MACHINE_CPUFLAG_SSSE3
+ struct virtio_hw *hw;
/* Use simple rx/tx func if single segment and no offloads */
if ((tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) ==
VIRTIO_SIMPLE_FLAGS &&
!vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
PMD_INIT_LOG(INFO, "Using simple rx/tx path");
dev->tx_pkt_burst = virtio_xmit_pkts_simple;
dev->rx_pkt_burst = virtio_recv_pkts_vec;
- use_simple_rxtx = 1;
+ hw = dev->data->dev_private;
+ hw->use_simple_rxtx = 1;
}
#endif
Instead of following scheme in existing patch,
#ifdef RTE_MACHINE_CPUFLAG_SSSE3
+ struct virtio_hw *hw = dev->data->dev_private;
/* Use simple rx/tx func if single segment and no offloads */
if ((tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) ==
VIRTIO_SIMPLE_FLAGS &&
!vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
PMD_INIT_LOG(INFO, "Using simple rx/tx path");
dev->tx_pkt_burst = virtio_xmit_pkts_simple;
dev->rx_pkt_burst = virtio_recv_pkts_vec;
- use_simple_rxtx = 1;
+ hw->use_simple_rxtx = 1;
}
#endif
The former case will have issue as "hw" been used in "if" with vtpci_with_feature.
Oh, my bad. I overlooked it. Sorry for that!
Post by Jerin Jacob
OR
if you meant just floating "struct virtio_hw *hw" without RTE_MACHINE_CPUFLAG_SSSE3
then it comes error on non x86 as unused "hw" variable.
If you meant something else then let me know?
I then prefer to keep the "#ifdef .. #endif" on top then. It will stop
us from offending a minor rule, while you can remove the ugly "#ifdef"
block in the next patch.

Works to you?

--yliu
Jerin Jacob
2016-07-04 12:50:42 UTC
Permalink
Post by Yuanhan Liu
Post by Jerin Jacob
Post by Yuanhan Liu
Post by Jerin Jacob
Post by Yuanhan Liu
Post by Jerin Jacob
Post by Yuanhan Liu
Post by Jerin Jacob
@@ -494,9 +486,6 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
{
uint8_t vtpci_queue_idx = 2 * queue_idx + VTNET_SQ_TQ_QUEUE_IDX;
-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
- struct virtio_hw *hw = dev->data->dev_private;
-#endif
struct virtnet_tx *txvq;
struct virtqueue *vq;
uint16_t tx_free_thresh;
@@ -511,13 +500,14 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
}
#ifdef RTE_MACHINE_CPUFLAG_SSSE3
+ struct virtio_hw *hw = dev->data->dev_private;
I'd suggest to move above declaration to ...
Post by Jerin Jacob
/* Use simple rx/tx func if single segment and no offloads */
if ((tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) == VIRTIO_SIMPLE_FLAGS &&
!vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
here: we should try to avoid declaring vars in the middle of a code block.
Next patch in this series, moving all rxtx handler selection code to
separate function(virtio_update_rxtx_handler()) where declaration comes
as first line in the function.i.e the comment is taken care of in the
series.
Yes, I saw that. But in principle, each patch is atomic: it's not a
good idea/practice to introduce issues in path A and then fix it in
path B.
In my view it was not an issue as I was removing all possible
conditional compilation flag. If I were to move the declaration to top
then another conditional compilation RTE_MACHINE_CPUFLAG_SSSE3
flag I need to add around declaring the variable.
Nope, I was suggesting to move it inside the "if" block. So, this
is actually consistent with what you are trying to do. Besides, it
removes an declaration in the middle.
Just to get the clarity on "moving inside the 'if' block"
Are you suggesting to have like below?
#ifdef RTE_MACHINE_CPUFLAG_SSSE3
+ struct virtio_hw *hw;
/* Use simple rx/tx func if single segment and no offloads */
if ((tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) ==
VIRTIO_SIMPLE_FLAGS &&
!vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
PMD_INIT_LOG(INFO, "Using simple rx/tx path");
dev->tx_pkt_burst = virtio_xmit_pkts_simple;
dev->rx_pkt_burst = virtio_recv_pkts_vec;
- use_simple_rxtx = 1;
+ hw = dev->data->dev_private;
+ hw->use_simple_rxtx = 1;
}
#endif
Instead of following scheme in existing patch,
#ifdef RTE_MACHINE_CPUFLAG_SSSE3
+ struct virtio_hw *hw = dev->data->dev_private;
/* Use simple rx/tx func if single segment and no offloads */
if ((tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) ==
VIRTIO_SIMPLE_FLAGS &&
!vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
PMD_INIT_LOG(INFO, "Using simple rx/tx path");
dev->tx_pkt_burst = virtio_xmit_pkts_simple;
dev->rx_pkt_burst = virtio_recv_pkts_vec;
- use_simple_rxtx = 1;
+ hw->use_simple_rxtx = 1;
}
#endif
The former case will have issue as "hw" been used in "if" with vtpci_with_feature.
Oh, my bad. I overlooked it. Sorry for that!
Post by Jerin Jacob
OR
if you meant just floating "struct virtio_hw *hw" without RTE_MACHINE_CPUFLAG_SSSE3
then it comes error on non x86 as unused "hw" variable.
If you meant something else then let me know?
I then prefer to keep the "#ifdef .. #endif" on top then. It will stop
us from offending a minor rule, while you can remove the ugly "#ifdef"
block in the next patch.
Works to you?
OK. As you wish :-)
Post by Yuanhan Liu
--yliu
Yuanhan Liu
2016-07-04 12:57:15 UTC
Permalink
Post by Jerin Jacob
Post by Yuanhan Liu
Post by Jerin Jacob
The former case will have issue as "hw" been used in "if" with vtpci_with_feature.
Oh, my bad. I overlooked it. Sorry for that!
Post by Jerin Jacob
OR
if you meant just floating "struct virtio_hw *hw" without RTE_MACHINE_CPUFLAG_SSSE3
then it comes error on non x86 as unused "hw" variable.
If you meant something else then let me know?
I then prefer to keep the "#ifdef .. #endif" on top then. It will stop
us from offending a minor rule, while you can remove the ugly "#ifdef"
block in the next patch.
Works to you?
OK. As you wish :-)
Thank you!

--yliu
Jerin Jacob
2016-07-01 11:16:37 UTC
Permalink
* Introduced cpuflag based run-time detection to
select the SSE based simple Rx handler
* Split out SSE instruction based virtio simple Rx
implementation to a separate file

Signed-off-by: Jerin Jacob <***@caviumnetworks.com>
---
drivers/net/virtio/Makefile | 4 +
drivers/net/virtio/virtio_rxtx.c | 35 ++--
drivers/net/virtio/virtio_rxtx_simple.c | 273 ++--------------------------
drivers/net/virtio/virtio_rxtx_simple.h | 133 ++++++++++++++
drivers/net/virtio/virtio_rxtx_simple_sse.c | 222 ++++++++++++++++++++++
5 files changed, 394 insertions(+), 273 deletions(-)
create mode 100644 drivers/net/virtio/virtio_rxtx_simple.h
create mode 100644 drivers/net/virtio/virtio_rxtx_simple_sse.c

diff --git a/drivers/net/virtio/Makefile b/drivers/net/virtio/Makefile
index b9b0d8d..c4103b7 100644
--- a/drivers/net/virtio/Makefile
+++ b/drivers/net/virtio/Makefile
@@ -52,6 +52,10 @@ SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx.c
SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_ethdev.c
SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx_simple.c

+ifeq ($(CONFIG_RTE_ARCH_X86),y)
+SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx_simple_sse.c
+endif
+
ifeq ($(CONFIG_RTE_VIRTIO_USER),y)
SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/vhost_user.c
SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/virtio_user_dev.c
diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
index 63b53f7..a4d4a57 100644
--- a/drivers/net/virtio/virtio_rxtx.c
+++ b/drivers/net/virtio/virtio_rxtx.c
@@ -50,6 +50,7 @@
#include <rte_string_fns.h>
#include <rte_errno.h>
#include <rte_byteorder.h>
+#include <rte_cpuflags.h>

#include "virtio_logs.h"
#include "virtio_ethdev.h"
@@ -470,6 +471,28 @@ virtio_dev_rx_queue_release(void *rxq)
rte_memzone_free(mz);
}

+static void
+virtio_update_rxtx_handler(struct rte_eth_dev *dev,
+ const struct rte_eth_txconf *tx_conf)
+{
+ uint8_t use_simple_rxtx = 0;
+ struct virtio_hw *hw = dev->data->dev_private;
+
+#if defined RTE_ARCH_X86
+ if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SSE3))
+ use_simple_rxtx = 1;
+#endif
+ /* Use simple rx/tx func if single segment and no offloads */
+ if (use_simple_rxtx &&
+ (tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) == VIRTIO_SIMPLE_FLAGS &&
+ !vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
+ PMD_INIT_LOG(INFO, "Using simple rx/tx path");
+ dev->tx_pkt_burst = virtio_xmit_pkts_simple;
+ dev->rx_pkt_burst = virtio_recv_pkts_vec;
+ hw->use_simple_rxtx = use_simple_rxtx;
+ }
+}
+
/*
* struct rte_eth_dev *dev: Used to update dev
* uint16_t nb_desc: Defaults to values read from config space
@@ -499,17 +522,7 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
return -EINVAL;
}

-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
- struct virtio_hw *hw = dev->data->dev_private;
- /* Use simple rx/tx func if single segment and no offloads */
- if ((tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) == VIRTIO_SIMPLE_FLAGS &&
- !vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
- PMD_INIT_LOG(INFO, "Using simple rx/tx path");
- dev->tx_pkt_burst = virtio_xmit_pkts_simple;
- dev->rx_pkt_burst = virtio_recv_pkts_vec;
- hw->use_simple_rxtx = 1;
- }
-#endif
+ virtio_update_rxtx_handler(dev, tx_conf);

ret = virtio_dev_queue_setup(dev, VTNET_TQ, queue_idx, vtpci_queue_idx,
nb_desc, socket_id, (void **)&txvq);
diff --git a/drivers/net/virtio/virtio_rxtx_simple.c b/drivers/net/virtio/virtio_rxtx_simple.c
index 67430da..485ddce 100644
--- a/drivers/net/virtio/virtio_rxtx_simple.c
+++ b/drivers/net/virtio/virtio_rxtx_simple.c
@@ -51,14 +51,7 @@
#include <rte_errno.h>
#include <rte_byteorder.h>

-#include "virtio_logs.h"
-#include "virtio_ethdev.h"
-#include "virtqueue.h"
-#include "virtio_rxtx.h"
-
-#define RTE_VIRTIO_VPMD_RX_BURST 32
-#define RTE_VIRTIO_DESC_PER_LOOP 8
-#define RTE_VIRTIO_VPMD_RX_REARM_THRESH RTE_VIRTIO_VPMD_RX_BURST
+#include "virtio_rxtx_simple.h"

#ifndef __INTEL_COMPILER
#pragma GCC diagnostic ignored "-Wcast-qual"
@@ -89,260 +82,6 @@ virtqueue_enqueue_recv_refill_simple(struct virtqueue *vq,
return 0;
}

-static inline void
-virtio_rxq_rearm_vec(struct virtnet_rx *rxvq)
-{
- int i;
- uint16_t desc_idx;
- struct rte_mbuf **sw_ring;
- struct vring_desc *start_dp;
- int ret;
- struct virtqueue *vq = rxvq->vq;
-
- desc_idx = vq->vq_avail_idx & (vq->vq_nentries - 1);
- sw_ring = &vq->sw_ring[desc_idx];
- start_dp = &vq->vq_ring.desc[desc_idx];
-
- ret = rte_mempool_get_bulk(rxvq->mpool, (void **)sw_ring,
- RTE_VIRTIO_VPMD_RX_REARM_THRESH);
- if (unlikely(ret)) {
- rte_eth_devices[rxvq->port_id].data->rx_mbuf_alloc_failed +=
- RTE_VIRTIO_VPMD_RX_REARM_THRESH;
- return;
- }
-
- for (i = 0; i < RTE_VIRTIO_VPMD_RX_REARM_THRESH; i++) {
- uintptr_t p;
-
- p = (uintptr_t)&sw_ring[i]->rearm_data;
- *(uint64_t *)p = rxvq->mbuf_initializer;
-
- start_dp[i].addr =
- MBUF_DATA_DMA_ADDR(sw_ring[i], vq->offset) -
- vq->hw->vtnet_hdr_size;
- start_dp[i].len = sw_ring[i]->buf_len -
- RTE_PKTMBUF_HEADROOM + vq->hw->vtnet_hdr_size;
- }
-
- vq->vq_avail_idx += RTE_VIRTIO_VPMD_RX_REARM_THRESH;
- vq->vq_free_cnt -= RTE_VIRTIO_VPMD_RX_REARM_THRESH;
- vq_update_avail_idx(vq);
-}
-
-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
-
-#include <tmmintrin.h>
-
-/* virtio vPMD receive routine, only accept(nb_pkts >= RTE_VIRTIO_DESC_PER_LOOP)
- *
- * This routine is for non-mergeable RX, one desc for each guest buffer.
- * This routine is based on the RX ring layout optimization. Each entry in the
- * avail ring points to the desc with the same index in the desc ring and this
- * will never be changed in the driver.
- *
- * - nb_pkts < RTE_VIRTIO_DESC_PER_LOOP, just return no packet
- */
-uint16_t
-virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
- uint16_t nb_pkts)
-{
- struct virtnet_rx *rxvq = rx_queue;
- struct virtqueue *vq = rxvq->vq;
- uint16_t nb_used;
- uint16_t desc_idx;
- struct vring_used_elem *rused;
- struct rte_mbuf **sw_ring;
- struct rte_mbuf **sw_ring_end;
- uint16_t nb_pkts_received;
- __m128i shuf_msk1, shuf_msk2, len_adjust;
-
- shuf_msk1 = _mm_set_epi8(
- 0xFF, 0xFF, 0xFF, 0xFF,
- 0xFF, 0xFF, /* vlan tci */
- 5, 4, /* dat len */
- 0xFF, 0xFF, 5, 4, /* pkt len */
- 0xFF, 0xFF, 0xFF, 0xFF /* packet type */
-
- );
-
- shuf_msk2 = _mm_set_epi8(
- 0xFF, 0xFF, 0xFF, 0xFF,
- 0xFF, 0xFF, /* vlan tci */
- 13, 12, /* dat len */
- 0xFF, 0xFF, 13, 12, /* pkt len */
- 0xFF, 0xFF, 0xFF, 0xFF /* packet type */
- );
-
- /* Subtract the header length.
- * In which case do we need the header length in used->len ?
- */
- len_adjust = _mm_set_epi16(
- 0, 0,
- 0,
- (uint16_t)-vq->hw->vtnet_hdr_size,
- 0, (uint16_t)-vq->hw->vtnet_hdr_size,
- 0, 0);
-
- if (unlikely(nb_pkts < RTE_VIRTIO_DESC_PER_LOOP))
- return 0;
-
- nb_used = VIRTQUEUE_NUSED(vq);
-
- rte_compiler_barrier();
-
- if (unlikely(nb_used == 0))
- return 0;
-
- nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_VIRTIO_DESC_PER_LOOP);
- nb_used = RTE_MIN(nb_used, nb_pkts);
-
- desc_idx = (uint16_t)(vq->vq_used_cons_idx & (vq->vq_nentries - 1));
- rused = &vq->vq_ring.used->ring[desc_idx];
- sw_ring = &vq->sw_ring[desc_idx];
- sw_ring_end = &vq->sw_ring[vq->vq_nentries];
-
- _mm_prefetch((const void *)rused, _MM_HINT_T0);
-
- if (vq->vq_free_cnt >= RTE_VIRTIO_VPMD_RX_REARM_THRESH) {
- virtio_rxq_rearm_vec(rxvq);
- if (unlikely(virtqueue_kick_prepare(vq)))
- virtqueue_notify(vq);
- }
-
- for (nb_pkts_received = 0;
- nb_pkts_received < nb_used;) {
- __m128i desc[RTE_VIRTIO_DESC_PER_LOOP / 2];
- __m128i mbp[RTE_VIRTIO_DESC_PER_LOOP / 2];
- __m128i pkt_mb[RTE_VIRTIO_DESC_PER_LOOP];
-
- mbp[0] = _mm_loadu_si128((__m128i *)(sw_ring + 0));
- desc[0] = _mm_loadu_si128((__m128i *)(rused + 0));
- _mm_storeu_si128((__m128i *)&rx_pkts[0], mbp[0]);
-
- mbp[1] = _mm_loadu_si128((__m128i *)(sw_ring + 2));
- desc[1] = _mm_loadu_si128((__m128i *)(rused + 2));
- _mm_storeu_si128((__m128i *)&rx_pkts[2], mbp[1]);
-
- mbp[2] = _mm_loadu_si128((__m128i *)(sw_ring + 4));
- desc[2] = _mm_loadu_si128((__m128i *)(rused + 4));
- _mm_storeu_si128((__m128i *)&rx_pkts[4], mbp[2]);
-
- mbp[3] = _mm_loadu_si128((__m128i *)(sw_ring + 6));
- desc[3] = _mm_loadu_si128((__m128i *)(rused + 6));
- _mm_storeu_si128((__m128i *)&rx_pkts[6], mbp[3]);
-
- pkt_mb[1] = _mm_shuffle_epi8(desc[0], shuf_msk2);
- pkt_mb[0] = _mm_shuffle_epi8(desc[0], shuf_msk1);
- pkt_mb[1] = _mm_add_epi16(pkt_mb[1], len_adjust);
- pkt_mb[0] = _mm_add_epi16(pkt_mb[0], len_adjust);
- _mm_storeu_si128((void *)&rx_pkts[1]->rx_descriptor_fields1,
- pkt_mb[1]);
- _mm_storeu_si128((void *)&rx_pkts[0]->rx_descriptor_fields1,
- pkt_mb[0]);
-
- pkt_mb[3] = _mm_shuffle_epi8(desc[1], shuf_msk2);
- pkt_mb[2] = _mm_shuffle_epi8(desc[1], shuf_msk1);
- pkt_mb[3] = _mm_add_epi16(pkt_mb[3], len_adjust);
- pkt_mb[2] = _mm_add_epi16(pkt_mb[2], len_adjust);
- _mm_storeu_si128((void *)&rx_pkts[3]->rx_descriptor_fields1,
- pkt_mb[3]);
- _mm_storeu_si128((void *)&rx_pkts[2]->rx_descriptor_fields1,
- pkt_mb[2]);
-
- pkt_mb[5] = _mm_shuffle_epi8(desc[2], shuf_msk2);
- pkt_mb[4] = _mm_shuffle_epi8(desc[2], shuf_msk1);
- pkt_mb[5] = _mm_add_epi16(pkt_mb[5], len_adjust);
- pkt_mb[4] = _mm_add_epi16(pkt_mb[4], len_adjust);
- _mm_storeu_si128((void *)&rx_pkts[5]->rx_descriptor_fields1,
- pkt_mb[5]);
- _mm_storeu_si128((void *)&rx_pkts[4]->rx_descriptor_fields1,
- pkt_mb[4]);
-
- pkt_mb[7] = _mm_shuffle_epi8(desc[3], shuf_msk2);
- pkt_mb[6] = _mm_shuffle_epi8(desc[3], shuf_msk1);
- pkt_mb[7] = _mm_add_epi16(pkt_mb[7], len_adjust);
- pkt_mb[6] = _mm_add_epi16(pkt_mb[6], len_adjust);
- _mm_storeu_si128((void *)&rx_pkts[7]->rx_descriptor_fields1,
- pkt_mb[7]);
- _mm_storeu_si128((void *)&rx_pkts[6]->rx_descriptor_fields1,
- pkt_mb[6]);
-
- if (unlikely(nb_used <= RTE_VIRTIO_DESC_PER_LOOP)) {
- if (sw_ring + nb_used <= sw_ring_end)
- nb_pkts_received += nb_used;
- else
- nb_pkts_received += sw_ring_end - sw_ring;
- break;
- } else {
- if (unlikely(sw_ring + RTE_VIRTIO_DESC_PER_LOOP >=
- sw_ring_end)) {
- nb_pkts_received += sw_ring_end - sw_ring;
- break;
- } else {
- nb_pkts_received += RTE_VIRTIO_DESC_PER_LOOP;
-
- rx_pkts += RTE_VIRTIO_DESC_PER_LOOP;
- sw_ring += RTE_VIRTIO_DESC_PER_LOOP;
- rused += RTE_VIRTIO_DESC_PER_LOOP;
- nb_used -= RTE_VIRTIO_DESC_PER_LOOP;
- }
- }
- }
-
- vq->vq_used_cons_idx += nb_pkts_received;
- vq->vq_free_cnt += nb_pkts_received;
- rxvq->stats.packets += nb_pkts_received;
- return nb_pkts_received;
-}
-
-#endif
-
-#define VIRTIO_TX_FREE_THRESH 32
-#define VIRTIO_TX_MAX_FREE_BUF_SZ 32
-#define VIRTIO_TX_FREE_NR 32
-/* TODO: vq->tx_free_cnt could mean num of free slots so we could avoid shift */
-static inline void
-virtio_xmit_cleanup(struct virtqueue *vq)
-{
- uint16_t i, desc_idx;
- int nb_free = 0;
- struct rte_mbuf *m, *free[VIRTIO_TX_MAX_FREE_BUF_SZ];
-
- desc_idx = (uint16_t)(vq->vq_used_cons_idx &
- ((vq->vq_nentries >> 1) - 1));
- m = (struct rte_mbuf *)vq->vq_descx[desc_idx++].cookie;
- m = __rte_pktmbuf_prefree_seg(m);
- if (likely(m != NULL)) {
- free[0] = m;
- nb_free = 1;
- for (i = 1; i < VIRTIO_TX_FREE_NR; i++) {
- m = (struct rte_mbuf *)vq->vq_descx[desc_idx++].cookie;
- m = __rte_pktmbuf_prefree_seg(m);
- if (likely(m != NULL)) {
- if (likely(m->pool == free[0]->pool))
- free[nb_free++] = m;
- else {
- rte_mempool_put_bulk(free[0]->pool,
- (void **)free, nb_free);
- free[0] = m;
- nb_free = 1;
- }
- }
- }
- rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free);
- } else {
- for (i = 1; i < VIRTIO_TX_FREE_NR; i++) {
- m = (struct rte_mbuf *)vq->vq_descx[desc_idx++].cookie;
- m = __rte_pktmbuf_prefree_seg(m);
- if (m != NULL)
- rte_mempool_put(m->pool, m);
- }
- }
-
- vq->vq_used_cons_idx += VIRTIO_TX_FREE_NR;
- vq->vq_free_cnt += (VIRTIO_TX_FREE_NR << 1);
-}
-
uint16_t
virtio_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts,
uint16_t nb_pkts)
@@ -423,3 +162,13 @@ virtio_rxq_vec_setup(struct virtnet_rx *rxq)

return 0;
}
+
+/* Stub for linkage when arch specific implementation is not available */
+uint16_t __attribute__((weak))
+virtio_recv_pkts_vec(void *rx_queue __rte_unused,
+ struct rte_mbuf **rx_pkts __rte_unused,
+ uint16_t nb_pkts __rte_unused)
+{
+ rte_panic("Wrong weak function linked by linker\n");
+ return 0;
+}
diff --git a/drivers/net/virtio/virtio_rxtx_simple.h b/drivers/net/virtio/virtio_rxtx_simple.h
new file mode 100644
index 0000000..8cb43c0
--- /dev/null
+++ b/drivers/net/virtio/virtio_rxtx_simple.h
@@ -0,0 +1,133 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _VIRTIO_RXTX_SIMPLE_H_
+#define _VIRTIO_RXTX_SIMPLE_H_
+
+#include <stdint.h>
+
+#include "virtio_logs.h"
+#include "virtio_ethdev.h"
+#include "virtqueue.h"
+#include "virtio_rxtx.h"
+
+#define RTE_VIRTIO_VPMD_RX_BURST 32
+#define RTE_VIRTIO_VPMD_RX_REARM_THRESH RTE_VIRTIO_VPMD_RX_BURST
+
+static inline void
+virtio_rxq_rearm_vec(struct virtnet_rx *rxvq)
+{
+ int i;
+ uint16_t desc_idx;
+ struct rte_mbuf **sw_ring;
+ struct vring_desc *start_dp;
+ int ret;
+ struct virtqueue *vq = rxvq->vq;
+
+ desc_idx = vq->vq_avail_idx & (vq->vq_nentries - 1);
+ sw_ring = &vq->sw_ring[desc_idx];
+ start_dp = &vq->vq_ring.desc[desc_idx];
+
+ ret = rte_mempool_get_bulk(rxvq->mpool, (void **)sw_ring,
+ RTE_VIRTIO_VPMD_RX_REARM_THRESH);
+ if (unlikely(ret)) {
+ rte_eth_devices[rxvq->port_id].data->rx_mbuf_alloc_failed +=
+ RTE_VIRTIO_VPMD_RX_REARM_THRESH;
+ return;
+ }
+
+ for (i = 0; i < RTE_VIRTIO_VPMD_RX_REARM_THRESH; i++) {
+ uintptr_t p;
+
+ p = (uintptr_t)&sw_ring[i]->rearm_data;
+ *(uint64_t *)p = rxvq->mbuf_initializer;
+
+ start_dp[i].addr =
+ MBUF_DATA_DMA_ADDR(sw_ring[i], vq->offset) -
+ vq->hw->vtnet_hdr_size;
+ start_dp[i].len = sw_ring[i]->buf_len -
+ RTE_PKTMBUF_HEADROOM + vq->hw->vtnet_hdr_size;
+ }
+
+ vq->vq_avail_idx += RTE_VIRTIO_VPMD_RX_REARM_THRESH;
+ vq->vq_free_cnt -= RTE_VIRTIO_VPMD_RX_REARM_THRESH;
+ vq_update_avail_idx(vq);
+}
+
+#define VIRTIO_TX_FREE_THRESH 32
+#define VIRTIO_TX_MAX_FREE_BUF_SZ 32
+#define VIRTIO_TX_FREE_NR 32
+/* TODO: vq->tx_free_cnt could mean num of free slots so we could avoid shift */
+static inline void
+virtio_xmit_cleanup(struct virtqueue *vq)
+{
+ uint16_t i, desc_idx;
+ int nb_free = 0;
+ struct rte_mbuf *m, *free[VIRTIO_TX_MAX_FREE_BUF_SZ];
+
+ desc_idx = (uint16_t)(vq->vq_used_cons_idx &
+ ((vq->vq_nentries >> 1) - 1));
+ m = (struct rte_mbuf *)vq->vq_descx[desc_idx++].cookie;
+ m = __rte_pktmbuf_prefree_seg(m);
+ if (likely(m != NULL)) {
+ free[0] = m;
+ nb_free = 1;
+ for (i = 1; i < VIRTIO_TX_FREE_NR; i++) {
+ m = (struct rte_mbuf *)vq->vq_descx[desc_idx++].cookie;
+ m = __rte_pktmbuf_prefree_seg(m);
+ if (likely(m != NULL)) {
+ if (likely(m->pool == free[0]->pool))
+ free[nb_free++] = m;
+ else {
+ rte_mempool_put_bulk(free[0]->pool,
+ (void **)free, nb_free);
+ free[0] = m;
+ nb_free = 1;
+ }
+ }
+ }
+ rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free);
+ } else {
+ for (i = 1; i < VIRTIO_TX_FREE_NR; i++) {
+ m = (struct rte_mbuf *)vq->vq_descx[desc_idx++].cookie;
+ m = __rte_pktmbuf_prefree_seg(m);
+ if (m != NULL)
+ rte_mempool_put(m->pool, m);
+ }
+ }
+
+ vq->vq_used_cons_idx += VIRTIO_TX_FREE_NR;
+ vq->vq_free_cnt += (VIRTIO_TX_FREE_NR << 1);
+}
+
+#endif /* _VIRTIO_RXTX_SIMPLE_H_ */
diff --git a/drivers/net/virtio/virtio_rxtx_simple_sse.c b/drivers/net/virtio/virtio_rxtx_simple_sse.c
new file mode 100644
index 0000000..39000e8
--- /dev/null
+++ b/drivers/net/virtio/virtio_rxtx_simple_sse.c
@@ -0,0 +1,222 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include <tmmintrin.h>
+
+#include <rte_byteorder.h>
+#include <rte_branch_prediction.h>
+#include <rte_cycles.h>
+#include <rte_ether.h>
+#include <rte_ethdev.h>
+#include <rte_errno.h>
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_mempool.h>
+#include <rte_malloc.h>
+#include <rte_mbuf.h>
+#include <rte_prefetch.h>
+#include <rte_string_fns.h>
+
+#include "virtio_rxtx_simple.h"
+
+#define RTE_VIRTIO_VPMD_RX_BURST 32
+#define RTE_VIRTIO_DESC_PER_LOOP 8
+#define RTE_VIRTIO_VPMD_RX_REARM_THRESH RTE_VIRTIO_VPMD_RX_BURST
+
+/* virtio vPMD receive routine, only accept(nb_pkts >= RTE_VIRTIO_DESC_PER_LOOP)
+ *
+ * This routine is for non-mergeable RX, one desc for each guest buffer.
+ * This routine is based on the RX ring layout optimization. Each entry in the
+ * avail ring points to the desc with the same index in the desc ring and this
+ * will never be changed in the driver.
+ *
+ * - nb_pkts < RTE_VIRTIO_DESC_PER_LOOP, just return no packet
+ */
+uint16_t
+virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts)
+{
+ struct virtnet_rx *rxvq = rx_queue;
+ struct virtqueue *vq = rxvq->vq;
+ uint16_t nb_used;
+ uint16_t desc_idx;
+ struct vring_used_elem *rused;
+ struct rte_mbuf **sw_ring;
+ struct rte_mbuf **sw_ring_end;
+ uint16_t nb_pkts_received;
+ __m128i shuf_msk1, shuf_msk2, len_adjust;
+
+ shuf_msk1 = _mm_set_epi8(
+ 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, /* vlan tci */
+ 5, 4, /* dat len */
+ 0xFF, 0xFF, 5, 4, /* pkt len */
+ 0xFF, 0xFF, 0xFF, 0xFF /* packet type */
+
+ );
+
+ shuf_msk2 = _mm_set_epi8(
+ 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, /* vlan tci */
+ 13, 12, /* dat len */
+ 0xFF, 0xFF, 13, 12, /* pkt len */
+ 0xFF, 0xFF, 0xFF, 0xFF /* packet type */
+ );
+
+ /* Subtract the header length.
+ * In which case do we need the header length in used->len ?
+ */
+ len_adjust = _mm_set_epi16(
+ 0, 0,
+ 0,
+ (uint16_t)-vq->hw->vtnet_hdr_size,
+ 0, (uint16_t)-vq->hw->vtnet_hdr_size,
+ 0, 0);
+
+ if (unlikely(nb_pkts < RTE_VIRTIO_DESC_PER_LOOP))
+ return 0;
+
+ nb_used = VIRTQUEUE_NUSED(vq);
+
+ rte_compiler_barrier();
+
+ if (unlikely(nb_used == 0))
+ return 0;
+
+ nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_VIRTIO_DESC_PER_LOOP);
+ nb_used = RTE_MIN(nb_used, nb_pkts);
+
+ desc_idx = (uint16_t)(vq->vq_used_cons_idx & (vq->vq_nentries - 1));
+ rused = &vq->vq_ring.used->ring[desc_idx];
+ sw_ring = &vq->sw_ring[desc_idx];
+ sw_ring_end = &vq->sw_ring[vq->vq_nentries];
+
+ _mm_prefetch((const void *)rused, _MM_HINT_T0);
+
+ if (vq->vq_free_cnt >= RTE_VIRTIO_VPMD_RX_REARM_THRESH) {
+ virtio_rxq_rearm_vec(rxvq);
+ if (unlikely(virtqueue_kick_prepare(vq)))
+ virtqueue_notify(vq);
+ }
+
+ for (nb_pkts_received = 0;
+ nb_pkts_received < nb_used;) {
+ __m128i desc[RTE_VIRTIO_DESC_PER_LOOP / 2];
+ __m128i mbp[RTE_VIRTIO_DESC_PER_LOOP / 2];
+ __m128i pkt_mb[RTE_VIRTIO_DESC_PER_LOOP];
+
+ mbp[0] = _mm_loadu_si128((__m128i *)(sw_ring + 0));
+ desc[0] = _mm_loadu_si128((__m128i *)(rused + 0));
+ _mm_storeu_si128((__m128i *)&rx_pkts[0], mbp[0]);
+
+ mbp[1] = _mm_loadu_si128((__m128i *)(sw_ring + 2));
+ desc[1] = _mm_loadu_si128((__m128i *)(rused + 2));
+ _mm_storeu_si128((__m128i *)&rx_pkts[2], mbp[1]);
+
+ mbp[2] = _mm_loadu_si128((__m128i *)(sw_ring + 4));
+ desc[2] = _mm_loadu_si128((__m128i *)(rused + 4));
+ _mm_storeu_si128((__m128i *)&rx_pkts[4], mbp[2]);
+
+ mbp[3] = _mm_loadu_si128((__m128i *)(sw_ring + 6));
+ desc[3] = _mm_loadu_si128((__m128i *)(rused + 6));
+ _mm_storeu_si128((__m128i *)&rx_pkts[6], mbp[3]);
+
+ pkt_mb[1] = _mm_shuffle_epi8(desc[0], shuf_msk2);
+ pkt_mb[0] = _mm_shuffle_epi8(desc[0], shuf_msk1);
+ pkt_mb[1] = _mm_add_epi16(pkt_mb[1], len_adjust);
+ pkt_mb[0] = _mm_add_epi16(pkt_mb[0], len_adjust);
+ _mm_storeu_si128((void *)&rx_pkts[1]->rx_descriptor_fields1,
+ pkt_mb[1]);
+ _mm_storeu_si128((void *)&rx_pkts[0]->rx_descriptor_fields1,
+ pkt_mb[0]);
+
+ pkt_mb[3] = _mm_shuffle_epi8(desc[1], shuf_msk2);
+ pkt_mb[2] = _mm_shuffle_epi8(desc[1], shuf_msk1);
+ pkt_mb[3] = _mm_add_epi16(pkt_mb[3], len_adjust);
+ pkt_mb[2] = _mm_add_epi16(pkt_mb[2], len_adjust);
+ _mm_storeu_si128((void *)&rx_pkts[3]->rx_descriptor_fields1,
+ pkt_mb[3]);
+ _mm_storeu_si128((void *)&rx_pkts[2]->rx_descriptor_fields1,
+ pkt_mb[2]);
+
+ pkt_mb[5] = _mm_shuffle_epi8(desc[2], shuf_msk2);
+ pkt_mb[4] = _mm_shuffle_epi8(desc[2], shuf_msk1);
+ pkt_mb[5] = _mm_add_epi16(pkt_mb[5], len_adjust);
+ pkt_mb[4] = _mm_add_epi16(pkt_mb[4], len_adjust);
+ _mm_storeu_si128((void *)&rx_pkts[5]->rx_descriptor_fields1,
+ pkt_mb[5]);
+ _mm_storeu_si128((void *)&rx_pkts[4]->rx_descriptor_fields1,
+ pkt_mb[4]);
+
+ pkt_mb[7] = _mm_shuffle_epi8(desc[3], shuf_msk2);
+ pkt_mb[6] = _mm_shuffle_epi8(desc[3], shuf_msk1);
+ pkt_mb[7] = _mm_add_epi16(pkt_mb[7], len_adjust);
+ pkt_mb[6] = _mm_add_epi16(pkt_mb[6], len_adjust);
+ _mm_storeu_si128((void *)&rx_pkts[7]->rx_descriptor_fields1,
+ pkt_mb[7]);
+ _mm_storeu_si128((void *)&rx_pkts[6]->rx_descriptor_fields1,
+ pkt_mb[6]);
+
+ if (unlikely(nb_used <= RTE_VIRTIO_DESC_PER_LOOP)) {
+ if (sw_ring + nb_used <= sw_ring_end)
+ nb_pkts_received += nb_used;
+ else
+ nb_pkts_received += sw_ring_end - sw_ring;
+ break;
+ } else {
+ if (unlikely(sw_ring + RTE_VIRTIO_DESC_PER_LOOP >=
+ sw_ring_end)) {
+ nb_pkts_received += sw_ring_end - sw_ring;
+ break;
+ } else {
+ nb_pkts_received += RTE_VIRTIO_DESC_PER_LOOP;
+
+ rx_pkts += RTE_VIRTIO_DESC_PER_LOOP;
+ sw_ring += RTE_VIRTIO_DESC_PER_LOOP;
+ rused += RTE_VIRTIO_DESC_PER_LOOP;
+ nb_used -= RTE_VIRTIO_DESC_PER_LOOP;
+ }
+ }
+ }
+
+ vq->vq_used_cons_idx += nb_pkts_received;
+ vq->vq_free_cnt += nb_pkts_received;
+ rxvq->stats.packets += nb_pkts_received;
+ return nb_pkts_received;
+}
--
2.5.5
Yuanhan Liu
2016-07-04 07:42:47 UTC
Permalink
Post by Jerin Jacob
* Introduced cpuflag based run-time detection to
select the SSE based simple Rx handler
* Split out SSE instruction based virtio simple Rx
implementation to a separate file
As your commit log says, it does two things, therefore, I'd suggest you
to do it in two patches, with each just does one thing as you mentioned.
Post by Jerin Jacob
+static void
+virtio_update_rxtx_handler(struct rte_eth_dev *dev,
+ const struct rte_eth_txconf *tx_conf)
+{
+ uint8_t use_simple_rxtx = 0;
+ struct virtio_hw *hw = dev->data->dev_private;
+
+#if defined RTE_ARCH_X86
+ if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SSE3))
+ use_simple_rxtx = 1;
+#endif
+ /* Use simple rx/tx func if single segment and no offloads */
+ if (use_simple_rxtx &&
+ (tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) == VIRTIO_SIMPLE_FLAGS &&
+ !vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
The alignment here is not consistent, something like following is what
I'd suggest:

if (use_simple_rxtx &&
(tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) == VIRTIO_SIMPLE_FLAGS &&
!vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {

--yliu
Jerin Jacob
2016-07-04 08:38:14 UTC
Permalink
Post by Yuanhan Liu
Post by Jerin Jacob
* Introduced cpuflag based run-time detection to
select the SSE based simple Rx handler
* Split out SSE instruction based virtio simple Rx
implementation to a separate file
As your commit log says, it does two things, therefore, I'd suggest you
to do it in two patches, with each just does one thing as you mentioned.
OK. Will fix it in next revision.
Post by Yuanhan Liu
Post by Jerin Jacob
+static void
+virtio_update_rxtx_handler(struct rte_eth_dev *dev,
+ const struct rte_eth_txconf *tx_conf)
+{
+ uint8_t use_simple_rxtx = 0;
+ struct virtio_hw *hw = dev->data->dev_private;
+
+#if defined RTE_ARCH_X86
+ if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SSE3))
+ use_simple_rxtx = 1;
+#endif
+ /* Use simple rx/tx func if single segment and no offloads */
+ if (use_simple_rxtx &&
+ (tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) == VIRTIO_SIMPLE_FLAGS &&
+ !vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
The alignment here is not consistent, something like following is what
if (use_simple_rxtx &&
(tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) == VIRTIO_SIMPLE_FLAGS &&
!vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
OK. Will fix it in next revision.
Post by Yuanhan Liu
--yliu
Jerin Jacob
2016-07-01 11:16:38 UTC
Permalink
Added neon based Rx vector implementation.
Selection of the new handler based neon availability at runtime.
Updated the release notes and MAINTAINERS file.

Signed-off-by: Jerin Jacob <***@caviumnetworks.com>
---
MAINTAINERS | 1 +
doc/guides/rel_notes/release_16_07.rst | 2 +
drivers/net/virtio/Makefile | 2 +
drivers/net/virtio/virtio_rxtx.c | 3 +
drivers/net/virtio/virtio_rxtx_simple_neon.c | 235 +++++++++++++++++++++++++++
5 files changed, 243 insertions(+)
create mode 100644 drivers/net/virtio/virtio_rxtx_simple_neon.c

diff --git a/MAINTAINERS b/MAINTAINERS
index a59191e..ab04cee 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -143,6 +143,7 @@ F: lib/librte_acl/acl_run_neon.*
F: lib/librte_lpm/rte_lpm_neon.h
F: lib/librte_hash/rte*_arm64.h
F: drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
+F: drivers/net/virtio/virtio_rxtx_simple_neon.c

EZchip TILE-Gx
M: Zhigang Lu <***@ezchip.com>
diff --git a/doc/guides/rel_notes/release_16_07.rst b/doc/guides/rel_notes/release_16_07.rst
index 9e2a817..3a5add5 100644
--- a/doc/guides/rel_notes/release_16_07.rst
+++ b/doc/guides/rel_notes/release_16_07.rst
@@ -174,6 +174,8 @@ New Features
section of the "Network Interface Controller Drivers" document.


+* **Virtio NEON support for ARM.**
+
Resolved Issues
---------------

diff --git a/drivers/net/virtio/Makefile b/drivers/net/virtio/Makefile
index c4103b7..97972a6 100644
--- a/drivers/net/virtio/Makefile
+++ b/drivers/net/virtio/Makefile
@@ -54,6 +54,8 @@ SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx_simple.c

ifeq ($(CONFIG_RTE_ARCH_X86),y)
SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx_simple_sse.c
+else ifneq ($(filter y,$(CONFIG_RTE_ARCH_ARM) $(CONFIG_RTE_ARCH_ARM64)),)
+SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx_simple_neon.c
endif

ifeq ($(CONFIG_RTE_VIRTIO_USER),y)
diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
index a4d4a57..19d1742 100644
--- a/drivers/net/virtio/virtio_rxtx.c
+++ b/drivers/net/virtio/virtio_rxtx.c
@@ -481,6 +481,9 @@ virtio_update_rxtx_handler(struct rte_eth_dev *dev,
#if defined RTE_ARCH_X86
if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SSE3))
use_simple_rxtx = 1;
+#elif defined RTE_ARCH_ARM64 || defined CONFIG_RTE_ARCH_ARM
+ if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON))
+ use_simple_rxtx = 1;
#endif
/* Use simple rx/tx func if single segment and no offloads */
if (use_simple_rxtx &&
diff --git a/drivers/net/virtio/virtio_rxtx_simple_neon.c b/drivers/net/virtio/virtio_rxtx_simple_neon.c
new file mode 100644
index 0000000..793eefb
--- /dev/null
+++ b/drivers/net/virtio/virtio_rxtx_simple_neon.c
@@ -0,0 +1,235 @@
+/*
+ * BSD LICENSE
+ *
+ * Copyright (C) Cavium networks Ltd. 2016
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Cavium networks nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include <rte_byteorder.h>
+#include <rte_branch_prediction.h>
+#include <rte_cycles.h>
+#include <rte_ether.h>
+#include <rte_ethdev.h>
+#include <rte_errno.h>
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_mempool.h>
+#include <rte_malloc.h>
+#include <rte_mbuf.h>
+#include <rte_prefetch.h>
+#include <rte_string_fns.h>
+#include <rte_vect.h>
+
+#include "virtio_rxtx_simple.h"
+
+#define RTE_VIRTIO_VPMD_RX_BURST 32
+#define RTE_VIRTIO_DESC_PER_LOOP 8
+#define RTE_VIRTIO_VPMD_RX_REARM_THRESH RTE_VIRTIO_VPMD_RX_BURST
+
+/* virtio vPMD receive routine, only accept(nb_pkts >= RTE_VIRTIO_DESC_PER_LOOP)
+ *
+ * This routine is for non-mergeable RX, one desc for each guest buffer.
+ * This routine is based on the RX ring layout optimization. Each entry in the
+ * avail ring points to the desc with the same index in the desc ring and this
+ * will never be changed in the driver.
+ *
+ * - nb_pkts < RTE_VIRTIO_DESC_PER_LOOP, just return no packet
+ */
+uint16_t
+virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts)
+{
+ struct virtnet_rx *rxvq = rx_queue;
+ struct virtqueue *vq = rxvq->vq;
+ uint16_t nb_used;
+ uint16_t desc_idx;
+ struct vring_used_elem *rused;
+ struct rte_mbuf **sw_ring;
+ struct rte_mbuf **sw_ring_end;
+ uint16_t nb_pkts_received;
+
+ uint8x16_t shuf_msk1 = {
+ 0xFF, 0xFF, 0xFF, 0xFF, /* packet type */
+ 4, 5, 0xFF, 0xFF, /* pkt len */
+ 4, 5, /* dat len */
+ 0xFF, 0xFF, /* vlan tci */
+ 0xFF, 0xFF, 0xFF, 0xFF
+ };
+
+ uint8x16_t shuf_msk2 = {
+ 0xFF, 0xFF, 0xFF, 0xFF, /* packet type */
+ 12, 13, 0xFF, 0xFF, /* pkt len */
+ 12, 13, /* dat len */
+ 0xFF, 0xFF, /* vlan tci */
+ 0xFF, 0xFF, 0xFF, 0xFF
+ };
+
+ /* Subtract the header length.
+ * In which case do we need the header length in used->len ?
+ */
+ uint16x8_t len_adjust = {
+ 0, 0,
+ (uint16_t)vq->hw->vtnet_hdr_size, 0,
+ (uint16_t)vq->hw->vtnet_hdr_size,
+ 0,
+ 0, 0
+ };
+
+ if (unlikely(nb_pkts < RTE_VIRTIO_DESC_PER_LOOP))
+ return 0;
+
+ nb_used = VIRTQUEUE_NUSED(vq);
+
+ rte_rmb();
+
+ if (unlikely(nb_used == 0))
+ return 0;
+
+ nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_VIRTIO_DESC_PER_LOOP);
+ nb_used = RTE_MIN(nb_used, nb_pkts);
+
+ desc_idx = (uint16_t)(vq->vq_used_cons_idx & (vq->vq_nentries - 1));
+ rused = &vq->vq_ring.used->ring[desc_idx];
+ sw_ring = &vq->sw_ring[desc_idx];
+ sw_ring_end = &vq->sw_ring[vq->vq_nentries];
+
+ rte_prefetch_non_temporal(rused);
+
+ if (vq->vq_free_cnt >= RTE_VIRTIO_VPMD_RX_REARM_THRESH) {
+ virtio_rxq_rearm_vec(rxvq);
+ if (unlikely(virtqueue_kick_prepare(vq)))
+ virtqueue_notify(vq);
+ }
+
+ for (nb_pkts_received = 0;
+ nb_pkts_received < nb_used;) {
+ uint64x2_t desc[RTE_VIRTIO_DESC_PER_LOOP / 2];
+ uint64x2_t mbp[RTE_VIRTIO_DESC_PER_LOOP / 2];
+ uint64x2_t pkt_mb[RTE_VIRTIO_DESC_PER_LOOP];
+
+ mbp[0] = vld1q_u64((uint64_t *)(sw_ring + 0));
+ desc[0] = vld1q_u64((uint64_t *)(rused + 0));
+ vst1q_u64((uint64_t *)&rx_pkts[0], mbp[0]);
+
+ mbp[1] = vld1q_u64((uint64_t *)(sw_ring + 2));
+ desc[1] = vld1q_u64((uint64_t *)(rused + 2));
+ vst1q_u64((uint64_t *)&rx_pkts[2], mbp[1]);
+
+ mbp[2] = vld1q_u64((uint64_t *)(sw_ring + 4));
+ desc[2] = vld1q_u64((uint64_t *)(rused + 4));
+ vst1q_u64((uint64_t *)&rx_pkts[4], mbp[2]);
+
+ mbp[3] = vld1q_u64((uint64_t *)(sw_ring + 6));
+ desc[3] = vld1q_u64((uint64_t *)(rused + 6));
+ vst1q_u64((uint64_t *)&rx_pkts[6], mbp[3]);
+
+ pkt_mb[1] = vreinterpretq_u64_u8(vqtbl1q_u8(
+ vreinterpretq_u8_u64(desc[0]), shuf_msk2));
+ pkt_mb[0] = vreinterpretq_u64_u8(vqtbl1q_u8(
+ vreinterpretq_u8_u64(desc[0]), shuf_msk1));
+ pkt_mb[1] = vreinterpretq_u64_u16(vsubq_u16(
+ vreinterpretq_u16_u64(pkt_mb[1]), len_adjust));
+ pkt_mb[0] = vreinterpretq_u64_u16(vsubq_u16(
+ vreinterpretq_u16_u64(pkt_mb[0]), len_adjust));
+ vst1q_u64((void *)&rx_pkts[1]->rx_descriptor_fields1,
+ pkt_mb[1]);
+ vst1q_u64((void *)&rx_pkts[0]->rx_descriptor_fields1,
+ pkt_mb[0]);
+
+ pkt_mb[3] = vreinterpretq_u64_u8(vqtbl1q_u8(
+ vreinterpretq_u8_u64(desc[1]), shuf_msk2));
+ pkt_mb[2] = vreinterpretq_u64_u8(vqtbl1q_u8(
+ vreinterpretq_u8_u64(desc[1]), shuf_msk1));
+ pkt_mb[3] = vreinterpretq_u64_u16(vsubq_u16(
+ vreinterpretq_u16_u64(pkt_mb[3]), len_adjust));
+ pkt_mb[2] = vreinterpretq_u64_u16(vsubq_u16(
+ vreinterpretq_u16_u64(pkt_mb[2]), len_adjust));
+ vst1q_u64((void *)&rx_pkts[3]->rx_descriptor_fields1,
+ pkt_mb[3]);
+ vst1q_u64((void *)&rx_pkts[2]->rx_descriptor_fields1,
+ pkt_mb[2]);
+
+ pkt_mb[5] = vreinterpretq_u64_u8(vqtbl1q_u8(
+ vreinterpretq_u8_u64(desc[2]), shuf_msk2));
+ pkt_mb[4] = vreinterpretq_u64_u8(vqtbl1q_u8(
+ vreinterpretq_u8_u64(desc[2]), shuf_msk1));
+ pkt_mb[5] = vreinterpretq_u64_u16(vsubq_u16(
+ vreinterpretq_u16_u64(pkt_mb[5]), len_adjust));
+ pkt_mb[4] = vreinterpretq_u64_u16(vsubq_u16(
+ vreinterpretq_u16_u64(pkt_mb[4]), len_adjust));
+ vst1q_u64((void *)&rx_pkts[5]->rx_descriptor_fields1,
+ pkt_mb[5]);
+ vst1q_u64((void *)&rx_pkts[4]->rx_descriptor_fields1,
+ pkt_mb[4]);
+
+ pkt_mb[7] = vreinterpretq_u64_u8(vqtbl1q_u8(
+ vreinterpretq_u8_u64(desc[3]), shuf_msk2));
+ pkt_mb[6] = vreinterpretq_u64_u8(vqtbl1q_u8(
+ vreinterpretq_u8_u64(desc[3]), shuf_msk1));
+ pkt_mb[7] = vreinterpretq_u64_u16(vsubq_u16(
+ vreinterpretq_u16_u64(pkt_mb[7]), len_adjust));
+ pkt_mb[6] = vreinterpretq_u64_u16(vsubq_u16(
+ vreinterpretq_u16_u64(pkt_mb[6]), len_adjust));
+ vst1q_u64((void *)&rx_pkts[7]->rx_descriptor_fields1,
+ pkt_mb[7]);
+ vst1q_u64((void *)&rx_pkts[6]->rx_descriptor_fields1,
+ pkt_mb[6]);
+
+ if (unlikely(nb_used <= RTE_VIRTIO_DESC_PER_LOOP)) {
+ if (sw_ring + nb_used <= sw_ring_end)
+ nb_pkts_received += nb_used;
+ else
+ nb_pkts_received += sw_ring_end - sw_ring;
+ break;
+ } else {
+ if (unlikely(sw_ring + RTE_VIRTIO_DESC_PER_LOOP >=
+ sw_ring_end)) {
+ nb_pkts_received += sw_ring_end - sw_ring;
+ break;
+ } else {
+ nb_pkts_received += RTE_VIRTIO_DESC_PER_LOOP;
+
+ rx_pkts += RTE_VIRTIO_DESC_PER_LOOP;
+ sw_ring += RTE_VIRTIO_DESC_PER_LOOP;
+ rused += RTE_VIRTIO_DESC_PER_LOOP;
+ nb_used -= RTE_VIRTIO_DESC_PER_LOOP;
+ }
+ }
+ }
+
+ vq->vq_used_cons_idx += nb_pkts_received;
+ vq->vq_free_cnt += nb_pkts_received;
+ rxvq->stats.packets += nb_pkts_received;
+ return nb_pkts_received;
+}
--
2.5.5
Yuanhan Liu
2016-07-04 07:53:22 UTC
Permalink
Post by Jerin Jacob
Added neon based Rx vector implementation.
Selection of the new handler based neon availability at runtime.
Updated the release notes and MAINTAINERS file.
---
MAINTAINERS | 1 +
doc/guides/rel_notes/release_16_07.rst | 2 +
drivers/net/virtio/Makefile | 2 +
drivers/net/virtio/virtio_rxtx.c | 3 +
drivers/net/virtio/virtio_rxtx_simple_neon.c | 235 +++++++++++++++++++++++++++
5 files changed, 243 insertions(+)
create mode 100644 drivers/net/virtio/virtio_rxtx_simple_neon.c
diff --git a/MAINTAINERS b/MAINTAINERS
index a59191e..ab04cee 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -143,6 +143,7 @@ F: lib/librte_acl/acl_run_neon.*
F: lib/librte_lpm/rte_lpm_neon.h
F: lib/librte_hash/rte*_arm64.h
F: drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
+F: drivers/net/virtio/virtio_rxtx_simple_neon.c
EZchip TILE-Gx
diff --git a/doc/guides/rel_notes/release_16_07.rst b/doc/guides/rel_notes/release_16_07.rst
index 9e2a817..3a5add5 100644
--- a/doc/guides/rel_notes/release_16_07.rst
+++ b/doc/guides/rel_notes/release_16_07.rst
This series basically looks good to me, but I don't think we can make it
for v16.07: you missed v1 deadline; it's also too late: rc1 was already out.

--yliu
Post by Jerin Jacob
@@ -174,6 +174,8 @@ New Features
section of the "Network Interface Controller Drivers" document.
+* **Virtio NEON support for ARM.**
+
Resolved Issues
---------------
Jerin Jacob
2016-07-04 08:55:55 UTC
Permalink
Post by Yuanhan Liu
Post by Jerin Jacob
Added neon based Rx vector implementation.
Selection of the new handler based neon availability at runtime.
Updated the release notes and MAINTAINERS file.
---
MAINTAINERS | 1 +
doc/guides/rel_notes/release_16_07.rst | 2 +
drivers/net/virtio/Makefile | 2 +
drivers/net/virtio/virtio_rxtx.c | 3 +
drivers/net/virtio/virtio_rxtx_simple_neon.c | 235 +++++++++++++++++++++++++++
5 files changed, 243 insertions(+)
create mode 100644 drivers/net/virtio/virtio_rxtx_simple_neon.c
diff --git a/MAINTAINERS b/MAINTAINERS
index a59191e..ab04cee 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -143,6 +143,7 @@ F: lib/librte_acl/acl_run_neon.*
F: lib/librte_lpm/rte_lpm_neon.h
F: lib/librte_hash/rte*_arm64.h
F: drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
+F: drivers/net/virtio/virtio_rxtx_simple_neon.c
EZchip TILE-Gx
diff --git a/doc/guides/rel_notes/release_16_07.rst b/doc/guides/rel_notes/release_16_07.rst
index 9e2a817..3a5add5 100644
--- a/doc/guides/rel_notes/release_16_07.rst
+++ b/doc/guides/rel_notes/release_16_07.rst
This series basically looks good to me, but I don't think we can make it
for v16.07: you missed v1 deadline; it's also too late: rc1 was already out.
OK. But I thought, Thomas hasn't pulled the changes from dpdk-next-virtio.

Even if didn't make it for v16.07, I would suggest you to consider taking
the changes to dpdk-next-virtio as this change involves file restructuring
(Will have issue with re-basing in future) without having any functional impact.
Post by Yuanhan Liu
--yliu
Post by Jerin Jacob
@@ -174,6 +174,8 @@ New Features
section of the "Network Interface Controller Drivers" document.
+* **Virtio NEON support for ARM.**
+
Resolved Issues
---------------
Yuanhan Liu
2016-07-04 09:02:27 UTC
Permalink
Post by Jerin Jacob
Post by Yuanhan Liu
Post by Jerin Jacob
Added neon based Rx vector implementation.
Selection of the new handler based neon availability at runtime.
Updated the release notes and MAINTAINERS file.
---
MAINTAINERS | 1 +
doc/guides/rel_notes/release_16_07.rst | 2 +
drivers/net/virtio/Makefile | 2 +
drivers/net/virtio/virtio_rxtx.c | 3 +
drivers/net/virtio/virtio_rxtx_simple_neon.c | 235 +++++++++++++++++++++++++++
5 files changed, 243 insertions(+)
create mode 100644 drivers/net/virtio/virtio_rxtx_simple_neon.c
diff --git a/MAINTAINERS b/MAINTAINERS
index a59191e..ab04cee 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -143,6 +143,7 @@ F: lib/librte_acl/acl_run_neon.*
F: lib/librte_lpm/rte_lpm_neon.h
F: lib/librte_hash/rte*_arm64.h
F: drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
+F: drivers/net/virtio/virtio_rxtx_simple_neon.c
EZchip TILE-Gx
diff --git a/doc/guides/rel_notes/release_16_07.rst b/doc/guides/rel_notes/release_16_07.rst
index 9e2a817..3a5add5 100644
--- a/doc/guides/rel_notes/release_16_07.rst
+++ b/doc/guides/rel_notes/release_16_07.rst
This series basically looks good to me, but I don't think we can make it
for v16.07: you missed v1 deadline; it's also too late: rc1 was already out.
OK. But I thought, Thomas hasn't pulled the changes from dpdk-next-virtio.
Even if didn't make it for v16.07, I would suggest you to consider taking
the changes to dpdk-next-virtio as this change involves file restructuring
(Will have issue with re-basing in future) without having any functional impact.
Yes, that's my plan. I will do the merge ASAP, when

- v16.07 is out.

- your patches are ready.


--yliu
Jerin Jacob
2016-07-05 12:49:22 UTC
Permalink
This patch-set includes,

1) General cleanup of compile time dependency.
2) made vector handler section based on run-time cpuflags
2) Added NEON support for optimized Rx handling

This patch-set is based on dpdk-next-virtio/master

v3:
Address Yuanhan's review comments
http://dpdk.org/dev/patchwork/patch/14495/
http://dpdk.org/dev/patchwork/patch/14496/

v2:
- made vector handler selection based on run-time cpuflags (Suggested by Thomas)
- moved vector implementations to .c file instead of .h file(Suggested by Jianbo)

Jerin Jacob (4):
virtio: conditional compilation cleanup
virtio: move SSE based Rx implementation to separate file
virtio: add cpuflag based vector handler selection
virtio: add neon support

MAINTAINERS | 1 +
doc/guides/rel_notes/release_16_07.rst | 2 +
drivers/net/virtio/Makefile | 7 +-
drivers/net/virtio/virtio_pci.h | 1 +
drivers/net/virtio/virtio_rxtx.c | 63 ++++---
drivers/net/virtio/virtio_rxtx.h | 3 +-
drivers/net/virtio/virtio_rxtx_simple.c | 269 ++-------------------------
drivers/net/virtio/virtio_rxtx_simple.h | 133 +++++++++++++
drivers/net/virtio/virtio_rxtx_simple_neon.c | 235 +++++++++++++++++++++++
drivers/net/virtio/virtio_rxtx_simple_sse.c | 222 ++++++++++++++++++++++
drivers/net/virtio/virtio_user_ethdev.c | 1 +
11 files changed, 646 insertions(+), 291 deletions(-)
create mode 100644 drivers/net/virtio/virtio_rxtx_simple.h
create mode 100644 drivers/net/virtio/virtio_rxtx_simple_neon.c
create mode 100644 drivers/net/virtio/virtio_rxtx_simple_sse.c
--
2.5.5
Jerin Jacob
2016-07-05 12:49:23 UTC
Permalink
Removed unnecessary compile time dependency on "use_simple_rxtx".

Signed-off-by: Jerin Jacob <***@caviumnetworks.com>
---
drivers/net/virtio/Makefile | 3 ---
drivers/net/virtio/virtio_pci.h | 1 +
drivers/net/virtio/virtio_rxtx.c | 24 ++++++++----------------
drivers/net/virtio/virtio_rxtx.h | 3 +--
drivers/net/virtio/virtio_rxtx_simple.c | 8 ++++++--
drivers/net/virtio/virtio_user_ethdev.c | 1 +
6 files changed, 17 insertions(+), 23 deletions(-)

diff --git a/drivers/net/virtio/Makefile b/drivers/net/virtio/Makefile
index 3020b68..b9b0d8d 100644
--- a/drivers/net/virtio/Makefile
+++ b/drivers/net/virtio/Makefile
@@ -50,10 +50,7 @@ SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtqueue.c
SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_pci.c
SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx.c
SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_ethdev.c
-
-ifeq ($(findstring RTE_MACHINE_CPUFLAG_SSSE3,$(CFLAGS)),RTE_MACHINE_CPUFLAG_SSSE3)
SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx_simple.c
-endif

ifeq ($(CONFIG_RTE_VIRTIO_USER),y)
SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/vhost_user.c
diff --git a/drivers/net/virtio/virtio_pci.h b/drivers/net/virtio/virtio_pci.h
index dd7693f..b8295a7 100644
--- a/drivers/net/virtio/virtio_pci.h
+++ b/drivers/net/virtio/virtio_pci.h
@@ -253,6 +253,7 @@ struct virtio_hw {
uint8_t use_msix;
uint8_t started;
uint8_t modern;
+ uint8_t use_simple_rxtx;
uint8_t mac_addr[ETHER_ADDR_LEN];
uint32_t notify_off_multiplier;
uint8_t *isr;
diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
index a27208e..e707954 100644
--- a/drivers/net/virtio/virtio_rxtx.c
+++ b/drivers/net/virtio/virtio_rxtx.c
@@ -67,10 +67,6 @@
#define VIRTIO_SIMPLE_FLAGS ((uint32_t)ETH_TXQ_FLAGS_NOMULTSEGS | \
ETH_TXQ_FLAGS_NOOFFLOADS)

-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
-static int use_simple_rxtx;
-#endif
-
static void
vq_ring_free_chain(struct virtqueue *vq, uint16_t desc_idx)
{
@@ -333,6 +329,7 @@ virtio_dev_rxtx_start(struct rte_eth_dev *dev)
*/
uint16_t i;
uint16_t desc_idx;
+ struct virtio_hw *hw = dev->data->dev_private;

PMD_INIT_FUNC_TRACE();

@@ -353,8 +350,7 @@ virtio_dev_rxtx_start(struct rte_eth_dev *dev)
nbufs = 0;
error = ENOSPC;

-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
- if (use_simple_rxtx) {
+ if (hw->use_simple_rxtx) {
for (desc_idx = 0; desc_idx < vq->vq_nentries;
desc_idx++) {
vq->vq_ring.avail->ring[desc_idx] = desc_idx;
@@ -362,7 +358,7 @@ virtio_dev_rxtx_start(struct rte_eth_dev *dev)
VRING_DESC_F_WRITE;
}
}
-#endif
+
memset(&rxvq->fake_mbuf, 0, sizeof(rxvq->fake_mbuf));
for (desc_idx = 0; desc_idx < RTE_PMD_VIRTIO_RX_MAX_BURST;
desc_idx++) {
@@ -378,12 +374,11 @@ virtio_dev_rxtx_start(struct rte_eth_dev *dev)
/******************************************
* Enqueue allocated buffers *
*******************************************/
-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
- if (use_simple_rxtx)
+ if (hw->use_simple_rxtx)
error = virtqueue_enqueue_recv_refill_simple(vq, m);
else
-#endif
error = virtqueue_enqueue_recv_refill(vq, m);
+
if (error) {
rte_pktmbuf_free(m);
break;
@@ -404,8 +399,7 @@ virtio_dev_rxtx_start(struct rte_eth_dev *dev)
struct virtqueue *vq = txvq->vq;

virtio_dev_vring_start(vq);
-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
- if (use_simple_rxtx) {
+ if (hw->use_simple_rxtx) {
uint16_t mid_idx = vq->vq_nentries >> 1;

for (desc_idx = 0; desc_idx < mid_idx; desc_idx++) {
@@ -426,7 +420,7 @@ virtio_dev_rxtx_start(struct rte_eth_dev *dev)
desc_idx++)
vq->vq_ring.avail->ring[desc_idx] = desc_idx;
}
-#endif
+
VIRTQUEUE_DUMP(vq);
}
}
@@ -456,9 +450,7 @@ virtio_dev_rx_queue_setup(struct rte_eth_dev *dev,

dev->data->rx_queues[queue_idx] = rxvq;

-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
virtio_rxq_vec_setup(rxvq);
-#endif

return 0;
}
@@ -517,7 +509,7 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
PMD_INIT_LOG(INFO, "Using simple rx/tx path");
dev->tx_pkt_burst = virtio_xmit_pkts_simple;
dev->rx_pkt_burst = virtio_recv_pkts_vec;
- use_simple_rxtx = 1;
+ hw->use_simple_rxtx = 1;
}
#endif

diff --git a/drivers/net/virtio/virtio_rxtx.h b/drivers/net/virtio/virtio_rxtx.h
index 058b56a..28f82d6 100644
--- a/drivers/net/virtio/virtio_rxtx.h
+++ b/drivers/net/virtio/virtio_rxtx.h
@@ -86,10 +86,9 @@ struct virtnet_ctl {
const struct rte_memzone *mz; /**< mem zone to populate RX ring. */
};

-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
int virtio_rxq_vec_setup(struct virtnet_rx *rxvq);

int virtqueue_enqueue_recv_refill_simple(struct virtqueue *vq,
struct rte_mbuf *m);
-#endif
+
#endif /* _VIRTIO_RXTX_H_ */
diff --git a/drivers/net/virtio/virtio_rxtx_simple.c b/drivers/net/virtio/virtio_rxtx_simple.c
index 242ad90..67430da 100644
--- a/drivers/net/virtio/virtio_rxtx_simple.c
+++ b/drivers/net/virtio/virtio_rxtx_simple.c
@@ -37,8 +37,6 @@
#include <string.h>
#include <errno.h>

-#include <tmmintrin.h>
-
#include <rte_cycles.h>
#include <rte_memory.h>
#include <rte_memzone.h>
@@ -131,6 +129,10 @@ virtio_rxq_rearm_vec(struct virtnet_rx *rxvq)
vq_update_avail_idx(vq);
}

+#ifdef RTE_MACHINE_CPUFLAG_SSSE3
+
+#include <tmmintrin.h>
+
/* virtio vPMD receive routine, only accept(nb_pkts >= RTE_VIRTIO_DESC_PER_LOOP)
*
* This routine is for non-mergeable RX, one desc for each guest buffer.
@@ -293,6 +295,8 @@ virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
return nb_pkts_received;
}

+#endif
+
#define VIRTIO_TX_FREE_THRESH 32
#define VIRTIO_TX_MAX_FREE_BUF_SZ 32
#define VIRTIO_TX_FREE_NR 32
diff --git a/drivers/net/virtio/virtio_user_ethdev.c b/drivers/net/virtio/virtio_user_ethdev.c
index 3d3c9da..3185a4c 100644
--- a/drivers/net/virtio/virtio_user_ethdev.c
+++ b/drivers/net/virtio/virtio_user_ethdev.c
@@ -303,6 +303,7 @@ virtio_user_eth_dev_alloc(const char *name)
hw->vtpci_ops = &virtio_user_ops;
hw->use_msix = 0;
hw->modern = 0;
+ hw->use_simple_rxtx = 0;
hw->virtio_user_dev = dev;
data->dev_private = hw;
data->numa_node = SOCKET_ID_ANY;
--
2.5.5
Jerin Jacob
2016-07-05 12:49:24 UTC
Permalink
Split out SSE instruction based virtio simple Rx
implementation to a separate file

Signed-off-by: Jerin Jacob <***@caviumnetworks.com>
---
drivers/net/virtio/Makefile | 4 +
drivers/net/virtio/virtio_rxtx_simple.c | 273 ++--------------------------
drivers/net/virtio/virtio_rxtx_simple.h | 133 ++++++++++++++
drivers/net/virtio/virtio_rxtx_simple_sse.c | 222 ++++++++++++++++++++++
4 files changed, 370 insertions(+), 262 deletions(-)
create mode 100644 drivers/net/virtio/virtio_rxtx_simple.h
create mode 100644 drivers/net/virtio/virtio_rxtx_simple_sse.c

diff --git a/drivers/net/virtio/Makefile b/drivers/net/virtio/Makefile
index b9b0d8d..c4103b7 100644
--- a/drivers/net/virtio/Makefile
+++ b/drivers/net/virtio/Makefile
@@ -52,6 +52,10 @@ SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx.c
SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_ethdev.c
SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx_simple.c

+ifeq ($(CONFIG_RTE_ARCH_X86),y)
+SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx_simple_sse.c
+endif
+
ifeq ($(CONFIG_RTE_VIRTIO_USER),y)
SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/vhost_user.c
SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/virtio_user_dev.c
diff --git a/drivers/net/virtio/virtio_rxtx_simple.c b/drivers/net/virtio/virtio_rxtx_simple.c
index 67430da..485ddce 100644
--- a/drivers/net/virtio/virtio_rxtx_simple.c
+++ b/drivers/net/virtio/virtio_rxtx_simple.c
@@ -51,14 +51,7 @@
#include <rte_errno.h>
#include <rte_byteorder.h>

-#include "virtio_logs.h"
-#include "virtio_ethdev.h"
-#include "virtqueue.h"
-#include "virtio_rxtx.h"
-
-#define RTE_VIRTIO_VPMD_RX_BURST 32
-#define RTE_VIRTIO_DESC_PER_LOOP 8
-#define RTE_VIRTIO_VPMD_RX_REARM_THRESH RTE_VIRTIO_VPMD_RX_BURST
+#include "virtio_rxtx_simple.h"

#ifndef __INTEL_COMPILER
#pragma GCC diagnostic ignored "-Wcast-qual"
@@ -89,260 +82,6 @@ virtqueue_enqueue_recv_refill_simple(struct virtqueue *vq,
return 0;
}

-static inline void
-virtio_rxq_rearm_vec(struct virtnet_rx *rxvq)
-{
- int i;
- uint16_t desc_idx;
- struct rte_mbuf **sw_ring;
- struct vring_desc *start_dp;
- int ret;
- struct virtqueue *vq = rxvq->vq;
-
- desc_idx = vq->vq_avail_idx & (vq->vq_nentries - 1);
- sw_ring = &vq->sw_ring[desc_idx];
- start_dp = &vq->vq_ring.desc[desc_idx];
-
- ret = rte_mempool_get_bulk(rxvq->mpool, (void **)sw_ring,
- RTE_VIRTIO_VPMD_RX_REARM_THRESH);
- if (unlikely(ret)) {
- rte_eth_devices[rxvq->port_id].data->rx_mbuf_alloc_failed +=
- RTE_VIRTIO_VPMD_RX_REARM_THRESH;
- return;
- }
-
- for (i = 0; i < RTE_VIRTIO_VPMD_RX_REARM_THRESH; i++) {
- uintptr_t p;
-
- p = (uintptr_t)&sw_ring[i]->rearm_data;
- *(uint64_t *)p = rxvq->mbuf_initializer;
-
- start_dp[i].addr =
- MBUF_DATA_DMA_ADDR(sw_ring[i], vq->offset) -
- vq->hw->vtnet_hdr_size;
- start_dp[i].len = sw_ring[i]->buf_len -
- RTE_PKTMBUF_HEADROOM + vq->hw->vtnet_hdr_size;
- }
-
- vq->vq_avail_idx += RTE_VIRTIO_VPMD_RX_REARM_THRESH;
- vq->vq_free_cnt -= RTE_VIRTIO_VPMD_RX_REARM_THRESH;
- vq_update_avail_idx(vq);
-}
-
-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
-
-#include <tmmintrin.h>
-
-/* virtio vPMD receive routine, only accept(nb_pkts >= RTE_VIRTIO_DESC_PER_LOOP)
- *
- * This routine is for non-mergeable RX, one desc for each guest buffer.
- * This routine is based on the RX ring layout optimization. Each entry in the
- * avail ring points to the desc with the same index in the desc ring and this
- * will never be changed in the driver.
- *
- * - nb_pkts < RTE_VIRTIO_DESC_PER_LOOP, just return no packet
- */
-uint16_t
-virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
- uint16_t nb_pkts)
-{
- struct virtnet_rx *rxvq = rx_queue;
- struct virtqueue *vq = rxvq->vq;
- uint16_t nb_used;
- uint16_t desc_idx;
- struct vring_used_elem *rused;
- struct rte_mbuf **sw_ring;
- struct rte_mbuf **sw_ring_end;
- uint16_t nb_pkts_received;
- __m128i shuf_msk1, shuf_msk2, len_adjust;
-
- shuf_msk1 = _mm_set_epi8(
- 0xFF, 0xFF, 0xFF, 0xFF,
- 0xFF, 0xFF, /* vlan tci */
- 5, 4, /* dat len */
- 0xFF, 0xFF, 5, 4, /* pkt len */
- 0xFF, 0xFF, 0xFF, 0xFF /* packet type */
-
- );
-
- shuf_msk2 = _mm_set_epi8(
- 0xFF, 0xFF, 0xFF, 0xFF,
- 0xFF, 0xFF, /* vlan tci */
- 13, 12, /* dat len */
- 0xFF, 0xFF, 13, 12, /* pkt len */
- 0xFF, 0xFF, 0xFF, 0xFF /* packet type */
- );
-
- /* Subtract the header length.
- * In which case do we need the header length in used->len ?
- */
- len_adjust = _mm_set_epi16(
- 0, 0,
- 0,
- (uint16_t)-vq->hw->vtnet_hdr_size,
- 0, (uint16_t)-vq->hw->vtnet_hdr_size,
- 0, 0);
-
- if (unlikely(nb_pkts < RTE_VIRTIO_DESC_PER_LOOP))
- return 0;
-
- nb_used = VIRTQUEUE_NUSED(vq);
-
- rte_compiler_barrier();
-
- if (unlikely(nb_used == 0))
- return 0;
-
- nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_VIRTIO_DESC_PER_LOOP);
- nb_used = RTE_MIN(nb_used, nb_pkts);
-
- desc_idx = (uint16_t)(vq->vq_used_cons_idx & (vq->vq_nentries - 1));
- rused = &vq->vq_ring.used->ring[desc_idx];
- sw_ring = &vq->sw_ring[desc_idx];
- sw_ring_end = &vq->sw_ring[vq->vq_nentries];
-
- _mm_prefetch((const void *)rused, _MM_HINT_T0);
-
- if (vq->vq_free_cnt >= RTE_VIRTIO_VPMD_RX_REARM_THRESH) {
- virtio_rxq_rearm_vec(rxvq);
- if (unlikely(virtqueue_kick_prepare(vq)))
- virtqueue_notify(vq);
- }
-
- for (nb_pkts_received = 0;
- nb_pkts_received < nb_used;) {
- __m128i desc[RTE_VIRTIO_DESC_PER_LOOP / 2];
- __m128i mbp[RTE_VIRTIO_DESC_PER_LOOP / 2];
- __m128i pkt_mb[RTE_VIRTIO_DESC_PER_LOOP];
-
- mbp[0] = _mm_loadu_si128((__m128i *)(sw_ring + 0));
- desc[0] = _mm_loadu_si128((__m128i *)(rused + 0));
- _mm_storeu_si128((__m128i *)&rx_pkts[0], mbp[0]);
-
- mbp[1] = _mm_loadu_si128((__m128i *)(sw_ring + 2));
- desc[1] = _mm_loadu_si128((__m128i *)(rused + 2));
- _mm_storeu_si128((__m128i *)&rx_pkts[2], mbp[1]);
-
- mbp[2] = _mm_loadu_si128((__m128i *)(sw_ring + 4));
- desc[2] = _mm_loadu_si128((__m128i *)(rused + 4));
- _mm_storeu_si128((__m128i *)&rx_pkts[4], mbp[2]);
-
- mbp[3] = _mm_loadu_si128((__m128i *)(sw_ring + 6));
- desc[3] = _mm_loadu_si128((__m128i *)(rused + 6));
- _mm_storeu_si128((__m128i *)&rx_pkts[6], mbp[3]);
-
- pkt_mb[1] = _mm_shuffle_epi8(desc[0], shuf_msk2);
- pkt_mb[0] = _mm_shuffle_epi8(desc[0], shuf_msk1);
- pkt_mb[1] = _mm_add_epi16(pkt_mb[1], len_adjust);
- pkt_mb[0] = _mm_add_epi16(pkt_mb[0], len_adjust);
- _mm_storeu_si128((void *)&rx_pkts[1]->rx_descriptor_fields1,
- pkt_mb[1]);
- _mm_storeu_si128((void *)&rx_pkts[0]->rx_descriptor_fields1,
- pkt_mb[0]);
-
- pkt_mb[3] = _mm_shuffle_epi8(desc[1], shuf_msk2);
- pkt_mb[2] = _mm_shuffle_epi8(desc[1], shuf_msk1);
- pkt_mb[3] = _mm_add_epi16(pkt_mb[3], len_adjust);
- pkt_mb[2] = _mm_add_epi16(pkt_mb[2], len_adjust);
- _mm_storeu_si128((void *)&rx_pkts[3]->rx_descriptor_fields1,
- pkt_mb[3]);
- _mm_storeu_si128((void *)&rx_pkts[2]->rx_descriptor_fields1,
- pkt_mb[2]);
-
- pkt_mb[5] = _mm_shuffle_epi8(desc[2], shuf_msk2);
- pkt_mb[4] = _mm_shuffle_epi8(desc[2], shuf_msk1);
- pkt_mb[5] = _mm_add_epi16(pkt_mb[5], len_adjust);
- pkt_mb[4] = _mm_add_epi16(pkt_mb[4], len_adjust);
- _mm_storeu_si128((void *)&rx_pkts[5]->rx_descriptor_fields1,
- pkt_mb[5]);
- _mm_storeu_si128((void *)&rx_pkts[4]->rx_descriptor_fields1,
- pkt_mb[4]);
-
- pkt_mb[7] = _mm_shuffle_epi8(desc[3], shuf_msk2);
- pkt_mb[6] = _mm_shuffle_epi8(desc[3], shuf_msk1);
- pkt_mb[7] = _mm_add_epi16(pkt_mb[7], len_adjust);
- pkt_mb[6] = _mm_add_epi16(pkt_mb[6], len_adjust);
- _mm_storeu_si128((void *)&rx_pkts[7]->rx_descriptor_fields1,
- pkt_mb[7]);
- _mm_storeu_si128((void *)&rx_pkts[6]->rx_descriptor_fields1,
- pkt_mb[6]);
-
- if (unlikely(nb_used <= RTE_VIRTIO_DESC_PER_LOOP)) {
- if (sw_ring + nb_used <= sw_ring_end)
- nb_pkts_received += nb_used;
- else
- nb_pkts_received += sw_ring_end - sw_ring;
- break;
- } else {
- if (unlikely(sw_ring + RTE_VIRTIO_DESC_PER_LOOP >=
- sw_ring_end)) {
- nb_pkts_received += sw_ring_end - sw_ring;
- break;
- } else {
- nb_pkts_received += RTE_VIRTIO_DESC_PER_LOOP;
-
- rx_pkts += RTE_VIRTIO_DESC_PER_LOOP;
- sw_ring += RTE_VIRTIO_DESC_PER_LOOP;
- rused += RTE_VIRTIO_DESC_PER_LOOP;
- nb_used -= RTE_VIRTIO_DESC_PER_LOOP;
- }
- }
- }
-
- vq->vq_used_cons_idx += nb_pkts_received;
- vq->vq_free_cnt += nb_pkts_received;
- rxvq->stats.packets += nb_pkts_received;
- return nb_pkts_received;
-}
-
-#endif
-
-#define VIRTIO_TX_FREE_THRESH 32
-#define VIRTIO_TX_MAX_FREE_BUF_SZ 32
-#define VIRTIO_TX_FREE_NR 32
-/* TODO: vq->tx_free_cnt could mean num of free slots so we could avoid shift */
-static inline void
-virtio_xmit_cleanup(struct virtqueue *vq)
-{
- uint16_t i, desc_idx;
- int nb_free = 0;
- struct rte_mbuf *m, *free[VIRTIO_TX_MAX_FREE_BUF_SZ];
-
- desc_idx = (uint16_t)(vq->vq_used_cons_idx &
- ((vq->vq_nentries >> 1) - 1));
- m = (struct rte_mbuf *)vq->vq_descx[desc_idx++].cookie;
- m = __rte_pktmbuf_prefree_seg(m);
- if (likely(m != NULL)) {
- free[0] = m;
- nb_free = 1;
- for (i = 1; i < VIRTIO_TX_FREE_NR; i++) {
- m = (struct rte_mbuf *)vq->vq_descx[desc_idx++].cookie;
- m = __rte_pktmbuf_prefree_seg(m);
- if (likely(m != NULL)) {
- if (likely(m->pool == free[0]->pool))
- free[nb_free++] = m;
- else {
- rte_mempool_put_bulk(free[0]->pool,
- (void **)free, nb_free);
- free[0] = m;
- nb_free = 1;
- }
- }
- }
- rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free);
- } else {
- for (i = 1; i < VIRTIO_TX_FREE_NR; i++) {
- m = (struct rte_mbuf *)vq->vq_descx[desc_idx++].cookie;
- m = __rte_pktmbuf_prefree_seg(m);
- if (m != NULL)
- rte_mempool_put(m->pool, m);
- }
- }
-
- vq->vq_used_cons_idx += VIRTIO_TX_FREE_NR;
- vq->vq_free_cnt += (VIRTIO_TX_FREE_NR << 1);
-}
-
uint16_t
virtio_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts,
uint16_t nb_pkts)
@@ -423,3 +162,13 @@ virtio_rxq_vec_setup(struct virtnet_rx *rxq)

return 0;
}
+
+/* Stub for linkage when arch specific implementation is not available */
+uint16_t __attribute__((weak))
+virtio_recv_pkts_vec(void *rx_queue __rte_unused,
+ struct rte_mbuf **rx_pkts __rte_unused,
+ uint16_t nb_pkts __rte_unused)
+{
+ rte_panic("Wrong weak function linked by linker\n");
+ return 0;
+}
diff --git a/drivers/net/virtio/virtio_rxtx_simple.h b/drivers/net/virtio/virtio_rxtx_simple.h
new file mode 100644
index 0000000..8cb43c0
--- /dev/null
+++ b/drivers/net/virtio/virtio_rxtx_simple.h
@@ -0,0 +1,133 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _VIRTIO_RXTX_SIMPLE_H_
+#define _VIRTIO_RXTX_SIMPLE_H_
+
+#include <stdint.h>
+
+#include "virtio_logs.h"
+#include "virtio_ethdev.h"
+#include "virtqueue.h"
+#include "virtio_rxtx.h"
+
+#define RTE_VIRTIO_VPMD_RX_BURST 32
+#define RTE_VIRTIO_VPMD_RX_REARM_THRESH RTE_VIRTIO_VPMD_RX_BURST
+
+static inline void
+virtio_rxq_rearm_vec(struct virtnet_rx *rxvq)
+{
+ int i;
+ uint16_t desc_idx;
+ struct rte_mbuf **sw_ring;
+ struct vring_desc *start_dp;
+ int ret;
+ struct virtqueue *vq = rxvq->vq;
+
+ desc_idx = vq->vq_avail_idx & (vq->vq_nentries - 1);
+ sw_ring = &vq->sw_ring[desc_idx];
+ start_dp = &vq->vq_ring.desc[desc_idx];
+
+ ret = rte_mempool_get_bulk(rxvq->mpool, (void **)sw_ring,
+ RTE_VIRTIO_VPMD_RX_REARM_THRESH);
+ if (unlikely(ret)) {
+ rte_eth_devices[rxvq->port_id].data->rx_mbuf_alloc_failed +=
+ RTE_VIRTIO_VPMD_RX_REARM_THRESH;
+ return;
+ }
+
+ for (i = 0; i < RTE_VIRTIO_VPMD_RX_REARM_THRESH; i++) {
+ uintptr_t p;
+
+ p = (uintptr_t)&sw_ring[i]->rearm_data;
+ *(uint64_t *)p = rxvq->mbuf_initializer;
+
+ start_dp[i].addr =
+ MBUF_DATA_DMA_ADDR(sw_ring[i], vq->offset) -
+ vq->hw->vtnet_hdr_size;
+ start_dp[i].len = sw_ring[i]->buf_len -
+ RTE_PKTMBUF_HEADROOM + vq->hw->vtnet_hdr_size;
+ }
+
+ vq->vq_avail_idx += RTE_VIRTIO_VPMD_RX_REARM_THRESH;
+ vq->vq_free_cnt -= RTE_VIRTIO_VPMD_RX_REARM_THRESH;
+ vq_update_avail_idx(vq);
+}
+
+#define VIRTIO_TX_FREE_THRESH 32
+#define VIRTIO_TX_MAX_FREE_BUF_SZ 32
+#define VIRTIO_TX_FREE_NR 32
+/* TODO: vq->tx_free_cnt could mean num of free slots so we could avoid shift */
+static inline void
+virtio_xmit_cleanup(struct virtqueue *vq)
+{
+ uint16_t i, desc_idx;
+ int nb_free = 0;
+ struct rte_mbuf *m, *free[VIRTIO_TX_MAX_FREE_BUF_SZ];
+
+ desc_idx = (uint16_t)(vq->vq_used_cons_idx &
+ ((vq->vq_nentries >> 1) - 1));
+ m = (struct rte_mbuf *)vq->vq_descx[desc_idx++].cookie;
+ m = __rte_pktmbuf_prefree_seg(m);
+ if (likely(m != NULL)) {
+ free[0] = m;
+ nb_free = 1;
+ for (i = 1; i < VIRTIO_TX_FREE_NR; i++) {
+ m = (struct rte_mbuf *)vq->vq_descx[desc_idx++].cookie;
+ m = __rte_pktmbuf_prefree_seg(m);
+ if (likely(m != NULL)) {
+ if (likely(m->pool == free[0]->pool))
+ free[nb_free++] = m;
+ else {
+ rte_mempool_put_bulk(free[0]->pool,
+ (void **)free, nb_free);
+ free[0] = m;
+ nb_free = 1;
+ }
+ }
+ }
+ rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free);
+ } else {
+ for (i = 1; i < VIRTIO_TX_FREE_NR; i++) {
+ m = (struct rte_mbuf *)vq->vq_descx[desc_idx++].cookie;
+ m = __rte_pktmbuf_prefree_seg(m);
+ if (m != NULL)
+ rte_mempool_put(m->pool, m);
+ }
+ }
+
+ vq->vq_used_cons_idx += VIRTIO_TX_FREE_NR;
+ vq->vq_free_cnt += (VIRTIO_TX_FREE_NR << 1);
+}
+
+#endif /* _VIRTIO_RXTX_SIMPLE_H_ */
diff --git a/drivers/net/virtio/virtio_rxtx_simple_sse.c b/drivers/net/virtio/virtio_rxtx_simple_sse.c
new file mode 100644
index 0000000..39000e8
--- /dev/null
+++ b/drivers/net/virtio/virtio_rxtx_simple_sse.c
@@ -0,0 +1,222 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include <tmmintrin.h>
+
+#include <rte_byteorder.h>
+#include <rte_branch_prediction.h>
+#include <rte_cycles.h>
+#include <rte_ether.h>
+#include <rte_ethdev.h>
+#include <rte_errno.h>
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_mempool.h>
+#include <rte_malloc.h>
+#include <rte_mbuf.h>
+#include <rte_prefetch.h>
+#include <rte_string_fns.h>
+
+#include "virtio_rxtx_simple.h"
+
+#define RTE_VIRTIO_VPMD_RX_BURST 32
+#define RTE_VIRTIO_DESC_PER_LOOP 8
+#define RTE_VIRTIO_VPMD_RX_REARM_THRESH RTE_VIRTIO_VPMD_RX_BURST
+
+/* virtio vPMD receive routine, only accept(nb_pkts >= RTE_VIRTIO_DESC_PER_LOOP)
+ *
+ * This routine is for non-mergeable RX, one desc for each guest buffer.
+ * This routine is based on the RX ring layout optimization. Each entry in the
+ * avail ring points to the desc with the same index in the desc ring and this
+ * will never be changed in the driver.
+ *
+ * - nb_pkts < RTE_VIRTIO_DESC_PER_LOOP, just return no packet
+ */
+uint16_t
+virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts)
+{
+ struct virtnet_rx *rxvq = rx_queue;
+ struct virtqueue *vq = rxvq->vq;
+ uint16_t nb_used;
+ uint16_t desc_idx;
+ struct vring_used_elem *rused;
+ struct rte_mbuf **sw_ring;
+ struct rte_mbuf **sw_ring_end;
+ uint16_t nb_pkts_received;
+ __m128i shuf_msk1, shuf_msk2, len_adjust;
+
+ shuf_msk1 = _mm_set_epi8(
+ 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, /* vlan tci */
+ 5, 4, /* dat len */
+ 0xFF, 0xFF, 5, 4, /* pkt len */
+ 0xFF, 0xFF, 0xFF, 0xFF /* packet type */
+
+ );
+
+ shuf_msk2 = _mm_set_epi8(
+ 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, /* vlan tci */
+ 13, 12, /* dat len */
+ 0xFF, 0xFF, 13, 12, /* pkt len */
+ 0xFF, 0xFF, 0xFF, 0xFF /* packet type */
+ );
+
+ /* Subtract the header length.
+ * In which case do we need the header length in used->len ?
+ */
+ len_adjust = _mm_set_epi16(
+ 0, 0,
+ 0,
+ (uint16_t)-vq->hw->vtnet_hdr_size,
+ 0, (uint16_t)-vq->hw->vtnet_hdr_size,
+ 0, 0);
+
+ if (unlikely(nb_pkts < RTE_VIRTIO_DESC_PER_LOOP))
+ return 0;
+
+ nb_used = VIRTQUEUE_NUSED(vq);
+
+ rte_compiler_barrier();
+
+ if (unlikely(nb_used == 0))
+ return 0;
+
+ nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_VIRTIO_DESC_PER_LOOP);
+ nb_used = RTE_MIN(nb_used, nb_pkts);
+
+ desc_idx = (uint16_t)(vq->vq_used_cons_idx & (vq->vq_nentries - 1));
+ rused = &vq->vq_ring.used->ring[desc_idx];
+ sw_ring = &vq->sw_ring[desc_idx];
+ sw_ring_end = &vq->sw_ring[vq->vq_nentries];
+
+ _mm_prefetch((const void *)rused, _MM_HINT_T0);
+
+ if (vq->vq_free_cnt >= RTE_VIRTIO_VPMD_RX_REARM_THRESH) {
+ virtio_rxq_rearm_vec(rxvq);
+ if (unlikely(virtqueue_kick_prepare(vq)))
+ virtqueue_notify(vq);
+ }
+
+ for (nb_pkts_received = 0;
+ nb_pkts_received < nb_used;) {
+ __m128i desc[RTE_VIRTIO_DESC_PER_LOOP / 2];
+ __m128i mbp[RTE_VIRTIO_DESC_PER_LOOP / 2];
+ __m128i pkt_mb[RTE_VIRTIO_DESC_PER_LOOP];
+
+ mbp[0] = _mm_loadu_si128((__m128i *)(sw_ring + 0));
+ desc[0] = _mm_loadu_si128((__m128i *)(rused + 0));
+ _mm_storeu_si128((__m128i *)&rx_pkts[0], mbp[0]);
+
+ mbp[1] = _mm_loadu_si128((__m128i *)(sw_ring + 2));
+ desc[1] = _mm_loadu_si128((__m128i *)(rused + 2));
+ _mm_storeu_si128((__m128i *)&rx_pkts[2], mbp[1]);
+
+ mbp[2] = _mm_loadu_si128((__m128i *)(sw_ring + 4));
+ desc[2] = _mm_loadu_si128((__m128i *)(rused + 4));
+ _mm_storeu_si128((__m128i *)&rx_pkts[4], mbp[2]);
+
+ mbp[3] = _mm_loadu_si128((__m128i *)(sw_ring + 6));
+ desc[3] = _mm_loadu_si128((__m128i *)(rused + 6));
+ _mm_storeu_si128((__m128i *)&rx_pkts[6], mbp[3]);
+
+ pkt_mb[1] = _mm_shuffle_epi8(desc[0], shuf_msk2);
+ pkt_mb[0] = _mm_shuffle_epi8(desc[0], shuf_msk1);
+ pkt_mb[1] = _mm_add_epi16(pkt_mb[1], len_adjust);
+ pkt_mb[0] = _mm_add_epi16(pkt_mb[0], len_adjust);
+ _mm_storeu_si128((void *)&rx_pkts[1]->rx_descriptor_fields1,
+ pkt_mb[1]);
+ _mm_storeu_si128((void *)&rx_pkts[0]->rx_descriptor_fields1,
+ pkt_mb[0]);
+
+ pkt_mb[3] = _mm_shuffle_epi8(desc[1], shuf_msk2);
+ pkt_mb[2] = _mm_shuffle_epi8(desc[1], shuf_msk1);
+ pkt_mb[3] = _mm_add_epi16(pkt_mb[3], len_adjust);
+ pkt_mb[2] = _mm_add_epi16(pkt_mb[2], len_adjust);
+ _mm_storeu_si128((void *)&rx_pkts[3]->rx_descriptor_fields1,
+ pkt_mb[3]);
+ _mm_storeu_si128((void *)&rx_pkts[2]->rx_descriptor_fields1,
+ pkt_mb[2]);
+
+ pkt_mb[5] = _mm_shuffle_epi8(desc[2], shuf_msk2);
+ pkt_mb[4] = _mm_shuffle_epi8(desc[2], shuf_msk1);
+ pkt_mb[5] = _mm_add_epi16(pkt_mb[5], len_adjust);
+ pkt_mb[4] = _mm_add_epi16(pkt_mb[4], len_adjust);
+ _mm_storeu_si128((void *)&rx_pkts[5]->rx_descriptor_fields1,
+ pkt_mb[5]);
+ _mm_storeu_si128((void *)&rx_pkts[4]->rx_descriptor_fields1,
+ pkt_mb[4]);
+
+ pkt_mb[7] = _mm_shuffle_epi8(desc[3], shuf_msk2);
+ pkt_mb[6] = _mm_shuffle_epi8(desc[3], shuf_msk1);
+ pkt_mb[7] = _mm_add_epi16(pkt_mb[7], len_adjust);
+ pkt_mb[6] = _mm_add_epi16(pkt_mb[6], len_adjust);
+ _mm_storeu_si128((void *)&rx_pkts[7]->rx_descriptor_fields1,
+ pkt_mb[7]);
+ _mm_storeu_si128((void *)&rx_pkts[6]->rx_descriptor_fields1,
+ pkt_mb[6]);
+
+ if (unlikely(nb_used <= RTE_VIRTIO_DESC_PER_LOOP)) {
+ if (sw_ring + nb_used <= sw_ring_end)
+ nb_pkts_received += nb_used;
+ else
+ nb_pkts_received += sw_ring_end - sw_ring;
+ break;
+ } else {
+ if (unlikely(sw_ring + RTE_VIRTIO_DESC_PER_LOOP >=
+ sw_ring_end)) {
+ nb_pkts_received += sw_ring_end - sw_ring;
+ break;
+ } else {
+ nb_pkts_received += RTE_VIRTIO_DESC_PER_LOOP;
+
+ rx_pkts += RTE_VIRTIO_DESC_PER_LOOP;
+ sw_ring += RTE_VIRTIO_DESC_PER_LOOP;
+ rused += RTE_VIRTIO_DESC_PER_LOOP;
+ nb_used -= RTE_VIRTIO_DESC_PER_LOOP;
+ }
+ }
+ }
+
+ vq->vq_used_cons_idx += nb_pkts_received;
+ vq->vq_free_cnt += nb_pkts_received;
+ rxvq->stats.packets += nb_pkts_received;
+ return nb_pkts_received;
+}
--
2.5.5
Yuanhan Liu
2016-08-18 06:52:31 UTC
Permalink
Post by Jerin Jacob
Split out SSE instruction based virtio simple Rx
implementation to a separate file
Hi,

I was about to apply this set. I then did some build test and found a
weird issue: it breaks the build with clang (ubuntu 16.04).

drivers/net/virtio/virtio_rxtx_simple_sse.c:130:2: error: cast from 'const void *' to 'void *' drops const qualifier [-Werror,-Wcast-qual]
_mm_prefetch((const void *)rused, _MM_HINT_T0);
^
/usr/lib/llvm-3.8/bin/../lib/clang/3.8.0/include/xmmintrin.h:684:58: note: expanded from macro '_mm_prefetch'
#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel)))
^
1 error generated.

Weird enough I don't see this issue before this commit: the error
line is exactly the same before and after this commit.

Another note is that _mm_prefetch() is actually with different prototype
for gcc and clang. For gcc, we have:

_mm_prefetch (const void *__P, enum _mm_hint __I)

Any thoughts?

--yliu
Jerin Jacob
2016-08-19 03:24:00 UTC
Permalink
Post by Yuanhan Liu
Post by Jerin Jacob
Split out SSE instruction based virtio simple Rx
implementation to a separate file
Hi,
I was about to apply this set. I then did some build test and found a
weird issue: it breaks the build with clang (ubuntu 16.04).
drivers/net/virtio/virtio_rxtx_simple_sse.c:130:2: error: cast from 'const void *' to 'void *' drops const qualifier [-Werror,-Wcast-qual]
_mm_prefetch((const void *)rused, _MM_HINT_T0);
^
/usr/lib/llvm-3.8/bin/../lib/clang/3.8.0/include/xmmintrin.h:684:58: note: expanded from macro '_mm_prefetch'
#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel)))
^
1 error generated.
Weird enough I don't see this issue before this commit: the error
line is exactly the same before and after this commit.
Yes, I looked at the pre processed output as well, it comes as same before and
after this commit.
Post by Yuanhan Liu
Another note is that _mm_prefetch() is actually with different prototype
_mm_prefetch (const void *__P, enum _mm_hint __I)
Any thoughts?
How about replacing "_mm_prefetch((const void *)rused, _MM_HINT_T0)"
with "rte_prefetch0(rused)" to have same prototype and fix the issue
with clang?
Post by Yuanhan Liu
--yliu
Yuanhan Liu
2016-08-23 07:43:05 UTC
Permalink
Post by Jerin Jacob
Post by Yuanhan Liu
Post by Jerin Jacob
Split out SSE instruction based virtio simple Rx
implementation to a separate file
Hi,
I was about to apply this set. I then did some build test and found a
weird issue: it breaks the build with clang (ubuntu 16.04).
drivers/net/virtio/virtio_rxtx_simple_sse.c:130:2: error: cast from 'const void *' to 'void *' drops const qualifier [-Werror,-Wcast-qual]
_mm_prefetch((const void *)rused, _MM_HINT_T0);
^
/usr/lib/llvm-3.8/bin/../lib/clang/3.8.0/include/xmmintrin.h:684:58: note: expanded from macro '_mm_prefetch'
#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel)))
^
1 error generated.
Weird enough I don't see this issue before this commit: the error
line is exactly the same before and after this commit.
Yes, I looked at the pre processed output as well, it comes as same before and
after this commit.
Post by Yuanhan Liu
Another note is that _mm_prefetch() is actually with different prototype
_mm_prefetch (const void *__P, enum _mm_hint __I)
Any thoughts?
How about replacing "_mm_prefetch((const void *)rused, _MM_HINT_T0)"
with "rte_prefetch0(rused)" to have same prototype and fix the issue
with clang?
Yes, it should work. BTW, I have just sent out a patch for that. I will
apply yours when that patch has been applied.

--yliu
Jerin Jacob
2016-07-05 12:49:25 UTC
Permalink
Introduced cpuflag based run-time detection to select the
SSE based simple Rx handler

Signed-off-by: Jerin Jacob <***@caviumnetworks.com>
---
drivers/net/virtio/virtio_rxtx.c | 38 ++++++++++++++++++++++++--------------
1 file changed, 24 insertions(+), 14 deletions(-)

diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
index e707954..adc3457 100644
--- a/drivers/net/virtio/virtio_rxtx.c
+++ b/drivers/net/virtio/virtio_rxtx.c
@@ -50,6 +50,7 @@
#include <rte_string_fns.h>
#include <rte_errno.h>
#include <rte_byteorder.h>
+#include <rte_cpuflags.h>

#include "virtio_logs.h"
#include "virtio_ethdev.h"
@@ -470,6 +471,28 @@ virtio_dev_rx_queue_release(void *rxq)
rte_memzone_free(mz);
}

+static void
+virtio_update_rxtx_handler(struct rte_eth_dev *dev,
+ const struct rte_eth_txconf *tx_conf)
+{
+ uint8_t use_simple_rxtx = 0;
+ struct virtio_hw *hw = dev->data->dev_private;
+
+#if defined RTE_ARCH_X86
+ if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SSE3))
+ use_simple_rxtx = 1;
+#endif
+ /* Use simple rx/tx func if single segment and no offloads */
+ if (use_simple_rxtx &&
+ (tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) == VIRTIO_SIMPLE_FLAGS &&
+ !vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
+ PMD_INIT_LOG(INFO, "Using simple rx/tx path");
+ dev->tx_pkt_burst = virtio_xmit_pkts_simple;
+ dev->rx_pkt_burst = virtio_recv_pkts_vec;
+ hw->use_simple_rxtx = use_simple_rxtx;
+ }
+}
+
/*
* struct rte_eth_dev *dev: Used to update dev
* uint16_t nb_desc: Defaults to values read from config space
@@ -485,10 +508,6 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
const struct rte_eth_txconf *tx_conf)
{
uint8_t vtpci_queue_idx = 2 * queue_idx + VTNET_SQ_TQ_QUEUE_IDX;
-
-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
- struct virtio_hw *hw = dev->data->dev_private;
-#endif
struct virtnet_tx *txvq;
struct virtqueue *vq;
uint16_t tx_free_thresh;
@@ -502,16 +521,7 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
return -EINVAL;
}

-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
- /* Use simple rx/tx func if single segment and no offloads */
- if ((tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) == VIRTIO_SIMPLE_FLAGS &&
- !vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
- PMD_INIT_LOG(INFO, "Using simple rx/tx path");
- dev->tx_pkt_burst = virtio_xmit_pkts_simple;
- dev->rx_pkt_burst = virtio_recv_pkts_vec;
- hw->use_simple_rxtx = 1;
- }
-#endif
+ virtio_update_rxtx_handler(dev, tx_conf);

ret = virtio_dev_queue_setup(dev, VTNET_TQ, queue_idx, vtpci_queue_idx,
nb_desc, socket_id, (void **)&txvq);
--
2.5.5
Jerin Jacob
2016-07-05 12:49:26 UTC
Permalink
Added neon based Rx vector implementation.
Selection of the new handler based neon availability at runtime.
Updated the release notes and MAINTAINERS file.

Signed-off-by: Jerin Jacob <***@caviumnetworks.com>
---
MAINTAINERS | 1 +
doc/guides/rel_notes/release_16_07.rst | 2 +
drivers/net/virtio/Makefile | 2 +
drivers/net/virtio/virtio_rxtx.c | 3 +
drivers/net/virtio/virtio_rxtx_simple_neon.c | 235 +++++++++++++++++++++++++++
5 files changed, 243 insertions(+)
create mode 100644 drivers/net/virtio/virtio_rxtx_simple_neon.c

diff --git a/MAINTAINERS b/MAINTAINERS
index a59191e..ab04cee 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -143,6 +143,7 @@ F: lib/librte_acl/acl_run_neon.*
F: lib/librte_lpm/rte_lpm_neon.h
F: lib/librte_hash/rte*_arm64.h
F: drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
+F: drivers/net/virtio/virtio_rxtx_simple_neon.c

EZchip TILE-Gx
M: Zhigang Lu <***@ezchip.com>
diff --git a/doc/guides/rel_notes/release_16_07.rst b/doc/guides/rel_notes/release_16_07.rst
index 569f562..57f3d28 100644
--- a/doc/guides/rel_notes/release_16_07.rst
+++ b/doc/guides/rel_notes/release_16_07.rst
@@ -180,6 +180,8 @@ New Features
section of the "Network Interface Controller Drivers" document.


+* **Virtio NEON support for ARM.**
+
Resolved Issues
---------------

diff --git a/drivers/net/virtio/Makefile b/drivers/net/virtio/Makefile
index c4103b7..97972a6 100644
--- a/drivers/net/virtio/Makefile
+++ b/drivers/net/virtio/Makefile
@@ -54,6 +54,8 @@ SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx_simple.c

ifeq ($(CONFIG_RTE_ARCH_X86),y)
SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx_simple_sse.c
+else ifneq ($(filter y,$(CONFIG_RTE_ARCH_ARM) $(CONFIG_RTE_ARCH_ARM64)),)
+SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx_simple_neon.c
endif

ifeq ($(CONFIG_RTE_VIRTIO_USER),y)
diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
index adc3457..8f6cad8 100644
--- a/drivers/net/virtio/virtio_rxtx.c
+++ b/drivers/net/virtio/virtio_rxtx.c
@@ -481,6 +481,9 @@ virtio_update_rxtx_handler(struct rte_eth_dev *dev,
#if defined RTE_ARCH_X86
if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SSE3))
use_simple_rxtx = 1;
+#elif defined RTE_ARCH_ARM64 || defined CONFIG_RTE_ARCH_ARM
+ if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON))
+ use_simple_rxtx = 1;
#endif
/* Use simple rx/tx func if single segment and no offloads */
if (use_simple_rxtx &&
diff --git a/drivers/net/virtio/virtio_rxtx_simple_neon.c b/drivers/net/virtio/virtio_rxtx_simple_neon.c
new file mode 100644
index 0000000..793eefb
--- /dev/null
+++ b/drivers/net/virtio/virtio_rxtx_simple_neon.c
@@ -0,0 +1,235 @@
+/*
+ * BSD LICENSE
+ *
+ * Copyright (C) Cavium networks Ltd. 2016
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Cavium networks nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include <rte_byteorder.h>
+#include <rte_branch_prediction.h>
+#include <rte_cycles.h>
+#include <rte_ether.h>
+#include <rte_ethdev.h>
+#include <rte_errno.h>
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_mempool.h>
+#include <rte_malloc.h>
+#include <rte_mbuf.h>
+#include <rte_prefetch.h>
+#include <rte_string_fns.h>
+#include <rte_vect.h>
+
+#include "virtio_rxtx_simple.h"
+
+#define RTE_VIRTIO_VPMD_RX_BURST 32
+#define RTE_VIRTIO_DESC_PER_LOOP 8
+#define RTE_VIRTIO_VPMD_RX_REARM_THRESH RTE_VIRTIO_VPMD_RX_BURST
+
+/* virtio vPMD receive routine, only accept(nb_pkts >= RTE_VIRTIO_DESC_PER_LOOP)
+ *
+ * This routine is for non-mergeable RX, one desc for each guest buffer.
+ * This routine is based on the RX ring layout optimization. Each entry in the
+ * avail ring points to the desc with the same index in the desc ring and this
+ * will never be changed in the driver.
+ *
+ * - nb_pkts < RTE_VIRTIO_DESC_PER_LOOP, just return no packet
+ */
+uint16_t
+virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts)
+{
+ struct virtnet_rx *rxvq = rx_queue;
+ struct virtqueue *vq = rxvq->vq;
+ uint16_t nb_used;
+ uint16_t desc_idx;
+ struct vring_used_elem *rused;
+ struct rte_mbuf **sw_ring;
+ struct rte_mbuf **sw_ring_end;
+ uint16_t nb_pkts_received;
+
+ uint8x16_t shuf_msk1 = {
+ 0xFF, 0xFF, 0xFF, 0xFF, /* packet type */
+ 4, 5, 0xFF, 0xFF, /* pkt len */
+ 4, 5, /* dat len */
+ 0xFF, 0xFF, /* vlan tci */
+ 0xFF, 0xFF, 0xFF, 0xFF
+ };
+
+ uint8x16_t shuf_msk2 = {
+ 0xFF, 0xFF, 0xFF, 0xFF, /* packet type */
+ 12, 13, 0xFF, 0xFF, /* pkt len */
+ 12, 13, /* dat len */
+ 0xFF, 0xFF, /* vlan tci */
+ 0xFF, 0xFF, 0xFF, 0xFF
+ };
+
+ /* Subtract the header length.
+ * In which case do we need the header length in used->len ?
+ */
+ uint16x8_t len_adjust = {
+ 0, 0,
+ (uint16_t)vq->hw->vtnet_hdr_size, 0,
+ (uint16_t)vq->hw->vtnet_hdr_size,
+ 0,
+ 0, 0
+ };
+
+ if (unlikely(nb_pkts < RTE_VIRTIO_DESC_PER_LOOP))
+ return 0;
+
+ nb_used = VIRTQUEUE_NUSED(vq);
+
+ rte_rmb();
+
+ if (unlikely(nb_used == 0))
+ return 0;
+
+ nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_VIRTIO_DESC_PER_LOOP);
+ nb_used = RTE_MIN(nb_used, nb_pkts);
+
+ desc_idx = (uint16_t)(vq->vq_used_cons_idx & (vq->vq_nentries - 1));
+ rused = &vq->vq_ring.used->ring[desc_idx];
+ sw_ring = &vq->sw_ring[desc_idx];
+ sw_ring_end = &vq->sw_ring[vq->vq_nentries];
+
+ rte_prefetch_non_temporal(rused);
+
+ if (vq->vq_free_cnt >= RTE_VIRTIO_VPMD_RX_REARM_THRESH) {
+ virtio_rxq_rearm_vec(rxvq);
+ if (unlikely(virtqueue_kick_prepare(vq)))
+ virtqueue_notify(vq);
+ }
+
+ for (nb_pkts_received = 0;
+ nb_pkts_received < nb_used;) {
+ uint64x2_t desc[RTE_VIRTIO_DESC_PER_LOOP / 2];
+ uint64x2_t mbp[RTE_VIRTIO_DESC_PER_LOOP / 2];
+ uint64x2_t pkt_mb[RTE_VIRTIO_DESC_PER_LOOP];
+
+ mbp[0] = vld1q_u64((uint64_t *)(sw_ring + 0));
+ desc[0] = vld1q_u64((uint64_t *)(rused + 0));
+ vst1q_u64((uint64_t *)&rx_pkts[0], mbp[0]);
+
+ mbp[1] = vld1q_u64((uint64_t *)(sw_ring + 2));
+ desc[1] = vld1q_u64((uint64_t *)(rused + 2));
+ vst1q_u64((uint64_t *)&rx_pkts[2], mbp[1]);
+
+ mbp[2] = vld1q_u64((uint64_t *)(sw_ring + 4));
+ desc[2] = vld1q_u64((uint64_t *)(rused + 4));
+ vst1q_u64((uint64_t *)&rx_pkts[4], mbp[2]);
+
+ mbp[3] = vld1q_u64((uint64_t *)(sw_ring + 6));
+ desc[3] = vld1q_u64((uint64_t *)(rused + 6));
+ vst1q_u64((uint64_t *)&rx_pkts[6], mbp[3]);
+
+ pkt_mb[1] = vreinterpretq_u64_u8(vqtbl1q_u8(
+ vreinterpretq_u8_u64(desc[0]), shuf_msk2));
+ pkt_mb[0] = vreinterpretq_u64_u8(vqtbl1q_u8(
+ vreinterpretq_u8_u64(desc[0]), shuf_msk1));
+ pkt_mb[1] = vreinterpretq_u64_u16(vsubq_u16(
+ vreinterpretq_u16_u64(pkt_mb[1]), len_adjust));
+ pkt_mb[0] = vreinterpretq_u64_u16(vsubq_u16(
+ vreinterpretq_u16_u64(pkt_mb[0]), len_adjust));
+ vst1q_u64((void *)&rx_pkts[1]->rx_descriptor_fields1,
+ pkt_mb[1]);
+ vst1q_u64((void *)&rx_pkts[0]->rx_descriptor_fields1,
+ pkt_mb[0]);
+
+ pkt_mb[3] = vreinterpretq_u64_u8(vqtbl1q_u8(
+ vreinterpretq_u8_u64(desc[1]), shuf_msk2));
+ pkt_mb[2] = vreinterpretq_u64_u8(vqtbl1q_u8(
+ vreinterpretq_u8_u64(desc[1]), shuf_msk1));
+ pkt_mb[3] = vreinterpretq_u64_u16(vsubq_u16(
+ vreinterpretq_u16_u64(pkt_mb[3]), len_adjust));
+ pkt_mb[2] = vreinterpretq_u64_u16(vsubq_u16(
+ vreinterpretq_u16_u64(pkt_mb[2]), len_adjust));
+ vst1q_u64((void *)&rx_pkts[3]->rx_descriptor_fields1,
+ pkt_mb[3]);
+ vst1q_u64((void *)&rx_pkts[2]->rx_descriptor_fields1,
+ pkt_mb[2]);
+
+ pkt_mb[5] = vreinterpretq_u64_u8(vqtbl1q_u8(
+ vreinterpretq_u8_u64(desc[2]), shuf_msk2));
+ pkt_mb[4] = vreinterpretq_u64_u8(vqtbl1q_u8(
+ vreinterpretq_u8_u64(desc[2]), shuf_msk1));
+ pkt_mb[5] = vreinterpretq_u64_u16(vsubq_u16(
+ vreinterpretq_u16_u64(pkt_mb[5]), len_adjust));
+ pkt_mb[4] = vreinterpretq_u64_u16(vsubq_u16(
+ vreinterpretq_u16_u64(pkt_mb[4]), len_adjust));
+ vst1q_u64((void *)&rx_pkts[5]->rx_descriptor_fields1,
+ pkt_mb[5]);
+ vst1q_u64((void *)&rx_pkts[4]->rx_descriptor_fields1,
+ pkt_mb[4]);
+
+ pkt_mb[7] = vreinterpretq_u64_u8(vqtbl1q_u8(
+ vreinterpretq_u8_u64(desc[3]), shuf_msk2));
+ pkt_mb[6] = vreinterpretq_u64_u8(vqtbl1q_u8(
+ vreinterpretq_u8_u64(desc[3]), shuf_msk1));
+ pkt_mb[7] = vreinterpretq_u64_u16(vsubq_u16(
+ vreinterpretq_u16_u64(pkt_mb[7]), len_adjust));
+ pkt_mb[6] = vreinterpretq_u64_u16(vsubq_u16(
+ vreinterpretq_u16_u64(pkt_mb[6]), len_adjust));
+ vst1q_u64((void *)&rx_pkts[7]->rx_descriptor_fields1,
+ pkt_mb[7]);
+ vst1q_u64((void *)&rx_pkts[6]->rx_descriptor_fields1,
+ pkt_mb[6]);
+
+ if (unlikely(nb_used <= RTE_VIRTIO_DESC_PER_LOOP)) {
+ if (sw_ring + nb_used <= sw_ring_end)
+ nb_pkts_received += nb_used;
+ else
+ nb_pkts_received += sw_ring_end - sw_ring;
+ break;
+ } else {
+ if (unlikely(sw_ring + RTE_VIRTIO_DESC_PER_LOOP >=
+ sw_ring_end)) {
+ nb_pkts_received += sw_ring_end - sw_ring;
+ break;
+ } else {
+ nb_pkts_received += RTE_VIRTIO_DESC_PER_LOOP;
+
+ rx_pkts += RTE_VIRTIO_DESC_PER_LOOP;
+ sw_ring += RTE_VIRTIO_DESC_PER_LOOP;
+ rused += RTE_VIRTIO_DESC_PER_LOOP;
+ nb_used -= RTE_VIRTIO_DESC_PER_LOOP;
+ }
+ }
+ }
+
+ vq->vq_used_cons_idx += nb_pkts_received;
+ vq->vq_free_cnt += nb_pkts_received;
+ rxvq->stats.packets += nb_pkts_received;
+ return nb_pkts_received;
+}
--
2.5.5
Jianbo Liu
2016-07-06 03:11:06 UTC
Permalink
Post by Jerin Jacob
Added neon based Rx vector implementation.
Selection of the new handler based neon availability at runtime.
Updated the release notes and MAINTAINERS file.
---
MAINTAINERS | 1 +
doc/guides/rel_notes/release_16_07.rst | 2 +
drivers/net/virtio/Makefile | 2 +
drivers/net/virtio/virtio_rxtx.c | 3 +
drivers/net/virtio/virtio_rxtx_simple_neon.c | 235 +++++++++++++++++++++++++++
5 files changed, 243 insertions(+)
create mode 100644 drivers/net/virtio/virtio_rxtx_simple_neon.c
Acked-by: Jianbo Liu <***@linaro.org>
Yuanhan Liu
2016-09-28 00:37:32 UTC
Permalink
Post by Jerin Jacob
This patch-set includes,
1) General cleanup of compile time dependency.
2) made vector handler section based on run-time cpuflags
2) Added NEON support for optimized Rx handling
This patch-set is based on dpdk-next-virtio/master
Now the weird build issue is gone, this series is applied to
dpdk-next-virtio.

I addressed some conflict issues; you might want to have a
simple check to make sure I don't mess something up. Or even
better, if you could do a test.

Thanks.

--yliu
Post by Jerin Jacob
Address Yuanhan's review comments
http://dpdk.org/dev/patchwork/patch/14495/
http://dpdk.org/dev/patchwork/patch/14496/
- made vector handler selection based on run-time cpuflags (Suggested by Thomas)
- moved vector implementations to .c file instead of .h file(Suggested by Jianbo)
virtio: conditional compilation cleanup
virtio: move SSE based Rx implementation to separate file
virtio: add cpuflag based vector handler selection
virtio: add neon support
MAINTAINERS | 1 +
doc/guides/rel_notes/release_16_07.rst | 2 +
drivers/net/virtio/Makefile | 7 +-
drivers/net/virtio/virtio_pci.h | 1 +
drivers/net/virtio/virtio_rxtx.c | 63 ++++---
drivers/net/virtio/virtio_rxtx.h | 3 +-
drivers/net/virtio/virtio_rxtx_simple.c | 269 ++-------------------------
drivers/net/virtio/virtio_rxtx_simple.h | 133 +++++++++++++
drivers/net/virtio/virtio_rxtx_simple_neon.c | 235 +++++++++++++++++++++++
drivers/net/virtio/virtio_rxtx_simple_sse.c | 222 ++++++++++++++++++++++
drivers/net/virtio/virtio_user_ethdev.c | 1 +
11 files changed, 646 insertions(+), 291 deletions(-)
create mode 100644 drivers/net/virtio/virtio_rxtx_simple.h
create mode 100644 drivers/net/virtio/virtio_rxtx_simple_neon.c
create mode 100644 drivers/net/virtio/virtio_rxtx_simple_sse.c
--
2.5.5
Jerin Jacob
2016-07-01 11:19:54 UTC
Permalink
This patch-set includes,

1) General cleanup of compile time dependency.
2) made vector handler section based on run-time cpuflags
2) Added NEON support for optimized Rx handling

This patch-set is based on dpdk-next-virtio/master

v2:
- made vector handler selection based on run-time cpuflags (Suggested by Thomas)
- moved vector implementations to .c file instead of .h file(Suggested by Jianbo)

Jerin Jacob (3):
virtio: conditional compilation cleanup
virtio: move SSE based Rx implementation to separate file
virtio: add neon support

MAINTAINERS | 1 +
doc/guides/rel_notes/release_16_07.rst | 2 +
drivers/net/vision/Wakefully | 7 +-
drivers/net/virtio/virtio_pci.h | 1 +
drivers/net/virtio/virtio_rxtx.c | 62 +++---
drivers/net/virtio/virtio_rxtx.h | 3 +-
drivers/net/virtio/virtio_rxtx_simple.c | 269 ++-------------------------
drivers/net/virtio/virtio_rxtx_simple.h | 133 +++++++++++++
drivers/net/virtio/virtio_rxtx_simple_neon.c | 235 +++++++++++++++++++++++
drivers/net/virtio/virtio_rxtx_simple_sse.c | 222 ++++++++++++++++++++++
drivers/net/virtio/virtio_user_ethdev.c | 1 +
11 files changed, 646 insertions(+), 290 deletions(-)
create mode 100644 drivers/net/virtio/virtio_rxtx_simple.h
create mode 100644 drivers/net/virtio/virtio_rxtx_simple_neon.c
create mode 100644 drivers/net/virtio/virtio_rxtx_simple_sse.c
--
2.5.5
Continue reading on narkive:
Loading...