Discussion:
[dpdk-dev] [PATCH 0/3] improve MAC swap performance.
(too old to reply)
Qi Zhang
2018-11-22 17:26:29 UTC
Permalink
**The pathset is target for 19.02**

Improved testpmd macswap performance for x86 by take advantage
of SSE instructions.
On a broadwell 1.6GHz sever with a i40e 25G NIC.
We abserve 17.7% performance improvement for testpmd's macswap
test.

Qi Zhang (3):
app/testpmd: code refactory for macswap
app/testpmd: improve MAC swap performance for x86
app/testpmd: further improve MAC swap performance for x86

app/test-pmd/l2fwd.h | 40 +++++++++++++++++++
app/test-pmd/l2fwd_common.h | 36 +++++++++++++++++
app/test-pmd/macswap.c | 36 +++--------------
app/test-pmd/macswap.h | 40 +++++++++++++++++++
app/test-pmd/macswap_common.h | 36 +++++++++++++++++
app/test-pmd/macswap_sse.h | 90 +++++++++++++++++++++++++++++++++++++++++++
6 files changed, 248 insertions(+), 30 deletions(-)
create mode 100644 app/test-pmd/l2fwd.h
create mode 100644 app/test-pmd/l2fwd_common.h
create mode 100644 app/test-pmd/macswap.h
create mode 100644 app/test-pmd/macswap_common.h
create mode 100644 app/test-pmd/macswap_sse.h
--
2.13.6
Qi Zhang
2018-11-22 17:26:30 UTC
Permalink
Move macswap workload to dedicate function, so we can further enable
platform specific optimized version.

Signed-off-by: Qi Zhang <***@intel.com>
---
app/test-pmd/l2fwd.h | 40 ++++++++++++++++++++++++++++++++++++++++
app/test-pmd/l2fwd_common.h | 36 ++++++++++++++++++++++++++++++++++++
app/test-pmd/macswap.c | 32 ++------------------------------
app/test-pmd/macswap.h | 40 ++++++++++++++++++++++++++++++++++++++++
app/test-pmd/macswap_common.h | 36 ++++++++++++++++++++++++++++++++++++
5 files changed, 154 insertions(+), 30 deletions(-)
create mode 100644 app/test-pmd/l2fwd.h
create mode 100644 app/test-pmd/l2fwd_common.h
create mode 100644 app/test-pmd/macswap.h
create mode 100644 app/test-pmd/macswap_common.h

diff --git a/app/test-pmd/l2fwd.h b/app/test-pmd/l2fwd.h
new file mode 100644
index 000000000..6fcad4d75
--- /dev/null
+++ b/app/test-pmd/l2fwd.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef _L2FWD_H_
+#define _L2FWD_H_
+
+#include "l2fwd_common.h"
+
+static inline void
+do_macswap(struct rte_mbuf *pkts[], uint16_t nb,
+ struct rte_port *txp)
+{
+ struct ether_hdr *eth_hdr;
+ struct rte_mbuf *mb;
+ struct ether_addr addr;
+ uint64_t ol_flags;
+ int i;
+
+ ol_flags = ol_flags_init(txp->dev_conf.txmode.offloads);
+
+ for (i = 0; i < nb; i++) {
+ if (likely(i < nb - 1))
+ rte_prefetch0(rte_pktmbuf_mtod(pkts[i+1], void *));
+ mb = pkts[i];
+
+ eth_hdr = rte_pktmbuf_mtod(mb, struct ether_hdr *);
+
+ /* Swap dest and src mac addresses. */
+ ether_addr_copy(&eth_hdr->d_addr, &addr);
+ ether_addr_copy(&eth_hdr->s_addr, &eth_hdr->d_addr);
+ ether_addr_copy(&addr, &eth_hdr->s_addr);
+
+ mbuf_field_set(mb, ol_flags, txp->tx_vlan_id,
+ txp->tx_vlan_id_outer);
+ }
+}
+
+#endif /* _BPF_CMD_H_ */
+
diff --git a/app/test-pmd/l2fwd_common.h b/app/test-pmd/l2fwd_common.h
new file mode 100644
index 000000000..2c01cbc8f
--- /dev/null
+++ b/app/test-pmd/l2fwd_common.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef _L2FWD_COMMON_H_
+#define _L2FWD_COMMON_H_
+
+static inline uint64_t
+ol_flags_init(uint64_t tx_offload)
+{
+ uint64_t ol_flags = 0;
+
+ ol_flags |= (tx_offload & DEV_TX_OFFLOAD_VLAN_INSERT) ?
+ PKT_TX_VLAN_PKT : 0;
+ ol_flags |= (tx_offload & DEV_TX_OFFLOAD_QINQ_INSERT) ?
+ PKT_TX_QINQ_PKT : 0;
+ ol_flags |= (tx_offload & DEV_TX_OFFLOAD_MACSEC_INSERT) ?
+ PKT_TX_MACSEC : 0;
+
+ return ol_flags;
+}
+
+static inline void
+mbuf_field_set(struct rte_mbuf *mb, uint64_t ol_flags,
+ uint16_t vlan, uint16_t vlan_outer)
+{
+ mb->ol_flags &= IND_ATTACHED_MBUF | EXT_ATTACHED_MBUF;
+ mb->ol_flags |= ol_flags;
+ mb->l2_len = sizeof(struct ether_hdr);
+ mb->l3_len = sizeof(struct ipv4_hdr);
+ mb->vlan_tci = vlan;
+ mb->vlan_tci_outer = vlan_outer;
+}
+
+#endif /* _BPF_CMD_H_ */
+
diff --git a/app/test-pmd/macswap.c b/app/test-pmd/macswap.c
index a8384d5b8..849194fe2 100644
--- a/app/test-pmd/macswap.c
+++ b/app/test-pmd/macswap.c
@@ -66,6 +66,7 @@
#include <rte_flow.h>

#include "testpmd.h"
+#include "macswap.h"

/*
* MAC swap forwarding mode: Swap the source and the destination Ethernet
@@ -76,15 +77,9 @@ pkt_burst_mac_swap(struct fwd_stream *fs)
{
struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
struct rte_port *txp;
- struct rte_mbuf *mb;
- struct ether_hdr *eth_hdr;
- struct ether_addr addr;
uint16_t nb_rx;
uint16_t nb_tx;
- uint16_t i;
uint32_t retry;
- uint64_t ol_flags = 0;
- uint64_t tx_offloads;
#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
uint64_t start_tsc;
uint64_t end_tsc;
@@ -108,32 +103,9 @@ pkt_burst_mac_swap(struct fwd_stream *fs)
#endif
fs->rx_packets += nb_rx;
txp = &ports[fs->tx_port];
- tx_offloads = txp->dev_conf.txmode.offloads;
- if (tx_offloads & DEV_TX_OFFLOAD_VLAN_INSERT)
- ol_flags = PKT_TX_VLAN_PKT;
- if (tx_offloads & DEV_TX_OFFLOAD_QINQ_INSERT)
- ol_flags |= PKT_TX_QINQ_PKT;
- if (tx_offloads & DEV_TX_OFFLOAD_MACSEC_INSERT)
- ol_flags |= PKT_TX_MACSEC;
- for (i = 0; i < nb_rx; i++) {
- if (likely(i < nb_rx - 1))
- rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[i + 1],
- void *));
- mb = pkts_burst[i];
- eth_hdr = rte_pktmbuf_mtod(mb, struct ether_hdr *);

- /* Swap dest and src mac addresses. */
- ether_addr_copy(&eth_hdr->d_addr, &addr);
- ether_addr_copy(&eth_hdr->s_addr, &eth_hdr->d_addr);
- ether_addr_copy(&addr, &eth_hdr->s_addr);
+ do_macswap(pkts_burst, nb_rx, txp);

- mb->ol_flags &= IND_ATTACHED_MBUF | EXT_ATTACHED_MBUF;
- mb->ol_flags |= ol_flags;
- mb->l2_len = sizeof(struct ether_hdr);
- mb->l3_len = sizeof(struct ipv4_hdr);
- mb->vlan_tci = txp->tx_vlan_id;
- mb->vlan_tci_outer = txp->tx_vlan_id_outer;
- }
nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst, nb_rx);
/*
* Retry if necessary
diff --git a/app/test-pmd/macswap.h b/app/test-pmd/macswap.h
new file mode 100644
index 000000000..bc8a95626
--- /dev/null
+++ b/app/test-pmd/macswap.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef _L2FWD_H_
+#define _L2FWD_H_
+
+#include "macswap_common.h"
+
+static inline void
+do_macswap(struct rte_mbuf *pkts[], uint16_t nb,
+ struct rte_port *txp)
+{
+ struct ether_hdr *eth_hdr;
+ struct rte_mbuf *mb;
+ struct ether_addr addr;
+ uint64_t ol_flags;
+ int i;
+
+ ol_flags = ol_flags_init(txp->dev_conf.txmode.offloads);
+
+ for (i = 0; i < nb; i++) {
+ if (likely(i < nb - 1))
+ rte_prefetch0(rte_pktmbuf_mtod(pkts[i+1], void *));
+ mb = pkts[i];
+
+ eth_hdr = rte_pktmbuf_mtod(mb, struct ether_hdr *);
+
+ /* Swap dest and src mac addresses. */
+ ether_addr_copy(&eth_hdr->d_addr, &addr);
+ ether_addr_copy(&eth_hdr->s_addr, &eth_hdr->d_addr);
+ ether_addr_copy(&addr, &eth_hdr->s_addr);
+
+ mbuf_field_set(mb, ol_flags, txp->tx_vlan_id,
+ txp->tx_vlan_id_outer);
+ }
+}
+
+#endif /* _BPF_CMD_H_ */
+
diff --git a/app/test-pmd/macswap_common.h b/app/test-pmd/macswap_common.h
new file mode 100644
index 000000000..2c01cbc8f
--- /dev/null
+++ b/app/test-pmd/macswap_common.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef _L2FWD_COMMON_H_
+#define _L2FWD_COMMON_H_
+
+static inline uint64_t
+ol_flags_init(uint64_t tx_offload)
+{
+ uint64_t ol_flags = 0;
+
+ ol_flags |= (tx_offload & DEV_TX_OFFLOAD_VLAN_INSERT) ?
+ PKT_TX_VLAN_PKT : 0;
+ ol_flags |= (tx_offload & DEV_TX_OFFLOAD_QINQ_INSERT) ?
+ PKT_TX_QINQ_PKT : 0;
+ ol_flags |= (tx_offload & DEV_TX_OFFLOAD_MACSEC_INSERT) ?
+ PKT_TX_MACSEC : 0;
+
+ return ol_flags;
+}
+
+static inline void
+mbuf_field_set(struct rte_mbuf *mb, uint64_t ol_flags,
+ uint16_t vlan, uint16_t vlan_outer)
+{
+ mb->ol_flags &= IND_ATTACHED_MBUF | EXT_ATTACHED_MBUF;
+ mb->ol_flags |= ol_flags;
+ mb->l2_len = sizeof(struct ether_hdr);
+ mb->l3_len = sizeof(struct ipv4_hdr);
+ mb->vlan_tci = vlan;
+ mb->vlan_tci_outer = vlan_outer;
+}
+
+#endif /* _BPF_CMD_H_ */
+
--
2.13.6
Qi Zhang
2018-11-22 17:26:31 UTC
Permalink
The patch optimizes the mac swap operation by taking advantage
of SSE instructions, it only impacts x86 platform.

Signed-off-by: Qi Zhang <***@intel.com>
---
app/test-pmd/macswap.c | 4 ++++
app/test-pmd/macswap_sse.h | 43 +++++++++++++++++++++++++++++++++++++++++++
2 files changed, 47 insertions(+)
create mode 100644 app/test-pmd/macswap_sse.h

diff --git a/app/test-pmd/macswap.c b/app/test-pmd/macswap.c
index 849194fe2..cbb41b728 100644
--- a/app/test-pmd/macswap.c
+++ b/app/test-pmd/macswap.c
@@ -66,7 +66,11 @@
#include <rte_flow.h>

#include "testpmd.h"
+#ifdef RTE_ARCH_X86
+#include "macswap_sse.h"
+#else
#include "macswap.h"
+#endif

/*
* MAC swap forwarding mode: Swap the source and the destination Ethernet
diff --git a/app/test-pmd/macswap_sse.h b/app/test-pmd/macswap_sse.h
new file mode 100644
index 000000000..d5b0f6a21
--- /dev/null
+++ b/app/test-pmd/macswap_sse.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef _L2FWD_SSE_H_
+#define _L2FWD_SSE_H_
+
+#include "macswap_common.h"
+static inline void
+do_macswap(struct rte_mbuf *pkts[], uint16_t nb,
+ struct rte_port *txp)
+{
+ struct ether_hdr *eth_hdr;
+ struct rte_mbuf *mb;
+ uint64_t ol_flags;
+ int i;
+ __m128i addr;
+ __m128i shfl_msk = _mm_set_epi8(15, 14, 13, 12,
+ 5, 4, 3, 2,
+ 1, 0, 11, 10,
+ 9, 8, 7, 6);
+
+ ol_flags = ol_flags_init(txp->dev_conf.txmode.offloads);
+
+ for (i = 0; i < nb; i++) {
+ if (likely(i < nb - 1))
+ rte_prefetch0(rte_pktmbuf_mtod(pkts[i+1], void *));
+ mb = pkts[i];
+
+ eth_hdr = rte_pktmbuf_mtod(mb, struct ether_hdr *);
+
+ /* Swap dest and src mac addresses. */
+ addr = _mm_loadu_si128((__m128i *)eth_hdr);
+ addr = _mm_shuffle_epi8(addr, shfl_msk);
+ _mm_storeu_si128((__m128i *)eth_hdr, addr);
+
+ mbuf_field_set(mb, ol_flags, txp->tx_vlan_id,
+ txp->tx_vlan_id_outer);
+ }
+}
+
+#endif /* _BPF_CMD_H_ */
+
--
2.13.6
Qi Zhang
2018-11-22 17:26:32 UTC
Permalink
Do four packets macswap in same loop iterate to squeeze more
CPU cycles.

Signed-off-by: Qi Zhang <***@intel.com>
---
app/test-pmd/macswap_sse.h | 65 ++++++++++++++++++++++++++++++++++++++--------
1 file changed, 54 insertions(+), 11 deletions(-)

diff --git a/app/test-pmd/macswap_sse.h b/app/test-pmd/macswap_sse.h
index d5b0f6a21..0649539c2 100644
--- a/app/test-pmd/macswap_sse.h
+++ b/app/test-pmd/macswap_sse.h
@@ -10,11 +10,12 @@ static inline void
do_macswap(struct rte_mbuf *pkts[], uint16_t nb,
struct rte_port *txp)
{
- struct ether_hdr *eth_hdr;
- struct rte_mbuf *mb;
+ struct ether_hdr *eth_hdr[4];
+ struct rte_mbuf *mb[4];
uint64_t ol_flags;
int i;
- __m128i addr;
+ int r;
+ __m128i addr0, addr1, addr2, addr3;
__m128i shfl_msk = _mm_set_epi8(15, 14, 13, 12,
5, 4, 3, 2,
1, 0, 11, 10,
@@ -22,19 +23,61 @@ do_macswap(struct rte_mbuf *pkts[], uint16_t nb,

ol_flags = ol_flags_init(txp->dev_conf.txmode.offloads);

- for (i = 0; i < nb; i++) {
- if (likely(i < nb - 1))
+ i = 0;
+ r = nb;
+
+ while (r >= 4) {
+ mb[0] = pkts[i++];
+ eth_hdr[0] = rte_pktmbuf_mtod(mb[0], struct ether_hdr *);
+ addr0 = _mm_loadu_si128((__m128i *)eth_hdr[0]);
+
+ mb[1] = pkts[i++];
+ eth_hdr[1] = rte_pktmbuf_mtod(mb[1], struct ether_hdr *);
+ addr1 = _mm_loadu_si128((__m128i *)eth_hdr[1]);
+
+
+ mb[2] = pkts[i++];
+ eth_hdr[2] = rte_pktmbuf_mtod(mb[2], struct ether_hdr *);
+ addr2 = _mm_loadu_si128((__m128i *)eth_hdr[2]);
+
+ mb[3] = pkts[i++];
+ eth_hdr[3] = rte_pktmbuf_mtod(mb[3], struct ether_hdr *);
+ addr3 = _mm_loadu_si128((__m128i *)eth_hdr[3]);
+
+ addr0 = _mm_shuffle_epi8(addr0, shfl_msk);
+ addr1 = _mm_shuffle_epi8(addr1, shfl_msk);
+ addr2 = _mm_shuffle_epi8(addr2, shfl_msk);
+ addr3 = _mm_shuffle_epi8(addr3, shfl_msk);
+
+ _mm_storeu_si128((__m128i *)eth_hdr[0], addr0);
+ _mm_storeu_si128((__m128i *)eth_hdr[1], addr1);
+ _mm_storeu_si128((__m128i *)eth_hdr[2], addr2);
+ _mm_storeu_si128((__m128i *)eth_hdr[3], addr3);
+
+ mbuf_field_set(mb[0], ol_flags, txp->tx_vlan_id,
+ txp->tx_vlan_id_outer);
+ mbuf_field_set(mb[1], ol_flags, txp->tx_vlan_id,
+ txp->tx_vlan_id_outer);
+ mbuf_field_set(mb[2], ol_flags, txp->tx_vlan_id,
+ txp->tx_vlan_id_outer);
+ mbuf_field_set(mb[3], ol_flags, txp->tx_vlan_id,
+ txp->tx_vlan_id_outer);
+ r -= 4;
+ }
+
+ for ( ; i < nb; i++) {
+ if (i < nb - 1)
rte_prefetch0(rte_pktmbuf_mtod(pkts[i+1], void *));
- mb = pkts[i];
+ mb[0] = pkts[i];

- eth_hdr = rte_pktmbuf_mtod(mb, struct ether_hdr *);
+ eth_hdr[0] = rte_pktmbuf_mtod(mb[0], struct ether_hdr *);

/* Swap dest and src mac addresses. */
- addr = _mm_loadu_si128((__m128i *)eth_hdr);
- addr = _mm_shuffle_epi8(addr, shfl_msk);
- _mm_storeu_si128((__m128i *)eth_hdr, addr);
+ addr0 = _mm_loadu_si128((__m128i *)eth_hdr);
+ addr0 = _mm_shuffle_epi8(addr0, shfl_msk);
+ _mm_storeu_si128((__m128i *)eth_hdr[0], addr0);

- mbuf_field_set(mb, ol_flags, txp->tx_vlan_id,
+ mbuf_field_set(mb[0], ol_flags, txp->tx_vlan_id,
txp->tx_vlan_id_outer);
}
}
--
2.13.6
Qi Zhang
2018-11-22 17:38:02 UTC
Permalink
**The pathset is target for 19.02**

Improved testpmd macswap performance for x86 by take advantage of SSE
instructions.
On a broadwell 1.6GHz sever with a i40e 25G NIC.
We abserve 17.7% performance improvement for testpmd's macswap test.

v2:
- remove uncessary files

Qi Zhang (3):
app/testpmd: code refactory for macswap
app/testpmd: improve MAC swap performance for x86
app/testpmd: further improve MAC swap performance for x86

app/test-pmd/macswap.c | 36 +++---------------
app/test-pmd/macswap.h | 40 ++++++++++++++++++++
app/test-pmd/macswap_common.h | 36 ++++++++++++++++++
app/test-pmd/macswap_sse.h | 86 +++++++++++++++++++++++++++++++++++++++++++
4 files changed, 168 insertions(+), 30 deletions(-)
create mode 100644 app/test-pmd/macswap.h
create mode 100644 app/test-pmd/macswap_common.h
create mode 100644 app/test-pmd/macswap_sse.h
--
2.13.6
Qi Zhang
2018-11-22 17:38:03 UTC
Permalink
Move macswap workload to dedicate function, so we can further enable
platform specific optimized version.

Signed-off-by: Qi Zhang <***@intel.com>
---
app/test-pmd/macswap.c | 32 ++------------------------------
app/test-pmd/macswap.h | 40 ++++++++++++++++++++++++++++++++++++++++
app/test-pmd/macswap_common.h | 36 ++++++++++++++++++++++++++++++++++++
3 files changed, 78 insertions(+), 30 deletions(-)
create mode 100644 app/test-pmd/macswap.h
create mode 100644 app/test-pmd/macswap_common.h

diff --git a/app/test-pmd/macswap.c b/app/test-pmd/macswap.c
index a8384d5b8..849194fe2 100644
--- a/app/test-pmd/macswap.c
+++ b/app/test-pmd/macswap.c
@@ -66,6 +66,7 @@
#include <rte_flow.h>

#include "testpmd.h"
+#include "macswap.h"

/*
* MAC swap forwarding mode: Swap the source and the destination Ethernet
@@ -76,15 +77,9 @@ pkt_burst_mac_swap(struct fwd_stream *fs)
{
struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
struct rte_port *txp;
- struct rte_mbuf *mb;
- struct ether_hdr *eth_hdr;
- struct ether_addr addr;
uint16_t nb_rx;
uint16_t nb_tx;
- uint16_t i;
uint32_t retry;
- uint64_t ol_flags = 0;
- uint64_t tx_offloads;
#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
uint64_t start_tsc;
uint64_t end_tsc;
@@ -108,32 +103,9 @@ pkt_burst_mac_swap(struct fwd_stream *fs)
#endif
fs->rx_packets += nb_rx;
txp = &ports[fs->tx_port];
- tx_offloads = txp->dev_conf.txmode.offloads;
- if (tx_offloads & DEV_TX_OFFLOAD_VLAN_INSERT)
- ol_flags = PKT_TX_VLAN_PKT;
- if (tx_offloads & DEV_TX_OFFLOAD_QINQ_INSERT)
- ol_flags |= PKT_TX_QINQ_PKT;
- if (tx_offloads & DEV_TX_OFFLOAD_MACSEC_INSERT)
- ol_flags |= PKT_TX_MACSEC;
- for (i = 0; i < nb_rx; i++) {
- if (likely(i < nb_rx - 1))
- rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[i + 1],
- void *));
- mb = pkts_burst[i];
- eth_hdr = rte_pktmbuf_mtod(mb, struct ether_hdr *);

- /* Swap dest and src mac addresses. */
- ether_addr_copy(&eth_hdr->d_addr, &addr);
- ether_addr_copy(&eth_hdr->s_addr, &eth_hdr->d_addr);
- ether_addr_copy(&addr, &eth_hdr->s_addr);
+ do_macswap(pkts_burst, nb_rx, txp);

- mb->ol_flags &= IND_ATTACHED_MBUF | EXT_ATTACHED_MBUF;
- mb->ol_flags |= ol_flags;
- mb->l2_len = sizeof(struct ether_hdr);
- mb->l3_len = sizeof(struct ipv4_hdr);
- mb->vlan_tci = txp->tx_vlan_id;
- mb->vlan_tci_outer = txp->tx_vlan_id_outer;
- }
nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst, nb_rx);
/*
* Retry if necessary
diff --git a/app/test-pmd/macswap.h b/app/test-pmd/macswap.h
new file mode 100644
index 000000000..bc8a95626
--- /dev/null
+++ b/app/test-pmd/macswap.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef _L2FWD_H_
+#define _L2FWD_H_
+
+#include "macswap_common.h"
+
+static inline void
+do_macswap(struct rte_mbuf *pkts[], uint16_t nb,
+ struct rte_port *txp)
+{
+ struct ether_hdr *eth_hdr;
+ struct rte_mbuf *mb;
+ struct ether_addr addr;
+ uint64_t ol_flags;
+ int i;
+
+ ol_flags = ol_flags_init(txp->dev_conf.txmode.offloads);
+
+ for (i = 0; i < nb; i++) {
+ if (likely(i < nb - 1))
+ rte_prefetch0(rte_pktmbuf_mtod(pkts[i+1], void *));
+ mb = pkts[i];
+
+ eth_hdr = rte_pktmbuf_mtod(mb, struct ether_hdr *);
+
+ /* Swap dest and src mac addresses. */
+ ether_addr_copy(&eth_hdr->d_addr, &addr);
+ ether_addr_copy(&eth_hdr->s_addr, &eth_hdr->d_addr);
+ ether_addr_copy(&addr, &eth_hdr->s_addr);
+
+ mbuf_field_set(mb, ol_flags, txp->tx_vlan_id,
+ txp->tx_vlan_id_outer);
+ }
+}
+
+#endif /* _BPF_CMD_H_ */
+
diff --git a/app/test-pmd/macswap_common.h b/app/test-pmd/macswap_common.h
new file mode 100644
index 000000000..2c01cbc8f
--- /dev/null
+++ b/app/test-pmd/macswap_common.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef _L2FWD_COMMON_H_
+#define _L2FWD_COMMON_H_
+
+static inline uint64_t
+ol_flags_init(uint64_t tx_offload)
+{
+ uint64_t ol_flags = 0;
+
+ ol_flags |= (tx_offload & DEV_TX_OFFLOAD_VLAN_INSERT) ?
+ PKT_TX_VLAN_PKT : 0;
+ ol_flags |= (tx_offload & DEV_TX_OFFLOAD_QINQ_INSERT) ?
+ PKT_TX_QINQ_PKT : 0;
+ ol_flags |= (tx_offload & DEV_TX_OFFLOAD_MACSEC_INSERT) ?
+ PKT_TX_MACSEC : 0;
+
+ return ol_flags;
+}
+
+static inline void
+mbuf_field_set(struct rte_mbuf *mb, uint64_t ol_flags,
+ uint16_t vlan, uint16_t vlan_outer)
+{
+ mb->ol_flags &= IND_ATTACHED_MBUF | EXT_ATTACHED_MBUF;
+ mb->ol_flags |= ol_flags;
+ mb->l2_len = sizeof(struct ether_hdr);
+ mb->l3_len = sizeof(struct ipv4_hdr);
+ mb->vlan_tci = vlan;
+ mb->vlan_tci_outer = vlan_outer;
+}
+
+#endif /* _BPF_CMD_H_ */
+
--
2.13.6
Ferruh Yigit
2018-12-10 17:44:24 UTC
Permalink
Post by Qi Zhang
Move macswap workload to dedicate function, so we can further enable
platform specific optimized version.
<...>
Post by Qi Zhang
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef _L2FWD_H_
+#define _L2FWD_H_
Looks like copy-paste artifact, there are a few more in patchset.

<...>
Post by Qi Zhang
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef _L2FWD_COMMON_H_
+#define _L2FWD_COMMON_H_
+
+static inline uint64_t
+ol_flags_init(uint64_t tx_offload)
+{
+ uint64_t ol_flags = 0;
+
+ ol_flags |= (tx_offload & DEV_TX_OFFLOAD_VLAN_INSERT) ?
+ PKT_TX_VLAN_PKT : 0;
'PKT_TX_VLAN_PKT' is depreciated and replaced with 'PKT_TX_VLAN'. I think it is
better to keep as it is in this patch, since mainly it copies from one place to
another, but can you update this in new patch in this patchset?
Post by Qi Zhang
+ ol_flags |= (tx_offload & DEV_TX_OFFLOAD_QINQ_INSERT) ?
+ PKT_TX_QINQ_PKT : 0;
Same here, 'PKT_TX_QINQ_PKT' replaced with 'PKT_TX_QINQ'.
Post by Qi Zhang
+ ol_flags |= (tx_offload & DEV_TX_OFFLOAD_MACSEC_INSERT) ?
+ PKT_TX_MACSEC : 0;
+
+ return ol_flags;
+}
+
+static inline void
+mbuf_field_set(struct rte_mbuf *mb, uint64_t ol_flags,
+ uint16_t vlan, uint16_t vlan_outer)
+{
+ mb->ol_flags &= IND_ATTACHED_MBUF | EXT_ATTACHED_MBUF;
I guess above line is to prevent those bits overwritten, but with '|='
assignment below I think they will be preserved already, do we need above line?
cc'ed Yongseok.
Post by Qi Zhang
+ mb->ol_flags |= ol_flags;
+ mb->l2_len = sizeof(struct ether_hdr);
+ mb->l3_len = sizeof(struct ipv4_hdr);
+ mb->vlan_tci = vlan;
+ mb->vlan_tci_outer = vlan_outer;
Setting 'vlan_tci' or 'vlan_tci_outer' makes sense only if 'PKT_TX_VLAN' and
'PKT_TX_QINQ' set, since there is already an check for them above, does it make
sense to do these assignment in them, for better performance.

Qi Zhang
2018-11-22 17:38:04 UTC
Permalink
The patch optimizes the mac swap operation by taking advantage
of SSE instructions, it only impacts x86 platform.

Signed-off-by: Qi Zhang <***@intel.com>
---
app/test-pmd/macswap.c | 4 ++++
app/test-pmd/macswap_sse.h | 43 +++++++++++++++++++++++++++++++++++++++++++
2 files changed, 47 insertions(+)
create mode 100644 app/test-pmd/macswap_sse.h

diff --git a/app/test-pmd/macswap.c b/app/test-pmd/macswap.c
index 849194fe2..cbb41b728 100644
--- a/app/test-pmd/macswap.c
+++ b/app/test-pmd/macswap.c
@@ -66,7 +66,11 @@
#include <rte_flow.h>

#include "testpmd.h"
+#ifdef RTE_ARCH_X86
+#include "macswap_sse.h"
+#else
#include "macswap.h"
+#endif

/*
* MAC swap forwarding mode: Swap the source and the destination Ethernet
diff --git a/app/test-pmd/macswap_sse.h b/app/test-pmd/macswap_sse.h
new file mode 100644
index 000000000..d5b0f6a21
--- /dev/null
+++ b/app/test-pmd/macswap_sse.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef _L2FWD_SSE_H_
+#define _L2FWD_SSE_H_
+
+#include "macswap_common.h"
+static inline void
+do_macswap(struct rte_mbuf *pkts[], uint16_t nb,
+ struct rte_port *txp)
+{
+ struct ether_hdr *eth_hdr;
+ struct rte_mbuf *mb;
+ uint64_t ol_flags;
+ int i;
+ __m128i addr;
+ __m128i shfl_msk = _mm_set_epi8(15, 14, 13, 12,
+ 5, 4, 3, 2,
+ 1, 0, 11, 10,
+ 9, 8, 7, 6);
+
+ ol_flags = ol_flags_init(txp->dev_conf.txmode.offloads);
+
+ for (i = 0; i < nb; i++) {
+ if (likely(i < nb - 1))
+ rte_prefetch0(rte_pktmbuf_mtod(pkts[i+1], void *));
+ mb = pkts[i];
+
+ eth_hdr = rte_pktmbuf_mtod(mb, struct ether_hdr *);
+
+ /* Swap dest and src mac addresses. */
+ addr = _mm_loadu_si128((__m128i *)eth_hdr);
+ addr = _mm_shuffle_epi8(addr, shfl_msk);
+ _mm_storeu_si128((__m128i *)eth_hdr, addr);
+
+ mbuf_field_set(mb, ol_flags, txp->tx_vlan_id,
+ txp->tx_vlan_id_outer);
+ }
+}
+
+#endif /* _BPF_CMD_H_ */
+
--
2.13.6
Ferruh Yigit
2018-12-10 17:44:36 UTC
Permalink
Post by Qi Zhang
The patch optimizes the mac swap operation by taking advantage
of SSE instructions, it only impacts x86 platform.
<...>
Post by Qi Zhang
+
+#include "macswap_common.h"
And empty line after include can be good.
Qi Zhang
2018-11-22 17:38:05 UTC
Permalink
Do four packets macswap in same loop iterate to squeeze more
CPU cycles.

Signed-off-by: Qi Zhang <***@intel.com>
---
app/test-pmd/macswap_sse.h | 65 ++++++++++++++++++++++++++++++++++++++--------
1 file changed, 54 insertions(+), 11 deletions(-)

diff --git a/app/test-pmd/macswap_sse.h b/app/test-pmd/macswap_sse.h
index d5b0f6a21..0649539c2 100644
--- a/app/test-pmd/macswap_sse.h
+++ b/app/test-pmd/macswap_sse.h
@@ -10,11 +10,12 @@ static inline void
do_macswap(struct rte_mbuf *pkts[], uint16_t nb,
struct rte_port *txp)
{
- struct ether_hdr *eth_hdr;
- struct rte_mbuf *mb;
+ struct ether_hdr *eth_hdr[4];
+ struct rte_mbuf *mb[4];
uint64_t ol_flags;
int i;
- __m128i addr;
+ int r;
+ __m128i addr0, addr1, addr2, addr3;
__m128i shfl_msk = _mm_set_epi8(15, 14, 13, 12,
5, 4, 3, 2,
1, 0, 11, 10,
@@ -22,19 +23,61 @@ do_macswap(struct rte_mbuf *pkts[], uint16_t nb,

ol_flags = ol_flags_init(txp->dev_conf.txmode.offloads);

- for (i = 0; i < nb; i++) {
- if (likely(i < nb - 1))
+ i = 0;
+ r = nb;
+
+ while (r >= 4) {
+ mb[0] = pkts[i++];
+ eth_hdr[0] = rte_pktmbuf_mtod(mb[0], struct ether_hdr *);
+ addr0 = _mm_loadu_si128((__m128i *)eth_hdr[0]);
+
+ mb[1] = pkts[i++];
+ eth_hdr[1] = rte_pktmbuf_mtod(mb[1], struct ether_hdr *);
+ addr1 = _mm_loadu_si128((__m128i *)eth_hdr[1]);
+
+
+ mb[2] = pkts[i++];
+ eth_hdr[2] = rte_pktmbuf_mtod(mb[2], struct ether_hdr *);
+ addr2 = _mm_loadu_si128((__m128i *)eth_hdr[2]);
+
+ mb[3] = pkts[i++];
+ eth_hdr[3] = rte_pktmbuf_mtod(mb[3], struct ether_hdr *);
+ addr3 = _mm_loadu_si128((__m128i *)eth_hdr[3]);
+
+ addr0 = _mm_shuffle_epi8(addr0, shfl_msk);
+ addr1 = _mm_shuffle_epi8(addr1, shfl_msk);
+ addr2 = _mm_shuffle_epi8(addr2, shfl_msk);
+ addr3 = _mm_shuffle_epi8(addr3, shfl_msk);
+
+ _mm_storeu_si128((__m128i *)eth_hdr[0], addr0);
+ _mm_storeu_si128((__m128i *)eth_hdr[1], addr1);
+ _mm_storeu_si128((__m128i *)eth_hdr[2], addr2);
+ _mm_storeu_si128((__m128i *)eth_hdr[3], addr3);
+
+ mbuf_field_set(mb[0], ol_flags, txp->tx_vlan_id,
+ txp->tx_vlan_id_outer);
+ mbuf_field_set(mb[1], ol_flags, txp->tx_vlan_id,
+ txp->tx_vlan_id_outer);
+ mbuf_field_set(mb[2], ol_flags, txp->tx_vlan_id,
+ txp->tx_vlan_id_outer);
+ mbuf_field_set(mb[3], ol_flags, txp->tx_vlan_id,
+ txp->tx_vlan_id_outer);
+ r -= 4;
+ }
+
+ for ( ; i < nb; i++) {
+ if (i < nb - 1)
rte_prefetch0(rte_pktmbuf_mtod(pkts[i+1], void *));
- mb = pkts[i];
+ mb[0] = pkts[i];

- eth_hdr = rte_pktmbuf_mtod(mb, struct ether_hdr *);
+ eth_hdr[0] = rte_pktmbuf_mtod(mb[0], struct ether_hdr *);

/* Swap dest and src mac addresses. */
- addr = _mm_loadu_si128((__m128i *)eth_hdr);
- addr = _mm_shuffle_epi8(addr, shfl_msk);
- _mm_storeu_si128((__m128i *)eth_hdr, addr);
+ addr0 = _mm_loadu_si128((__m128i *)eth_hdr);
+ addr0 = _mm_shuffle_epi8(addr0, shfl_msk);
+ _mm_storeu_si128((__m128i *)eth_hdr[0], addr0);

- mbuf_field_set(mb, ol_flags, txp->tx_vlan_id,
+ mbuf_field_set(mb[0], ol_flags, txp->tx_vlan_id,
txp->tx_vlan_id_outer);
}
}
--
2.13.6
Loading...