Discussion:
[dpdk-dev] [PATCH 0/5] Support TCP/IPv4, VxLAN and GRE GSO in DPDK
(too old to reply)
Jiayu Hu
2017-08-24 14:15:39 UTC
Permalink
Generic Segmentation Offload (GSO) is a SW technique to split large
packets into small ones. Akin to TSO, GSO enables applications to
operate on large packets, thus reducing per-packet processing overhead.

To enable more flexibility to applications, DPDK GSO is implemented
as a standalone library. Applications explicitly use the GSO library
to segment packets. This patch adds GSO support to DPDK for specific
packet types: specifically, TCP/IPv4, VxLAN, and GRE.

The first patch introduces the GSO API framework. The second patch
adds GSO support for TCP/IPv4 packets (containing an optional VLAN
tag). The third patch adds GSO support for VxLAN packets that contain
outer IPv4, and inner TCP/IPv4 headers (plus optional inner and/or
outer VLAN tags). The fourth patch adds GSO support for GRE packets
that contain outer IPv4, and inner TCP/IPv4 headers (with optional
outer VLAN tag). The last patch in the series enables TCP/IPv4, VxLAN,
and GRE GSO in testpmd's checksum forwarding engine.

The performance of TCP/IPv4 GSO on a 10Gbps link is demonstrated using
iperf. Setup for the test is described as follows:

a. Connect 2 x 10Gbps physical ports (P0, P1), together physically.
b. Launch testpmd with P0 and a vhost-user port, and use csum
forwarding engine.
c. Select IP and TCP HW checksum calculation for P0; select TCP HW
checksum calculation for vhost-user port.
d. Launch a VM with csum and tso offloading enabled.
e. Run iperf-client on virtio-net port in the VM to send TCP packets.

With GSO enabled for P0 in testpmd, observed iperf throughput is ~9Gbps.
The experimental data of VxLAN and GRE will be shown later.

Jiayu Hu (3):
lib: add Generic Segmentation Offload API framework
gso/lib: add TCP/IPv4 GSO support
app/testpmd: enable TCP/IPv4, VxLAN and GRE GSO

Mark Kavanagh (2):
lib/gso: add VxLAN GSO support
lib/gso: add GRE GSO support

app/test-pmd/cmdline.c | 121 +++++++++
app/test-pmd/config.c | 25 ++
app/test-pmd/csumonly.c | 68 ++++-
app/test-pmd/testpmd.c | 9 +
app/test-pmd/testpmd.h | 10 +
config/common_base | 5 +
lib/Makefile | 2 +
lib/librte_eal/common/include/rte_log.h | 1 +
lib/librte_gso/Makefile | 52 ++++
lib/librte_gso/gso_common.c | 431 ++++++++++++++++++++++++++++++++
lib/librte_gso/gso_common.h | 180 +++++++++++++
lib/librte_gso/gso_tcp.c | 82 ++++++
lib/librte_gso/gso_tcp.h | 73 ++++++
lib/librte_gso/gso_tunnel.c | 62 +++++
lib/librte_gso/gso_tunnel.h | 46 ++++
lib/librte_gso/rte_gso.c | 100 ++++++++
lib/librte_gso/rte_gso.h | 122 +++++++++
lib/librte_gso/rte_gso_version.map | 7 +
mk/rte.app.mk | 1 +
19 files changed, 1392 insertions(+), 5 deletions(-)
create mode 100644 lib/librte_gso/Makefile
create mode 100644 lib/librte_gso/gso_common.c
create mode 100644 lib/librte_gso/gso_common.h
create mode 100644 lib/librte_gso/gso_tcp.c
create mode 100644 lib/librte_gso/gso_tcp.h
create mode 100644 lib/librte_gso/gso_tunnel.c
create mode 100644 lib/librte_gso/gso_tunnel.h
create mode 100644 lib/librte_gso/rte_gso.c
create mode 100644 lib/librte_gso/rte_gso.h
create mode 100644 lib/librte_gso/rte_gso_version.map
--
2.7.4
Jiayu Hu
2017-08-24 14:15:40 UTC
Permalink
Generic Segmentation Offload (GSO) is a SW technique to split large
packets into small ones. Akin to TSO, GSO enables applications to
operate on large packets, thus reducing per-packet processing overhead.

To enable more flexibility to applications, DPDK GSO is implemented
as a standalone library. Applications explicitly use the GSO library
to segment packets. This patch introduces the GSO API framework to DPDK.

The GSO library provides a segmentation API, rte_gso_segment(), for
applications. It splits an input packet into small ones in each
invocation. The GSO library refers to these small packets generated
by rte_gso_segment() as GSO segments. When all GSO segments are freed,
the input packet is freed automatically.

Signed-off-by: Jiayu Hu <***@intel.com>
Signed-off-by: Mark Kavanagh <***@intel.com>
---
config/common_base | 5 ++
lib/Makefile | 2 +
lib/librte_gso/Makefile | 49 ++++++++++++++++
lib/librte_gso/rte_gso.c | 47 ++++++++++++++++
lib/librte_gso/rte_gso.h | 111 +++++++++++++++++++++++++++++++++++++
lib/librte_gso/rte_gso_version.map | 7 +++
mk/rte.app.mk | 1 +
7 files changed, 222 insertions(+)
create mode 100644 lib/librte_gso/Makefile
create mode 100644 lib/librte_gso/rte_gso.c
create mode 100644 lib/librte_gso/rte_gso.h
create mode 100644 lib/librte_gso/rte_gso_version.map

diff --git a/config/common_base b/config/common_base
index 5e97a08..603e340 100644
--- a/config/common_base
+++ b/config/common_base
@@ -652,6 +652,11 @@ CONFIG_RTE_LIBRTE_IP_FRAG_TBL_STAT=n
CONFIG_RTE_LIBRTE_GRO=y

#
+# Compile GSO library
+#
+CONFIG_RTE_LIBRTE_GSO=y
+
+#
# Compile librte_meter
#
CONFIG_RTE_LIBRTE_METER=y
diff --git a/lib/Makefile b/lib/Makefile
index 86caba1..3d123f4 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -108,6 +108,8 @@ DIRS-$(CONFIG_RTE_LIBRTE_REORDER) += librte_reorder
DEPDIRS-librte_reorder := librte_eal librte_mempool librte_mbuf
DIRS-$(CONFIG_RTE_LIBRTE_PDUMP) += librte_pdump
DEPDIRS-librte_pdump := librte_eal librte_mempool librte_mbuf librte_ether
+DIRS-$(CONFIG_RTE_LIBRTE_GSO) += librte_gso
+DEPDIRS-librte_gso := librte_eal librte_mbuf librte_ether librte_net

ifeq ($(CONFIG_RTE_EXEC_ENV_LINUXAPP),y)
DIRS-$(CONFIG_RTE_LIBRTE_KNI) += librte_kni
diff --git a/lib/librte_gso/Makefile b/lib/librte_gso/Makefile
new file mode 100644
index 0000000..aeaacbc
--- /dev/null
+++ b/lib/librte_gso/Makefile
@@ -0,0 +1,49 @@
+# BSD LICENSE
+#
+# Copyright(c) 2017 Intel Corporation. All rights reserved.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+# library name
+LIB = librte_gso.a
+
+CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3
+
+EXPORT_MAP := rte_gso_version.map
+
+LIBABIVER := 1
+
+#source files
+SRCS-$(CONFIG_RTE_LIBRTE_GSO) += rte_gso.c
+
+# install this header file
+SYMLINK-$(CONFIG_RTE_LIBRTE_GSO)-include += rte_gso.h
+
+include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/lib/librte_gso/rte_gso.c b/lib/librte_gso/rte_gso.c
new file mode 100644
index 0000000..b81afce
--- /dev/null
+++ b/lib/librte_gso/rte_gso.c
@@ -0,0 +1,47 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "rte_gso.h"
+
+int
+rte_gso_segment(struct rte_mbuf *pkt,
+ struct rte_gso_ctx gso_ctx,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out __rte_unused)
+{
+ if (pkt == NULL || pkts_out == NULL || gso_ctx.direct_pool ==
+ NULL || gso_ctx.indirect_pool == NULL)
+ return -EINVAL;
+
+ return 1;
+}
diff --git a/lib/librte_gso/rte_gso.h b/lib/librte_gso/rte_gso.h
new file mode 100644
index 0000000..5a8389a
--- /dev/null
+++ b/lib/librte_gso/rte_gso.h
@@ -0,0 +1,111 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_GSO_H_
+#define _RTE_GSO_H_
+
+/**
+ * @file
+ * Interface to GSO library
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <rte_mbuf.h>
+
+/**
+ * GSO context structure.
+ */
+struct rte_gso_ctx {
+ struct rte_mempool *direct_pool;
+ /**< MBUF pool for allocating direct buffers, which are used
+ * to store packet headers for GSO segments.
+ */
+ struct rte_mempool *indirect_pool;
+ /**< MBUF pool for allocating indirect buffers, which are used
+ * to locate packet payloads for GSO segments. The indirect
+ * buffer doesn't contain any data, but simply points to an
+ * offset within the packet to segment.
+ */
+ uint64_t gso_types;
+ /**< GSO types to perform */
+ uint16_t gso_size;
+ /**< maximum size of a GSO segment, measured in bytes */
+};
+
+/**
+ * Segmentation function, which supports processing of both single- and
+ * multi- segment packets. rte_gso_segment() assumes the input packet
+ * has correct checksums, and it doesn't process IP fragment packets.
+ * Additionally, it assumes that 'pkts_out' is large enough to hold all GSO
+ * segments.
+ *
+ * We refer to the packets that are segmented from the input packet as 'GSO
+ * segments'. If the input packet is GSOed, its mbuf refcnt reduces by 1.
+ * Therefore, when all GSO segments are freed, the input packet is freed
+ * automatically. If the input packet doesn't match the criteria for GSO
+ * (e.g. 'pkt's length is small and doesn't need segmentation), the packet
+ * is skipped and this function returns 1. If the available memory space
+ * in MBUF pools is insufficient, the packet is skipped and return -ENOMEM.
+ *
+ * @param pkt
+ * The packet mbuf to segment.
+ * @param ctx
+ * GSO context object.
+ * @param pkts_out
+ * Pointer array used to stores the mbuf addresses of GSO segments.
+ * Applications must ensure pkts_out is large enough to hold all GSO
+ * segments. If the memory space in pkts_out is insufficient, the input
+ * packet is skipped and return -EINVAL.
+ * @param nb_pkts_out
+ * The max number of items that pkts_out can keep.
+ *
+ * @return
+ * - The number of GSO segments created on success.
+ * - Return 1 if no GSO is performed.
+ * - Return -ENOMEM if run out of memory in MBUF pools.
+ * - Return -EINVAL for invalid parameters.
+ */
+int rte_gso_segment(struct rte_mbuf *pkt,
+ struct rte_gso_ctx ctx,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_GSO_H_ */
diff --git a/lib/librte_gso/rte_gso_version.map b/lib/librte_gso/rte_gso_version.map
new file mode 100644
index 0000000..e1fd453
--- /dev/null
+++ b/lib/librte_gso/rte_gso_version.map
@@ -0,0 +1,7 @@
+DPDK_17.11 {
+ global:
+
+ rte_gso_segment;
+
+ local: *;
+};
diff --git a/mk/rte.app.mk b/mk/rte.app.mk
index c25fdd9..d4c9873 100644
--- a/mk/rte.app.mk
+++ b/mk/rte.app.mk
@@ -66,6 +66,7 @@ _LDLIBS-$(CONFIG_RTE_LIBRTE_PDUMP) += -lrte_pdump
_LDLIBS-$(CONFIG_RTE_LIBRTE_DISTRIBUTOR) += -lrte_distributor
_LDLIBS-$(CONFIG_RTE_LIBRTE_IP_FRAG) += -lrte_ip_frag
_LDLIBS-$(CONFIG_RTE_LIBRTE_GRO) += -lrte_gro
+_LDLIBS-$(CONFIG_RTE_LIBRTE_GSO) += -lrte_gso
_LDLIBS-$(CONFIG_RTE_LIBRTE_METER) += -lrte_meter
_LDLIBS-$(CONFIG_RTE_LIBRTE_SCHED) += -lrte_sched
_LDLIBS-$(CONFIG_RTE_LIBRTE_LPM) += -lrte_lpm
--
2.7.4
Ananyev, Konstantin
2017-08-30 01:38:02 UTC
Permalink
Hi Jiayu,
Post by Jiayu Hu
Generic Segmentation Offload (GSO) is a SW technique to split large
packets into small ones. Akin to TSO, GSO enables applications to
operate on large packets, thus reducing per-packet processing overhead.
To enable more flexibility to applications, DPDK GSO is implemented
as a standalone library. Applications explicitly use the GSO library
to segment packets. This patch introduces the GSO API framework to DPDK.
The GSO library provides a segmentation API, rte_gso_segment(), for
applications. It splits an input packet into small ones in each
invocation. The GSO library refers to these small packets generated
by rte_gso_segment() as GSO segments. When all GSO segments are freed,
the input packet is freed automatically.
---
config/common_base | 5 ++
lib/Makefile | 2 +
lib/librte_gso/Makefile | 49 ++++++++++++++++
lib/librte_gso/rte_gso.c | 47 ++++++++++++++++
lib/librte_gso/rte_gso.h | 111 +++++++++++++++++++++++++++++++++++++
lib/librte_gso/rte_gso_version.map | 7 +++
mk/rte.app.mk | 1 +
7 files changed, 222 insertions(+)
create mode 100644 lib/librte_gso/Makefile
create mode 100644 lib/librte_gso/rte_gso.c
create mode 100644 lib/librte_gso/rte_gso.h
create mode 100644 lib/librte_gso/rte_gso_version.map
diff --git a/config/common_base b/config/common_base
index 5e97a08..603e340 100644
--- a/config/common_base
+++ b/config/common_base
@@ -652,6 +652,11 @@ CONFIG_RTE_LIBRTE_IP_FRAG_TBL_STAT=n
CONFIG_RTE_LIBRTE_GRO=y
#
+# Compile GSO library
+#
+CONFIG_RTE_LIBRTE_GSO=y
+
+#
# Compile librte_meter
#
CONFIG_RTE_LIBRTE_METER=y
diff --git a/lib/Makefile b/lib/Makefile
index 86caba1..3d123f4 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -108,6 +108,8 @@ DIRS-$(CONFIG_RTE_LIBRTE_REORDER) += librte_reorder
DEPDIRS-librte_reorder := librte_eal librte_mempool librte_mbuf
DIRS-$(CONFIG_RTE_LIBRTE_PDUMP) += librte_pdump
DEPDIRS-librte_pdump := librte_eal librte_mempool librte_mbuf librte_ether
+DIRS-$(CONFIG_RTE_LIBRTE_GSO) += librte_gso
+DEPDIRS-librte_gso := librte_eal librte_mbuf librte_ether librte_net
ifeq ($(CONFIG_RTE_EXEC_ENV_LINUXAPP),y)
DIRS-$(CONFIG_RTE_LIBRTE_KNI) += librte_kni
diff --git a/lib/librte_gso/Makefile b/lib/librte_gso/Makefile
new file mode 100644
index 0000000..aeaacbc
--- /dev/null
+++ b/lib/librte_gso/Makefile
@@ -0,0 +1,49 @@
+# BSD LICENSE
+#
+# Copyright(c) 2017 Intel Corporation. All rights reserved.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+# library name
+LIB = librte_gso.a
+
+CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3
+
+EXPORT_MAP := rte_gso_version.map
+
+LIBABIVER := 1
+
+#source files
+SRCS-$(CONFIG_RTE_LIBRTE_GSO) += rte_gso.c
+
+# install this header file
+SYMLINK-$(CONFIG_RTE_LIBRTE_GSO)-include += rte_gso.h
+
+include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/lib/librte_gso/rte_gso.c b/lib/librte_gso/rte_gso.c
new file mode 100644
index 0000000..b81afce
--- /dev/null
+++ b/lib/librte_gso/rte_gso.c
@@ -0,0 +1,47 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "rte_gso.h"
+
+int
+rte_gso_segment(struct rte_mbuf *pkt,
+ struct rte_gso_ctx gso_ctx,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out __rte_unused)
+{
+ if (pkt == NULL || pkts_out == NULL || gso_ctx.direct_pool ==
+ NULL || gso_ctx.indirect_pool == NULL)
+ return -EINVAL;
+
+ return 1;
+}
diff --git a/lib/librte_gso/rte_gso.h b/lib/librte_gso/rte_gso.h
new file mode 100644
index 0000000..5a8389a
--- /dev/null
+++ b/lib/librte_gso/rte_gso.h
@@ -0,0 +1,111 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_GSO_H_
+#define _RTE_GSO_H_
+
+/**
+ * Interface to GSO library
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <rte_mbuf.h>
+
+/**
+ * GSO context structure.
+ */
+struct rte_gso_ctx {
+ struct rte_mempool *direct_pool;
+ /**< MBUF pool for allocating direct buffers, which are used
+ * to store packet headers for GSO segments.
+ */
+ struct rte_mempool *indirect_pool;
+ /**< MBUF pool for allocating indirect buffers, which are used
+ * to locate packet payloads for GSO segments. The indirect
+ * buffer doesn't contain any data, but simply points to an
+ * offset within the packet to segment.
+ */
+ uint64_t gso_types;
+ /**< GSO types to perform */
Looking at the way it is used right now - there seems not much value in it...
Why not to make it a mask of ptypes for which GSO should be perfomed?
Let say for gso_ctx that supports only ip4/tcp it would be:
gso_types = (RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L4_TCP)
and then in rte_gso_segment() we can perfom gso only on packets of requested ptype:

if ((pkt->packet_type & gso_ctx->gso_types) == pkt->packet_type) {
/* do segmentation */
} else {
/* skip segmentation for that packet */
}
Post by Jiayu Hu
+ uint16_t gso_size;
+ /**< maximum size of a GSO segment, measured in bytes */
Is that MSS or MTU?
Post by Jiayu Hu
+};
+
+/**
+ * Segmentation function, which supports processing of both single- and
+ * multi- segment packets. rte_gso_segment() assumes the input packet
+ * has correct checksums, and it doesn't process IP fragment packets.
+ * Additionally, it assumes that 'pkts_out' is large enough to hold all GSO
+ * segments.
+ *
+ * We refer to the packets that are segmented from the input packet as 'GSO
+ * segments'. If the input packet is GSOed, its mbuf refcnt reduces by 1.
+ * Therefore, when all GSO segments are freed, the input packet is freed
+ * automatically. If the input packet doesn't match the criteria for GSO
+ * (e.g. 'pkt's length is small and doesn't need segmentation), the packet
+ * is skipped and this function returns 1. If the available memory space
+ * in MBUF pools is insufficient, the packet is skipped and return -ENOMEM.
+ *
+ * The packet mbuf to segment.
+ * GSO context object.
+ * Pointer array used to stores the mbuf addresses of GSO segments.
+ * Applications must ensure pkts_out is large enough to hold all GSO
+ * segments. If the memory space in pkts_out is insufficient, the input
+ * packet is skipped and return -EINVAL.
+ * The max number of items that pkts_out can keep.
+ *
+ * - The number of GSO segments created on success.
+ * - Return 1 if no GSO is performed.
Wouldn't it be better to return number of elems filled in pkts_out[] on success?
Post by Jiayu Hu
+ * - Return -ENOMEM if run out of memory in MBUF pools.
+ * - Return -EINVAL for invalid parameters.
+ */
+int rte_gso_segment(struct rte_mbuf *pkt,
+ struct rte_gso_ctx ctx,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_GSO_H_ */
diff --git a/lib/librte_gso/rte_gso_version.map b/lib/librte_gso/rte_gso_version.map
new file mode 100644
index 0000000..e1fd453
--- /dev/null
+++ b/lib/librte_gso/rte_gso_version.map
@@ -0,0 +1,7 @@
+DPDK_17.11 {
+
+ rte_gso_segment;
+
+ local: *;
+};
diff --git a/mk/rte.app.mk b/mk/rte.app.mk
index c25fdd9..d4c9873 100644
--- a/mk/rte.app.mk
+++ b/mk/rte.app.mk
@@ -66,6 +66,7 @@ _LDLIBS-$(CONFIG_RTE_LIBRTE_PDUMP) += -lrte_pdump
_LDLIBS-$(CONFIG_RTE_LIBRTE_DISTRIBUTOR) += -lrte_distributor
_LDLIBS-$(CONFIG_RTE_LIBRTE_IP_FRAG) += -lrte_ip_frag
_LDLIBS-$(CONFIG_RTE_LIBRTE_GRO) += -lrte_gro
+_LDLIBS-$(CONFIG_RTE_LIBRTE_GSO) += -lrte_gso
_LDLIBS-$(CONFIG_RTE_LIBRTE_METER) += -lrte_meter
_LDLIBS-$(CONFIG_RTE_LIBRTE_SCHED) += -lrte_sched
_LDLIBS-$(CONFIG_RTE_LIBRTE_LPM) += -lrte_lpm
--
2.7.4
Jiayu Hu
2017-08-30 07:57:19 UTC
Permalink
Hi Konstantin,
Post by Ananyev, Konstantin
Hi Jiayu,
Post by Jiayu Hu
Generic Segmentation Offload (GSO) is a SW technique to split large
packets into small ones. Akin to TSO, GSO enables applications to
operate on large packets, thus reducing per-packet processing overhead.
To enable more flexibility to applications, DPDK GSO is implemented
as a standalone library. Applications explicitly use the GSO library
to segment packets. This patch introduces the GSO API framework to DPDK.
The GSO library provides a segmentation API, rte_gso_segment(), for
applications. It splits an input packet into small ones in each
invocation. The GSO library refers to these small packets generated
by rte_gso_segment() as GSO segments. When all GSO segments are freed,
the input packet is freed automatically.
---
config/common_base | 5 ++
lib/Makefile | 2 +
lib/librte_gso/Makefile | 49 ++++++++++++++++
lib/librte_gso/rte_gso.c | 47 ++++++++++++++++
lib/librte_gso/rte_gso.h | 111 +++++++++++++++++++++++++++++++++++++
lib/librte_gso/rte_gso_version.map | 7 +++
mk/rte.app.mk | 1 +
7 files changed, 222 insertions(+)
create mode 100644 lib/librte_gso/Makefile
create mode 100644 lib/librte_gso/rte_gso.c
create mode 100644 lib/librte_gso/rte_gso.h
create mode 100644 lib/librte_gso/rte_gso_version.map
diff --git a/config/common_base b/config/common_base
index 5e97a08..603e340 100644
--- a/config/common_base
+++ b/config/common_base
@@ -652,6 +652,11 @@ CONFIG_RTE_LIBRTE_IP_FRAG_TBL_STAT=n
CONFIG_RTE_LIBRTE_GRO=y
#
+# Compile GSO library
+#
+CONFIG_RTE_LIBRTE_GSO=y
+
+#
# Compile librte_meter
#
CONFIG_RTE_LIBRTE_METER=y
diff --git a/lib/Makefile b/lib/Makefile
index 86caba1..3d123f4 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -108,6 +108,8 @@ DIRS-$(CONFIG_RTE_LIBRTE_REORDER) += librte_reorder
DEPDIRS-librte_reorder := librte_eal librte_mempool librte_mbuf
DIRS-$(CONFIG_RTE_LIBRTE_PDUMP) += librte_pdump
DEPDIRS-librte_pdump := librte_eal librte_mempool librte_mbuf librte_ether
+DIRS-$(CONFIG_RTE_LIBRTE_GSO) += librte_gso
+DEPDIRS-librte_gso := librte_eal librte_mbuf librte_ether librte_net
ifeq ($(CONFIG_RTE_EXEC_ENV_LINUXAPP),y)
DIRS-$(CONFIG_RTE_LIBRTE_KNI) += librte_kni
diff --git a/lib/librte_gso/Makefile b/lib/librte_gso/Makefile
new file mode 100644
index 0000000..aeaacbc
--- /dev/null
+++ b/lib/librte_gso/Makefile
@@ -0,0 +1,49 @@
+# BSD LICENSE
+#
+# Copyright(c) 2017 Intel Corporation. All rights reserved.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+# library name
+LIB = librte_gso.a
+
+CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3
+
+EXPORT_MAP := rte_gso_version.map
+
+LIBABIVER := 1
+
+#source files
+SRCS-$(CONFIG_RTE_LIBRTE_GSO) += rte_gso.c
+
+# install this header file
+SYMLINK-$(CONFIG_RTE_LIBRTE_GSO)-include += rte_gso.h
+
+include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/lib/librte_gso/rte_gso.c b/lib/librte_gso/rte_gso.c
new file mode 100644
index 0000000..b81afce
--- /dev/null
+++ b/lib/librte_gso/rte_gso.c
@@ -0,0 +1,47 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "rte_gso.h"
+
+int
+rte_gso_segment(struct rte_mbuf *pkt,
+ struct rte_gso_ctx gso_ctx,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out __rte_unused)
+{
+ if (pkt == NULL || pkts_out == NULL || gso_ctx.direct_pool ==
+ NULL || gso_ctx.indirect_pool == NULL)
+ return -EINVAL;
+
+ return 1;
+}
diff --git a/lib/librte_gso/rte_gso.h b/lib/librte_gso/rte_gso.h
new file mode 100644
index 0000000..5a8389a
--- /dev/null
+++ b/lib/librte_gso/rte_gso.h
@@ -0,0 +1,111 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_GSO_H_
+#define _RTE_GSO_H_
+
+/**
+ * Interface to GSO library
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <rte_mbuf.h>
+
+/**
+ * GSO context structure.
+ */
+struct rte_gso_ctx {
+ struct rte_mempool *direct_pool;
+ /**< MBUF pool for allocating direct buffers, which are used
+ * to store packet headers for GSO segments.
+ */
+ struct rte_mempool *indirect_pool;
+ /**< MBUF pool for allocating indirect buffers, which are used
+ * to locate packet payloads for GSO segments. The indirect
+ * buffer doesn't contain any data, but simply points to an
+ * offset within the packet to segment.
+ */
+ uint64_t gso_types;
+ /**< GSO types to perform */
Looking at the way it is used right now - there seems not much value in it...
Why not to make it a mask of ptypes for which GSO should be perfomed?
gso_types = (RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L4_TCP)
if ((pkt->packet_type & gso_ctx->gso_types) == pkt->packet_type) {
/* do segmentation */
} else {
/* skip segmentation for that packet */
}
Yes, you are right. It's unnecessary to define GRO type macros. We
can reuse ptype. I will change it in the next version.
Post by Ananyev, Konstantin
Post by Jiayu Hu
+ uint16_t gso_size;
+ /**< maximum size of a GSO segment, measured in bytes */
Is that MSS or MTU?
MSS. It's the max length of a complete packet, including packet headers.
Post by Ananyev, Konstantin
Post by Jiayu Hu
+};
+
+/**
+ * Segmentation function, which supports processing of both single- and
+ * multi- segment packets. rte_gso_segment() assumes the input packet
+ * has correct checksums, and it doesn't process IP fragment packets.
+ * Additionally, it assumes that 'pkts_out' is large enough to hold all GSO
+ * segments.
+ *
+ * We refer to the packets that are segmented from the input packet as 'GSO
+ * segments'. If the input packet is GSOed, its mbuf refcnt reduces by 1.
+ * Therefore, when all GSO segments are freed, the input packet is freed
+ * automatically. If the input packet doesn't match the criteria for GSO
+ * (e.g. 'pkt's length is small and doesn't need segmentation), the packet
+ * is skipped and this function returns 1. If the available memory space
+ * in MBUF pools is insufficient, the packet is skipped and return -ENOMEM.
+ *
+ * The packet mbuf to segment.
+ * GSO context object.
+ * Pointer array used to stores the mbuf addresses of GSO segments.
+ * Applications must ensure pkts_out is large enough to hold all GSO
+ * segments. If the memory space in pkts_out is insufficient, the input
+ * packet is skipped and return -EINVAL.
+ * The max number of items that pkts_out can keep.
+ *
+ * - The number of GSO segments created on success.
+ * - Return 1 if no GSO is performed.
Wouldn't it be better to return number of elems filled in pkts_out[] on success?
Agree. I will change it.
Post by Ananyev, Konstantin
Post by Jiayu Hu
+ * - Return -ENOMEM if run out of memory in MBUF pools.
+ * - Return -EINVAL for invalid parameters.
+ */
+int rte_gso_segment(struct rte_mbuf *pkt,
+ struct rte_gso_ctx ctx,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_GSO_H_ */
diff --git a/lib/librte_gso/rte_gso_version.map b/lib/librte_gso/rte_gso_version.map
new file mode 100644
index 0000000..e1fd453
--- /dev/null
+++ b/lib/librte_gso/rte_gso_version.map
@@ -0,0 +1,7 @@
+DPDK_17.11 {
+
+ rte_gso_segment;
+
+ local: *;
+};
diff --git a/mk/rte.app.mk b/mk/rte.app.mk
index c25fdd9..d4c9873 100644
--- a/mk/rte.app.mk
+++ b/mk/rte.app.mk
@@ -66,6 +66,7 @@ _LDLIBS-$(CONFIG_RTE_LIBRTE_PDUMP) += -lrte_pdump
_LDLIBS-$(CONFIG_RTE_LIBRTE_DISTRIBUTOR) += -lrte_distributor
_LDLIBS-$(CONFIG_RTE_LIBRTE_IP_FRAG) += -lrte_ip_frag
_LDLIBS-$(CONFIG_RTE_LIBRTE_GRO) += -lrte_gro
+_LDLIBS-$(CONFIG_RTE_LIBRTE_GSO) += -lrte_gso
_LDLIBS-$(CONFIG_RTE_LIBRTE_METER) += -lrte_meter
_LDLIBS-$(CONFIG_RTE_LIBRTE_SCHED) += -lrte_sched
_LDLIBS-$(CONFIG_RTE_LIBRTE_LPM) += -lrte_lpm
--
2.7.4
Jiayu Hu
2017-08-24 14:15:41 UTC
Permalink
This patch adds GSO support for TCP/IPv4 packets. Supported packets
may include a single VLAN tag. TCP/IPv4 GSO assumes that all input
packets have correct checksums, and doesn't update checksums for output
packets (the responsibility for this lies with the application).
Additionally, TCP/IPv4 GSO doesn't process IP fragmented packets.

TCP/IPv4 GSO uses two chained MBUFs, one direct MBUF and one indrect
MBUF, to organize an output packet. Note that we refer to these two
chained MBUFs as a two-segment MBUF. The direct MBUF stores the packet
header, while the indirect mbuf simply points to a location within the
original packet's payload. Consequently, use of the GSO library requires
multi-segment MBUF support in the TX functions of the NIC driver.

If a packet is GSOed, TCP/IPv4 GSO reduces its MBUF refcnt by 1. As a
result, when all of its GSOed segments are freed, the packet is freed
automatically.

Signed-off-by: Jiayu Hu <***@intel.com>
Signed-off-by: Mark Kavanagh <***@intel.com>
---
lib/librte_eal/common/include/rte_log.h | 1 +
lib/librte_gso/Makefile | 2 +
lib/librte_gso/gso_common.c | 270 ++++++++++++++++++++++++++++++++
lib/librte_gso/gso_common.h | 120 ++++++++++++++
lib/librte_gso/gso_tcp.c | 82 ++++++++++
lib/librte_gso/gso_tcp.h | 73 +++++++++
lib/librte_gso/rte_gso.c | 44 +++++-
lib/librte_gso/rte_gso.h | 3 +
8 files changed, 593 insertions(+), 2 deletions(-)
create mode 100644 lib/librte_gso/gso_common.c
create mode 100644 lib/librte_gso/gso_common.h
create mode 100644 lib/librte_gso/gso_tcp.c
create mode 100644 lib/librte_gso/gso_tcp.h

diff --git a/lib/librte_eal/common/include/rte_log.h b/lib/librte_eal/common/include/rte_log.h
index ec8dba7..2fa1199 100644
--- a/lib/librte_eal/common/include/rte_log.h
+++ b/lib/librte_eal/common/include/rte_log.h
@@ -87,6 +87,7 @@ extern struct rte_logs rte_logs;
#define RTE_LOGTYPE_CRYPTODEV 17 /**< Log related to cryptodev. */
#define RTE_LOGTYPE_EFD 18 /**< Log related to EFD. */
#define RTE_LOGTYPE_EVENTDEV 19 /**< Log related to eventdev. */
+#define RTE_LOGTYPE_GSO 20 /**< Log related to GSO. */

/* these log types can be used in an application */
#define RTE_LOGTYPE_USER1 24 /**< User-defined log type 1. */
diff --git a/lib/librte_gso/Makefile b/lib/librte_gso/Makefile
index aeaacbc..0f8e38f 100644
--- a/lib/librte_gso/Makefile
+++ b/lib/librte_gso/Makefile
@@ -42,6 +42,8 @@ LIBABIVER := 1

#source files
SRCS-$(CONFIG_RTE_LIBRTE_GSO) += rte_gso.c
+SRCS-$(CONFIG_RTE_LIBRTE_GSO) += gso_common.c
+SRCS-$(CONFIG_RTE_LIBRTE_GSO) += gso_tcp.c

# install this header file
SYMLINK-$(CONFIG_RTE_LIBRTE_GSO)-include += rte_gso.h
diff --git a/lib/librte_gso/gso_common.c b/lib/librte_gso/gso_common.c
new file mode 100644
index 0000000..2b54fbd
--- /dev/null
+++ b/lib/librte_gso/gso_common.c
@@ -0,0 +1,270 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdbool.h>
+#include <string.h>
+
+#include <rte_malloc.h>
+
+#include <rte_ether.h>
+#include <rte_ip.h>
+#include <rte_tcp.h>
+
+#include "gso_common.h"
+
+static inline void
+hdr_segment_init(struct rte_mbuf *hdr_segment, struct rte_mbuf *pkt,
+ uint16_t pkt_hdr_offset)
+{
+ /* copy mbuf metadata */
+ hdr_segment->nb_segs = 1;
+ hdr_segment->port = pkt->port;
+ hdr_segment->ol_flags = pkt->ol_flags;
+ hdr_segment->packet_type = pkt->packet_type;
+ hdr_segment->pkt_len = pkt_hdr_offset;
+ hdr_segment->data_len = pkt_hdr_offset;
+ hdr_segment->tx_offload = pkt->tx_offload;
+ /* copy packet header */
+ rte_memcpy(rte_pktmbuf_mtod(hdr_segment, char *),
+ rte_pktmbuf_mtod(pkt, char *),
+ pkt_hdr_offset);
+}
+
+static inline void
+free_gso_segment(struct rte_mbuf **pkts, uint16_t nb_pkts)
+{
+ uint16_t i;
+
+ for (i = 0; i < nb_pkts; i++) {
+ rte_pktmbuf_detach(pkts[i]->next);
+ rte_pktmbuf_free(pkts[i]);
+ pkts[i] = NULL;
+ }
+}
+
+int
+gso_do_segment(struct rte_mbuf *pkt,
+ uint16_t pkt_hdr_offset,
+ uint16_t pyld_unit_size,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out)
+{
+ struct rte_mbuf *pkt_in;
+ struct rte_mbuf *hdr_segment, *pyld_segment;
+ uint32_t pkt_in_pyld_off;
+ uint16_t pkt_in_segment_len, pkt_out_segment_len;
+ uint16_t nb_segs;
+ bool pkt_in_segment_processed;
+
+ pkt_in_pyld_off = pkt->data_off + pkt_hdr_offset;
+ pkt_in = pkt;
+ nb_segs = 0;
+
+ while (pkt_in) {
+ pkt_in_segment_processed = false;
+ pkt_in_segment_len = pkt_in->data_off + pkt_in->data_len;
+
+ while (!pkt_in_segment_processed) {
+ if (unlikely(nb_segs >= nb_pkts_out)) {
+ free_gso_segment(pkts_out, nb_segs);
+ return -EINVAL;
+ }
+
+ /* allocate direct mbuf */
+ hdr_segment = rte_pktmbuf_alloc(direct_pool);
+ if (unlikely(hdr_segment == NULL)) {
+ free_gso_segment(pkts_out, nb_segs);
+ return -ENOMEM;
+ }
+
+ /* allocate indirect mbuf */
+ pyld_segment = rte_pktmbuf_alloc(indirect_pool);
+ if (unlikely(pyld_segment == NULL)) {
+ rte_pktmbuf_free(hdr_segment);
+ free_gso_segment(pkts_out, nb_segs);
+ return -ENOMEM;
+ }
+
+ /* copy packet header */
+ hdr_segment_init(hdr_segment, pkt, pkt_hdr_offset);
+
+ /* attach payload mbuf to current packet segment */
+ rte_pktmbuf_attach(pyld_segment, pkt_in);
+
+ hdr_segment->next = pyld_segment;
+ pkts_out[nb_segs++] = hdr_segment;
+
+ /* calculate payload length */
+ pkt_out_segment_len = pyld_unit_size;
+ if (pkt_in_pyld_off + pkt_out_segment_len >
+ pkt_in_segment_len) {
+ pkt_out_segment_len = pkt_in_segment_len -
+ pkt_in_pyld_off;
+ }
+
+ /* update payload segment */
+ pyld_segment->data_off = pkt_in_pyld_off;
+ pyld_segment->data_len = pkt_out_segment_len;
+
+ /* update header segment */
+ hdr_segment->pkt_len += pyld_segment->data_len;
+ hdr_segment->nb_segs++;
+
+ /* update pkt_in_pyld_off */
+ pkt_in_pyld_off += pkt_out_segment_len;
+ if (pkt_in_pyld_off == pkt_in_segment_len)
+ pkt_in_segment_processed = true;
+ }
+
+ /* 'pkt_in' may contain numerous segments */
+ pkt_in = pkt_in->next;
+ if (pkt_in != NULL)
+ pkt_in_pyld_off = pkt_in->data_off;
+ }
+ return nb_segs;
+}
+
+static inline void
+parse_ipv4(struct ipv4_hdr *ipv4_hdr, struct rte_mbuf *pkt)
+{
+ struct tcp_hdr *tcp_hdr;
+
+ switch (ipv4_hdr->next_proto_id) {
+ case IPPROTO_TCP:
+ pkt->packet_type |= RTE_PTYPE_L4_TCP;
+ pkt->l3_len = IPv4_HDR_LEN(ipv4_hdr);
+ tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr + pkt->l3_len);
+ pkt->l4_len = TCP_HDR_LEN(tcp_hdr);
+ break;
+ }
+}
+
+static inline void
+parse_ethernet(struct ether_hdr *eth_hdr, struct rte_mbuf *pkt)
+{
+ struct ipv4_hdr *ipv4_hdr;
+ struct vlan_hdr *vlan_hdr;
+ uint16_t ethertype;
+
+ ethertype = rte_be_to_cpu_16(eth_hdr->ether_type);
+ if (ethertype == ETHER_TYPE_VLAN) {
+ vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1);
+ pkt->l2_len = sizeof(struct vlan_hdr);
+ pkt->packet_type |= RTE_PTYPE_L2_ETHER_VLAN;
+ ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto);
+ }
+
+ switch (ethertype) {
+ case ETHER_TYPE_IPv4:
+ if (IS_VLAN_PKT(pkt)) {
+ pkt->packet_type |= RTE_PTYPE_L3_IPV4;
+ } else {
+ pkt->packet_type |= RTE_PTYPE_L2_ETHER;
+ pkt->packet_type |= RTE_PTYPE_L3_IPV4;
+ }
+ pkt->l2_len += sizeof(struct ether_hdr);
+ ipv4_hdr = (struct ipv4_hdr *) ((char *)eth_hdr +
+ pkt->l2_len);
+ parse_ipv4(ipv4_hdr, pkt);
+ break;
+ }
+}
+
+void
+gso_parse_packet(struct rte_mbuf *pkt)
+{
+ struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+
+ pkt->packet_type = pkt->tx_offload = 0;
+ parse_ethernet(eth_hdr, pkt);
+}
+
+static inline void
+update_ipv4_header(char *base, uint16_t offset, uint16_t length, uint16_t id)
+{
+ struct ipv4_hdr *ipv4_hdr = (struct ipv4_hdr *)(base + offset);
+
+ ipv4_hdr->total_length = rte_cpu_to_be_16(length - offset);
+ ipv4_hdr->packet_id = rte_cpu_to_be_16(id);
+}
+
+static inline void
+update_tcp_header(char *base, uint16_t offset, uint32_t sent_seq,
+ uint8_t non_tail)
+{
+ struct tcp_hdr *tcp_hdr = (struct tcp_hdr *)(base + offset);
+
+ tcp_hdr->sent_seq = rte_cpu_to_be_32(sent_seq);
+ /* clean FIN and PSH for non-tail segments */
+ if (non_tail)
+ tcp_hdr->tcp_flags &= (~(TCP_HDR_PSH_MASK | TCP_HDR_FIN_MASK));
+}
+
+void
+gso_update_pkt_headers(struct rte_mbuf *pkt, uint16_t nb_segments,
+ struct rte_mbuf **out_segments)
+{
+ struct ipv4_hdr *ipv4_hdr;
+ struct tcp_hdr *tcp_hdr;
+ struct rte_mbuf *seg;
+ uint32_t sent_seq;
+ uint16_t offset, i;
+ uint16_t tail_seg_idx = nb_segments - 1, id;
+
+ switch (pkt->packet_type) {
+ case ETHER_VLAN_IPv4_TCP_PKT:
+ case ETHER_IPv4_TCP_PKT:
+ ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *) +
+ pkt->l2_len);
+ tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr + pkt->l3_len);
+ id = rte_be_to_cpu_16(ipv4_hdr->packet_id);
+ sent_seq = rte_be_to_cpu_32(tcp_hdr->sent_seq);
+
+ for (i = 0; i < nb_segments; i++) {
+ seg = out_segments[i];
+
+ offset = seg->l2_len;
+ update_ipv4_header(rte_pktmbuf_mtod(seg, char *),
+ offset, seg->pkt_len, id);
+ id++;
+
+ offset += seg->l3_len;
+ update_tcp_header(rte_pktmbuf_mtod(seg, char *),
+ offset, sent_seq, i < tail_seg_idx);
+ sent_seq += seg->next->data_len;
+ }
+ break;
+ }
+}
diff --git a/lib/librte_gso/gso_common.h b/lib/librte_gso/gso_common.h
new file mode 100644
index 0000000..d750041
--- /dev/null
+++ b/lib/librte_gso/gso_common.h
@@ -0,0 +1,120 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _GSO_COMMON_H_
+#define _GSO_COMMON_H_
+
+#include <stdint.h>
+#include <rte_mbuf.h>
+
+#define IPV4_HDR_DF_SHIFT 14
+#define IPV4_HDR_DF_MASK (1 << IPV4_HDR_DF_SHIFT)
+#define IPv4_HDR_LEN(iph) ((iph->version_ihl & 0x0f) * 4)
+
+#define TCP_HDR_PSH_MASK ((uint8_t)0x08)
+#define TCP_HDR_FIN_MASK ((uint8_t)0x01)
+#define TCP_HDR_LEN(tcph) ((tcph->data_off & 0xf0) >> 2)
+
+#define ETHER_IPv4_PKT (RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4)
+/* Supported packet types */
+/* TCP/IPv4 packet. */
+#define ETHER_IPv4_TCP_PKT (ETHER_IPv4_PKT | RTE_PTYPE_L4_TCP)
+
+/* TCP/IPv4 packet with VLAN tag. */
+#define ETHER_VLAN_IPv4_TCP_PKT (RTE_PTYPE_L2_ETHER_VLAN | \
+ RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L4_TCP)
+
+#define IS_VLAN_PKT(pkt) ((pkt->packet_type & RTE_PTYPE_L2_ETHER_VLAN) == \
+ RTE_PTYPE_L2_ETHER_VLAN)
+
+/**
+ * Internal function which parses a packet, setting outer_l2/l3_len and
+ * l2/l3/l4_len and packet_type.
+ *
+ * @param pkt
+ * Packet to parse.
+ */
+void gso_parse_packet(struct rte_mbuf *pkt);
+
+/**
+ * Internal function which updates relevant packet headers, following
+ * segmentation. This is required to update, for example, the IPv4
+ * 'total_length' field, to reflect the reduced length of the now-
+ * segmented packet.
+ *
+ * @param pkt
+ * The original packet.
+ * @param nb_segments
+ * The number of GSO segments into which pkt was split.
+ * @param out_segements
+ * Pointer array used for storing mbuf addresses for GSO segments.
+ */
+void gso_update_pkt_headers(struct rte_mbuf *pkt, uint16_t nb_segments,
+ struct rte_mbuf **out_segments);
+
+/**
+ * Internal function which divides the input packet into small segments.
+ * Each of the newly-created segments is organized as a two-segment mbuf,
+ * where the first segment is a standard mbuf, which stores a copy of
+ * packet header, and the second is an indirect mbuf which points to a
+ * section of data in the input packet.
+ *
+ * @param pkt
+ * Packet to segment.
+ * @param pkt_hdr_offset
+ * Packet header offset, measured in byte.
+ * @param pyld_unit_size
+ * The max payload length of a GSO segment.
+ * @param direct_pool
+ * MBUF pool used for allocating direct buffers for output segments.
+ * @param indirect_pool
+ * MBUF pool used for allocating indirect buffers for output segments.
+ * @param pkts_out
+ * Pointer array used to keep the mbuf addresses of output segments.
+ * @param nb_pkts_out
+ * The max number of items that pkts_out can keep.
+ *
+ * @return
+ * - The number of segments created in the event of success.
+ * - If no GSO is performed, return 1.
+ * - If available memory in mempools is insufficient, return -ENOMEM.
+ * - -EINVAL for invalid parameters
+ */
+int gso_do_segment(struct rte_mbuf *pkt,
+ uint16_t pkt_hdr_offset,
+ uint16_t pyld_unit_size,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out);
+#endif
diff --git a/lib/librte_gso/gso_tcp.c b/lib/librte_gso/gso_tcp.c
new file mode 100644
index 0000000..9d5fc30
--- /dev/null
+++ b/lib/librte_gso/gso_tcp.c
@@ -0,0 +1,82 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include <rte_ether.h>
+#include <rte_ip.h>
+#include <rte_tcp.h>
+
+#include "gso_common.h"
+#include "gso_tcp.h"
+
+int
+gso_tcp4_segment(struct rte_mbuf *pkt,
+ uint16_t gso_size,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out)
+{
+ struct ether_hdr *eth_hdr;
+ struct ipv4_hdr *ipv4_hdr;
+ uint16_t tcp_dl;
+ uint16_t pyld_unit_size;
+ uint16_t hdr_offset;
+ int ret = 1;
+
+ eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+ ipv4_hdr = (struct ipv4_hdr *)((char *)eth_hdr + pkt->l2_len);
+
+ /* don't process fragmented packet */
+ if ((ipv4_hdr->fragment_offset &
+ rte_cpu_to_be_16(IPV4_HDR_DF_MASK)) == 0)
+ return ret;
+
+ tcp_dl = rte_be_to_cpu_16(ipv4_hdr->total_length) -
+ pkt->l3_len - pkt->l4_len;
+ /* don't process packet without data */
+ if (tcp_dl == 0)
+ return ret;
+
+ hdr_offset = pkt->l2_len + pkt->l3_len + pkt->l4_len;
+ pyld_unit_size = gso_size - hdr_offset - ETHER_CRC_LEN;
+
+ /* segment the payload */
+ ret = gso_do_segment(pkt, hdr_offset, pyld_unit_size, direct_pool,
+ indirect_pool, pkts_out, nb_pkts_out);
+
+ if (ret > 1)
+ gso_update_pkt_headers(pkt, ret, pkts_out);
+
+ return ret;
+}
diff --git a/lib/librte_gso/gso_tcp.h b/lib/librte_gso/gso_tcp.h
new file mode 100644
index 0000000..f291ccb
--- /dev/null
+++ b/lib/librte_gso/gso_tcp.h
@@ -0,0 +1,73 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _GSO_TCP_H_
+#define _GSO_TCP_H_
+
+#include <stdint.h>
+#include <rte_mbuf.h>
+
+/**
+ * Segment an IPv4/TCP packet. This function assumes the input packet has
+ * correct checksums and doesn't update checksums for GSO segment.
+ * Furthermore, it doesn't process IP fragment packets.
+ *
+ * @param pkt
+ * The packet mbuf to segment.
+ * @param gso_size
+ * The max length of a GSO segment, measured in bytes.
+ * @param direct_pool
+ * MBUF pool used for allocating direct buffers for output segments.
+ * @param indirect_pool
+ * MBUF pool used for allocating indirect buffers for output segments.
+ * @param pkts_out
+ * Pointer array, which is used to store mbuf addresses of GSO segments.
+ * Caller should guarantee that 'pkts_out' is sufficiently large to store
+ * all GSO segments.
+ * @param nb_pkts_out
+ * The max number of items that 'pkts_out' can keep.
+ *
+ * @return
+ * - The number of GSO segments on success.
+ * - Return 1 if no GSO is performed.
+ * - Return -ENOMEM if available memory in mempools is insufficient.
+ * - Return -EINVAL for invalid parameters.
+ */
+int gso_tcp4_segment(struct rte_mbuf *pkt,
+ uint16_t gso_size,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out);
+
+#endif
diff --git a/lib/librte_gso/rte_gso.c b/lib/librte_gso/rte_gso.c
index b81afce..fac95f2 100644
--- a/lib/librte_gso/rte_gso.c
+++ b/lib/librte_gso/rte_gso.c
@@ -31,17 +31,57 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

+#include <rte_log.h>
+
#include "rte_gso.h"
+#include "gso_common.h"
+#include "gso_tcp.h"

int
rte_gso_segment(struct rte_mbuf *pkt,
struct rte_gso_ctx gso_ctx,
struct rte_mbuf **pkts_out,
- uint16_t nb_pkts_out __rte_unused)
+ uint16_t nb_pkts_out)
{
+ struct rte_mempool *direct_pool, *indirect_pool;
+ struct rte_mbuf *pkt_seg;
+ uint16_t nb_segments, gso_size;
+
if (pkt == NULL || pkts_out == NULL || gso_ctx.direct_pool ==
NULL || gso_ctx.indirect_pool == NULL)
return -EINVAL;

- return 1;
+ if ((gso_ctx.gso_types & RTE_GSO_TCP_IPV4) == 0 ||
+ gso_ctx.gso_size >= pkt->pkt_len ||
+ gso_ctx.gso_size == 0)
+ return 1;
+
+ pkt_seg = pkt;
+ gso_size = gso_ctx.gso_size;
+ direct_pool = gso_ctx.direct_pool;
+ indirect_pool = gso_ctx.indirect_pool;
+
+ /* Parse packet headers to determine how to segment 'pkt' */
+ gso_parse_packet(pkt);
+
+ switch (pkt->packet_type) {
+ case ETHER_VLAN_IPv4_TCP_PKT:
+ case ETHER_IPv4_TCP_PKT:
+ nb_segments = gso_tcp4_segment(pkt, gso_size,
+ direct_pool, indirect_pool,
+ pkts_out, nb_pkts_out);
+ break;
+ default:
+ RTE_LOG(WARNING, GSO, "Unsupported packet type\n");
+ nb_segments = 1;
+ }
+
+ if (nb_segments > 1) {
+ while (pkt_seg) {
+ rte_mbuf_refcnt_update(pkt_seg, -1);
+ pkt_seg = pkt_seg->next;
+ }
+ }
+
+ return nb_segments;
}
diff --git a/lib/librte_gso/rte_gso.h b/lib/librte_gso/rte_gso.h
index 5a8389a..77853fa 100644
--- a/lib/librte_gso/rte_gso.h
+++ b/lib/librte_gso/rte_gso.h
@@ -46,6 +46,9 @@ extern "C" {
#include <stdint.h>
#include <rte_mbuf.h>

+#define RTE_GSO_TCP_IPV4 (1ULL << 0)
+/**< GSO flag for TCP/IPv4 packets (containing optional VLAN tag) */
+
/**
* GSO context structure.
*/
--
2.7.4
Ananyev, Konstantin
2017-08-30 01:38:33 UTC
Permalink
-----Original Message-----
From: Hu, Jiayu
Sent: Thursday, August 24, 2017 3:16 PM
Subject: [PATCH 2/5] gso/lib: add TCP/IPv4 GSO support
This patch adds GSO support for TCP/IPv4 packets. Supported packets
may include a single VLAN tag. TCP/IPv4 GSO assumes that all input
packets have correct checksums, and doesn't update checksums for output
packets (the responsibility for this lies with the application).
Additionally, TCP/IPv4 GSO doesn't process IP fragmented packets.
TCP/IPv4 GSO uses two chained MBUFs, one direct MBUF and one indrect
MBUF, to organize an output packet. Note that we refer to these two
chained MBUFs as a two-segment MBUF. The direct MBUF stores the packet
header, while the indirect mbuf simply points to a location within the
original packet's payload. Consequently, use of the GSO library requires
multi-segment MBUF support in the TX functions of the NIC driver.
If a packet is GSOed, TCP/IPv4 GSO reduces its MBUF refcnt by 1. As a
result, when all of its GSOed segments are freed, the packet is freed
automatically.
---
lib/librte_eal/common/include/rte_log.h | 1 +
lib/librte_gso/Makefile | 2 +
lib/librte_gso/gso_common.c | 270 ++++++++++++++++++++++++++++++++
lib/librte_gso/gso_common.h | 120 ++++++++++++++
lib/librte_gso/gso_tcp.c | 82 ++++++++++
lib/librte_gso/gso_tcp.h | 73 +++++++++
lib/librte_gso/rte_gso.c | 44 +++++-
lib/librte_gso/rte_gso.h | 3 +
8 files changed, 593 insertions(+), 2 deletions(-)
create mode 100644 lib/librte_gso/gso_common.c
create mode 100644 lib/librte_gso/gso_common.h
create mode 100644 lib/librte_gso/gso_tcp.c
create mode 100644 lib/librte_gso/gso_tcp.h
diff --git a/lib/librte_eal/common/include/rte_log.h b/lib/librte_eal/common/include/rte_log.h
index ec8dba7..2fa1199 100644
--- a/lib/librte_eal/common/include/rte_log.h
+++ b/lib/librte_eal/common/include/rte_log.h
@@ -87,6 +87,7 @@ extern struct rte_logs rte_logs;
#define RTE_LOGTYPE_CRYPTODEV 17 /**< Log related to cryptodev. */
#define RTE_LOGTYPE_EFD 18 /**< Log related to EFD. */
#define RTE_LOGTYPE_EVENTDEV 19 /**< Log related to eventdev. */
+#define RTE_LOGTYPE_GSO 20 /**< Log related to GSO. */
/* these log types can be used in an application */
#define RTE_LOGTYPE_USER1 24 /**< User-defined log type 1. */
diff --git a/lib/librte_gso/Makefile b/lib/librte_gso/Makefile
index aeaacbc..0f8e38f 100644
--- a/lib/librte_gso/Makefile
+++ b/lib/librte_gso/Makefile
@@ -42,6 +42,8 @@ LIBABIVER := 1
#source files
SRCS-$(CONFIG_RTE_LIBRTE_GSO) += rte_gso.c
+SRCS-$(CONFIG_RTE_LIBRTE_GSO) += gso_common.c
+SRCS-$(CONFIG_RTE_LIBRTE_GSO) += gso_tcp.c
# install this header file
SYMLINK-$(CONFIG_RTE_LIBRTE_GSO)-include += rte_gso.h
diff --git a/lib/librte_gso/gso_common.c b/lib/librte_gso/gso_common.c
new file mode 100644
index 0000000..2b54fbd
--- /dev/null
+++ b/lib/librte_gso/gso_common.c
@@ -0,0 +1,270 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdbool.h>
+#include <string.h>
+
+#include <rte_malloc.h>
+
+#include <rte_ether.h>
+#include <rte_ip.h>
+#include <rte_tcp.h>
+
+#include "gso_common.h"
+
+static inline void
+hdr_segment_init(struct rte_mbuf *hdr_segment, struct rte_mbuf *pkt,
+ uint16_t pkt_hdr_offset)
+{
+ /* copy mbuf metadata */
+ hdr_segment->nb_segs = 1;
+ hdr_segment->port = pkt->port;
+ hdr_segment->ol_flags = pkt->ol_flags;
+ hdr_segment->packet_type = pkt->packet_type;
+ hdr_segment->pkt_len = pkt_hdr_offset;
+ hdr_segment->data_len = pkt_hdr_offset;
+ hdr_segment->tx_offload = pkt->tx_offload;
+ /* copy packet header */
+ rte_memcpy(rte_pktmbuf_mtod(hdr_segment, char *),
+ rte_pktmbuf_mtod(pkt, char *),
+ pkt_hdr_offset);
+}
+
+static inline void
+free_gso_segment(struct rte_mbuf **pkts, uint16_t nb_pkts)
+{
+ uint16_t i;
+
+ for (i = 0; i < nb_pkts; i++) {
+ rte_pktmbuf_detach(pkts[i]->next);
I don't think you need to call detach() here explicitly.
Just rte_pktmbuf_free(pkts[i]) should do I think.
+ rte_pktmbuf_free(pkts[i]);
+ pkts[i] = NULL;
+ }
+}
+
+int
+gso_do_segment(struct rte_mbuf *pkt,
+ uint16_t pkt_hdr_offset,
+ uint16_t pyld_unit_size,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out)
+{
+ struct rte_mbuf *pkt_in;
+ struct rte_mbuf *hdr_segment, *pyld_segment;
+ uint32_t pkt_in_pyld_off;
+ uint16_t pkt_in_segment_len, pkt_out_segment_len;
+ uint16_t nb_segs;
+ bool pkt_in_segment_processed;
+
+ pkt_in_pyld_off = pkt->data_off + pkt_hdr_offset;
+ pkt_in = pkt;
+ nb_segs = 0;
+
+ while (pkt_in) {
+ pkt_in_segment_processed = false;
+ pkt_in_segment_len = pkt_in->data_off + pkt_in->data_len;
+
+ while (!pkt_in_segment_processed) {
+ if (unlikely(nb_segs >= nb_pkts_out)) {
+ free_gso_segment(pkts_out, nb_segs);
+ return -EINVAL;
+ }
+
+ /* allocate direct mbuf */
+ hdr_segment = rte_pktmbuf_alloc(direct_pool);
+ if (unlikely(hdr_segment == NULL)) {
+ free_gso_segment(pkts_out, nb_segs);
+ return -ENOMEM;
+ }
+
+ /* allocate indirect mbuf */
+ pyld_segment = rte_pktmbuf_alloc(indirect_pool);
+ if (unlikely(pyld_segment == NULL)) {
+ rte_pktmbuf_free(hdr_segment);
+ free_gso_segment(pkts_out, nb_segs);
+ return -ENOMEM;
+ }
So, if I understand correctly each new packet would always contain just one data segment?
Why several data segments couldn't be chained together (if sum of their data_len <= mss)?
In a same way as done here:
http://dpdk.org/browse/dpdk/tree/lib/librte_ip_frag/rte_ipv4_fragmentation.c#n93
or here:
https://gerrit.fd.io/r/gitweb?p=tldk.git;a=blob;f=lib/libtle_l4p/tcp_tx_seg.h;h=a8d2425597a7ad6f598aa4bb7fcd7f1da74305f0;hb=HEAD#l23
?
+
+ /* copy packet header */
+ hdr_segment_init(hdr_segment, pkt, pkt_hdr_offset);
+
+ /* attach payload mbuf to current packet segment */
+ rte_pktmbuf_attach(pyld_segment, pkt_in);
+
+ hdr_segment->next = pyld_segment;
+ pkts_out[nb_segs++] = hdr_segment;
+
+ /* calculate payload length */
+ pkt_out_segment_len = pyld_unit_size;
+ if (pkt_in_pyld_off + pkt_out_segment_len >
+ pkt_in_segment_len) {
+ pkt_out_segment_len = pkt_in_segment_len -
+ pkt_in_pyld_off;
+ }
+
+ /* update payload segment */
+ pyld_segment->data_off = pkt_in_pyld_off;
+ pyld_segment->data_len = pkt_out_segment_len;
+
+ /* update header segment */
+ hdr_segment->pkt_len += pyld_segment->data_len;
+ hdr_segment->nb_segs++;
+
+ /* update pkt_in_pyld_off */
+ pkt_in_pyld_off += pkt_out_segment_len;
+ if (pkt_in_pyld_off == pkt_in_segment_len)
+ pkt_in_segment_processed = true;
+ }
+
+ /* 'pkt_in' may contain numerous segments */
+ pkt_in = pkt_in->next;
+ if (pkt_in != NULL)
+ pkt_in_pyld_off = pkt_in->data_off;
+ }
+ return nb_segs;
+}
+
+static inline void
+parse_ipv4(struct ipv4_hdr *ipv4_hdr, struct rte_mbuf *pkt)
+{
+ struct tcp_hdr *tcp_hdr;
+
+ switch (ipv4_hdr->next_proto_id) {
+ pkt->packet_type |= RTE_PTYPE_L4_TCP;
+ pkt->l3_len = IPv4_HDR_LEN(ipv4_hdr);
+ tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr + pkt->l3_len);
+ pkt->l4_len = TCP_HDR_LEN(tcp_hdr);
+ break;
+ }
+}
+
+static inline void
+parse_ethernet(struct ether_hdr *eth_hdr, struct rte_mbuf *pkt)
+{
+ struct ipv4_hdr *ipv4_hdr;
+ struct vlan_hdr *vlan_hdr;
+ uint16_t ethertype;
+
+ ethertype = rte_be_to_cpu_16(eth_hdr->ether_type);
+ if (ethertype == ETHER_TYPE_VLAN) {
+ vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1);
+ pkt->l2_len = sizeof(struct vlan_hdr);
+ pkt->packet_type |= RTE_PTYPE_L2_ETHER_VLAN;
+ ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto);
+ }
+
+ switch (ethertype) {
+ if (IS_VLAN_PKT(pkt)) {
+ pkt->packet_type |= RTE_PTYPE_L3_IPV4;
+ } else {
+ pkt->packet_type |= RTE_PTYPE_L2_ETHER;
+ pkt->packet_type |= RTE_PTYPE_L3_IPV4;
+ }
+ pkt->l2_len += sizeof(struct ether_hdr);
+ ipv4_hdr = (struct ipv4_hdr *) ((char *)eth_hdr +
+ pkt->l2_len);
+ parse_ipv4(ipv4_hdr, pkt);
+ break;
+ }
+}
+
+void
+gso_parse_packet(struct rte_mbuf *pkt)
There is a function rte_net_get_ptype() that supposed to provide similar functionality.
So we probably don't need to create a new SW parse function here, instead would be better
to reuse (and update if needed) an existing one.
Again user already might have l2/l3/l4.../_len and packet_type setuped.
So better to keep SW packet parsing out of scope of that library.
+{
+ struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+
+ pkt->packet_type = pkt->tx_offload = 0;
+ parse_ethernet(eth_hdr, pkt);
+}
+
+static inline void
+update_ipv4_header(char *base, uint16_t offset, uint16_t length, uint16_t id)
+{
+ struct ipv4_hdr *ipv4_hdr = (struct ipv4_hdr *)(base + offset);
+
+ ipv4_hdr->total_length = rte_cpu_to_be_16(length - offset);
+ ipv4_hdr->packet_id = rte_cpu_to_be_16(id);
+}
+
+static inline void
+update_tcp_header(char *base, uint16_t offset, uint32_t sent_seq,
+ uint8_t non_tail)
+{
+ struct tcp_hdr *tcp_hdr = (struct tcp_hdr *)(base + offset);
+
+ tcp_hdr->sent_seq = rte_cpu_to_be_32(sent_seq);
+ /* clean FIN and PSH for non-tail segments */
+ if (non_tail)
+ tcp_hdr->tcp_flags &= (~(TCP_HDR_PSH_MASK | TCP_HDR_FIN_MASK));
+}
+
+void
+gso_update_pkt_headers(struct rte_mbuf *pkt, uint16_t nb_segments,
+ struct rte_mbuf **out_segments)
+{
+ struct ipv4_hdr *ipv4_hdr;
+ struct tcp_hdr *tcp_hdr;
+ struct rte_mbuf *seg;
+ uint32_t sent_seq;
+ uint16_t offset, i;
+ uint16_t tail_seg_idx = nb_segments - 1, id;
+
+ switch (pkt->packet_type) {
Might be worth to put code below in a separate function:
update_inner_tcp_hdr(..) or so.
Then you can reuse it for tunneled cases too.
+ ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *)
+ pkt->l2_len);
+ tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr + pkt->l3_len);
+ id = rte_be_to_cpu_16(ipv4_hdr->packet_id);
+ sent_seq = rte_be_to_cpu_32(tcp_hdr->sent_seq);
+
+ for (i = 0; i < nb_segments; i++) {
+ seg = out_segments[i];
+
+ offset = seg->l2_len;
+ update_ipv4_header(rte_pktmbuf_mtod(seg, char *),
+ offset, seg->pkt_len, id);
+ id++;
Who would be responsible to make sure that we wouldn't have consecutive packets with the IPV4 id?
Would be the upper layer that forms the packet or gso library or ...?
+
+ offset += seg->l3_len;
+ update_tcp_header(rte_pktmbuf_mtod(seg, char *),
+ offset, sent_seq, i < tail_seg_idx);
+ sent_seq += seg->next->data_len;
+ }
+ break;
+ }
+}
diff --git a/lib/librte_gso/gso_common.h b/lib/librte_gso/gso_common.h
new file mode 100644
index 0000000..d750041
--- /dev/null
+++ b/lib/librte_gso/gso_common.h
@@ -0,0 +1,120 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _GSO_COMMON_H_
+#define _GSO_COMMON_H_
+
+#include <stdint.h>
+#include <rte_mbuf.h>
+
+#define IPV4_HDR_DF_SHIFT 14
+#define IPV4_HDR_DF_MASK (1 << IPV4_HDR_DF_SHIFT)
+#define IPv4_HDR_LEN(iph) ((iph->version_ihl & 0x0f) * 4)
+
+#define TCP_HDR_PSH_MASK ((uint8_t)0x08)
+#define TCP_HDR_FIN_MASK ((uint8_t)0x01)
+#define TCP_HDR_LEN(tcph) ((tcph->data_off & 0xf0) >> 2)
+
+#define ETHER_IPv4_PKT (RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4)
+/* Supported packet types */
+/* TCP/IPv4 packet. */
+#define ETHER_IPv4_TCP_PKT (ETHER_IPv4_PKT | RTE_PTYPE_L4_TCP)
+
+/* TCP/IPv4 packet with VLAN tag. */
+#define ETHER_VLAN_IPv4_TCP_PKT (RTE_PTYPE_L2_ETHER_VLAN | \
+ RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L4_TCP)
+
+#define IS_VLAN_PKT(pkt) ((pkt->packet_type & RTE_PTYPE_L2_ETHER_VLAN) == \
+ RTE_PTYPE_L2_ETHER_VLAN)
+
+/**
+ * Internal function which parses a packet, setting outer_l2/l3_len and
+ * l2/l3/l4_len and packet_type.
+ *
+ * Packet to parse.
+ */
+void gso_parse_packet(struct rte_mbuf *pkt);
+
+/**
+ * Internal function which updates relevant packet headers, following
+ * segmentation. This is required to update, for example, the IPv4
+ * 'total_length' field, to reflect the reduced length of the now-
+ * segmented packet.
+ *
+ * The original packet.
+ * The number of GSO segments into which pkt was split.
+ * Pointer array used for storing mbuf addresses for GSO segments.
+ */
+void gso_update_pkt_headers(struct rte_mbuf *pkt, uint16_t nb_segments,
+ struct rte_mbuf **out_segments);
+
+/**
+ * Internal function which divides the input packet into small segments.
+ * Each of the newly-created segments is organized as a two-segment mbuf,
+ * where the first segment is a standard mbuf, which stores a copy of
+ * packet header, and the second is an indirect mbuf which points to a
+ * section of data in the input packet.
+ *
+ * Packet to segment.
+ * Packet header offset, measured in byte.
+ * The max payload length of a GSO segment.
+ * MBUF pool used for allocating direct buffers for output segments.
+ * MBUF pool used for allocating indirect buffers for output segments.
+ * Pointer array used to keep the mbuf addresses of output segments.
+ * The max number of items that pkts_out can keep.
+ *
+ * - The number of segments created in the event of success.
+ * - If no GSO is performed, return 1.
+ * - If available memory in mempools is insufficient, return -ENOMEM.
+ * - -EINVAL for invalid parameters
+ */
+int gso_do_segment(struct rte_mbuf *pkt,
+ uint16_t pkt_hdr_offset,
+ uint16_t pyld_unit_size,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out);
+#endif
diff --git a/lib/librte_gso/gso_tcp.c b/lib/librte_gso/gso_tcp.c
new file mode 100644
index 0000000..9d5fc30
--- /dev/null
+++ b/lib/librte_gso/gso_tcp.c
@@ -0,0 +1,82 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include <rte_ether.h>
+#include <rte_ip.h>
+#include <rte_tcp.h>
+
+#include "gso_common.h"
+#include "gso_tcp.h"
+
+int
+gso_tcp4_segment(struct rte_mbuf *pkt,
+ uint16_t gso_size,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out)
+{
+ struct ether_hdr *eth_hdr;
+ struct ipv4_hdr *ipv4_hdr;
+ uint16_t tcp_dl;
+ uint16_t pyld_unit_size;
+ uint16_t hdr_offset;
+ int ret = 1;
+
+ eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+ ipv4_hdr = (struct ipv4_hdr *)((char *)eth_hdr + pkt->l2_len);
+
+ /* don't process fragmented packet */
+ if ((ipv4_hdr->fragment_offset &
+ rte_cpu_to_be_16(IPV4_HDR_DF_MASK)) == 0)
+ return ret;
+
+ tcp_dl = rte_be_to_cpu_16(ipv4_hdr->total_length) -
+ pkt->l3_len - pkt->l4_len;
+ /* don't process packet without data */
+ if (tcp_dl == 0)
+ return ret;
+
+ hdr_offset = pkt->l2_len + pkt->l3_len + pkt->l4_len;
+ pyld_unit_size = gso_size - hdr_offset - ETHER_CRC_LEN;
+
+ /* segment the payload */
+ ret = gso_do_segment(pkt, hdr_offset, pyld_unit_size, direct_pool,
+ indirect_pool, pkts_out, nb_pkts_out);
+
+ if (ret > 1)
+ gso_update_pkt_headers(pkt, ret, pkts_out);
+
+ return ret;
+}
diff --git a/lib/librte_gso/gso_tcp.h b/lib/librte_gso/gso_tcp.h
new file mode 100644
index 0000000..f291ccb
--- /dev/null
+++ b/lib/librte_gso/gso_tcp.h
@@ -0,0 +1,73 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _GSO_TCP_H_
+#define _GSO_TCP_H_
+
+#include <stdint.h>
+#include <rte_mbuf.h>
+
+/**
+ * Segment an IPv4/TCP packet. This function assumes the input packet has
+ * correct checksums and doesn't update checksums for GSO segment.
+ * Furthermore, it doesn't process IP fragment packets.
+ *
+ * The packet mbuf to segment.
+ * The max length of a GSO segment, measured in bytes.
+ * MBUF pool used for allocating direct buffers for output segments.
+ * MBUF pool used for allocating indirect buffers for output segments.
+ * Pointer array, which is used to store mbuf addresses of GSO segments.
+ * Caller should guarantee that 'pkts_out' is sufficiently large to store
+ * all GSO segments.
+ * The max number of items that 'pkts_out' can keep.
+ *
+ * - The number of GSO segments on success.
+ * - Return 1 if no GSO is performed.
+ * - Return -ENOMEM if available memory in mempools is insufficient.
+ * - Return -EINVAL for invalid parameters.
+ */
+int gso_tcp4_segment(struct rte_mbuf *pkt,
+ uint16_t gso_size,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out);
+
+#endif
diff --git a/lib/librte_gso/rte_gso.c b/lib/librte_gso/rte_gso.c
index b81afce..fac95f2 100644
--- a/lib/librte_gso/rte_gso.c
+++ b/lib/librte_gso/rte_gso.c
@@ -31,17 +31,57 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
+#include <rte_log.h>
+
#include "rte_gso.h"
+#include "gso_common.h"
+#include "gso_tcp.h"
int
rte_gso_segment(struct rte_mbuf *pkt,
struct rte_gso_ctx gso_ctx,
struct rte_mbuf **pkts_out,
- uint16_t nb_pkts_out __rte_unused)
+ uint16_t nb_pkts_out)
{
+ struct rte_mempool *direct_pool, *indirect_pool;
+ struct rte_mbuf *pkt_seg;
+ uint16_t nb_segments, gso_size;
+
if (pkt == NULL || pkts_out == NULL || gso_ctx.direct_pool ==
NULL || gso_ctx.indirect_pool == NULL)
return -EINVAL;
Probably we don't need to check gso_ctx values for each incoming packet.
If you feel it is necessary - create new function rte_gso_ctx_check() that
could be performed just once per ctx.
- return 1;
+ if ((gso_ctx.gso_types & RTE_GSO_TCP_IPV4) == 0 ||
+ gso_ctx.gso_size >= pkt->pkt_len ||
+ gso_ctx.gso_size == 0)
First and third condition seems redundant.
+ return 1;
I think you forgot here:
pkts_out[0] = pkt;
+
+ pkt_seg = pkt;
+ gso_size = gso_ctx.gso_size;
+ direct_pool = gso_ctx.direct_pool;
+ indirect_pool = gso_ctx.indirect_pool;
+
+ /* Parse packet headers to determine how to segment 'pkt' */
+ gso_parse_packet(pkt);
I don't think we need to parse packet here.
Instead assume that user already filled packet_type and l2/l3/..._len fields correctly.
+
+ switch (pkt->packet_type) {
+ nb_segments = gso_tcp4_segment(pkt, gso_size,
+ direct_pool, indirect_pool,
+ pkts_out, nb_pkts_out);
+ break;
+ RTE_LOG(WARNING, GSO, "Unsupported packet type\n");
+ nb_segments = 1;
+ }
+
+ if (nb_segments > 1) {
+ while (pkt_seg) {
+ rte_mbuf_refcnt_update(pkt_seg, -1);
+ pkt_seg = pkt_seg->next;
+ }
+ }
+
+ return nb_segments;
}
diff --git a/lib/librte_gso/rte_gso.h b/lib/librte_gso/rte_gso.h
index 5a8389a..77853fa 100644
--- a/lib/librte_gso/rte_gso.h
+++ b/lib/librte_gso/rte_gso.h
@@ -46,6 +46,9 @@ extern "C" {
#include <stdint.h>
#include <rte_mbuf.h>
+#define RTE_GSO_TCP_IPV4 (1ULL << 0)
+/**< GSO flag for TCP/IPv4 packets (containing optional VLAN tag) */
+
/**
* GSO context structure.
*/
--
2.7.4
Jiayu Hu
2017-08-30 02:55:50 UTC
Permalink
Hi Konstantin,

Thanks for your important suggestions. My feedbacks are inline.
Post by Ananyev, Konstantin
-----Original Message-----
From: Hu, Jiayu
Sent: Thursday, August 24, 2017 3:16 PM
Subject: [PATCH 2/5] gso/lib: add TCP/IPv4 GSO support
This patch adds GSO support for TCP/IPv4 packets. Supported packets
may include a single VLAN tag. TCP/IPv4 GSO assumes that all input
packets have correct checksums, and doesn't update checksums for output
packets (the responsibility for this lies with the application).
Additionally, TCP/IPv4 GSO doesn't process IP fragmented packets.
TCP/IPv4 GSO uses two chained MBUFs, one direct MBUF and one indrect
MBUF, to organize an output packet. Note that we refer to these two
chained MBUFs as a two-segment MBUF. The direct MBUF stores the packet
header, while the indirect mbuf simply points to a location within the
original packet's payload. Consequently, use of the GSO library requires
multi-segment MBUF support in the TX functions of the NIC driver.
If a packet is GSOed, TCP/IPv4 GSO reduces its MBUF refcnt by 1. As a
result, when all of its GSOed segments are freed, the packet is freed
automatically.
---
lib/librte_eal/common/include/rte_log.h | 1 +
lib/librte_gso/Makefile | 2 +
lib/librte_gso/gso_common.c | 270 ++++++++++++++++++++++++++++++++
lib/librte_gso/gso_common.h | 120 ++++++++++++++
lib/librte_gso/gso_tcp.c | 82 ++++++++++
lib/librte_gso/gso_tcp.h | 73 +++++++++
lib/librte_gso/rte_gso.c | 44 +++++-
lib/librte_gso/rte_gso.h | 3 +
8 files changed, 593 insertions(+), 2 deletions(-)
create mode 100644 lib/librte_gso/gso_common.c
create mode 100644 lib/librte_gso/gso_common.h
create mode 100644 lib/librte_gso/gso_tcp.c
create mode 100644 lib/librte_gso/gso_tcp.h
diff --git a/lib/librte_eal/common/include/rte_log.h b/lib/librte_eal/common/include/rte_log.h
index ec8dba7..2fa1199 100644
--- a/lib/librte_eal/common/include/rte_log.h
+++ b/lib/librte_eal/common/include/rte_log.h
@@ -87,6 +87,7 @@ extern struct rte_logs rte_logs;
#define RTE_LOGTYPE_CRYPTODEV 17 /**< Log related to cryptodev. */
#define RTE_LOGTYPE_EFD 18 /**< Log related to EFD. */
#define RTE_LOGTYPE_EVENTDEV 19 /**< Log related to eventdev. */
+#define RTE_LOGTYPE_GSO 20 /**< Log related to GSO. */
/* these log types can be used in an application */
#define RTE_LOGTYPE_USER1 24 /**< User-defined log type 1. */
diff --git a/lib/librte_gso/Makefile b/lib/librte_gso/Makefile
index aeaacbc..0f8e38f 100644
--- a/lib/librte_gso/Makefile
+++ b/lib/librte_gso/Makefile
@@ -42,6 +42,8 @@ LIBABIVER := 1
#source files
SRCS-$(CONFIG_RTE_LIBRTE_GSO) += rte_gso.c
+SRCS-$(CONFIG_RTE_LIBRTE_GSO) += gso_common.c
+SRCS-$(CONFIG_RTE_LIBRTE_GSO) += gso_tcp.c
# install this header file
SYMLINK-$(CONFIG_RTE_LIBRTE_GSO)-include += rte_gso.h
diff --git a/lib/librte_gso/gso_common.c b/lib/librte_gso/gso_common.c
new file mode 100644
index 0000000..2b54fbd
--- /dev/null
+++ b/lib/librte_gso/gso_common.c
@@ -0,0 +1,270 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdbool.h>
+#include <string.h>
+
+#include <rte_malloc.h>
+
+#include <rte_ether.h>
+#include <rte_ip.h>
+#include <rte_tcp.h>
+
+#include "gso_common.h"
+
+static inline void
+hdr_segment_init(struct rte_mbuf *hdr_segment, struct rte_mbuf *pkt,
+ uint16_t pkt_hdr_offset)
+{
+ /* copy mbuf metadata */
+ hdr_segment->nb_segs = 1;
+ hdr_segment->port = pkt->port;
+ hdr_segment->ol_flags = pkt->ol_flags;
+ hdr_segment->packet_type = pkt->packet_type;
+ hdr_segment->pkt_len = pkt_hdr_offset;
+ hdr_segment->data_len = pkt_hdr_offset;
+ hdr_segment->tx_offload = pkt->tx_offload;
+ /* copy packet header */
+ rte_memcpy(rte_pktmbuf_mtod(hdr_segment, char *),
+ rte_pktmbuf_mtod(pkt, char *),
+ pkt_hdr_offset);
+}
+
+static inline void
+free_gso_segment(struct rte_mbuf **pkts, uint16_t nb_pkts)
+{
+ uint16_t i;
+
+ for (i = 0; i < nb_pkts; i++) {
+ rte_pktmbuf_detach(pkts[i]->next);
I don't think you need to call detach() here explicitly.
Just rte_pktmbuf_free(pkts[i]) should do I think.
Yes, rte_pktmbuf_free() is enough. I will modify it. Thanks.
Post by Ananyev, Konstantin
+ rte_pktmbuf_free(pkts[i]);
+ pkts[i] = NULL;
+ }
+}
+
+int
+gso_do_segment(struct rte_mbuf *pkt,
+ uint16_t pkt_hdr_offset,
+ uint16_t pyld_unit_size,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out)
+{
+ struct rte_mbuf *pkt_in;
+ struct rte_mbuf *hdr_segment, *pyld_segment;
+ uint32_t pkt_in_pyld_off;
+ uint16_t pkt_in_segment_len, pkt_out_segment_len;
+ uint16_t nb_segs;
+ bool pkt_in_segment_processed;
+
+ pkt_in_pyld_off = pkt->data_off + pkt_hdr_offset;
+ pkt_in = pkt;
+ nb_segs = 0;
+
+ while (pkt_in) {
+ pkt_in_segment_processed = false;
+ pkt_in_segment_len = pkt_in->data_off + pkt_in->data_len;
+
+ while (!pkt_in_segment_processed) {
+ if (unlikely(nb_segs >= nb_pkts_out)) {
+ free_gso_segment(pkts_out, nb_segs);
+ return -EINVAL;
+ }
+
+ /* allocate direct mbuf */
+ hdr_segment = rte_pktmbuf_alloc(direct_pool);
+ if (unlikely(hdr_segment == NULL)) {
+ free_gso_segment(pkts_out, nb_segs);
+ return -ENOMEM;
+ }
+
+ /* allocate indirect mbuf */
+ pyld_segment = rte_pktmbuf_alloc(indirect_pool);
+ if (unlikely(pyld_segment == NULL)) {
+ rte_pktmbuf_free(hdr_segment);
+ free_gso_segment(pkts_out, nb_segs);
+ return -ENOMEM;
+ }
So, if I understand correctly each new packet would always contain just one data segment?
Why several data segments couldn't be chained together (if sum of their data_len <= mss)?
http://dpdk.org/browse/dpdk/tree/lib/librte_ip_frag/rte_ipv4_fragmentation.c#n93
https://gerrit.fd.io/r/gitweb?p=tldk.git;a=blob;f=lib/libtle_l4p/tcp_tx_seg.h;h=a8d2425597a7ad6f598aa4bb7fcd7f1da74305f0;hb=HEAD#l23
?
Oh, yes. I can chain these data segments when their total length is less than the GSO segsz.
I will change it in the next patch. Thanks very much.
Post by Ananyev, Konstantin
+
+ /* copy packet header */
+ hdr_segment_init(hdr_segment, pkt, pkt_hdr_offset);
+
+ /* attach payload mbuf to current packet segment */
+ rte_pktmbuf_attach(pyld_segment, pkt_in);
+
+ hdr_segment->next = pyld_segment;
+ pkts_out[nb_segs++] = hdr_segment;
+
+ /* calculate payload length */
+ pkt_out_segment_len = pyld_unit_size;
+ if (pkt_in_pyld_off + pkt_out_segment_len >
+ pkt_in_segment_len) {
+ pkt_out_segment_len = pkt_in_segment_len -
+ pkt_in_pyld_off;
+ }
+
+ /* update payload segment */
+ pyld_segment->data_off = pkt_in_pyld_off;
+ pyld_segment->data_len = pkt_out_segment_len;
+
+ /* update header segment */
+ hdr_segment->pkt_len += pyld_segment->data_len;
+ hdr_segment->nb_segs++;
+
+ /* update pkt_in_pyld_off */
+ pkt_in_pyld_off += pkt_out_segment_len;
+ if (pkt_in_pyld_off == pkt_in_segment_len)
+ pkt_in_segment_processed = true;
+ }
+
+ /* 'pkt_in' may contain numerous segments */
+ pkt_in = pkt_in->next;
+ if (pkt_in != NULL)
+ pkt_in_pyld_off = pkt_in->data_off;
+ }
+ return nb_segs;
+}
+
+static inline void
+parse_ipv4(struct ipv4_hdr *ipv4_hdr, struct rte_mbuf *pkt)
+{
+ struct tcp_hdr *tcp_hdr;
+
+ switch (ipv4_hdr->next_proto_id) {
+ pkt->packet_type |= RTE_PTYPE_L4_TCP;
+ pkt->l3_len = IPv4_HDR_LEN(ipv4_hdr);
+ tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr + pkt->l3_len);
+ pkt->l4_len = TCP_HDR_LEN(tcp_hdr);
+ break;
+ }
+}
+
+static inline void
+parse_ethernet(struct ether_hdr *eth_hdr, struct rte_mbuf *pkt)
+{
+ struct ipv4_hdr *ipv4_hdr;
+ struct vlan_hdr *vlan_hdr;
+ uint16_t ethertype;
+
+ ethertype = rte_be_to_cpu_16(eth_hdr->ether_type);
+ if (ethertype == ETHER_TYPE_VLAN) {
+ vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1);
+ pkt->l2_len = sizeof(struct vlan_hdr);
+ pkt->packet_type |= RTE_PTYPE_L2_ETHER_VLAN;
+ ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto);
+ }
+
+ switch (ethertype) {
+ if (IS_VLAN_PKT(pkt)) {
+ pkt->packet_type |= RTE_PTYPE_L3_IPV4;
+ } else {
+ pkt->packet_type |= RTE_PTYPE_L2_ETHER;
+ pkt->packet_type |= RTE_PTYPE_L3_IPV4;
+ }
+ pkt->l2_len += sizeof(struct ether_hdr);
+ ipv4_hdr = (struct ipv4_hdr *) ((char *)eth_hdr +
+ pkt->l2_len);
+ parse_ipv4(ipv4_hdr, pkt);
+ break;
+ }
+}
+
+void
+gso_parse_packet(struct rte_mbuf *pkt)
There is a function rte_net_get_ptype() that supposed to provide similar functionality.
So we probably don't need to create a new SW parse function here, instead would be better
to reuse (and update if needed) an existing one.
Again user already might have l2/l3/l4.../_len and packet_type setuped.
So better to keep SW packet parsing out of scope of that library.
Hmm, I know we have discussed this design choice in the GRO library, and I also think it's
better to reuse these values.

But from the perspective of OVS, it may add extra overhead, since OVS doesn't parse every
Post by Ananyev, Konstantin
+{
+ struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+
+ pkt->packet_type = pkt->tx_offload = 0;
+ parse_ethernet(eth_hdr, pkt);
+}
+
+static inline void
+update_ipv4_header(char *base, uint16_t offset, uint16_t length, uint16_t id)
+{
+ struct ipv4_hdr *ipv4_hdr = (struct ipv4_hdr *)(base + offset);
+
+ ipv4_hdr->total_length = rte_cpu_to_be_16(length - offset);
+ ipv4_hdr->packet_id = rte_cpu_to_be_16(id);
+}
+
+static inline void
+update_tcp_header(char *base, uint16_t offset, uint32_t sent_seq,
+ uint8_t non_tail)
+{
+ struct tcp_hdr *tcp_hdr = (struct tcp_hdr *)(base + offset);
+
+ tcp_hdr->sent_seq = rte_cpu_to_be_32(sent_seq);
+ /* clean FIN and PSH for non-tail segments */
+ if (non_tail)
+ tcp_hdr->tcp_flags &= (~(TCP_HDR_PSH_MASK | TCP_HDR_FIN_MASK));
+}
+
+void
+gso_update_pkt_headers(struct rte_mbuf *pkt, uint16_t nb_segments,
+ struct rte_mbuf **out_segments)
+{
+ struct ipv4_hdr *ipv4_hdr;
+ struct tcp_hdr *tcp_hdr;
+ struct rte_mbuf *seg;
+ uint32_t sent_seq;
+ uint16_t offset, i;
+ uint16_t tail_seg_idx = nb_segments - 1, id;
+
+ switch (pkt->packet_type) {
update_inner_tcp_hdr(..) or so.
Then you can reuse it for tunneled cases too.
Yes, I will modify it in the next patch. Thanks.
Post by Ananyev, Konstantin
+ ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *)
+ pkt->l2_len);
+ tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr + pkt->l3_len);
+ id = rte_be_to_cpu_16(ipv4_hdr->packet_id);
+ sent_seq = rte_be_to_cpu_32(tcp_hdr->sent_seq);
+
+ for (i = 0; i < nb_segments; i++) {
+ seg = out_segments[i];
+
+ offset = seg->l2_len;
+ update_ipv4_header(rte_pktmbuf_mtod(seg, char *),
+ offset, seg->pkt_len, id);
+ id++;
Who would be responsible to make sure that we wouldn't have consecutive packets with the IPV4 id?
Would be the upper layer that forms the packet or gso library or ...?
Oh yes. I ingore this important issue. I don't think applications can guarantee it.
I will check the design of linux and try to figure out a way. Thanks for reminder.
Post by Ananyev, Konstantin
+
+ offset += seg->l3_len;
+ update_tcp_header(rte_pktmbuf_mtod(seg, char *),
+ offset, sent_seq, i < tail_seg_idx);
+ sent_seq += seg->next->data_len;
+ }
+ break;
+ }
+}
diff --git a/lib/librte_gso/gso_common.h b/lib/librte_gso/gso_common.h
new file mode 100644
index 0000000..d750041
--- /dev/null
+++ b/lib/librte_gso/gso_common.h
@@ -0,0 +1,120 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _GSO_COMMON_H_
+#define _GSO_COMMON_H_
+
+#include <stdint.h>
+#include <rte_mbuf.h>
+
+#define IPV4_HDR_DF_SHIFT 14
+#define IPV4_HDR_DF_MASK (1 << IPV4_HDR_DF_SHIFT)
+#define IPv4_HDR_LEN(iph) ((iph->version_ihl & 0x0f) * 4)
+
+#define TCP_HDR_PSH_MASK ((uint8_t)0x08)
+#define TCP_HDR_FIN_MASK ((uint8_t)0x01)
+#define TCP_HDR_LEN(tcph) ((tcph->data_off & 0xf0) >> 2)
+
+#define ETHER_IPv4_PKT (RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4)
+/* Supported packet types */
+/* TCP/IPv4 packet. */
+#define ETHER_IPv4_TCP_PKT (ETHER_IPv4_PKT | RTE_PTYPE_L4_TCP)
+
+/* TCP/IPv4 packet with VLAN tag. */
+#define ETHER_VLAN_IPv4_TCP_PKT (RTE_PTYPE_L2_ETHER_VLAN | \
+ RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L4_TCP)
+
+#define IS_VLAN_PKT(pkt) ((pkt->packet_type & RTE_PTYPE_L2_ETHER_VLAN) == \
+ RTE_PTYPE_L2_ETHER_VLAN)
+
+/**
+ * Internal function which parses a packet, setting outer_l2/l3_len and
+ * l2/l3/l4_len and packet_type.
+ *
+ * Packet to parse.
+ */
+void gso_parse_packet(struct rte_mbuf *pkt);
+
+/**
+ * Internal function which updates relevant packet headers, following
+ * segmentation. This is required to update, for example, the IPv4
+ * 'total_length' field, to reflect the reduced length of the now-
+ * segmented packet.
+ *
+ * The original packet.
+ * The number of GSO segments into which pkt was split.
+ * Pointer array used for storing mbuf addresses for GSO segments.
+ */
+void gso_update_pkt_headers(struct rte_mbuf *pkt, uint16_t nb_segments,
+ struct rte_mbuf **out_segments);
+
+/**
+ * Internal function which divides the input packet into small segments.
+ * Each of the newly-created segments is organized as a two-segment mbuf,
+ * where the first segment is a standard mbuf, which stores a copy of
+ * packet header, and the second is an indirect mbuf which points to a
+ * section of data in the input packet.
+ *
+ * Packet to segment.
+ * Packet header offset, measured in byte.
+ * The max payload length of a GSO segment.
+ * MBUF pool used for allocating direct buffers for output segments.
+ * MBUF pool used for allocating indirect buffers for output segments.
+ * Pointer array used to keep the mbuf addresses of output segments.
+ * The max number of items that pkts_out can keep.
+ *
+ * - The number of segments created in the event of success.
+ * - If no GSO is performed, return 1.
+ * - If available memory in mempools is insufficient, return -ENOMEM.
+ * - -EINVAL for invalid parameters
+ */
+int gso_do_segment(struct rte_mbuf *pkt,
+ uint16_t pkt_hdr_offset,
+ uint16_t pyld_unit_size,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out);
+#endif
diff --git a/lib/librte_gso/gso_tcp.c b/lib/librte_gso/gso_tcp.c
new file mode 100644
index 0000000..9d5fc30
--- /dev/null
+++ b/lib/librte_gso/gso_tcp.c
@@ -0,0 +1,82 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include <rte_ether.h>
+#include <rte_ip.h>
+#include <rte_tcp.h>
+
+#include "gso_common.h"
+#include "gso_tcp.h"
+
+int
+gso_tcp4_segment(struct rte_mbuf *pkt,
+ uint16_t gso_size,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out)
+{
+ struct ether_hdr *eth_hdr;
+ struct ipv4_hdr *ipv4_hdr;
+ uint16_t tcp_dl;
+ uint16_t pyld_unit_size;
+ uint16_t hdr_offset;
+ int ret = 1;
+
+ eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+ ipv4_hdr = (struct ipv4_hdr *)((char *)eth_hdr + pkt->l2_len);
+
+ /* don't process fragmented packet */
+ if ((ipv4_hdr->fragment_offset &
+ rte_cpu_to_be_16(IPV4_HDR_DF_MASK)) == 0)
+ return ret;
+
+ tcp_dl = rte_be_to_cpu_16(ipv4_hdr->total_length) -
+ pkt->l3_len - pkt->l4_len;
+ /* don't process packet without data */
+ if (tcp_dl == 0)
+ return ret;
+
+ hdr_offset = pkt->l2_len + pkt->l3_len + pkt->l4_len;
+ pyld_unit_size = gso_size - hdr_offset - ETHER_CRC_LEN;
+
+ /* segment the payload */
+ ret = gso_do_segment(pkt, hdr_offset, pyld_unit_size, direct_pool,
+ indirect_pool, pkts_out, nb_pkts_out);
+
+ if (ret > 1)
+ gso_update_pkt_headers(pkt, ret, pkts_out);
+
+ return ret;
+}
diff --git a/lib/librte_gso/gso_tcp.h b/lib/librte_gso/gso_tcp.h
new file mode 100644
index 0000000..f291ccb
--- /dev/null
+++ b/lib/librte_gso/gso_tcp.h
@@ -0,0 +1,73 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _GSO_TCP_H_
+#define _GSO_TCP_H_
+
+#include <stdint.h>
+#include <rte_mbuf.h>
+
+/**
+ * Segment an IPv4/TCP packet. This function assumes the input packet has
+ * correct checksums and doesn't update checksums for GSO segment.
+ * Furthermore, it doesn't process IP fragment packets.
+ *
+ * The packet mbuf to segment.
+ * The max length of a GSO segment, measured in bytes.
+ * MBUF pool used for allocating direct buffers for output segments.
+ * MBUF pool used for allocating indirect buffers for output segments.
+ * Pointer array, which is used to store mbuf addresses of GSO segments.
+ * Caller should guarantee that 'pkts_out' is sufficiently large to store
+ * all GSO segments.
+ * The max number of items that 'pkts_out' can keep.
+ *
+ * - The number of GSO segments on success.
+ * - Return 1 if no GSO is performed.
+ * - Return -ENOMEM if available memory in mempools is insufficient.
+ * - Return -EINVAL for invalid parameters.
+ */
+int gso_tcp4_segment(struct rte_mbuf *pkt,
+ uint16_t gso_size,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out);
+
+#endif
diff --git a/lib/librte_gso/rte_gso.c b/lib/librte_gso/rte_gso.c
index b81afce..fac95f2 100644
--- a/lib/librte_gso/rte_gso.c
+++ b/lib/librte_gso/rte_gso.c
@@ -31,17 +31,57 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
+#include <rte_log.h>
+
#include "rte_gso.h"
+#include "gso_common.h"
+#include "gso_tcp.h"
int
rte_gso_segment(struct rte_mbuf *pkt,
struct rte_gso_ctx gso_ctx,
struct rte_mbuf **pkts_out,
- uint16_t nb_pkts_out __rte_unused)
+ uint16_t nb_pkts_out)
{
+ struct rte_mempool *direct_pool, *indirect_pool;
+ struct rte_mbuf *pkt_seg;
+ uint16_t nb_segments, gso_size;
+
if (pkt == NULL || pkts_out == NULL || gso_ctx.direct_pool ==
NULL || gso_ctx.indirect_pool == NULL)
return -EINVAL;
Probably we don't need to check gso_ctx values for each incoming packet.
If you feel it is necessary - create new function rte_gso_ctx_check() that
could be performed just once per ctx.
Agree. I will change it. Thanks.
Post by Ananyev, Konstantin
- return 1;
+ if ((gso_ctx.gso_types & RTE_GSO_TCP_IPV4) == 0 ||
+ gso_ctx.gso_size >= pkt->pkt_len ||
+ gso_ctx.gso_size == 0)
First and third condition seems redundant.
The reason to check gso_ctx.gso_types here is that we don't perform
GSO if applications don't set RTE_GSO_TCP_IPV4 to gso_ctx.gso_types,
even the input packet is TCP/IPv4. And if gso_ctx.gso_size is 0,
we don't need to execute the following codes. So we still need to
remove these two conditions?
Post by Ananyev, Konstantin
+ return 1;
pkts_out[0] = pkt;
But why should we keep the input packet in the output array? Currently, if
GSO is not performed, no packets will be kept in pkts_out[]. Applications
can know it by the return value 1 of rte_gso_segment().
Post by Ananyev, Konstantin
+
+ pkt_seg = pkt;
+ gso_size = gso_ctx.gso_size;
+ direct_pool = gso_ctx.direct_pool;
+ indirect_pool = gso_ctx.indirect_pool;
+
+ /* Parse packet headers to determine how to segment 'pkt' */
+ gso_parse_packet(pkt);
I don't think we need to parse packet here.
Instead assume that user already filled packet_type and l2/l3/..._len fields correctly.
Hmm, I see it. Thanks.

Thanks,
Jiayu
Kavanagh, Mark B
2017-08-30 09:25:38 UTC
Permalink
From: Hu, Jiayu
Sent: Wednesday, August 30, 2017 3:56 AM
Subject: Re: [PATCH 2/5] gso/lib: add TCP/IPv4 GSO support
Hi Konstantin,
Thanks for your important suggestions. My feedbacks are inline.
Post by Ananyev, Konstantin
-----Original Message-----
From: Hu, Jiayu
Sent: Thursday, August 24, 2017 3:16 PM
Subject: [PATCH 2/5] gso/lib: add TCP/IPv4 GSO support
This patch adds GSO support for TCP/IPv4 packets. Supported packets
may include a single VLAN tag. TCP/IPv4 GSO assumes that all input
packets have correct checksums, and doesn't update checksums for output
packets (the responsibility for this lies with the application).
Additionally, TCP/IPv4 GSO doesn't process IP fragmented packets.
TCP/IPv4 GSO uses two chained MBUFs, one direct MBUF and one indrect
MBUF, to organize an output packet. Note that we refer to these two
chained MBUFs as a two-segment MBUF. The direct MBUF stores the packet
header, while the indirect mbuf simply points to a location within the
original packet's payload. Consequently, use of the GSO library requires
multi-segment MBUF support in the TX functions of the NIC driver.
If a packet is GSOed, TCP/IPv4 GSO reduces its MBUF refcnt by 1. As a
result, when all of its GSOed segments are freed, the packet is freed
automatically.
---
lib/librte_eal/common/include/rte_log.h | 1 +
lib/librte_gso/Makefile | 2 +
lib/librte_gso/gso_common.c | 270
++++++++++++++++++++++++++++++++
Post by Ananyev, Konstantin
lib/librte_gso/gso_common.h | 120 ++++++++++++++
lib/librte_gso/gso_tcp.c | 82 ++++++++++
lib/librte_gso/gso_tcp.h | 73 +++++++++
lib/librte_gso/rte_gso.c | 44 +++++-
lib/librte_gso/rte_gso.h | 3 +
8 files changed, 593 insertions(+), 2 deletions(-)
create mode 100644 lib/librte_gso/gso_common.c
create mode 100644 lib/librte_gso/gso_common.h
create mode 100644 lib/librte_gso/gso_tcp.c
create mode 100644 lib/librte_gso/gso_tcp.h
diff --git a/lib/librte_eal/common/include/rte_log.h
b/lib/librte_eal/common/include/rte_log.h
Post by Ananyev, Konstantin
index ec8dba7..2fa1199 100644
--- a/lib/librte_eal/common/include/rte_log.h
+++ b/lib/librte_eal/common/include/rte_log.h
@@ -87,6 +87,7 @@ extern struct rte_logs rte_logs;
#define RTE_LOGTYPE_CRYPTODEV 17 /**< Log related to cryptodev. */
#define RTE_LOGTYPE_EFD 18 /**< Log related to EFD. */
#define RTE_LOGTYPE_EVENTDEV 19 /**< Log related to eventdev. */
+#define RTE_LOGTYPE_GSO 20 /**< Log related to GSO. */
/* these log types can be used in an application */
#define RTE_LOGTYPE_USER1 24 /**< User-defined log type 1. */
diff --git a/lib/librte_gso/Makefile b/lib/librte_gso/Makefile
index aeaacbc..0f8e38f 100644
--- a/lib/librte_gso/Makefile
+++ b/lib/librte_gso/Makefile
@@ -42,6 +42,8 @@ LIBABIVER := 1
#source files
SRCS-$(CONFIG_RTE_LIBRTE_GSO) += rte_gso.c
+SRCS-$(CONFIG_RTE_LIBRTE_GSO) += gso_common.c
+SRCS-$(CONFIG_RTE_LIBRTE_GSO) += gso_tcp.c
# install this header file
SYMLINK-$(CONFIG_RTE_LIBRTE_GSO)-include += rte_gso.h
diff --git a/lib/librte_gso/gso_common.c b/lib/librte_gso/gso_common.c
new file mode 100644
index 0000000..2b54fbd
--- /dev/null
+++ b/lib/librte_gso/gso_common.c
@@ -0,0 +1,270 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
copyright
Post by Ananyev, Konstantin
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
FOR
Post by Ananyev, Konstantin
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
INCIDENTAL,
Post by Ananyev, Konstantin
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
USE,
Post by Ananyev, Konstantin
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY
Post by Ananyev, Konstantin
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE
Post by Ananyev, Konstantin
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdbool.h>
+#include <string.h>
+
+#include <rte_malloc.h>
+
+#include <rte_ether.h>
+#include <rte_ip.h>
+#include <rte_tcp.h>
+
+#include "gso_common.h"
+
+static inline void
+hdr_segment_init(struct rte_mbuf *hdr_segment, struct rte_mbuf *pkt,
+ uint16_t pkt_hdr_offset)
+{
+ /* copy mbuf metadata */
+ hdr_segment->nb_segs = 1;
+ hdr_segment->port = pkt->port;
+ hdr_segment->ol_flags = pkt->ol_flags;
+ hdr_segment->packet_type = pkt->packet_type;
+ hdr_segment->pkt_len = pkt_hdr_offset;
+ hdr_segment->data_len = pkt_hdr_offset;
+ hdr_segment->tx_offload = pkt->tx_offload;
+ /* copy packet header */
+ rte_memcpy(rte_pktmbuf_mtod(hdr_segment, char *),
+ rte_pktmbuf_mtod(pkt, char *),
+ pkt_hdr_offset);
+}
+
+static inline void
+free_gso_segment(struct rte_mbuf **pkts, uint16_t nb_pkts)
+{
+ uint16_t i;
+
+ for (i = 0; i < nb_pkts; i++) {
+ rte_pktmbuf_detach(pkts[i]->next);
I don't think you need to call detach() here explicitly.
Just rte_pktmbuf_free(pkts[i]) should do I think.
Yes, rte_pktmbuf_free() is enough. I will modify it. Thanks.
Post by Ananyev, Konstantin
+ rte_pktmbuf_free(pkts[i]);
+ pkts[i] = NULL;
+ }
+}
+
+int
+gso_do_segment(struct rte_mbuf *pkt,
+ uint16_t pkt_hdr_offset,
+ uint16_t pyld_unit_size,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out)
+{
+ struct rte_mbuf *pkt_in;
+ struct rte_mbuf *hdr_segment, *pyld_segment;
+ uint32_t pkt_in_pyld_off;
+ uint16_t pkt_in_segment_len, pkt_out_segment_len;
+ uint16_t nb_segs;
+ bool pkt_in_segment_processed;
+
+ pkt_in_pyld_off = pkt->data_off + pkt_hdr_offset;
+ pkt_in = pkt;
+ nb_segs = 0;
+
+ while (pkt_in) {
+ pkt_in_segment_processed = false;
+ pkt_in_segment_len = pkt_in->data_off + pkt_in->data_len;
+
+ while (!pkt_in_segment_processed) {
+ if (unlikely(nb_segs >= nb_pkts_out)) {
+ free_gso_segment(pkts_out, nb_segs);
+ return -EINVAL;
+ }
+
+ /* allocate direct mbuf */
+ hdr_segment = rte_pktmbuf_alloc(direct_pool);
+ if (unlikely(hdr_segment == NULL)) {
+ free_gso_segment(pkts_out, nb_segs);
+ return -ENOMEM;
+ }
+
+ /* allocate indirect mbuf */
+ pyld_segment = rte_pktmbuf_alloc(indirect_pool);
+ if (unlikely(pyld_segment == NULL)) {
+ rte_pktmbuf_free(hdr_segment);
+ free_gso_segment(pkts_out, nb_segs);
+ return -ENOMEM;
+ }
So, if I understand correctly each new packet would always contain just one
data segment?
Post by Ananyev, Konstantin
Why several data segments couldn't be chained together (if sum of their
data_len <= mss)?
http://dpdk.org/browse/dpdk/tree/lib/librte_ip_frag/rte_ipv4_fragmentation.c#n
93
https://gerrit.fd.io/r/gitweb?p=tldk.git;a=blob;f=lib/libtle_l4p/tcp_tx_seg.h;
h=a8d2425597a7ad6f598aa4bb7fcd7f1da74305f0;hb=HEAD#l23
Post by Ananyev, Konstantin
?
Oh, yes. I can chain these data segments when their total length is less than the GSO segsz.
I will change it in the next patch. Thanks very much.
Post by Ananyev, Konstantin
+
+ /* copy packet header */
+ hdr_segment_init(hdr_segment, pkt, pkt_hdr_offset);
+
+ /* attach payload mbuf to current packet segment */
+ rte_pktmbuf_attach(pyld_segment, pkt_in);
+
+ hdr_segment->next = pyld_segment;
+ pkts_out[nb_segs++] = hdr_segment;
+
+ /* calculate payload length */
+ pkt_out_segment_len = pyld_unit_size;
+ if (pkt_in_pyld_off + pkt_out_segment_len >
+ pkt_in_segment_len) {
+ pkt_out_segment_len = pkt_in_segment_len -
+ pkt_in_pyld_off;
+ }
+
+ /* update payload segment */
+ pyld_segment->data_off = pkt_in_pyld_off;
+ pyld_segment->data_len = pkt_out_segment_len;
+
+ /* update header segment */
+ hdr_segment->pkt_len += pyld_segment->data_len;
+ hdr_segment->nb_segs++;
+
+ /* update pkt_in_pyld_off */
+ pkt_in_pyld_off += pkt_out_segment_len;
+ if (pkt_in_pyld_off == pkt_in_segment_len)
+ pkt_in_segment_processed = true;
+ }
+
+ /* 'pkt_in' may contain numerous segments */
+ pkt_in = pkt_in->next;
+ if (pkt_in != NULL)
+ pkt_in_pyld_off = pkt_in->data_off;
+ }
+ return nb_segs;
+}
+
+static inline void
+parse_ipv4(struct ipv4_hdr *ipv4_hdr, struct rte_mbuf *pkt)
+{
+ struct tcp_hdr *tcp_hdr;
+
+ switch (ipv4_hdr->next_proto_id) {
+ pkt->packet_type |= RTE_PTYPE_L4_TCP;
+ pkt->l3_len = IPv4_HDR_LEN(ipv4_hdr);
+ tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr + pkt->l3_len);
+ pkt->l4_len = TCP_HDR_LEN(tcp_hdr);
+ break;
+ }
+}
+
+static inline void
+parse_ethernet(struct ether_hdr *eth_hdr, struct rte_mbuf *pkt)
+{
+ struct ipv4_hdr *ipv4_hdr;
+ struct vlan_hdr *vlan_hdr;
+ uint16_t ethertype;
+
+ ethertype = rte_be_to_cpu_16(eth_hdr->ether_type);
+ if (ethertype == ETHER_TYPE_VLAN) {
+ vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1);
+ pkt->l2_len = sizeof(struct vlan_hdr);
+ pkt->packet_type |= RTE_PTYPE_L2_ETHER_VLAN;
+ ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto);
+ }
+
+ switch (ethertype) {
+ if (IS_VLAN_PKT(pkt)) {
+ pkt->packet_type |= RTE_PTYPE_L3_IPV4;
+ } else {
+ pkt->packet_type |= RTE_PTYPE_L2_ETHER;
+ pkt->packet_type |= RTE_PTYPE_L3_IPV4;
+ }
+ pkt->l2_len += sizeof(struct ether_hdr);
+ ipv4_hdr = (struct ipv4_hdr *) ((char *)eth_hdr +
+ pkt->l2_len);
+ parse_ipv4(ipv4_hdr, pkt);
+ break;
+ }
+}
+
+void
+gso_parse_packet(struct rte_mbuf *pkt)
There is a function rte_net_get_ptype() that supposed to provide similar
functionality.
Post by Ananyev, Konstantin
So we probably don't need to create a new SW parse function here, instead
would be better
Post by Ananyev, Konstantin
to reuse (and update if needed) an existing one.
Again user already might have l2/l3/l4.../_len and packet_type setuped.
So better to keep SW packet parsing out of scope of that library.
Hmm, I know we have discussed this design choice in the GRO library, and I also think it's
better to reuse these values.
But from the perspective of OVS, it may add extra overhead, since OVS doesn't parse every
Hi Jiayu, Konstantin

For GSO, the application needs to know:
- the packet type (as it only currently supports TCP/IPv4, VxLAN, GRE packets)
- the l2/3/4_lens, etc. (in order to replicate the original packet's headers across outgoing segments)

For this, we can use the rte_net_get_ptype function, as per Konstantin's suggestion, as it provides both - thanks Konstantin!

WRT the extra overhead in OvS: TSO is the defacto standard, and GSO is provided purely as a fallback option. As such, and since the additional packet parsing is a necessity in order to facilitate GSO, the additional overhead is IMO acceptable.

Thanks,
Mark
Post by Ananyev, Konstantin
+{
+ struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+
+ pkt->packet_type = pkt->tx_offload = 0;
+ parse_ethernet(eth_hdr, pkt);
+}
+
+static inline void
+update_ipv4_header(char *base, uint16_t offset, uint16_t length, uint16_t
id)
Post by Ananyev, Konstantin
+{
+ struct ipv4_hdr *ipv4_hdr = (struct ipv4_hdr *)(base + offset);
+
+ ipv4_hdr->total_length = rte_cpu_to_be_16(length - offset);
+ ipv4_hdr->packet_id = rte_cpu_to_be_16(id);
+}
+
+static inline void
+update_tcp_header(char *base, uint16_t offset, uint32_t sent_seq,
+ uint8_t non_tail)
+{
+ struct tcp_hdr *tcp_hdr = (struct tcp_hdr *)(base + offset);
+
+ tcp_hdr->sent_seq = rte_cpu_to_be_32(sent_seq);
+ /* clean FIN and PSH for non-tail segments */
+ if (non_tail)
+ tcp_hdr->tcp_flags &= (~(TCP_HDR_PSH_MASK | TCP_HDR_FIN_MASK));
+}
+
+void
+gso_update_pkt_headers(struct rte_mbuf *pkt, uint16_t nb_segments,
+ struct rte_mbuf **out_segments)
+{
+ struct ipv4_hdr *ipv4_hdr;
+ struct tcp_hdr *tcp_hdr;
+ struct rte_mbuf *seg;
+ uint32_t sent_seq;
+ uint16_t offset, i;
+ uint16_t tail_seg_idx = nb_segments - 1, id;
+
+ switch (pkt->packet_type) {
update_inner_tcp_hdr(..) or so.
Then you can reuse it for tunneled cases too.
Yes, I will modify it in the next patch. Thanks.
Post by Ananyev, Konstantin
+ ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *)
+ pkt->l2_len);
+ tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr + pkt->l3_len);
+ id = rte_be_to_cpu_16(ipv4_hdr->packet_id);
+ sent_seq = rte_be_to_cpu_32(tcp_hdr->sent_seq);
+
+ for (i = 0; i < nb_segments; i++) {
+ seg = out_segments[i];
+
+ offset = seg->l2_len;
+ update_ipv4_header(rte_pktmbuf_mtod(seg, char *),
+ offset, seg->pkt_len, id);
+ id++;
Who would be responsible to make sure that we wouldn't have consecutive
packets with the IPV4 id?
Post by Ananyev, Konstantin
Would be the upper layer that forms the packet or gso library or ...?
Oh yes. I ingore this important issue. I don't think applications can guarantee it.
I will check the design of linux and try to figure out a way. Thanks for reminder.
Post by Ananyev, Konstantin
+
+ offset += seg->l3_len;
+ update_tcp_header(rte_pktmbuf_mtod(seg, char *),
+ offset, sent_seq, i < tail_seg_idx);
+ sent_seq += seg->next->data_len;
+ }
+ break;
+ }
+}
diff --git a/lib/librte_gso/gso_common.h b/lib/librte_gso/gso_common.h
new file mode 100644
index 0000000..d750041
--- /dev/null
+++ b/lib/librte_gso/gso_common.h
@@ -0,0 +1,120 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
copyright
Post by Ananyev, Konstantin
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
FOR
Post by Ananyev, Konstantin
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
INCIDENTAL,
Post by Ananyev, Konstantin
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
USE,
Post by Ananyev, Konstantin
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY
Post by Ananyev, Konstantin
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE
Post by Ananyev, Konstantin
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _GSO_COMMON_H_
+#define _GSO_COMMON_H_
+
+#include <stdint.h>
+#include <rte_mbuf.h>
+
+#define IPV4_HDR_DF_SHIFT 14
+#define IPV4_HDR_DF_MASK (1 << IPV4_HDR_DF_SHIFT)
+#define IPv4_HDR_LEN(iph) ((iph->version_ihl & 0x0f) * 4)
+
+#define TCP_HDR_PSH_MASK ((uint8_t)0x08)
+#define TCP_HDR_FIN_MASK ((uint8_t)0x01)
+#define TCP_HDR_LEN(tcph) ((tcph->data_off & 0xf0) >> 2)
+
+#define ETHER_IPv4_PKT (RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4)
+/* Supported packet types */
+/* TCP/IPv4 packet. */
+#define ETHER_IPv4_TCP_PKT (ETHER_IPv4_PKT | RTE_PTYPE_L4_TCP)
+
+/* TCP/IPv4 packet with VLAN tag. */
+#define ETHER_VLAN_IPv4_TCP_PKT (RTE_PTYPE_L2_ETHER_VLAN | \
+ RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L4_TCP)
+
+#define IS_VLAN_PKT(pkt) ((pkt->packet_type & RTE_PTYPE_L2_ETHER_VLAN) ==
\
Post by Ananyev, Konstantin
+ RTE_PTYPE_L2_ETHER_VLAN)
+
+/**
+ * Internal function which parses a packet, setting outer_l2/l3_len and
+ * l2/l3/l4_len and packet_type.
+ *
+ * Packet to parse.
+ */
+void gso_parse_packet(struct rte_mbuf *pkt);
+
+/**
+ * Internal function which updates relevant packet headers, following
+ * segmentation. This is required to update, for example, the IPv4
+ * 'total_length' field, to reflect the reduced length of the now-
+ * segmented packet.
+ *
+ * The original packet.
+ * The number of GSO segments into which pkt was split.
+ * Pointer array used for storing mbuf addresses for GSO segments.
+ */
+void gso_update_pkt_headers(struct rte_mbuf *pkt, uint16_t nb_segments,
+ struct rte_mbuf **out_segments);
+
+/**
+ * Internal function which divides the input packet into small segments.
+ * Each of the newly-created segments is organized as a two-segment mbuf,
+ * where the first segment is a standard mbuf, which stores a copy of
+ * packet header, and the second is an indirect mbuf which points to a
+ * section of data in the input packet.
+ *
+ * Packet to segment.
+ * Packet header offset, measured in byte.
+ * The max payload length of a GSO segment.
+ * MBUF pool used for allocating direct buffers for output segments.
+ * MBUF pool used for allocating indirect buffers for output segments.
+ * Pointer array used to keep the mbuf addresses of output segments.
+ * The max number of items that pkts_out can keep.
+ *
+ * - The number of segments created in the event of success.
+ * - If no GSO is performed, return 1.
+ * - If available memory in mempools is insufficient, return -ENOMEM.
+ * - -EINVAL for invalid parameters
+ */
+int gso_do_segment(struct rte_mbuf *pkt,
+ uint16_t pkt_hdr_offset,
+ uint16_t pyld_unit_size,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out);
+#endif
diff --git a/lib/librte_gso/gso_tcp.c b/lib/librte_gso/gso_tcp.c
new file mode 100644
index 0000000..9d5fc30
--- /dev/null
+++ b/lib/librte_gso/gso_tcp.c
@@ -0,0 +1,82 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
copyright
Post by Ananyev, Konstantin
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
FOR
Post by Ananyev, Konstantin
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
INCIDENTAL,
Post by Ananyev, Konstantin
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
USE,
Post by Ananyev, Konstantin
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY
Post by Ananyev, Konstantin
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE
Post by Ananyev, Konstantin
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include <rte_ether.h>
+#include <rte_ip.h>
+#include <rte_tcp.h>
+
+#include "gso_common.h"
+#include "gso_tcp.h"
+
+int
+gso_tcp4_segment(struct rte_mbuf *pkt,
+ uint16_t gso_size,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out)
+{
+ struct ether_hdr *eth_hdr;
+ struct ipv4_hdr *ipv4_hdr;
+ uint16_t tcp_dl;
+ uint16_t pyld_unit_size;
+ uint16_t hdr_offset;
+ int ret = 1;
+
+ eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+ ipv4_hdr = (struct ipv4_hdr *)((char *)eth_hdr + pkt->l2_len);
+
+ /* don't process fragmented packet */
+ if ((ipv4_hdr->fragment_offset &
+ rte_cpu_to_be_16(IPV4_HDR_DF_MASK)) == 0)
+ return ret;
+
+ tcp_dl = rte_be_to_cpu_16(ipv4_hdr->total_length) -
+ pkt->l3_len - pkt->l4_len;
+ /* don't process packet without data */
+ if (tcp_dl == 0)
+ return ret;
+
+ hdr_offset = pkt->l2_len + pkt->l3_len + pkt->l4_len;
+ pyld_unit_size = gso_size - hdr_offset - ETHER_CRC_LEN;
+
+ /* segment the payload */
+ ret = gso_do_segment(pkt, hdr_offset, pyld_unit_size, direct_pool,
+ indirect_pool, pkts_out, nb_pkts_out);
+
+ if (ret > 1)
+ gso_update_pkt_headers(pkt, ret, pkts_out);
+
+ return ret;
+}
diff --git a/lib/librte_gso/gso_tcp.h b/lib/librte_gso/gso_tcp.h
new file mode 100644
index 0000000..f291ccb
--- /dev/null
+++ b/lib/librte_gso/gso_tcp.h
@@ -0,0 +1,73 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
copyright
Post by Ananyev, Konstantin
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
FOR
Post by Ananyev, Konstantin
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
INCIDENTAL,
Post by Ananyev, Konstantin
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
USE,
Post by Ananyev, Konstantin
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY
Post by Ananyev, Konstantin
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE
Post by Ananyev, Konstantin
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _GSO_TCP_H_
+#define _GSO_TCP_H_
+
+#include <stdint.h>
+#include <rte_mbuf.h>
+
+/**
+ * Segment an IPv4/TCP packet. This function assumes the input packet has
+ * correct checksums and doesn't update checksums for GSO segment.
+ * Furthermore, it doesn't process IP fragment packets.
+ *
+ * The packet mbuf to segment.
+ * The max length of a GSO segment, measured in bytes.
+ * MBUF pool used for allocating direct buffers for output segments.
+ * MBUF pool used for allocating indirect buffers for output segments.
+ * Pointer array, which is used to store mbuf addresses of GSO segments.
+ * Caller should guarantee that 'pkts_out' is sufficiently large to
store
Post by Ananyev, Konstantin
+ * all GSO segments.
+ * The max number of items that 'pkts_out' can keep.
+ *
+ * - The number of GSO segments on success.
+ * - Return 1 if no GSO is performed.
+ * - Return -ENOMEM if available memory in mempools is insufficient.
+ * - Return -EINVAL for invalid parameters.
+ */
+int gso_tcp4_segment(struct rte_mbuf *pkt,
+ uint16_t gso_size,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out);
+
+#endif
diff --git a/lib/librte_gso/rte_gso.c b/lib/librte_gso/rte_gso.c
index b81afce..fac95f2 100644
--- a/lib/librte_gso/rte_gso.c
+++ b/lib/librte_gso/rte_gso.c
@@ -31,17 +31,57 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
+#include <rte_log.h>
+
#include "rte_gso.h"
+#include "gso_common.h"
+#include "gso_tcp.h"
int
rte_gso_segment(struct rte_mbuf *pkt,
struct rte_gso_ctx gso_ctx,
struct rte_mbuf **pkts_out,
- uint16_t nb_pkts_out __rte_unused)
+ uint16_t nb_pkts_out)
{
+ struct rte_mempool *direct_pool, *indirect_pool;
+ struct rte_mbuf *pkt_seg;
+ uint16_t nb_segments, gso_size;
+
if (pkt == NULL || pkts_out == NULL || gso_ctx.direct_pool ==
NULL || gso_ctx.indirect_pool == NULL)
return -EINVAL;
Probably we don't need to check gso_ctx values for each incoming packet.
If you feel it is necessary - create new function rte_gso_ctx_check() that
could be performed just once per ctx.
Agree. I will change it. Thanks.
Post by Ananyev, Konstantin
- return 1;
+ if ((gso_ctx.gso_types & RTE_GSO_TCP_IPV4) == 0 ||
+ gso_ctx.gso_size >= pkt->pkt_len ||
+ gso_ctx.gso_size == 0)
First and third condition seems redundant.
The reason to check gso_ctx.gso_types here is that we don't perform
GSO if applications don't set RTE_GSO_TCP_IPV4 to gso_ctx.gso_types,
even the input packet is TCP/IPv4. And if gso_ctx.gso_size is 0,
we don't need to execute the following codes. So we still need to
remove these two conditions?
Post by Ananyev, Konstantin
+ return 1;
pkts_out[0] = pkt;
But why should we keep the input packet in the output array? Currently, if
GSO is not performed, no packets will be kept in pkts_out[]. Applications
can know it by the return value 1 of rte_gso_segment().
Post by Ananyev, Konstantin
+
+ pkt_seg = pkt;
+ gso_size = gso_ctx.gso_size;
+ direct_pool = gso_ctx.direct_pool;
+ indirect_pool = gso_ctx.indirect_pool;
+
+ /* Parse packet headers to determine how to segment 'pkt' */
+ gso_parse_packet(pkt);
I don't think we need to parse packet here.
Instead assume that user already filled packet_type and l2/l3/..._len
fields correctly.
Hmm, I see it. Thanks.
Thanks,
Jiayu
Ananyev, Konstantin
2017-08-30 09:39:09 UTC
Permalink
Hi Mark,
Post by Kavanagh, Mark B
Post by Ananyev, Konstantin
Post by Ananyev, Konstantin
Post by Jiayu Hu
+
+void
+gso_parse_packet(struct rte_mbuf *pkt)
There is a function rte_net_get_ptype() that supposed to provide similar
functionality.
Post by Ananyev, Konstantin
So we probably don't need to create a new SW parse function here, instead
would be better
Post by Ananyev, Konstantin
to reuse (and update if needed) an existing one.
Again user already might have l2/l3/l4.../_len and packet_type setuped.
So better to keep SW packet parsing out of scope of that library.
Hmm, I know we have discussed this design choice in the GRO library, and I
also think it's
better to reuse these values.
But from the perspective of OVS, it may add extra overhead, since OVS doesn't parse every
Hi Jiayu, Konstantin
- the packet type (as it only currently supports TCP/IPv4, VxLAN, GRE packets)
- the l2/3/4_lens, etc. (in order to replicate the original packet's headers across outgoing segments)
For this, we can use the rte_net_get_ptype function, as per Konstantin's suggestion, as it provides both - thanks Konstantin!
WRT the extra overhead in OvS: TSO is the defacto standard, and GSO is provided purely as a fallback option. As such, and since the
additional packet parsing is a necessity in order to facilitate GSO, the additional overhead is IMO acceptable.
As I remember, for TSO in DPDK user still have to provide l2/l3/l4_len and mss information to the PMD.
So unless user knows these value straightway (user creates a packet himself) some packet processing will be unavailable anyway.
Konstantin
Post by Kavanagh, Mark B
Thanks,
Mark
Ananyev, Konstantin
2017-08-30 09:59:15 UTC
Permalink
-----Original Message-----
Sent: Wednesday, August 30, 2017 10:39 AM
Subject: Re: [dpdk-dev] [PATCH 2/5] gso/lib: add TCP/IPv4 GSO support
Hi Mark,
Post by Kavanagh, Mark B
Post by Ananyev, Konstantin
Post by Ananyev, Konstantin
Post by Jiayu Hu
+
+void
+gso_parse_packet(struct rte_mbuf *pkt)
There is a function rte_net_get_ptype() that supposed to provide similar
functionality.
Post by Ananyev, Konstantin
So we probably don't need to create a new SW parse function here, instead
would be better
Post by Ananyev, Konstantin
to reuse (and update if needed) an existing one.
Again user already might have l2/l3/l4.../_len and packet_type setuped.
So better to keep SW packet parsing out of scope of that library.
Hmm, I know we have discussed this design choice in the GRO library, and I
also think it's
better to reuse these values.
But from the perspective of OVS, it may add extra overhead, since OVS doesn't
parse every
Hi Jiayu, Konstantin
- the packet type (as it only currently supports TCP/IPv4, VxLAN, GRE packets)
- the l2/3/4_lens, etc. (in order to replicate the original packet's headers across outgoing segments)
For this, we can use the rte_net_get_ptype function, as per Konstantin's suggestion, as it provides both - thanks Konstantin!
WRT the extra overhead in OvS: TSO is the defacto standard, and GSO is provided purely as a fallback option. As such, and since the
additional packet parsing is a necessity in order to facilitate GSO, the additional overhead is IMO acceptable.
As I remember, for TSO in DPDK user still have to provide l2/l3/l4_len and mss information to the PMD.
So unless user knows these value straightway (user creates a packet himself) some packet processing will be unavailable anyway.
Konstantin
s/unavailable/unavoidable/
sorry for bad typing.
Konstantin
Post by Kavanagh, Mark B
Thanks,
Mark
Kavanagh, Mark B
2017-08-30 13:27:51 UTC
Permalink
From: Ananyev, Konstantin
Sent: Wednesday, August 30, 2017 10:59 AM
Subject: RE: [PATCH 2/5] gso/lib: add TCP/IPv4 GSO support
-----Original Message-----
Sent: Wednesday, August 30, 2017 10:39 AM
Subject: Re: [dpdk-dev] [PATCH 2/5] gso/lib: add TCP/IPv4 GSO support
Hi Mark,
Post by Kavanagh, Mark B
Post by Ananyev, Konstantin
Post by Ananyev, Konstantin
Post by Jiayu Hu
+
+void
+gso_parse_packet(struct rte_mbuf *pkt)
There is a function rte_net_get_ptype() that supposed to provide
similar
Post by Kavanagh, Mark B
Post by Ananyev, Konstantin
functionality.
Post by Ananyev, Konstantin
So we probably don't need to create a new SW parse function here,
instead
Post by Kavanagh, Mark B
Post by Ananyev, Konstantin
would be better
Post by Ananyev, Konstantin
to reuse (and update if needed) an existing one.
Again user already might have l2/l3/l4.../_len and packet_type setuped.
So better to keep SW packet parsing out of scope of that library.
Hmm, I know we have discussed this design choice in the GRO library, and
I
Post by Kavanagh, Mark B
Post by Ananyev, Konstantin
also think it's
better to reuse these values.
But from the perspective of OVS, it may add extra overhead, since OVS
doesn't
Post by Kavanagh, Mark B
Post by Ananyev, Konstantin
parse every
OVS.
Post by Kavanagh, Mark B
Hi Jiayu, Konstantin
- the packet type (as it only currently supports TCP/IPv4, VxLAN, GRE
packets)
Post by Kavanagh, Mark B
- the l2/3/4_lens, etc. (in order to replicate the original packet's
headers across outgoing segments)
Post by Kavanagh, Mark B
For this, we can use the rte_net_get_ptype function, as per Konstantin's
suggestion, as it provides both - thanks Konstantin!
Post by Kavanagh, Mark B
WRT the extra overhead in OvS: TSO is the defacto standard, and GSO is
provided purely as a fallback option. As such, and since the
Post by Kavanagh, Mark B
additional packet parsing is a necessity in order to facilitate GSO, the
additional overhead is IMO acceptable.
As I remember, for TSO in DPDK user still have to provide l2/l3/l4_len and
mss information to the PMD.
Yes, that's correct.
So unless user knows these value straightway (user creates a packet himself)
some packet processing will be unavailable anyway.
That's correct also. Currently, packets that originate in a VM, and which have been marked for TSO, have the l2_len, etc. fields populated by the 'parse_ethernet' function, called as part of the call stack of the rte_vhost_dequeue_burst function, so that particular overhead is already implicit in the TSO case.
Konstantin
s/unavailable/unavoidable/
sorry for bad typing.
Konstantin
Post by Kavanagh, Mark B
Thanks,
Mark
Jiayu Hu
2017-08-30 09:03:13 UTC
Permalink
Hi Konstantin,
Post by Ananyev, Konstantin
-----Original Message-----
From: Hu, Jiayu
Sent: Thursday, August 24, 2017 3:16 PM
Subject: [PATCH 2/5] gso/lib: add TCP/IPv4 GSO support
This patch adds GSO support for TCP/IPv4 packets. Supported packets
may include a single VLAN tag. TCP/IPv4 GSO assumes that all input
packets have correct checksums, and doesn't update checksums for output
packets (the responsibility for this lies with the application).
Additionally, TCP/IPv4 GSO doesn't process IP fragmented packets.
TCP/IPv4 GSO uses two chained MBUFs, one direct MBUF and one indrect
MBUF, to organize an output packet. Note that we refer to these two
chained MBUFs as a two-segment MBUF. The direct MBUF stores the packet
header, while the indirect mbuf simply points to a location within the
original packet's payload. Consequently, use of the GSO library requires
multi-segment MBUF support in the TX functions of the NIC driver.
If a packet is GSOed, TCP/IPv4 GSO reduces its MBUF refcnt by 1. As a
result, when all of its GSOed segments are freed, the packet is freed
automatically.
---
int
rte_gso_segment(struct rte_mbuf *pkt,
struct rte_gso_ctx gso_ctx,
struct rte_mbuf **pkts_out,
- uint16_t nb_pkts_out __rte_unused)
+ uint16_t nb_pkts_out)
{
+ struct rte_mempool *direct_pool, *indirect_pool;
+ struct rte_mbuf *pkt_seg;
+ uint16_t nb_segments, gso_size;
+
if (pkt == NULL || pkts_out == NULL || gso_ctx.direct_pool ==
NULL || gso_ctx.indirect_pool == NULL)
return -EINVAL;
Probably we don't need to check gso_ctx values for each incoming packet.
If you feel it is necessary - create new function rte_gso_ctx_check() that
could be performed just once per ctx.
- return 1;
+ if ((gso_ctx.gso_types & RTE_GSO_TCP_IPV4) == 0 ||
+ gso_ctx.gso_size >= pkt->pkt_len ||
+ gso_ctx.gso_size == 0)
First and third condition seems redundant.
Yes, we don't need the first and the third check here. Please ingore the redundant
reply in the previous mail.

Thanks,
Jiayu
Jiayu Hu
2017-09-04 03:31:50 UTC
Permalink
Hi Konstantin,

About the IP identifier, I check the linux codes and have some feedbacks inline.
Post by Ananyev, Konstantin
-----Original Message-----
From: Hu, Jiayu
Sent: Thursday, August 24, 2017 3:16 PM
Subject: [PATCH 2/5] gso/lib: add TCP/IPv4 GSO support
This patch adds GSO support for TCP/IPv4 packets. Supported packets
may include a single VLAN tag. TCP/IPv4 GSO assumes that all input
packets have correct checksums, and doesn't update checksums for output
packets (the responsibility for this lies with the application).
Additionally, TCP/IPv4 GSO doesn't process IP fragmented packets.
TCP/IPv4 GSO uses two chained MBUFs, one direct MBUF and one indrect
MBUF, to organize an output packet. Note that we refer to these two
chained MBUFs as a two-segment MBUF. The direct MBUF stores the packet
header, while the indirect mbuf simply points to a location within the
original packet's payload. Consequently, use of the GSO library requires
multi-segment MBUF support in the TX functions of the NIC driver.
If a packet is GSOed, TCP/IPv4 GSO reduces its MBUF refcnt by 1. As a
result, when all of its GSOed segments are freed, the packet is freed
automatically.
---
+void
+gso_update_pkt_headers(struct rte_mbuf *pkt, uint16_t nb_segments,
+ struct rte_mbuf **out_segments)
+{
+ struct ipv4_hdr *ipv4_hdr;
+ struct tcp_hdr *tcp_hdr;
+ struct rte_mbuf *seg;
+ uint32_t sent_seq;
+ uint16_t offset, i;
+ uint16_t tail_seg_idx = nb_segments - 1, id;
+
+ switch (pkt->packet_type) {
update_inner_tcp_hdr(..) or so.
Then you can reuse it for tunneled cases too.
+ ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *)
+ pkt->l2_len);
+ tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr + pkt->l3_len);
+ id = rte_be_to_cpu_16(ipv4_hdr->packet_id);
+ sent_seq = rte_be_to_cpu_32(tcp_hdr->sent_seq);
+
+ for (i = 0; i < nb_segments; i++) {
+ seg = out_segments[i];
+
+ offset = seg->l2_len;
+ update_ipv4_header(rte_pktmbuf_mtod(seg, char *),
+ offset, seg->pkt_len, id);
+ id++;
Who would be responsible to make sure that we wouldn't have consecutive packets with the IPV4 id?
Would be the upper layer that forms the packet or gso library or ...?
Linux supports two kinds of IP identifier: fixed identifier and incremental identifier, and
which one to use depends on upper protocol modules. Specifically, if the protocol module
wants fixed identifiers, it will set SKB_GSO_TCP_FIXEDID to skb->gso_type, and then
inet_gso_segment() will keep identifiers the same. Otherwise, all segments will have
incremental identifiers. The reason for this design is that some protocols may choose fixed
IP identifiers, like TCP (from RFC791). This design also shows that linux ignores the issue
of repeated IP identifiers.

From the perspective of DPDK, we need to solve two problems. One is if ignore the issue of
repeated IP identifiers. The other is if the GSO library provides an interface to upper
applications to enable them to choose fixed or incremental identifiers, or simply uses
incremental IP identifiers.

Do you have any suggestions?

Thanks,
Jiayu
Post by Ananyev, Konstantin
+
+ offset += seg->l3_len;
+ update_tcp_header(rte_pktmbuf_mtod(seg, char *),
+ offset, sent_seq, i < tail_seg_idx);
+ sent_seq += seg->next->data_len;
+ }
+ break;
+ }
+}
--
2.7.4
Ananyev, Konstantin
2017-09-04 09:54:41 UTC
Permalink
Hi Jiayu,
-----Original Message-----
From: Hu, Jiayu
Sent: Monday, September 4, 2017 4:32 AM
Subject: Re: [PATCH 2/5] gso/lib: add TCP/IPv4 GSO support
Hi Konstantin,
About the IP identifier, I check the linux codes and have some feedbacks inline.
Post by Ananyev, Konstantin
-----Original Message-----
From: Hu, Jiayu
Sent: Thursday, August 24, 2017 3:16 PM
Subject: [PATCH 2/5] gso/lib: add TCP/IPv4 GSO support
This patch adds GSO support for TCP/IPv4 packets. Supported packets
may include a single VLAN tag. TCP/IPv4 GSO assumes that all input
packets have correct checksums, and doesn't update checksums for output
packets (the responsibility for this lies with the application).
Additionally, TCP/IPv4 GSO doesn't process IP fragmented packets.
TCP/IPv4 GSO uses two chained MBUFs, one direct MBUF and one indrect
MBUF, to organize an output packet. Note that we refer to these two
chained MBUFs as a two-segment MBUF. The direct MBUF stores the packet
header, while the indirect mbuf simply points to a location within the
original packet's payload. Consequently, use of the GSO library requires
multi-segment MBUF support in the TX functions of the NIC driver.
If a packet is GSOed, TCP/IPv4 GSO reduces its MBUF refcnt by 1. As a
result, when all of its GSOed segments are freed, the packet is freed
automatically.
---
+void
+gso_update_pkt_headers(struct rte_mbuf *pkt, uint16_t nb_segments,
+ struct rte_mbuf **out_segments)
+{
+ struct ipv4_hdr *ipv4_hdr;
+ struct tcp_hdr *tcp_hdr;
+ struct rte_mbuf *seg;
+ uint32_t sent_seq;
+ uint16_t offset, i;
+ uint16_t tail_seg_idx = nb_segments - 1, id;
+
+ switch (pkt->packet_type) {
update_inner_tcp_hdr(..) or so.
Then you can reuse it for tunneled cases too.
+ ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *)
+ pkt->l2_len);
+ tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr + pkt->l3_len);
+ id = rte_be_to_cpu_16(ipv4_hdr->packet_id);
+ sent_seq = rte_be_to_cpu_32(tcp_hdr->sent_seq);
+
+ for (i = 0; i < nb_segments; i++) {
+ seg = out_segments[i];
+
+ offset = seg->l2_len;
+ update_ipv4_header(rte_pktmbuf_mtod(seg, char *),
+ offset, seg->pkt_len, id);
+ id++;
Who would be responsible to make sure that we wouldn't have consecutive packets with the IPV4 id?
Would be the upper layer that forms the packet or gso library or ...?
Linux supports two kinds of IP identifier: fixed identifier and incremental identifier, and
which one to use depends on upper protocol modules. Specifically, if the protocol module
wants fixed identifiers, it will set SKB_GSO_TCP_FIXEDID to skb->gso_type, and then
inet_gso_segment() will keep identifiers the same. Otherwise, all segments will have
incremental identifiers. The reason for this design is that some protocols may choose fixed
IP identifiers, like TCP (from RFC791). This design also shows that linux ignores the issue
of repeated IP identifiers.
From the perspective of DPDK, we need to solve two problems. One is if ignore the issue of
repeated IP identifiers. The other is if the GSO library provides an interface to upper
applications to enable them to choose fixed or incremental identifiers, or simply uses
incremental IP identifiers.
Do you have any suggestions?
Do the same as Linux?
I.E. add some flag RRE_GSO_IPID_FIXED (or so) into gso_ctx?
Konstantin
Thanks,
Jiayu
Post by Ananyev, Konstantin
+
+ offset += seg->l3_len;
+ update_tcp_header(rte_pktmbuf_mtod(seg, char *),
+ offset, sent_seq, i < tail_seg_idx);
+ sent_seq += seg->next->data_len;
+ }
+ break;
+ }
+}
--
2.7.4
Hu, Jiayu
2017-09-05 01:09:01 UTC
Permalink
Hi Konstantin,
-----Original Message-----
From: Ananyev, Konstantin
Sent: Monday, September 4, 2017 5:55 PM
Subject: RE: [PATCH 2/5] gso/lib: add TCP/IPv4 GSO support
Hi Jiayu,
-----Original Message-----
From: Hu, Jiayu
Sent: Monday, September 4, 2017 4:32 AM
Subject: Re: [PATCH 2/5] gso/lib: add TCP/IPv4 GSO support
Hi Konstantin,
About the IP identifier, I check the linux codes and have some feedbacks
inline.
Post by Ananyev, Konstantin
-----Original Message-----
From: Hu, Jiayu
Sent: Thursday, August 24, 2017 3:16 PM
Subject: [PATCH 2/5] gso/lib: add TCP/IPv4 GSO support
This patch adds GSO support for TCP/IPv4 packets. Supported packets
may include a single VLAN tag. TCP/IPv4 GSO assumes that all input
packets have correct checksums, and doesn't update checksums for
output
Post by Ananyev, Konstantin
packets (the responsibility for this lies with the application).
Additionally, TCP/IPv4 GSO doesn't process IP fragmented packets.
TCP/IPv4 GSO uses two chained MBUFs, one direct MBUF and one
indrect
Post by Ananyev, Konstantin
MBUF, to organize an output packet. Note that we refer to these two
chained MBUFs as a two-segment MBUF. The direct MBUF stores the
packet
Post by Ananyev, Konstantin
header, while the indirect mbuf simply points to a location within the
original packet's payload. Consequently, use of the GSO library requires
multi-segment MBUF support in the TX functions of the NIC driver.
If a packet is GSOed, TCP/IPv4 GSO reduces its MBUF refcnt by 1. As a
result, when all of its GSOed segments are freed, the packet is freed
automatically.
---
+void
+gso_update_pkt_headers(struct rte_mbuf *pkt, uint16_t nb_segments,
+ struct rte_mbuf **out_segments)
+{
+ struct ipv4_hdr *ipv4_hdr;
+ struct tcp_hdr *tcp_hdr;
+ struct rte_mbuf *seg;
+ uint32_t sent_seq;
+ uint16_t offset, i;
+ uint16_t tail_seg_idx = nb_segments - 1, id;
+
+ switch (pkt->packet_type) {
update_inner_tcp_hdr(..) or so.
Then you can reuse it for tunneled cases too.
+ ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *)
+ pkt->l2_len);
+ tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr + pkt->l3_len);
+ id = rte_be_to_cpu_16(ipv4_hdr->packet_id);
+ sent_seq = rte_be_to_cpu_32(tcp_hdr->sent_seq);
+
+ for (i = 0; i < nb_segments; i++) {
+ seg = out_segments[i];
+
+ offset = seg->l2_len;
+ update_ipv4_header(rte_pktmbuf_mtod(seg, char *),
+ offset, seg->pkt_len, id);
+ id++;
Who would be responsible to make sure that we wouldn't have
consecutive packets with the IPV4 id?
Post by Ananyev, Konstantin
Would be the upper layer that forms the packet or gso library or ...?
Linux supports two kinds of IP identifier: fixed identifier and incremental
identifier, and
which one to use depends on upper protocol modules. Specifically, if the
protocol module
wants fixed identifiers, it will set SKB_GSO_TCP_FIXEDID to skb->gso_type,
and then
inet_gso_segment() will keep identifiers the same. Otherwise, all segments
will have
incremental identifiers. The reason for this design is that some protocols
may choose fixed
IP identifiers, like TCP (from RFC791). This design also shows that linux
ignores the issue
of repeated IP identifiers.
From the perspective of DPDK, we need to solve two problems. One is if
ignore the issue of
repeated IP identifiers. The other is if the GSO library provides an interface
to upper
applications to enable them to choose fixed or incremental identifiers, or
simply uses
incremental IP identifiers.
Do you have any suggestions?
Do the same as Linux?
I.E. add some flag RRE_GSO_IPID_FIXED (or so) into gso_ctx?
OK, I see. We can do that.

In the GRO library, we check if the IP identifiers are incremental compulsorily. If we
enable fixed IP identifier in GSO, it seems we also need to change the GRO library.
I mean ignore IP identifier when merge packets, and don't update the IP identifier
for the merged packet. What do you think of it?

Thanks,
Jiayu
Konstantin
Thanks,
Jiayu
Post by Ananyev, Konstantin
+
+ offset += seg->l3_len;
+ update_tcp_header(rte_pktmbuf_mtod(seg, char *),
+ offset, sent_seq, i < tail_seg_idx);
+ sent_seq += seg->next->data_len;
+ }
+ break;
+ }
+}
--
2.7.4
Ananyev, Konstantin
2017-09-11 13:04:07 UTC
Permalink
Hi Jiayu,
Post by Hu, Jiayu
Post by Jiayu Hu
Post by Jiayu Hu
Linux supports two kinds of IP identifier: fixed identifier and incremental
identifier, and
Post by Jiayu Hu
which one to use depends on upper protocol modules. Specifically, if the
protocol module
Post by Jiayu Hu
wants fixed identifiers, it will set SKB_GSO_TCP_FIXEDID to skb->gso_type,
and then
Post by Jiayu Hu
inet_gso_segment() will keep identifiers the same. Otherwise, all segments
will have
Post by Jiayu Hu
incremental identifiers. The reason for this design is that some protocols
may choose fixed
Post by Jiayu Hu
IP identifiers, like TCP (from RFC791). This design also shows that linux
ignores the issue
Post by Jiayu Hu
of repeated IP identifiers.
From the perspective of DPDK, we need to solve two problems. One is if
ignore the issue of
Post by Jiayu Hu
repeated IP identifiers. The other is if the GSO library provides an interface
to upper
Post by Jiayu Hu
applications to enable them to choose fixed or incremental identifiers, or
simply uses
Post by Jiayu Hu
incremental IP identifiers.
Do you have any suggestions?
Do the same as Linux?
I.E. add some flag RRE_GSO_IPID_FIXED (or so) into gso_ctx?
OK, I see. We can do that.
In the GRO library, we check if the IP identifiers are incremental compulsorily. If we
enable fixed IP identifier in GSO, it seems we also need to change the GRO library.
I mean ignore IP identifier when merge packets, and don't update the IP identifier
for the merged packet. What do you think of it?
I suppose we can, if there is a use-case for it.
Konstantin
Jiayu Hu
2017-08-24 14:15:42 UTC
Permalink
From: Mark Kavanagh <***@intel.com>

This patch adds GSO support for VxLAN-encapsulated packets. Supported
VxLAN packets must have an outer IPv4 header (prepended by an optional
VLAN tag), and contain an inner TCP/IPv4 packet (with an optional inner
VLAN tag).

VxLAN GSO assumes that all input packets have correct checksums and
doesn't update checksums for output packets. Additionally, it doesn't
process IP fragmented packets.

As with TCP/IPv4 GSO, VxLAN GSO uses a two-segment MBUF to organize each
output packet, which mandates support for multi-segment mbufs in the TX
functions of the NIC driver. Also, if a packet is GSOed, VxLAN GSO
reduces its MBUF refcnt by 1. As a result, when all of its GSOed
segments are freed, the packet is freed automatically.

Signed-off-by: Mark Kavanagh <***@intel.com>
Signed-off-by: Jiayu Hu <***@intel.com>
---
lib/librte_gso/Makefile | 1 +
lib/librte_gso/gso_common.c | 109 ++++++++++++++++++++++++++++++++++++++++++--
lib/librte_gso/gso_common.h | 41 ++++++++++++++++-
lib/librte_gso/gso_tunnel.c | 62 +++++++++++++++++++++++++
lib/librte_gso/gso_tunnel.h | 46 +++++++++++++++++++
lib/librte_gso/rte_gso.c | 12 ++++-
lib/librte_gso/rte_gso.h | 4 ++
7 files changed, 268 insertions(+), 7 deletions(-)
create mode 100644 lib/librte_gso/gso_tunnel.c
create mode 100644 lib/librte_gso/gso_tunnel.h

diff --git a/lib/librte_gso/Makefile b/lib/librte_gso/Makefile
index 0f8e38f..a4d1a81 100644
--- a/lib/librte_gso/Makefile
+++ b/lib/librte_gso/Makefile
@@ -44,6 +44,7 @@ LIBABIVER := 1
SRCS-$(CONFIG_RTE_LIBRTE_GSO) += rte_gso.c
SRCS-$(CONFIG_RTE_LIBRTE_GSO) += gso_common.c
SRCS-$(CONFIG_RTE_LIBRTE_GSO) += gso_tcp.c
+SRCS-$(CONFIG_RTE_LIBRTE_GSO) += gso_tunnel.c

# install this header file
SYMLINK-$(CONFIG_RTE_LIBRTE_GSO)-include += rte_gso.h
diff --git a/lib/librte_gso/gso_common.c b/lib/librte_gso/gso_common.c
index 2b54fbd..65cec44 100644
--- a/lib/librte_gso/gso_common.c
+++ b/lib/librte_gso/gso_common.c
@@ -39,6 +39,7 @@
#include <rte_ether.h>
#include <rte_ip.h>
#include <rte_tcp.h>
+#include <rte_udp.h>

#include "gso_common.h"

@@ -156,18 +157,60 @@ gso_do_segment(struct rte_mbuf *pkt,
return nb_segs;
}

+static inline void parse_ethernet(struct ether_hdr *eth_hdr,
+ struct rte_mbuf *pkt);
+
+static inline void
+parse_vxlan(struct udp_hdr *udp_hdr, struct rte_mbuf *pkt)
+{
+ struct ether_hdr *eth_hdr;
+
+ eth_hdr = (struct ether_hdr *)((char *)udp_hdr +
+ sizeof(struct udp_hdr) +
+ sizeof(struct vxlan_hdr));
+
+ pkt->packet_type |= RTE_PTYPE_TUNNEL_VXLAN;
+ pkt->outer_l2_len = pkt->l2_len;
+ parse_ethernet(eth_hdr, pkt);
+ pkt->l2_len += ETHER_VXLAN_HLEN; /* add udp + vxlan */
+}
+
+static inline void
+parse_udp(struct udp_hdr *udp_hdr, struct rte_mbuf *pkt)
+{
+ /* Outer UDP header of VxLAN packet */
+ if (udp_hdr->dst_port == rte_cpu_to_be_16(VXLAN_DEFAULT_PORT)) {
+ pkt->packet_type |= RTE_PTYPE_L4_UDP;
+ parse_vxlan(udp_hdr, pkt);
+ } else {
+ /* IPv4/UDP packet */
+ pkt->l4_len = sizeof(struct udp_hdr);
+ pkt->packet_type |= RTE_PTYPE_L4_UDP;
+ }
+}
+
static inline void
parse_ipv4(struct ipv4_hdr *ipv4_hdr, struct rte_mbuf *pkt)
{
struct tcp_hdr *tcp_hdr;
+ struct udp_hdr *udp_hdr;

switch (ipv4_hdr->next_proto_id) {
case IPPROTO_TCP:
- pkt->packet_type |= RTE_PTYPE_L4_TCP;
+ if (IS_VXLAN_PKT(pkt)) {
+ pkt->outer_l3_len = pkt->l3_len;
+ pkt->packet_type |= RTE_PTYPE_INNER_L4_TCP;
+ } else
+ pkt->packet_type |= RTE_PTYPE_L4_TCP;
pkt->l3_len = IPv4_HDR_LEN(ipv4_hdr);
tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr + pkt->l3_len);
pkt->l4_len = TCP_HDR_LEN(tcp_hdr);
break;
+ case IPPROTO_UDP:
+ pkt->l3_len = IPv4_HDR_LEN(ipv4_hdr);
+ udp_hdr = (struct udp_hdr *)((char *)ipv4_hdr + pkt->l3_len);
+ parse_udp(udp_hdr, pkt);
+ break;
}
}

@@ -182,13 +225,21 @@ parse_ethernet(struct ether_hdr *eth_hdr, struct rte_mbuf *pkt)
if (ethertype == ETHER_TYPE_VLAN) {
vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1);
pkt->l2_len = sizeof(struct vlan_hdr);
- pkt->packet_type |= RTE_PTYPE_L2_ETHER_VLAN;
+ if (IS_VXLAN_PKT(pkt))
+ pkt->packet_type |= RTE_PTYPE_INNER_L2_ETHER_VLAN;
+ else
+ pkt->packet_type |= RTE_PTYPE_L2_ETHER_VLAN;
ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto);
- }
+ } else
+ pkt->l2_len = 0;

switch (ethertype) {
case ETHER_TYPE_IPv4:
- if (IS_VLAN_PKT(pkt)) {
+ if (IS_VXLAN_PKT(pkt)) {
+ if (!IS_INNER_VLAN_PKT(pkt))
+ pkt->packet_type |= RTE_PTYPE_INNER_L2_ETHER;
+ pkt->packet_type |= RTE_PTYPE_INNER_L3_IPV4;
+ } else if (IS_VLAN_PKT(pkt)) {
pkt->packet_type |= RTE_PTYPE_L3_IPV4;
} else {
pkt->packet_type |= RTE_PTYPE_L2_ETHER;
@@ -236,14 +287,62 @@ void
gso_update_pkt_headers(struct rte_mbuf *pkt, uint16_t nb_segments,
struct rte_mbuf **out_segments)
{
- struct ipv4_hdr *ipv4_hdr;
+ struct ipv4_hdr *ipv4_hdr, *outer_ipv4_hdr;
struct tcp_hdr *tcp_hdr;
+ struct udp_hdr *udp_hdr;
struct rte_mbuf *seg;
uint32_t sent_seq;
uint16_t offset, i;
uint16_t tail_seg_idx = nb_segments - 1, id;
+ uint16_t outer_id;

switch (pkt->packet_type) {
+ case ETHER_VLAN_IPv4_UDP_VXLAN_VLAN_IPv4_TCP_PKT:
+ case ETHER_VLAN_IPv4_UDP_VXLAN_IPv4_TCP_PKT:
+ case ETHER_IPv4_UDP_VXLAN_VLAN_IPv4_TCP_PKT:
+ case ETHER_IPv4_UDP_VXLAN_IPv4_TCP_PKT:
+ outer_ipv4_hdr =
+ (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *) +
+ pkt->outer_l2_len);
+ ipv4_hdr = (struct ipv4_hdr *)((char *)(outer_ipv4_hdr +
+ pkt->outer_l3_len + pkt->l2_len));
+ tcp_hdr = (struct tcp_hdr *)(ipv4_hdr + 1);
+
+ outer_id = rte_be_to_cpu_16(outer_ipv4_hdr->packet_id);
+ id = rte_be_to_cpu_16(ipv4_hdr->packet_id);
+ sent_seq = rte_be_to_cpu_32(tcp_hdr->sent_seq);
+
+ for (i = 0; i < nb_segments; i++) {
+ seg = out_segments[i];
+
+ /* Update outer IPv4 header */
+ offset = seg->outer_l2_len;
+ update_ipv4_header(rte_pktmbuf_mtod(seg, char *),
+ offset, seg->pkt_len, outer_id);
+ outer_id++;
+
+ /* Update outer UDP header */
+ offset += seg->outer_l3_len;
+ udp_hdr = (struct udp_hdr *)(
+ rte_pktmbuf_mtod(seg, char *) +
+ offset);
+ udp_hdr->dgram_len = rte_cpu_to_be_16(seg->pkt_len -
+ offset);
+
+ /* Update inner IPv4 header */
+ offset += seg->l2_len;
+ update_ipv4_header(rte_pktmbuf_mtod(seg, char*),
+ offset, seg->pkt_len, id);
+ id++;
+
+ /* Update inner TCP header */
+ offset += seg->l3_len;
+ update_tcp_header(rte_pktmbuf_mtod(seg, char *),
+ offset, sent_seq, i < tail_seg_idx);
+
+ sent_seq += seg->next->data_len;
+ }
+ break;
case ETHER_VLAN_IPv4_TCP_PKT:
case ETHER_IPv4_TCP_PKT:
ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *) +
diff --git a/lib/librte_gso/gso_common.h b/lib/librte_gso/gso_common.h
index d750041..0ad95d3 100644
--- a/lib/librte_gso/gso_common.h
+++ b/lib/librte_gso/gso_common.h
@@ -46,6 +46,8 @@
#define TCP_HDR_LEN(tcph) ((tcph->data_off & 0xf0) >> 2)

#define ETHER_IPv4_PKT (RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4)
+#define INNER_ETHER_IPv4_TCP_PKT (RTE_PTYPE_INNER_L2_ETHER |\
+ RTE_PTYPE_INNER_L3_IPV4 | RTE_PTYPE_INNER_L4_TCP)
/* Supported packet types */
/* TCP/IPv4 packet. */
#define ETHER_IPv4_TCP_PKT (ETHER_IPv4_PKT | RTE_PTYPE_L4_TCP)
@@ -54,9 +56,46 @@
#define ETHER_VLAN_IPv4_TCP_PKT (RTE_PTYPE_L2_ETHER_VLAN | \
RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L4_TCP)

+/* VxLAN packet */
+#define ETHER_IPv4_UDP_VXLAN_IPv4_TCP_PKT (ETHER_IPv4_PKT | \
+ RTE_PTYPE_L4_UDP | \
+ RTE_PTYPE_TUNNEL_VXLAN | \
+ INNER_ETHER_IPv4_TCP_PKT)
+
+/* VxLAN packet with outer VLAN tag. */
+#define ETHER_VLAN_IPv4_UDP_VXLAN_IPv4_TCP_PKT (RTE_PTYPE_L2_ETHER_VLAN | \
+ RTE_PTYPE_L3_IPV4 | \
+ RTE_PTYPE_L4_UDP | \
+ RTE_PTYPE_TUNNEL_VXLAN | \
+ INNER_ETHER_IPv4_TCP_PKT)
+
+/* VxLAN packet with inner VLAN tag. */
+#define ETHER_IPv4_UDP_VXLAN_VLAN_IPv4_TCP_PKT (ETHER_IPv4_PKT | \
+ RTE_PTYPE_L4_UDP | \
+ RTE_PTYPE_TUNNEL_VXLAN | \
+ RTE_PTYPE_INNER_L2_ETHER_VLAN | \
+ RTE_PTYPE_INNER_L3_IPV4 | \
+ RTE_PTYPE_INNER_L4_TCP)
+
+/* VxLAN packet with both outer and inner VLAN tags. */
+#define ETHER_VLAN_IPv4_UDP_VXLAN_VLAN_IPv4_TCP_PKT (\
+ RTE_PTYPE_L2_ETHER_VLAN | \
+ RTE_PTYPE_L3_IPV4 | \
+ RTE_PTYPE_L4_UDP | \
+ RTE_PTYPE_TUNNEL_VXLAN | \
+ RTE_PTYPE_INNER_L2_ETHER_VLAN | \
+ RTE_PTYPE_INNER_L3_IPV4 | \
+ RTE_PTYPE_INNER_L4_TCP)
+
#define IS_VLAN_PKT(pkt) ((pkt->packet_type & RTE_PTYPE_L2_ETHER_VLAN) == \
RTE_PTYPE_L2_ETHER_VLAN)
+#define IS_INNER_VLAN_PKT(pkt) (\
+ (pkt->packet_type & RTE_PTYPE_INNER_L2_ETHER_VLAN) == \
+ RTE_PTYPE_INNER_L2_ETHER_VLAN)

+#define VXLAN_DEFAULT_PORT 4789
+#define IS_VXLAN_PKT(pkt) ((pkt->packet_type & RTE_PTYPE_TUNNEL_VXLAN) == \
+ RTE_PTYPE_TUNNEL_VXLAN)
/**
* Internal function which parses a packet, setting outer_l2/l3_len and
* l2/l3/l4_len and packet_type.
@@ -92,7 +131,7 @@ void gso_update_pkt_headers(struct rte_mbuf *pkt, uint16_t nb_segments,
* @param pkt
* Packet to segment.
* @param pkt_hdr_offset
- * Packet header offset, measured in byte.
+ * Packet header offset, measured in bytes.
* @param pyld_unit_size
* The max payload length of a GSO segment.
* @param direct_pool
diff --git a/lib/librte_gso/gso_tunnel.c b/lib/librte_gso/gso_tunnel.c
new file mode 100644
index 0000000..6a04697
--- /dev/null
+++ b/lib/librte_gso/gso_tunnel.c
@@ -0,0 +1,62 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <rte_ether.h>
+
+#include "gso_common.h"
+#include "gso_tunnel.h"
+
+int
+gso_tunnel_segment(struct rte_mbuf *pkt,
+ uint16_t gso_size,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out)
+{
+ uint16_t pyld_unit_size, hdr_offset;
+ int ret;
+
+ hdr_offset = pkt->outer_l2_len + pkt->outer_l3_len + pkt->l2_len +
+ pkt->l3_len + pkt->l4_len;
+
+ pyld_unit_size = gso_size - hdr_offset - ETHER_CRC_LEN;
+
+ ret = gso_do_segment(pkt, hdr_offset, pyld_unit_size, direct_pool,
+ indirect_pool, pkts_out, nb_pkts_out);
+
+ if (ret > 1)
+ gso_update_pkt_headers(pkt, ret, pkts_out);
+
+ return ret;
+}
diff --git a/lib/librte_gso/gso_tunnel.h b/lib/librte_gso/gso_tunnel.h
new file mode 100644
index 0000000..a9b2363
--- /dev/null
+++ b/lib/librte_gso/gso_tunnel.h
@@ -0,0 +1,46 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _GSO_TUNNEL_H_
+#define _GSO_TUNNEL_H_
+
+#include <stdint.h>
+#include <rte_mbuf.h>
+
+int gso_tunnel_segment(struct rte_mbuf *pkt,
+ uint16_t gso_size,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out);
+#endif
diff --git a/lib/librte_gso/rte_gso.c b/lib/librte_gso/rte_gso.c
index fac95f2..f110f18 100644
--- a/lib/librte_gso/rte_gso.c
+++ b/lib/librte_gso/rte_gso.c
@@ -36,6 +36,7 @@
#include "rte_gso.h"
#include "gso_common.h"
#include "gso_tcp.h"
+#include "gso_tunnel.h"

int
rte_gso_segment(struct rte_mbuf *pkt,
@@ -51,7 +52,8 @@ rte_gso_segment(struct rte_mbuf *pkt,
NULL || gso_ctx.indirect_pool == NULL)
return -EINVAL;

- if ((gso_ctx.gso_types & RTE_GSO_TCP_IPV4) == 0 ||
+ if ((gso_ctx.gso_types & (RTE_GSO_TCP_IPV4 |
+ RTE_GSO_IPV4_VXLAN_TCP_IPV4)) == 0 ||
gso_ctx.gso_size >= pkt->pkt_len ||
gso_ctx.gso_size == 0)
return 1;
@@ -71,6 +73,14 @@ rte_gso_segment(struct rte_mbuf *pkt,
direct_pool, indirect_pool,
pkts_out, nb_pkts_out);
break;
+ case ETHER_VLAN_IPv4_UDP_VXLAN_VLAN_IPv4_TCP_PKT:
+ case ETHER_VLAN_IPv4_UDP_VXLAN_IPv4_TCP_PKT:
+ case ETHER_IPv4_UDP_VXLAN_VLAN_IPv4_TCP_PKT:
+ case ETHER_IPv4_UDP_VXLAN_IPv4_TCP_PKT:
+ nb_segments = gso_tunnel_segment(pkt, gso_size,
+ direct_pool, indirect_pool,
+ pkts_out, nb_pkts_out);
+ break;
default:
RTE_LOG(WARNING, GSO, "Unsupported packet type\n");
nb_segments = 1;
diff --git a/lib/librte_gso/rte_gso.h b/lib/librte_gso/rte_gso.h
index 77853fa..e1b2c23 100644
--- a/lib/librte_gso/rte_gso.h
+++ b/lib/librte_gso/rte_gso.h
@@ -48,6 +48,10 @@ extern "C" {

#define RTE_GSO_TCP_IPV4 (1ULL << 0)
/**< GSO flag for TCP/IPv4 packets (containing optional VLAN tag) */
+#define RTE_GSO_IPV4_VXLAN_TCP_IPV4 (1ULL << 1)
+/**< GSO flag for VxLAN packets that contain outer IPv4, and inner
+ * TCP/IPv4 headers (plus optional inner and/or outer VLAN tags).
+ */

/**
* GSO context structure.
--
2.7.4
Jiayu Hu
2017-08-24 14:15:43 UTC
Permalink
From: Mark Kavanagh <***@intel.com>

This patch adds GSO support for GRE-tunneled packets. Supported GRE
packets must contain an outer IPv4 header, and inner TCP/IPv4 headers.
They may also contain a single VLAN tag. GRE GSO assumes that all input
packets have correct checksums and doesn't update checksums for output
packets. Additionally, it doesn't process IP fragmented packets.

As with VxLAN GSO, GRE GSO uses a two-segment MBUF to organize each
output packet, which requires multi-segment mbuf support in the TX
functions of the NIC driver. Also, if a packet is GSOed, GRE GSO reduces
its MBUF refcnt by 1. As a result, when all of its GSOed segments are
freed, the packet is freed automatically.

Signed-off-by: Mark Kavanagh <***@intel.com>
Signed-off-by: Jiayu Hu <***@intel.com>
---
lib/librte_gso/gso_common.c | 66 +++++++++++++++++++++++++++++++++++++++++++--
lib/librte_gso/gso_common.h | 21 +++++++++++++++
lib/librte_gso/rte_gso.c | 5 +++-
lib/librte_gso/rte_gso.h | 4 +++
4 files changed, 93 insertions(+), 3 deletions(-)

diff --git a/lib/librte_gso/gso_common.c b/lib/librte_gso/gso_common.c
index 65cec44..b3e7f9d 100644
--- a/lib/librte_gso/gso_common.c
+++ b/lib/librte_gso/gso_common.c
@@ -37,6 +37,7 @@
#include <rte_malloc.h>

#include <rte_ether.h>
+#include <rte_gre.h>
#include <rte_ip.h>
#include <rte_tcp.h>
#include <rte_udp.h>
@@ -159,6 +160,8 @@ gso_do_segment(struct rte_mbuf *pkt,

static inline void parse_ethernet(struct ether_hdr *eth_hdr,
struct rte_mbuf *pkt);
+static inline void parse_ipv4(struct ipv4_hdr *ipv4_hdr,
+ struct rte_mbuf *pkt);

static inline void
parse_vxlan(struct udp_hdr *udp_hdr, struct rte_mbuf *pkt)
@@ -190,15 +193,29 @@ parse_udp(struct udp_hdr *udp_hdr, struct rte_mbuf *pkt)
}

static inline void
+parse_gre(struct gre_hdr *gre_hdr, struct rte_mbuf *pkt)
+{
+ struct ipv4_hdr *ipv4_hdr;
+
+ if (gre_hdr->proto == rte_cpu_to_be_16(ETHER_TYPE_IPv4)) {
+ ipv4_hdr = (struct ipv4_hdr *)(gre_hdr + 1);
+ pkt->packet_type |= RTE_PTYPE_INNER_L3_IPV4;
+ parse_ipv4(ipv4_hdr, pkt);
+ }
+}
+
+static inline void
parse_ipv4(struct ipv4_hdr *ipv4_hdr, struct rte_mbuf *pkt)
{
+ struct gre_hdr *gre_hdr;
struct tcp_hdr *tcp_hdr;
struct udp_hdr *udp_hdr;

switch (ipv4_hdr->next_proto_id) {
case IPPROTO_TCP:
- if (IS_VXLAN_PKT(pkt)) {
- pkt->outer_l3_len = pkt->l3_len;
+ if (IS_TUNNEL_PKT(pkt)) {
+ if (IS_VXLAN_PKT(pkt))
+ pkt->outer_l3_len = pkt->l3_len;
pkt->packet_type |= RTE_PTYPE_INNER_L4_TCP;
} else
pkt->packet_type |= RTE_PTYPE_L4_TCP;
@@ -211,6 +228,14 @@ parse_ipv4(struct ipv4_hdr *ipv4_hdr, struct rte_mbuf *pkt)
udp_hdr = (struct udp_hdr *)((char *)ipv4_hdr + pkt->l3_len);
parse_udp(udp_hdr, pkt);
break;
+ case IPPROTO_GRE:
+ gre_hdr = (struct gre_hdr *)(ipv4_hdr + 1);
+ pkt->outer_l2_len = pkt->l2_len;
+ pkt->outer_l3_len = IPv4_HDR_LEN(ipv4_hdr);
+ pkt->l2_len = sizeof(*gre_hdr);
+ pkt->packet_type |= RTE_PTYPE_TUNNEL_GRE;
+ parse_gre(gre_hdr, pkt);
+ break;
}
}

@@ -343,6 +368,43 @@ gso_update_pkt_headers(struct rte_mbuf *pkt, uint16_t nb_segments,
sent_seq += seg->next->data_len;
}
break;
+ case ETHER_VLAN_IPv4_GRE_IPv4_TCP_PKT:
+ case ETHER_IPv4_GRE_IPv4_TCP_PKT:
+ outer_ipv4_hdr =
+ (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *) +
+ pkt->outer_l2_len);
+ ipv4_hdr = (struct ipv4_hdr *)((char *)outer_ipv4_hdr +
+ pkt->outer_l3_len + pkt->l2_len);
+ tcp_hdr = (struct tcp_hdr *)(ipv4_hdr + 1);
+
+ /* Retrieve values from original packet */
+ id = rte_be_to_cpu_16(ipv4_hdr->packet_id);
+ outer_id = rte_be_to_cpu_16(outer_ipv4_hdr->packet_id);
+ sent_seq = rte_be_to_cpu_32(tcp_hdr->sent_seq);
+
+ for (i = 0; i < nb_segments; i++) {
+ seg = out_segments[i];
+
+ /* Update outer IPv4 header */
+ offset = seg->outer_l2_len;
+ update_ipv4_header(rte_pktmbuf_mtod(seg, char *),
+ offset, seg->pkt_len, outer_id);
+ outer_id++;
+
+ /* Update inner IPv4 header */
+ offset += seg->outer_l3_len + seg->l2_len;
+ update_ipv4_header(rte_pktmbuf_mtod(seg, char *),
+ offset, seg->pkt_len, id);
+ id++;
+
+ /* Update inner TCP header */
+ offset += seg->l3_len;
+ update_tcp_header(rte_pktmbuf_mtod(seg, char *),
+ offset, sent_seq, i < tail_seg_idx);
+
+ sent_seq += seg->next->data_len;
+ }
+ break;
case ETHER_VLAN_IPv4_TCP_PKT:
case ETHER_IPv4_TCP_PKT:
ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *) +
diff --git a/lib/librte_gso/gso_common.h b/lib/librte_gso/gso_common.h
index 0ad95d3..2ed264a 100644
--- a/lib/librte_gso/gso_common.h
+++ b/lib/librte_gso/gso_common.h
@@ -87,6 +87,21 @@
RTE_PTYPE_INNER_L3_IPV4 | \
RTE_PTYPE_INNER_L4_TCP)

+/* GRE packet. */
+#define ETHER_IPv4_GRE_IPv4_TCP_PKT (\
+ ETHER_IPv4_PKT | \
+ RTE_PTYPE_TUNNEL_GRE | \
+ RTE_PTYPE_INNER_L3_IPV4 | \
+ RTE_PTYPE_INNER_L4_TCP)
+
+/* GRE packet with VLAN tag. */
+#define ETHER_VLAN_IPv4_GRE_IPv4_TCP_PKT (\
+ RTE_PTYPE_L2_ETHER_VLAN | \
+ RTE_PTYPE_L3_IPV4 | \
+ RTE_PTYPE_TUNNEL_GRE | \
+ RTE_PTYPE_INNER_L3_IPV4 | \
+ RTE_PTYPE_INNER_L4_TCP)
+
#define IS_VLAN_PKT(pkt) ((pkt->packet_type & RTE_PTYPE_L2_ETHER_VLAN) == \
RTE_PTYPE_L2_ETHER_VLAN)
#define IS_INNER_VLAN_PKT(pkt) (\
@@ -96,6 +111,12 @@
#define VXLAN_DEFAULT_PORT 4789
#define IS_VXLAN_PKT(pkt) ((pkt->packet_type & RTE_PTYPE_TUNNEL_VXLAN) == \
RTE_PTYPE_TUNNEL_VXLAN)
+
+#define IS_GRE_PKT(pkt) ((pkt->packet_type & RTE_PTYPE_TUNNEL_GRE) == \
+ RTE_PTYPE_TUNNEL_GRE)
+
+#define IS_TUNNEL_PKT(pkt) ((pkt->packet_type & RTE_PTYPE_TUNNEL_VXLAN) | \
+ (pkt->packet_type & RTE_PTYPE_TUNNEL_GRE))
/**
* Internal function which parses a packet, setting outer_l2/l3_len and
* l2/l3/l4_len and packet_type.
diff --git a/lib/librte_gso/rte_gso.c b/lib/librte_gso/rte_gso.c
index f110f18..244bbf6 100644
--- a/lib/librte_gso/rte_gso.c
+++ b/lib/librte_gso/rte_gso.c
@@ -53,7 +53,8 @@ rte_gso_segment(struct rte_mbuf *pkt,
return -EINVAL;

if ((gso_ctx.gso_types & (RTE_GSO_TCP_IPV4 |
- RTE_GSO_IPV4_VXLAN_TCP_IPV4)) == 0 ||
+ RTE_GSO_IPV4_VXLAN_TCP_IPV4 |
+ RTE_GSO_IPV4_GRE_TCP_IPV4)) == 0 ||
gso_ctx.gso_size >= pkt->pkt_len ||
gso_ctx.gso_size == 0)
return 1;
@@ -77,6 +78,8 @@ rte_gso_segment(struct rte_mbuf *pkt,
case ETHER_VLAN_IPv4_UDP_VXLAN_IPv4_TCP_PKT:
case ETHER_IPv4_UDP_VXLAN_VLAN_IPv4_TCP_PKT:
case ETHER_IPv4_UDP_VXLAN_IPv4_TCP_PKT:
+ case ETHER_VLAN_IPv4_GRE_IPv4_TCP_PKT:
+ case ETHER_IPv4_GRE_IPv4_TCP_PKT:
nb_segments = gso_tunnel_segment(pkt, gso_size,
direct_pool, indirect_pool,
pkts_out, nb_pkts_out);
diff --git a/lib/librte_gso/rte_gso.h b/lib/librte_gso/rte_gso.h
index e1b2c23..86ca790 100644
--- a/lib/librte_gso/rte_gso.h
+++ b/lib/librte_gso/rte_gso.h
@@ -52,6 +52,10 @@ extern "C" {
/**< GSO flag for VxLAN packets that contain outer IPv4, and inner
* TCP/IPv4 headers (plus optional inner and/or outer VLAN tags).
*/
+#define RTE_GSO_IPV4_GRE_TCP_IPV4 (1ULL << 2)
+/**< GSO flag for GRE packets that contain outer IPv4, and inner
+ * TCP/IPv4 headers (with optional outer VLAN tag).
+ */

/**
* GSO context structure.
--
2.7.4
Jiayu Hu
2017-08-24 14:15:44 UTC
Permalink
This patch adds GSO support to the csum forwarding engine. Oversized
packets transmitted over a GSO-enabled port will undergo segmentation
(with the exception of packet-types unsupported by the GSO library).
GSO support is disabled by default.

GSO support may be toggled on a per-port basis, using the command

"set port <port_id> gso on|off".

The maximum packet length for GSO segments may be set with the command

"set port <port_id> gso_segsz <length>"

Signed-off-by: Jiayu Hu <***@intel.com>
Signed-off-by: Mark Kavanagh <***@intel.com>
---
app/test-pmd/cmdline.c | 121 ++++++++++++++++++++++++++++++++++++++++++++++++
app/test-pmd/config.c | 25 ++++++++++
app/test-pmd/csumonly.c | 68 +++++++++++++++++++++++++--
app/test-pmd/testpmd.c | 9 ++++
app/test-pmd/testpmd.h | 10 ++++
5 files changed, 228 insertions(+), 5 deletions(-)

diff --git a/app/test-pmd/cmdline.c b/app/test-pmd/cmdline.c
index cd8c358..754e249 100644
--- a/app/test-pmd/cmdline.c
+++ b/app/test-pmd/cmdline.c
@@ -431,6 +431,13 @@ static void cmd_help_long_parsed(void *parsed_result,
" Set max flow number and max packet number per-flow"
" for GRO.\n\n"

+ "set port (port_id) gso (on|off)"
+ " Enable or disable Generic Segmentation Offload in"
+ " csum forwarding engine.\n\n"
+
+ "set port <port_id> gso_segsz <length>\n"
+ " Set max packet length for GSO segment.\n\n"
+
"set fwd (%s)\n"
" Set packet forwarding mode.\n\n"

@@ -3963,6 +3970,118 @@ cmdline_parse_inst_t cmd_gro_set = {
},
};

+/* *** ENABLE/DISABLE GSO FOR PORTS *** */
+struct cmd_gso_enable_result {
+ cmdline_fixed_string_t cmd_set;
+ cmdline_fixed_string_t cmd_port;
+ cmdline_fixed_string_t cmd_keyword;
+ cmdline_fixed_string_t cmd_mode;
+ uint8_t cmd_pid;
+};
+
+static void
+cmd_gso_enable_parsed(void *parsed_result,
+ __attribute__((unused)) struct cmdline *cl,
+ __attribute__((unused)) void *data)
+{
+ struct cmd_gso_enable_result *res;
+
+ res = parsed_result;
+ setup_gso(res->cmd_mode, res->cmd_pid);
+}
+
+cmdline_parse_token_string_t cmd_gso_enable_set =
+ TOKEN_STRING_INITIALIZER(struct cmd_gso_enable_result,
+ cmd_set, "set");
+cmdline_parse_token_string_t cmd_gso_enable_port =
+ TOKEN_STRING_INITIALIZER(struct cmd_gso_enable_result,
+ cmd_port, "port");
+cmdline_parse_token_string_t cmd_gso_enable_keyword =
+ TOKEN_STRING_INITIALIZER(struct cmd_gso_enable_result,
+ cmd_keyword, "gso");
+cmdline_parse_token_string_t cmd_gso_enable_mode =
+ TOKEN_STRING_INITIALIZER(struct cmd_gso_enable_result,
+ cmd_mode, "on#off");
+cmdline_parse_token_num_t cmd_gso_enable_pid =
+ TOKEN_NUM_INITIALIZER(struct cmd_gso_enable_result,
+ cmd_pid, UINT8);
+
+cmdline_parse_inst_t cmd_gso_enable = {
+ .f = cmd_gso_enable_parsed,
+ .data = NULL,
+ .help_str = "set port <port_id> gso on|off",
+ .tokens = {
+ (void *)&cmd_gso_enable_set,
+ (void *)&cmd_gso_enable_port,
+ (void *)&cmd_gso_enable_pid,
+ (void *)&cmd_gso_enable_keyword,
+ (void *)&cmd_gso_enable_mode,
+ NULL,
+ },
+};
+
+/* *** SET MAX PACKET LENGTH FOR GSO SEGMENT *** */
+struct cmd_gso_size_result {
+ cmdline_fixed_string_t cmd_set;
+ cmdline_fixed_string_t cmd_port;
+ cmdline_fixed_string_t cmd_keyword;
+ uint16_t cmd_segsz;
+ uint8_t cmd_pid;
+};
+
+static void
+cmd_gso_size_parsed(void *parsed_result,
+ __attribute__((unused)) struct cmdline *cl,
+ __attribute__((unused)) void *data)
+{
+ struct cmd_gso_size_result *res = parsed_result;
+
+ if (port_id_is_invalid(res->cmd_pid, ENABLED_WARN))
+ return;
+
+ if (!strcmp(res->cmd_keyword, "gso_segsz")) {
+ if (res->cmd_segsz == 0) {
+ gso_ports[res->cmd_pid].enable = 0;
+ gso_ports[res->cmd_pid].gso_segsz = 0;
+ printf("Input gso_segsz is 0. Disable GSO for"
+ " port %u\n", res->cmd_pid);
+ } else
+ gso_ports[res->cmd_pid].gso_segsz = res->cmd_segsz;
+
+ }
+}
+
+cmdline_parse_token_string_t cmd_gso_size_set =
+ TOKEN_STRING_INITIALIZER(struct cmd_gso_size_result,
+ cmd_set, "set");
+cmdline_parse_token_string_t cmd_gso_size_port =
+ TOKEN_STRING_INITIALIZER(struct cmd_gso_size_result,
+ cmd_port, "port");
+cmdline_parse_token_string_t cmd_gso_size_keyword =
+ TOKEN_STRING_INITIALIZER(struct cmd_gso_size_result,
+ cmd_keyword, "gso_segsz");
+cmdline_parse_token_num_t cmd_gso_size_segsz =
+ TOKEN_NUM_INITIALIZER(struct cmd_gso_size_result,
+ cmd_segsz, UINT16);
+cmdline_parse_token_num_t cmd_gso_size_pid =
+ TOKEN_NUM_INITIALIZER(struct cmd_gso_size_result,
+ cmd_pid, UINT8);
+
+cmdline_parse_inst_t cmd_gso_size = {
+ .f = cmd_gso_size_parsed,
+ .data = NULL,
+ .help_str = "set port <port_id> gso_segsz <length>: set max "
+ "packet length for GSO segment (0 to disable GSO)",
+ .tokens = {
+ (void *)&cmd_gso_size_set,
+ (void *)&cmd_gso_size_port,
+ (void *)&cmd_gso_size_pid,
+ (void *)&cmd_gso_size_keyword,
+ (void *)&cmd_gso_size_segsz,
+ NULL,
+ },
+};
+
/* *** ENABLE/DISABLE FLUSH ON RX STREAMS *** */
struct cmd_set_flush_rx {
cmdline_fixed_string_t set;
@@ -14251,6 +14370,8 @@ cmdline_parse_ctx_t main_ctx[] = {
(cmdline_parse_inst_t *)&cmd_tunnel_tso_show,
(cmdline_parse_inst_t *)&cmd_enable_gro,
(cmdline_parse_inst_t *)&cmd_gro_set,
+ (cmdline_parse_inst_t *)&cmd_gso_enable,
+ (cmdline_parse_inst_t *)&cmd_gso_size,
(cmdline_parse_inst_t *)&cmd_link_flow_control_set,
(cmdline_parse_inst_t *)&cmd_link_flow_control_set_rx,
(cmdline_parse_inst_t *)&cmd_link_flow_control_set_tx,
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 3ae3e1c..1837fb1 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -2454,6 +2454,31 @@ setup_gro(const char *mode, uint8_t port_id)
}
}

+void
+setup_gso(const char *mode, uint8_t port_id)
+{
+ if (!rte_eth_dev_is_valid_port(port_id)) {
+ printf("invalid port id %u\n", port_id);
+ return;
+ }
+ if (strcmp(mode, "on") == 0) {
+ if (test_done == 0) {
+ printf("before enable GSO,"
+ " please stop forwarding first\n");
+ return;
+ }
+ gso_ports[port_id].enable = 1;
+ gso_ports[port_id].gso_segsz = ETHER_MAX_LEN;
+ } else if (strcmp(mode, "off") == 0) {
+ if (test_done == 0) {
+ printf("before disable GSO,"
+ " please stop forwarding first\n");
+ return;
+ }
+ gso_ports[port_id].enable = 0;
+ }
+}
+
char*
list_pkt_forwarding_modes(void)
{
diff --git a/app/test-pmd/csumonly.c b/app/test-pmd/csumonly.c
index 90c8119..f55bb0f 100644
--- a/app/test-pmd/csumonly.c
+++ b/app/test-pmd/csumonly.c
@@ -70,6 +70,7 @@
#include <rte_string_fns.h>
#include <rte_flow.h>
#include <rte_gro.h>
+#include <rte_gso.h>
#include "testpmd.h"

#define IP_DEFTTL 64 /* from RFC 1340. */
@@ -627,6 +628,9 @@ static void
pkt_burst_checksum_forward(struct fwd_stream *fs)
{
struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
+ struct rte_mbuf *gso_segments[GSO_MAX_PKT_BURST];
+ struct rte_gso_ctx *gso_ctx;
+ struct rte_mbuf **tx_pkts_burst;
struct rte_port *txp;
struct rte_mbuf *m, *p;
struct ether_hdr *eth_hdr;
@@ -641,6 +645,9 @@ pkt_burst_checksum_forward(struct fwd_stream *fs)
uint32_t rx_bad_ip_csum;
uint32_t rx_bad_l4_csum;
struct testpmd_offload_info info;
+ uint8_t no_gso[GSO_MAX_PKT_BURST] = {0};
+ uint16_t nb_segments = 0;
+ int ret;

#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
uint64_t start_tsc;
@@ -851,13 +858,54 @@ pkt_burst_checksum_forward(struct fwd_stream *fs)
}
}

+ if (unlikely(gso_ports[fs->tx_port].enable)) {
+ gso_ctx = &(current_fwd_lcore()->gso_ctx);
+ gso_ctx->gso_size = gso_ports[fs->tx_port].gso_segsz > 0 ?
+ gso_ports[fs->tx_port].gso_segsz : gso_ctx->gso_size;
+ for (i = 0; i < nb_rx; i++) {
+ ret = rte_gso_segment(pkts_burst[i], *gso_ctx,
+ &gso_segments[nb_segments],
+ GSO_MAX_PKT_BURST - nb_segments);
+ if (ret > 1)
+ nb_segments += ret;
+ else if (ret == 1) {
+ gso_segments[nb_segments] = pkts_burst[i];
+ no_gso[nb_segments++] = 1;
+ } else {
+ /* insufficient MBUFs, stop GSO */
+ memcpy(&gso_segments[nb_segments],
+ &pkts_burst[i],
+ sizeof(struct rte_mbuf *) *
+ (nb_rx - i));
+ nb_segments += (nb_rx - i);
+ break;
+ }
+ if (unlikely(nb_rx - i >= GSO_MAX_PKT_BURST -
+ nb_segments)) {
+ /*
+ * insufficient space in gso_segments,
+ * stop GSO.
+ */
+ memcpy(&gso_segments[nb_segments],
+ &pkts_burst[i],
+ sizeof(struct rte_mbuf *) *
+ (nb_rx - i));
+ nb_segments += (nb_rx - i);
+ break;
+ }
+ }
+ tx_pkts_burst = gso_segments;
+ nb_rx = nb_segments;
+ } else
+ tx_pkts_burst = pkts_burst;
+
nb_prep = rte_eth_tx_prepare(fs->tx_port, fs->tx_queue,
- pkts_burst, nb_rx);
+ tx_pkts_burst, nb_rx);
if (nb_prep != nb_rx)
printf("Preparing packet burst to transmit failed: %s\n",
rte_strerror(rte_errno));

- nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst,
+ nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, tx_pkts_burst,
nb_prep);

/*
@@ -868,7 +916,7 @@ pkt_burst_checksum_forward(struct fwd_stream *fs)
while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
rte_delay_us(burst_tx_delay_time);
nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
- &pkts_burst[nb_tx], nb_rx - nb_tx);
+ &tx_pkts_burst[nb_tx], nb_rx - nb_tx);
}
}
fs->tx_packets += nb_tx;
@@ -878,12 +926,22 @@ pkt_burst_checksum_forward(struct fwd_stream *fs)
#ifdef RTE_TEST_PMD_RECORD_BURST_STATS
fs->tx_burst_stats.pkt_burst_spread[nb_tx]++;
#endif
- if (unlikely(nb_tx < nb_rx)) {
+
+ if (unlikely(nb_tx < nb_rx) &&
+ unlikely(gso_ports[fs->tx_port].enable)) {
fs->fwd_dropped += (nb_rx - nb_tx);
do {
- rte_pktmbuf_free(pkts_burst[nb_tx]);
+ if (no_gso[nb_tx] == 0)
+ rte_pktmbuf_detach(tx_pkts_burst[nb_tx]->next);
+ rte_pktmbuf_free(tx_pkts_burst[nb_tx]);
+ } while (++nb_tx < nb_rx);
+ } else if (unlikely(nb_tx < nb_rx)) {
+ fs->fwd_dropped += (nb_rx - nb_tx);
+ do {
+ rte_pktmbuf_free(tx_pkts_burst[nb_tx]);
} while (++nb_tx < nb_rx);
}
+
#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
end_tsc = rte_rdtsc();
core_cycles = (end_tsc - start_tsc);
diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
index 7d40139..16c60f0 100644
--- a/app/test-pmd/testpmd.c
+++ b/app/test-pmd/testpmd.c
@@ -400,6 +400,8 @@ static int eth_event_callback(uint8_t port_id,
*/
static int all_ports_started(void);

+struct gso_status gso_ports[RTE_MAX_ETHPORTS];
+
/*
* Helper function to check if socket is already discovered.
* If yes, return positive value. If not, return zero.
@@ -664,6 +666,13 @@ init_config(void)
if (mbp == NULL)
mbp = mbuf_pool_find(0);
fwd_lcores[lc_id]->mbp = mbp;
+ /* initialize GSO context */
+ fwd_lcores[lc_id]->gso_ctx.direct_pool = mbp;
+ fwd_lcores[lc_id]->gso_ctx.indirect_pool = mbp;
+ fwd_lcores[lc_id]->gso_ctx.gso_types = RTE_GSO_TCP_IPV4 |
+ RTE_GSO_IPV4_VXLAN_TCP_IPV4 |
+ RTE_GSO_IPV4_GRE_TCP_IPV4;
+ fwd_lcores[lc_id]->gso_ctx.gso_size = ETHER_MAX_LEN;
}

/* Configuration of packet forwarding streams. */
diff --git a/app/test-pmd/testpmd.h b/app/test-pmd/testpmd.h
index c9d7739..3697d3f 100644
--- a/app/test-pmd/testpmd.h
+++ b/app/test-pmd/testpmd.h
@@ -36,6 +36,7 @@

#include <rte_pci.h>
#include <rte_gro.h>
+#include <rte_gso.h>

#define RTE_PORT_ALL (~(portid_t)0x0)

@@ -205,6 +206,7 @@ struct rte_port {
* CPU id. configuration table.
*/
struct fwd_lcore {
+ struct rte_gso_ctx gso_ctx; /**< GSO context */
struct rte_mempool *mbp; /**< The mbuf pool to use by this core */
streamid_t stream_idx; /**< index of 1st stream in "fwd_streams" */
streamid_t stream_nb; /**< number of streams in "fwd_streams" */
@@ -442,6 +444,13 @@ struct gro_status {
};
extern struct gro_status gro_ports[RTE_MAX_ETHPORTS];

+#define GSO_MAX_PKT_BURST 2048
+struct gso_status {
+ uint16_t gso_segsz;
+ uint8_t enable;
+};
+extern struct gso_status gso_ports[RTE_MAX_ETHPORTS];
+
static inline unsigned int
lcore_num(void)
{
@@ -641,6 +650,7 @@ void get_5tuple_filter(uint8_t port_id, uint16_t index);
int rx_queue_id_is_invalid(queueid_t rxq_id);
int tx_queue_id_is_invalid(queueid_t txq_id);
void setup_gro(const char *mode, uint8_t port_id);
+void setup_gso(const char *mode, uint8_t port_id);

/* Functions to manage the set of filtered Multicast MAC addresses */
void mcast_addr_add(uint8_t port_id, struct ether_addr *mc_addr);
--
2.7.4
Ananyev, Konstantin
2017-08-30 01:37:42 UTC
Permalink
Hi Jiayu,
Few questions/comments from me below in in next few mails.
Thanks
Konstantin
Post by Jiayu Hu
Generic Segmentation Offload (GSO) is a SW technique to split large
packets into small ones. Akin to TSO, GSO enables applications to
operate on large packets, thus reducing per-packet processing overhead.
To enable more flexibility to applications, DPDK GSO is implemented
as a standalone library. Applications explicitly use the GSO library
to segment packets. This patch adds GSO support to DPDK for specific
packet types: specifically, TCP/IPv4, VxLAN, and GRE.
The first patch introduces the GSO API framework. The second patch
adds GSO support for TCP/IPv4 packets (containing an optional VLAN
tag). The third patch adds GSO support for VxLAN packets that contain
outer IPv4, and inner TCP/IPv4 headers (plus optional inner and/or
outer VLAN tags). The fourth patch adds GSO support for GRE packets
that contain outer IPv4, and inner TCP/IPv4 headers (with optional
outer VLAN tag). The last patch in the series enables TCP/IPv4, VxLAN,
and GRE GSO in testpmd's checksum forwarding engine.
The performance of TCP/IPv4 GSO on a 10Gbps link is demonstrated using
a. Connect 2 x 10Gbps physical ports (P0, P1), together physically.
b. Launch testpmd with P0 and a vhost-user port, and use csum
forwarding engine.
c. Select IP and TCP HW checksum calculation for P0; select TCP HW
checksum calculation for vhost-user port.
d. Launch a VM with csum and tso offloading enabled.
e. Run iperf-client on virtio-net port in the VM to send TCP packets.
Not sure I understand the setup correctly:
So testpmd forwards packets between P0 and vhost-user port, right?
And who uses P1? iperf-server over linux kernel?
Also is P1 on another box or not?
Post by Jiayu Hu
With GSO enabled for P0 in testpmd, observed iperf throughput is ~9Gbps.
Ok, and if GSO is disabled what is the throughput?
Another stupid question: if P0 is physical 10G (ixgbe?) we can just enable a TSO on it, right?
If so, what would be the TSO numbers here?

In fact, could you probably explain a bit more, what supposed to be a main usage model for that library?
Is that to perform segmentation on (virtual) devices that doesn't support HW TSO or ...?
Again would it be for a termination point (packets were just formed and filled) by the caller,
or is that for box in the middle which just forwards packets between nodes?
If the later one, then we'll probably already have most of our packets segmented properly, no?
Post by Jiayu Hu
The experimental data of VxLAN and GRE will be shown later.
lib: add Generic Segmentation Offload API framework
gso/lib: add TCP/IPv4 GSO support
app/testpmd: enable TCP/IPv4, VxLAN and GRE GSO
lib/gso: add VxLAN GSO support
lib/gso: add GRE GSO support
app/test-pmd/cmdline.c | 121 +++++++++
app/test-pmd/config.c | 25 ++
app/test-pmd/csumonly.c | 68 ++++-
app/test-pmd/testpmd.c | 9 +
app/test-pmd/testpmd.h | 10 +
config/common_base | 5 +
lib/Makefile | 2 +
lib/librte_eal/common/include/rte_log.h | 1 +
lib/librte_gso/Makefile | 52 ++++
lib/librte_gso/gso_common.c | 431 ++++++++++++++++++++++++++++++++
lib/librte_gso/gso_common.h | 180 +++++++++++++
lib/librte_gso/gso_tcp.c | 82 ++++++
lib/librte_gso/gso_tcp.h | 73 ++++++
lib/librte_gso/gso_tunnel.c | 62 +++++
lib/librte_gso/gso_tunnel.h | 46 ++++
lib/librte_gso/rte_gso.c | 100 ++++++++
lib/librte_gso/rte_gso.h | 122 +++++++++
lib/librte_gso/rte_gso_version.map | 7 +
mk/rte.app.mk | 1 +
19 files changed, 1392 insertions(+), 5 deletions(-)
create mode 100644 lib/librte_gso/Makefile
create mode 100644 lib/librte_gso/gso_common.c
create mode 100644 lib/librte_gso/gso_common.h
create mode 100644 lib/librte_gso/gso_tcp.c
create mode 100644 lib/librte_gso/gso_tcp.h
create mode 100644 lib/librte_gso/gso_tunnel.c
create mode 100644 lib/librte_gso/gso_tunnel.h
create mode 100644 lib/librte_gso/rte_gso.c
create mode 100644 lib/librte_gso/rte_gso.h
create mode 100644 lib/librte_gso/rte_gso_version.map
--
2.7.4
Jiayu Hu
2017-08-30 07:36:56 UTC
Permalink
Hi Konstantin,

Thanks for your suggestions. Feedbacks are inline.

Thanks,
Jiayu
Post by Ananyev, Konstantin
Hi Jiayu,
Few questions/comments from me below in in next few mails.
Thanks
Konstantin
Post by Jiayu Hu
Generic Segmentation Offload (GSO) is a SW technique to split large
packets into small ones. Akin to TSO, GSO enables applications to
operate on large packets, thus reducing per-packet processing overhead.
To enable more flexibility to applications, DPDK GSO is implemented
as a standalone library. Applications explicitly use the GSO library
to segment packets. This patch adds GSO support to DPDK for specific
packet types: specifically, TCP/IPv4, VxLAN, and GRE.
The first patch introduces the GSO API framework. The second patch
adds GSO support for TCP/IPv4 packets (containing an optional VLAN
tag). The third patch adds GSO support for VxLAN packets that contain
outer IPv4, and inner TCP/IPv4 headers (plus optional inner and/or
outer VLAN tags). The fourth patch adds GSO support for GRE packets
that contain outer IPv4, and inner TCP/IPv4 headers (with optional
outer VLAN tag). The last patch in the series enables TCP/IPv4, VxLAN,
and GRE GSO in testpmd's checksum forwarding engine.
The performance of TCP/IPv4 GSO on a 10Gbps link is demonstrated using
a. Connect 2 x 10Gbps physical ports (P0, P1), together physically.
b. Launch testpmd with P0 and a vhost-user port, and use csum
forwarding engine.
c. Select IP and TCP HW checksum calculation for P0; select TCP HW
checksum calculation for vhost-user port.
d. Launch a VM with csum and tso offloading enabled.
e. Run iperf-client on virtio-net port in the VM to send TCP packets.
So testpmd forwards packets between P0 and vhost-user port, right?
Yes.
Post by Ananyev, Konstantin
And who uses P1? iperf-server over linux kernel?
P1 is possessed by linux kernel.
Post by Ananyev, Konstantin
Also is P1 on another box or not?
P0 and P1 are in the same machine and are connected physically.
Post by Ananyev, Konstantin
Post by Jiayu Hu
With GSO enabled for P0 in testpmd, observed iperf throughput is ~9Gbps.
Ok, and if GSO is disabled what is the throughput?
Another stupid question: if P0 is physical 10G (ixgbe?) we can just enable a TSO on it, right?
If so, what would be the TSO numbers here?
Here are more detailed experiment information:

test1: only enable GSO for p0, GSO size is 1518, use two iperf-clients (i.e. "-P 2")
test2: only enable TSO for p0, TSO size is 1518, use two iperf-clients
test3: disable TSO and GSO, use two iperf-clients

test1 performance: 8.6Gpbs
test2 throughput: 9.5Gbps
test3 throughput: 3Mbps
Post by Ananyev, Konstantin
In fact, could you probably explain a bit more, what supposed to be a main usage model for that library?
The GSO library is just a SW segmentation method, which can be used by applications, like OVS.
Currently, most of NICs supports to segment TCP and UDP packets, but not for all NICs. So current
OVS doesn't enable TSO, as a result of lacking a SW segmentation fallback. Besides, the protocol
types in HW segmentation are limited. So it's necessary to provide a SW segmentation solution.

With the GSO library, OVS and other applications are able to receive large packets from VMs and
process these large packets, instead of standard ones (i.e. 1518B). So the per-packet overhead is
reduced, since the number of packets needed processing is much fewer.
Post by Ananyev, Konstantin
Is that to perform segmentation on (virtual) devices that doesn't support HW TSO or ...?
When launch qemu with enabling TSO or GSO, the virtual device doesn't really do segmentation.
It directly sends large packets. Therefore, testpmd can receive large packets from the VM and
then perform GSO. The GSO/TSO behavior of virtual devices is different from physical NICs.
Post by Ananyev, Konstantin
Again would it be for a termination point (packets were just formed and filled) by the caller,
or is that for box in the middle which just forwards packets between nodes?
If the later one, then we'll probably already have most of our packets segmented properly, no?
Post by Jiayu Hu
The experimental data of VxLAN and GRE will be shown later.
lib: add Generic Segmentation Offload API framework
gso/lib: add TCP/IPv4 GSO support
app/testpmd: enable TCP/IPv4, VxLAN and GRE GSO
lib/gso: add VxLAN GSO support
lib/gso: add GRE GSO support
app/test-pmd/cmdline.c | 121 +++++++++
app/test-pmd/config.c | 25 ++
app/test-pmd/csumonly.c | 68 ++++-
app/test-pmd/testpmd.c | 9 +
app/test-pmd/testpmd.h | 10 +
config/common_base | 5 +
lib/Makefile | 2 +
lib/librte_eal/common/include/rte_log.h | 1 +
lib/librte_gso/Makefile | 52 ++++
lib/librte_gso/gso_common.c | 431 ++++++++++++++++++++++++++++++++
lib/librte_gso/gso_common.h | 180 +++++++++++++
lib/librte_gso/gso_tcp.c | 82 ++++++
lib/librte_gso/gso_tcp.h | 73 ++++++
lib/librte_gso/gso_tunnel.c | 62 +++++
lib/librte_gso/gso_tunnel.h | 46 ++++
lib/librte_gso/rte_gso.c | 100 ++++++++
lib/librte_gso/rte_gso.h | 122 +++++++++
lib/librte_gso/rte_gso_version.map | 7 +
mk/rte.app.mk | 1 +
19 files changed, 1392 insertions(+), 5 deletions(-)
create mode 100644 lib/librte_gso/Makefile
create mode 100644 lib/librte_gso/gso_common.c
create mode 100644 lib/librte_gso/gso_common.h
create mode 100644 lib/librte_gso/gso_tcp.c
create mode 100644 lib/librte_gso/gso_tcp.h
create mode 100644 lib/librte_gso/gso_tunnel.c
create mode 100644 lib/librte_gso/gso_tunnel.h
create mode 100644 lib/librte_gso/rte_gso.c
create mode 100644 lib/librte_gso/rte_gso.h
create mode 100644 lib/librte_gso/rte_gso_version.map
--
2.7.4
Ananyev, Konstantin
2017-08-30 10:49:15 UTC
Permalink
-----Original Message-----
From: Hu, Jiayu
Sent: Wednesday, August 30, 2017 8:37 AM
Subject: Re: [PATCH 0/5] Support TCP/IPv4, VxLAN and GRE GSO in DPDK
Hi Konstantin,
Thanks for your suggestions. Feedbacks are inline.
Thanks,
Jiayu
Post by Ananyev, Konstantin
Hi Jiayu,
Few questions/comments from me below in in next few mails.
Thanks
Konstantin
Post by Jiayu Hu
Generic Segmentation Offload (GSO) is a SW technique to split large
packets into small ones. Akin to TSO, GSO enables applications to
operate on large packets, thus reducing per-packet processing overhead.
To enable more flexibility to applications, DPDK GSO is implemented
as a standalone library. Applications explicitly use the GSO library
to segment packets. This patch adds GSO support to DPDK for specific
packet types: specifically, TCP/IPv4, VxLAN, and GRE.
The first patch introduces the GSO API framework. The second patch
adds GSO support for TCP/IPv4 packets (containing an optional VLAN
tag). The third patch adds GSO support for VxLAN packets that contain
outer IPv4, and inner TCP/IPv4 headers (plus optional inner and/or
outer VLAN tags). The fourth patch adds GSO support for GRE packets
that contain outer IPv4, and inner TCP/IPv4 headers (with optional
outer VLAN tag). The last patch in the series enables TCP/IPv4, VxLAN,
and GRE GSO in testpmd's checksum forwarding engine.
The performance of TCP/IPv4 GSO on a 10Gbps link is demonstrated using
a. Connect 2 x 10Gbps physical ports (P0, P1), together physically.
b. Launch testpmd with P0 and a vhost-user port, and use csum
forwarding engine.
c. Select IP and TCP HW checksum calculation for P0; select TCP HW
checksum calculation for vhost-user port.
d. Launch a VM with csum and tso offloading enabled.
e. Run iperf-client on virtio-net port in the VM to send TCP packets.
So testpmd forwards packets between P0 and vhost-user port, right?
Yes.
Post by Ananyev, Konstantin
And who uses P1? iperf-server over linux kernel?
P1 is possessed by linux kernel.
Post by Ananyev, Konstantin
Also is P1 on another box or not?
P0 and P1 are in the same machine and are connected physically.
Post by Ananyev, Konstantin
Post by Jiayu Hu
With GSO enabled for P0 in testpmd, observed iperf throughput is ~9Gbps.
Ok, and if GSO is disabled what is the throughput?
Another stupid question: if P0 is physical 10G (ixgbe?) we can just enable a TSO on it, right?
If so, what would be the TSO numbers here?
test1: only enable GSO for p0, GSO size is 1518, use two iperf-clients (i.e. "-P 2")
test2: only enable TSO for p0, TSO size is 1518, use two iperf-clients
test3: disable TSO and GSO, use two iperf-clients
test1 performance: 8.6Gpbs
test2 throughput: 9.5Gbps
test3 throughput: 3Mbps
Ok thanks for detailed explanation.
I' d suggest you put it into next version cover letter.
Post by Ananyev, Konstantin
In fact, could you probably explain a bit more, what supposed to be a main usage model for that library?
The GSO library is just a SW segmentation method, which can be used by applications, like OVS.
Currently, most of NICs supports to segment TCP and UDP packets, but not for all NICs. So current
OVS doesn't enable TSO, as a result of lacking a SW segmentation fallback. Besides, the protocol
types in HW segmentation are limited. So it's necessary to provide a SW segmentation solution.
With the GSO library, OVS and other applications are able to receive large packets from VMs and
process these large packets, instead of standard ones (i.e. 1518B). So the per-packet overhead is
reduced, since the number of packets needed processing is much fewer.
Ok, just for my curiosity what is the size of the packets coming from VM?
Konstantin
Post by Ananyev, Konstantin
Is that to perform segmentation on (virtual) devices that doesn't support HW TSO or ...?
When launch qemu with enabling TSO or GSO, the virtual device doesn't really do segmentation.
It directly sends large packets. Therefore, testpmd can receive large packets from the VM and
then perform GSO. The GSO/TSO behavior of virtual devices is different from physical NICs.
Post by Ananyev, Konstantin
Again would it be for a termination point (packets were just formed and filled) by the caller,
or is that for box in the middle which just forwards packets between nodes?
If the later one, then we'll probably already have most of our packets segmented properly, no?
Post by Jiayu Hu
The experimental data of VxLAN and GRE will be shown later.
lib: add Generic Segmentation Offload API framework
gso/lib: add TCP/IPv4 GSO support
app/testpmd: enable TCP/IPv4, VxLAN and GRE GSO
lib/gso: add VxLAN GSO support
lib/gso: add GRE GSO support
app/test-pmd/cmdline.c | 121 +++++++++
app/test-pmd/config.c | 25 ++
app/test-pmd/csumonly.c | 68 ++++-
app/test-pmd/testpmd.c | 9 +
app/test-pmd/testpmd.h | 10 +
config/common_base | 5 +
lib/Makefile | 2 +
lib/librte_eal/common/include/rte_log.h | 1 +
lib/librte_gso/Makefile | 52 ++++
lib/librte_gso/gso_common.c | 431 ++++++++++++++++++++++++++++++++
lib/librte_gso/gso_common.h | 180 +++++++++++++
lib/librte_gso/gso_tcp.c | 82 ++++++
lib/librte_gso/gso_tcp.h | 73 ++++++
lib/librte_gso/gso_tunnel.c | 62 +++++
lib/librte_gso/gso_tunnel.h | 46 ++++
lib/librte_gso/rte_gso.c | 100 ++++++++
lib/librte_gso/rte_gso.h | 122 +++++++++
lib/librte_gso/rte_gso_version.map | 7 +
mk/rte.app.mk | 1 +
19 files changed, 1392 insertions(+), 5 deletions(-)
create mode 100644 lib/librte_gso/Makefile
create mode 100644 lib/librte_gso/gso_common.c
create mode 100644 lib/librte_gso/gso_common.h
create mode 100644 lib/librte_gso/gso_tcp.c
create mode 100644 lib/librte_gso/gso_tcp.h
create mode 100644 lib/librte_gso/gso_tunnel.c
create mode 100644 lib/librte_gso/gso_tunnel.h
create mode 100644 lib/librte_gso/rte_gso.c
create mode 100644 lib/librte_gso/rte_gso.h
create mode 100644 lib/librte_gso/rte_gso_version.map
--
2.7.4
Kavanagh, Mark B
2017-08-30 13:32:23 UTC
Permalink
From: Ananyev, Konstantin
Sent: Wednesday, August 30, 2017 11:49 AM
Subject: RE: [PATCH 0/5] Support TCP/IPv4, VxLAN and GRE GSO in DPDK
-----Original Message-----
From: Hu, Jiayu
Sent: Wednesday, August 30, 2017 8:37 AM
Subject: Re: [PATCH 0/5] Support TCP/IPv4, VxLAN and GRE GSO in DPDK
Hi Konstantin,
Thanks for your suggestions. Feedbacks are inline.
Thanks,
Jiayu
Post by Ananyev, Konstantin
Hi Jiayu,
Few questions/comments from me below in in next few mails.
Thanks
Konstantin
Post by Jiayu Hu
Generic Segmentation Offload (GSO) is a SW technique to split large
packets into small ones. Akin to TSO, GSO enables applications to
operate on large packets, thus reducing per-packet processing overhead.
To enable more flexibility to applications, DPDK GSO is implemented
as a standalone library. Applications explicitly use the GSO library
to segment packets. This patch adds GSO support to DPDK for specific
packet types: specifically, TCP/IPv4, VxLAN, and GRE.
The first patch introduces the GSO API framework. The second patch
adds GSO support for TCP/IPv4 packets (containing an optional VLAN
tag). The third patch adds GSO support for VxLAN packets that contain
outer IPv4, and inner TCP/IPv4 headers (plus optional inner and/or
outer VLAN tags). The fourth patch adds GSO support for GRE packets
that contain outer IPv4, and inner TCP/IPv4 headers (with optional
outer VLAN tag). The last patch in the series enables TCP/IPv4, VxLAN,
and GRE GSO in testpmd's checksum forwarding engine.
The performance of TCP/IPv4 GSO on a 10Gbps link is demonstrated using
a. Connect 2 x 10Gbps physical ports (P0, P1), together physically.
b. Launch testpmd with P0 and a vhost-user port, and use csum
forwarding engine.
c. Select IP and TCP HW checksum calculation for P0; select TCP HW
checksum calculation for vhost-user port.
d. Launch a VM with csum and tso offloading enabled.
e. Run iperf-client on virtio-net port in the VM to send TCP packets.
So testpmd forwards packets between P0 and vhost-user port, right?
Yes.
Post by Ananyev, Konstantin
And who uses P1? iperf-server over linux kernel?
P1 is possessed by linux kernel.
Post by Ananyev, Konstantin
Also is P1 on another box or not?
P0 and P1 are in the same machine and are connected physically.
Post by Ananyev, Konstantin
Post by Jiayu Hu
With GSO enabled for P0 in testpmd, observed iperf throughput is ~9Gbps.
Ok, and if GSO is disabled what is the throughput?
Another stupid question: if P0 is physical 10G (ixgbe?) we can just enable
a TSO on it, right?
Post by Ananyev, Konstantin
If so, what would be the TSO numbers here?
test1: only enable GSO for p0, GSO size is 1518, use two iperf-clients (i.e.
"-P 2")
test2: only enable TSO for p0, TSO size is 1518, use two iperf-clients
test3: disable TSO and GSO, use two iperf-clients
test1 performance: 8.6Gpbs
test2 throughput: 9.5Gbps
test3 throughput: 3Mbps
Ok thanks for detailed explanation.
I' d suggest you put it into next version cover letter.
Thanks Konstantin - will do.
Post by Ananyev, Konstantin
In fact, could you probably explain a bit more, what supposed to be a main
usage model for that library?
The GSO library is just a SW segmentation method, which can be used by
applications, like OVS.
Currently, most of NICs supports to segment TCP and UDP packets, but not for
all NICs. So current
OVS doesn't enable TSO, as a result of lacking a SW segmentation fallback.
Besides, the protocol
types in HW segmentation are limited. So it's necessary to provide a SW
segmentation solution.
With the GSO library, OVS and other applications are able to receive large
packets from VMs and
process these large packets, instead of standard ones (i.e. 1518B). So the
per-packet overhead is
reduced, since the number of packets needed processing is much fewer.
Ok, just for my curiosity what is the size of the packets coming from VM?
Konstantin
In the case of TSO (and as a corollary, GSO), I guess that the packet size is bounded to ~64k. In OvS, that packet is dequeued using the rte_vhost_dequeue_burst API, and stored in an mbuf chain. The data capacity of mbufs in OvS is user-defined, up to a limit of 9728B.

Thanks,
Mark
Post by Ananyev, Konstantin
Is that to perform segmentation on (virtual) devices that doesn't support
HW TSO or ...?
When launch qemu with enabling TSO or GSO, the virtual device doesn't really
do segmentation.
It directly sends large packets. Therefore, testpmd can receive large
packets from the VM and
then perform GSO. The GSO/TSO behavior of virtual devices is different from
physical NICs.
Post by Ananyev, Konstantin
Again would it be for a termination point (packets were just formed and
filled) by the caller,
Post by Ananyev, Konstantin
or is that for box in the middle which just forwards packets between
nodes?
Post by Ananyev, Konstantin
If the later one, then we'll probably already have most of our packets
segmented properly, no?
Post by Ananyev, Konstantin
Post by Jiayu Hu
The experimental data of VxLAN and GRE will be shown later.
lib: add Generic Segmentation Offload API framework
gso/lib: add TCP/IPv4 GSO support
app/testpmd: enable TCP/IPv4, VxLAN and GRE GSO
lib/gso: add VxLAN GSO support
lib/gso: add GRE GSO support
app/test-pmd/cmdline.c | 121 +++++++++
app/test-pmd/config.c | 25 ++
app/test-pmd/csumonly.c | 68 ++++-
app/test-pmd/testpmd.c | 9 +
app/test-pmd/testpmd.h | 10 +
config/common_base | 5 +
lib/Makefile | 2 +
lib/librte_eal/common/include/rte_log.h | 1 +
lib/librte_gso/Makefile | 52 ++++
lib/librte_gso/gso_common.c | 431
++++++++++++++++++++++++++++++++
Post by Ananyev, Konstantin
Post by Jiayu Hu
lib/librte_gso/gso_common.h | 180 +++++++++++++
lib/librte_gso/gso_tcp.c | 82 ++++++
lib/librte_gso/gso_tcp.h | 73 ++++++
lib/librte_gso/gso_tunnel.c | 62 +++++
lib/librte_gso/gso_tunnel.h | 46 ++++
lib/librte_gso/rte_gso.c | 100 ++++++++
lib/librte_gso/rte_gso.h | 122 +++++++++
lib/librte_gso/rte_gso_version.map | 7 +
mk/rte.app.mk | 1 +
19 files changed, 1392 insertions(+), 5 deletions(-)
create mode 100644 lib/librte_gso/Makefile
create mode 100644 lib/librte_gso/gso_common.c
create mode 100644 lib/librte_gso/gso_common.h
create mode 100644 lib/librte_gso/gso_tcp.c
create mode 100644 lib/librte_gso/gso_tcp.h
create mode 100644 lib/librte_gso/gso_tunnel.c
create mode 100644 lib/librte_gso/gso_tunnel.h
create mode 100644 lib/librte_gso/rte_gso.c
create mode 100644 lib/librte_gso/rte_gso.h
create mode 100644 lib/librte_gso/rte_gso_version.map
--
2.7.4
Jiayu Hu
2017-09-05 07:57:45 UTC
Permalink
Generic Segmentation Offload (GSO) is a SW technique to split large
packets into small ones. Akin to TSO, GSO enables applications to
operate on large packets, thus reducing per-packet processing overhead.

To enable more flexibility to applications, DPDK GSO is implemented
as a standalone library. Applications explicitly use the GSO library
to segment packets. This patch adds GSO support to DPDK for specific
packet types: specifically, TCP/IPv4, VxLAN, and GRE.

The first patch introduces the GSO API framework. The second patch
adds GSO support for TCP/IPv4 packets (containing an optional VLAN
tag). The third patch adds GSO support for VxLAN packets that contain
outer IPv4, and inner TCP/IPv4 headers (plus optional inner and/or
outer VLAN tags). The fourth patch adds GSO support for GRE packets
that contain outer IPv4, and inner TCP/IPv4 headers (with optional
outer VLAN tag). The last patch in the series enables TCP/IPv4, VxLAN,
and GRE GSO in testpmd's checksum forwarding engine.

The performance of TCP/IPv4 GSO on a 10Gbps link is demonstrated using
iperf. Setup for the test is described as follows:

a. Connect 2 x 10Gbps physical ports (P0, P1), which are in the same
machine, together physically.
b. Launch testpmd with P0 and a vhost-user port, and use csum
forwarding engine with "retry".
c. Select IP and TCP HW checksum calculation for P0; select TCP HW
checksum calculation for vhost-user port.
d. Launch a VM with csum and tso offloading enabled.
e. Run iperf-client on virtio-net port in the VM to send TCP packets.
With enabling csum and tso, the VM can send large TCP/IPv4 packets
(mss is up to 64KB).
f. P1 is assigned to linux kernel and enabled kernel GRO. Run
iperf-server on P1.

We conduct three iperf tests:

test-1: enable GSO for P0 in testpmd, and set max GSO segment length
to 1518B. Run two iperf-client in the VM.
test-2: enable TSO for P0 in testpmd, and set TSO segsz to 1518B. Run
two iperf-client in the VM.
test-3: disable GSO and TSO in testpmd. Run two iperf-client in the VM.

Throughput of the above three tests:

test-1: ~9Gbps
test-2: 9.5Gbps
test-3: 3Mbps

The experimental data of VxLAN and GRE will be shown later.

Change log
==========
v2:
- merge data segments whose data_len is less than mss into a large data
segment in gso_do_segment().
- use mbuf->packet_type/l2_len/l3_len etc. instead of parsing the packet
header in rte_gso_segment().
- provide IP id macros for applications to select fixed or incremental IP
ids.
- change the defination of gso_types in struct rte_gso_ctx.
- replace rte_pktmbuf_detach() with rte_pktmbuf_free().
- refactor gso_update_pkt_headers().
- change the return value of rte_gso_segment().
- remove parameter checks in rte_gso_segment().
- use rte_net_get_ptype() in app/test-pmd/csumonly.c to fill
mbuf->packet_type.
- add a new GSO command in testpmd to show GSO configuration for ports.
- misc: fix typo and optimize function description.

Jiayu Hu (3):
gso: add Generic Segmentation Offload API framework
gso: add TCP/IPv4 GSO support
app/testpmd: enable TCP/IPv4, VxLAN and GRE GSO

Mark Kavanagh (2):
gso: add VxLAN GSO support
gso: add GRE GSO support

app/test-pmd/cmdline.c | 178 ++++++++++++++++++++
app/test-pmd/config.c | 24 +++
app/test-pmd/csumonly.c | 60 ++++++-
app/test-pmd/testpmd.c | 15 ++
app/test-pmd/testpmd.h | 10 ++
config/common_base | 5 +
lib/Makefile | 2 +
lib/librte_eal/common/include/rte_log.h | 1 +
lib/librte_gso/Makefile | 52 ++++++
lib/librte_gso/gso_common.c | 281 ++++++++++++++++++++++++++++++++
lib/librte_gso/gso_common.h | 156 ++++++++++++++++++
lib/librte_gso/gso_tcp.c | 83 ++++++++++
lib/librte_gso/gso_tcp.h | 76 +++++++++
lib/librte_gso/gso_tunnel.c | 61 +++++++
lib/librte_gso/gso_tunnel.h | 75 +++++++++
lib/librte_gso/rte_gso.c | 99 +++++++++++
lib/librte_gso/rte_gso.h | 133 +++++++++++++++
lib/librte_gso/rte_gso_version.map | 7 +
mk/rte.app.mk | 1 +
19 files changed, 1315 insertions(+), 4 deletions(-)
create mode 100644 lib/librte_gso/Makefile
create mode 100644 lib/librte_gso/gso_common.c
create mode 100644 lib/librte_gso/gso_common.h
create mode 100644 lib/librte_gso/gso_tcp.c
create mode 100644 lib/librte_gso/gso_tcp.h
create mode 100644 lib/librte_gso/gso_tunnel.c
create mode 100644 lib/librte_gso/gso_tunnel.h
create mode 100644 lib/librte_gso/rte_gso.c
create mode 100644 lib/librte_gso/rte_gso.h
create mode 100644 lib/librte_gso/rte_gso_version.map
--
2.7.4
Jiayu Hu
2017-09-05 07:57:46 UTC
Permalink
Generic Segmentation Offload (GSO) is a SW technique to split large
packets into small ones. Akin to TSO, GSO enables applications to
operate on large packets, thus reducing per-packet processing overhead.

To enable more flexibility to applications, DPDK GSO is implemented
as a standalone library. Applications explicitly use the GSO library
to segment packets. This patch introduces the GSO API framework to DPDK.

The GSO library provides a segmentation API, rte_gso_segment(), for
applications. It splits an input packet into small ones in each
invocation. The GSO library refers to these small packets generated
by rte_gso_segment() as GSO segments. Each of the newly-created GSO
segments is organized as a two-segment MBUF, where the first segment is a
standard MBUF, which stores a copy of packet header, and the second is an
indirect MBUF which points to a section of data in the input packet.
rte_gso_segment() reduces the refcnt of the input packet by 1. Therefore,
when all GSO segments are freed, the input packet is freed automatically.
Additionally, since each GSO segment has multiple MBUFs (i.e. 2 MBUFs),
the driver of the interface which the GSO segments are sent to should
support to transmit multi-segment packets.

Signed-off-by: Jiayu Hu <***@intel.com>
Signed-off-by: Mark Kavanagh <***@intel.com>
---
config/common_base | 5 ++
lib/Makefile | 2 +
lib/librte_gso/Makefile | 49 ++++++++++++++
lib/librte_gso/rte_gso.c | 48 +++++++++++++
lib/librte_gso/rte_gso.h | 133 +++++++++++++++++++++++++++++++++++++
lib/librte_gso/rte_gso_version.map | 7 ++
mk/rte.app.mk | 1 +
7 files changed, 245 insertions(+)
create mode 100644 lib/librte_gso/Makefile
create mode 100644 lib/librte_gso/rte_gso.c
create mode 100644 lib/librte_gso/rte_gso.h
create mode 100644 lib/librte_gso/rte_gso_version.map

diff --git a/config/common_base b/config/common_base
index 5e97a08..603e340 100644
--- a/config/common_base
+++ b/config/common_base
@@ -652,6 +652,11 @@ CONFIG_RTE_LIBRTE_IP_FRAG_TBL_STAT=n
CONFIG_RTE_LIBRTE_GRO=y

#
+# Compile GSO library
+#
+CONFIG_RTE_LIBRTE_GSO=y
+
+#
# Compile librte_meter
#
CONFIG_RTE_LIBRTE_METER=y
diff --git a/lib/Makefile b/lib/Makefile
index 86caba1..3d123f4 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -108,6 +108,8 @@ DIRS-$(CONFIG_RTE_LIBRTE_REORDER) += librte_reorder
DEPDIRS-librte_reorder := librte_eal librte_mempool librte_mbuf
DIRS-$(CONFIG_RTE_LIBRTE_PDUMP) += librte_pdump
DEPDIRS-librte_pdump := librte_eal librte_mempool librte_mbuf librte_ether
+DIRS-$(CONFIG_RTE_LIBRTE_GSO) += librte_gso
+DEPDIRS-librte_gso := librte_eal librte_mbuf librte_ether librte_net

ifeq ($(CONFIG_RTE_EXEC_ENV_LINUXAPP),y)
DIRS-$(CONFIG_RTE_LIBRTE_KNI) += librte_kni
diff --git a/lib/librte_gso/Makefile b/lib/librte_gso/Makefile
new file mode 100644
index 0000000..aeaacbc
--- /dev/null
+++ b/lib/librte_gso/Makefile
@@ -0,0 +1,49 @@
+# BSD LICENSE
+#
+# Copyright(c) 2017 Intel Corporation. All rights reserved.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+# library name
+LIB = librte_gso.a
+
+CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3
+
+EXPORT_MAP := rte_gso_version.map
+
+LIBABIVER := 1
+
+#source files
+SRCS-$(CONFIG_RTE_LIBRTE_GSO) += rte_gso.c
+
+# install this header file
+SYMLINK-$(CONFIG_RTE_LIBRTE_GSO)-include += rte_gso.h
+
+include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/lib/librte_gso/rte_gso.c b/lib/librte_gso/rte_gso.c
new file mode 100644
index 0000000..fef6725
--- /dev/null
+++ b/lib/librte_gso/rte_gso.c
@@ -0,0 +1,48 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "rte_gso.h"
+
+int
+rte_gso_segment(struct rte_mbuf *pkt,
+ struct rte_gso_ctx gso_ctx __rte_unused,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out)
+{
+ if (pkt == NULL || pkts_out == NULL || nb_pkts_out < 1)
+ return -EINVAL;
+
+ pkts_out[0] = pkt;
+
+ return 1;
+}
diff --git a/lib/librte_gso/rte_gso.h b/lib/librte_gso/rte_gso.h
new file mode 100644
index 0000000..eb4ac4b
--- /dev/null
+++ b/lib/librte_gso/rte_gso.h
@@ -0,0 +1,133 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_GSO_H_
+#define _RTE_GSO_H_
+
+/**
+ * @file
+ * Interface to GSO library
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <rte_mbuf.h>
+
+/* GSO IP id flags for the IPv4 header */
+#define RTE_GSO_IPID_FIXED (1 << 0)
+/**< Use fixed IP ids for output GSO segments */
+#define RTE_GSO_IPID_INCREASE (1 << 1)
+/**< Use incremental IP ids for output GSO segments */
+
+/**
+ * GSO context structure.
+ */
+struct rte_gso_ctx {
+ struct rte_mempool *direct_pool;
+ /**< MBUF pool for allocating direct buffers, which are used
+ * to store packet headers for GSO segments.
+ */
+ struct rte_mempool *indirect_pool;
+ /**< MBUF pool for allocating indirect buffers, which are used
+ * to locate packet payloads for GSO segments. The indirect
+ * buffer doesn't contain any data, but simply points to an
+ * offset within the packet to segment.
+ */
+ uint32_t gso_types;
+ /**< packet types to perform GSO. For example, if applications
+ * want to segment TCP/IPv4 packets, set (RTE_PTYPE_L2_ETHER |
+ * RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L4_TCP) to gso_types.
+ */
+ uint16_t gso_size;
+ /**< maximum size of an output GSO segment, including packet
+ * header and payload, measured in bytes.
+ */
+ uint8_t ipid_flag;
+ /**< flag to indicate GSO uses fixed or incremental IP ids for
+ * IPv4 headers of output GSO segments.
+ */
+};
+
+/**
+ * Segmentation function, which supports processing of both single- and
+ * multi- segment packets.
+ *
+ * Note that we refer to the packets that are segmented from the input
+ * packet as 'GSO segments'. rte_gso_segment() assumes the input packet
+ * has correct checksums, and it doesn't update checksums for output
+ * GSO segments. Additionally, it doesn't process IP fragment packets.
+ *
+ * Each of the newly-created GSO segments is organized as a two-segment
+ * MBUF, where the first segment is a standard MBUF, which stores a copy
+ * of packet header, and the second is an indirect MBUF which points to
+ * a section of data in the input packet. Since each GSO segment has
+ * multiple MBUFs (i.e. 2 MBUFs), the driver of the interface which the
+ * GSO segments are sent to should support to transmit multi-segment
+ * packets.
+ *
+ * If the input packet is GSOed, its mbuf refcnt reduces by 1. Therefore,
+ * when all GSO segments are freed, the input packet is freed automatically.
+ *
+ * If the memory space in pkts_out or MBUF pools is insufficient, this
+ * function fails, and it returns (-1) * errno. Otherwise, GSO successes,
+ * and this function returns the number of output GSO segments filled in
+ * pkts_out.
+ *
+ * @param pkt
+ * The packet mbuf to segment.
+ * @param ctx
+ * GSO context object.
+ * @param pkts_out
+ * Pointer array used to store the MBUF addresses of output GSO
+ * segments, when rte_gso_segment() successes.
+ * @param nb_pkts_out
+ * The max number of items that pkts_out can keep.
+ *
+ * @return
+ * - The number of GSO segments filled in pkts_out on success.
+ * - Return -ENOMEM if run out of memory in MBUF pools.
+ * - Return -EINVAL for invalid parameters.
+ */
+int rte_gso_segment(struct rte_mbuf *pkt,
+ struct rte_gso_ctx ctx,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_GSO_H_ */
diff --git a/lib/librte_gso/rte_gso_version.map b/lib/librte_gso/rte_gso_version.map
new file mode 100644
index 0000000..e1fd453
--- /dev/null
+++ b/lib/librte_gso/rte_gso_version.map
@@ -0,0 +1,7 @@
+DPDK_17.11 {
+ global:
+
+ rte_gso_segment;
+
+ local: *;
+};
diff --git a/mk/rte.app.mk b/mk/rte.app.mk
index c25fdd9..d4c9873 100644
--- a/mk/rte.app.mk
+++ b/mk/rte.app.mk
@@ -66,6 +66,7 @@ _LDLIBS-$(CONFIG_RTE_LIBRTE_PDUMP) += -lrte_pdump
_LDLIBS-$(CONFIG_RTE_LIBRTE_DISTRIBUTOR) += -lrte_distributor
_LDLIBS-$(CONFIG_RTE_LIBRTE_IP_FRAG) += -lrte_ip_frag
_LDLIBS-$(CONFIG_RTE_LIBRTE_GRO) += -lrte_gro
+_LDLIBS-$(CONFIG_RTE_LIBRTE_GSO) += -lrte_gso
_LDLIBS-$(CONFIG_RTE_LIBRTE_METER) += -lrte_meter
_LDLIBS-$(CONFIG_RTE_LIBRTE_SCHED) += -lrte_sched
_LDLIBS-$(CONFIG_RTE_LIBRTE_LPM) += -lrte_lpm
--
2.7.4
Jiayu Hu
2017-09-05 07:57:47 UTC
Permalink
This patch adds GSO support for TCP/IPv4 packets. Supported packets
may include a single VLAN tag. TCP/IPv4 GSO assumes that all input
packets have correct checksums, and doesn't update checksums for output
packets (the responsibility for this lies with the application).
Additionally, TCP/IPv4 GSO doesn't process IP fragmented packets.

TCP/IPv4 GSO uses two chained MBUFs, one direct MBUF and one indrect
MBUF, to organize an output packet. Note that we refer to these two
chained MBUFs as a two-segment MBUF. The direct MBUF stores the packet
header, while the indirect mbuf simply points to a location within the
original packet's payload. Consequently, use of the GSO library requires
multi-segment MBUF support in the TX functions of the NIC driver.

If a packet is GSOed, TCP/IPv4 GSO reduces its MBUF refcnt by 1. As a
result, when all of its GSOed segments are freed, the packet is freed
automatically.

Signed-off-by: Jiayu Hu <***@intel.com>
Signed-off-by: Mark Kavanagh <***@intel.com>
---
lib/librte_eal/common/include/rte_log.h | 1 +
lib/librte_gso/Makefile | 2 +
lib/librte_gso/gso_common.c | 207 ++++++++++++++++++++++++++++++++
lib/librte_gso/gso_common.h | 107 +++++++++++++++++
lib/librte_gso/gso_tcp.c | 83 +++++++++++++
lib/librte_gso/gso_tcp.h | 76 ++++++++++++
lib/librte_gso/rte_gso.c | 46 ++++++-
7 files changed, 519 insertions(+), 3 deletions(-)
create mode 100644 lib/librte_gso/gso_common.c
create mode 100644 lib/librte_gso/gso_common.h
create mode 100644 lib/librte_gso/gso_tcp.c
create mode 100644 lib/librte_gso/gso_tcp.h

diff --git a/lib/librte_eal/common/include/rte_log.h b/lib/librte_eal/common/include/rte_log.h
index ec8dba7..2fa1199 100644
--- a/lib/librte_eal/common/include/rte_log.h
+++ b/lib/librte_eal/common/include/rte_log.h
@@ -87,6 +87,7 @@ extern struct rte_logs rte_logs;
#define RTE_LOGTYPE_CRYPTODEV 17 /**< Log related to cryptodev. */
#define RTE_LOGTYPE_EFD 18 /**< Log related to EFD. */
#define RTE_LOGTYPE_EVENTDEV 19 /**< Log related to eventdev. */
+#define RTE_LOGTYPE_GSO 20 /**< Log related to GSO. */

/* these log types can be used in an application */
#define RTE_LOGTYPE_USER1 24 /**< User-defined log type 1. */
diff --git a/lib/librte_gso/Makefile b/lib/librte_gso/Makefile
index aeaacbc..0f8e38f 100644
--- a/lib/librte_gso/Makefile
+++ b/lib/librte_gso/Makefile
@@ -42,6 +42,8 @@ LIBABIVER := 1

#source files
SRCS-$(CONFIG_RTE_LIBRTE_GSO) += rte_gso.c
+SRCS-$(CONFIG_RTE_LIBRTE_GSO) += gso_common.c
+SRCS-$(CONFIG_RTE_LIBRTE_GSO) += gso_tcp.c

# install this header file
SYMLINK-$(CONFIG_RTE_LIBRTE_GSO)-include += rte_gso.h
diff --git a/lib/librte_gso/gso_common.c b/lib/librte_gso/gso_common.c
new file mode 100644
index 0000000..4d4c3fd
--- /dev/null
+++ b/lib/librte_gso/gso_common.c
@@ -0,0 +1,207 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdbool.h>
+#include <string.h>
+
+#include <rte_malloc.h>
+
+#include <rte_ether.h>
+#include <rte_ip.h>
+#include <rte_tcp.h>
+
+#include "gso_common.h"
+
+static inline void
+hdr_segment_init(struct rte_mbuf *hdr_segment, struct rte_mbuf *pkt,
+ uint16_t pkt_hdr_offset)
+{
+ /* copy mbuf metadata */
+ hdr_segment->nb_segs = 1;
+ hdr_segment->port = pkt->port;
+ hdr_segment->ol_flags = pkt->ol_flags;
+ hdr_segment->packet_type = pkt->packet_type;
+ hdr_segment->pkt_len = pkt_hdr_offset;
+ hdr_segment->data_len = pkt_hdr_offset;
+ hdr_segment->tx_offload = pkt->tx_offload;
+
+ /* copy packet header */
+ rte_memcpy(rte_pktmbuf_mtod(hdr_segment, char *),
+ rte_pktmbuf_mtod(pkt, char *),
+ pkt_hdr_offset);
+}
+
+static inline void
+free_gso_segment(struct rte_mbuf **pkts, uint16_t nb_pkts)
+{
+ uint16_t i;
+
+ for (i = 0; i < nb_pkts; i++)
+ rte_pktmbuf_free(pkts[i]);
+}
+
+int
+gso_do_segment(struct rte_mbuf *pkt,
+ uint16_t pkt_hdr_offset,
+ uint16_t pyld_unit_size,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out)
+{
+ struct rte_mbuf *pkt_in;
+ struct rte_mbuf *hdr_segment, *pyld_segment, *prev_segment;
+ uint16_t pkt_in_data_pos, segment_bytes_remaining;
+ uint16_t pyld_len, nb_segs;
+ bool more_in_pkt, more_out_segs;
+
+ pkt_in = pkt;
+ nb_segs = 0;
+ more_in_pkt = 1;
+ pkt_in_data_pos = pkt_hdr_offset;
+
+ while (more_in_pkt) {
+ if (unlikely(nb_segs >= nb_pkts_out)) {
+ free_gso_segment(pkts_out, nb_segs);
+ return -EINVAL;
+ }
+
+ /* allocate direct mbuf */
+ hdr_segment = rte_pktmbuf_alloc(direct_pool);
+ if (unlikely(hdr_segment == NULL)) {
+ free_gso_segment(pkts_out, nb_segs);
+ return -ENOMEM;
+ }
+ /* fill packet header */
+ hdr_segment_init(hdr_segment, pkt, pkt_hdr_offset);
+
+ prev_segment = hdr_segment;
+ segment_bytes_remaining = pyld_unit_size;
+ more_out_segs = 1;
+
+ while (more_out_segs && more_in_pkt) {
+ /* allocate indirect MBUF */
+ pyld_segment = rte_pktmbuf_alloc(indirect_pool);
+ if (unlikely(pyld_segment == NULL)) {
+ rte_pktmbuf_free(hdr_segment);
+ free_gso_segment(pkts_out, nb_segs);
+ return -ENOMEM;
+ }
+ /* attach to current MBUF segment of pkt */
+ rte_pktmbuf_attach(pyld_segment, pkt_in);
+
+ prev_segment->next = pyld_segment;
+ prev_segment = pyld_segment;
+
+ pyld_len = segment_bytes_remaining;
+ if (pyld_len + pkt_in_data_pos > pkt_in->data_len)
+ pyld_len = pkt_in->data_len - pkt_in_data_pos;
+
+ pyld_segment->data_off = pkt_in_data_pos +
+ pkt_in->data_off;
+ pyld_segment->data_len = pyld_len;
+
+ /* update header segment */
+ hdr_segment->pkt_len += pyld_len;
+ hdr_segment->nb_segs++;
+
+ pkt_in_data_pos += pyld_len;
+ segment_bytes_remaining -= pyld_len;
+
+ /* finish processing a MBUF segment of pkt */
+ if (pkt_in_data_pos == pkt_in->data_len) {
+ pkt_in = pkt_in->next;
+ pkt_in_data_pos = 0;
+ if (pkt_in == NULL)
+ more_in_pkt = 0;
+ }
+
+ /* finish generating a GSO segment */
+ if (segment_bytes_remaining == 0)
+ more_out_segs = 0;
+ }
+ pkts_out[nb_segs++] = hdr_segment;
+ }
+ return nb_segs;
+}
+
+static inline void
+update_inner_tcp4_header(struct rte_mbuf *pkt, uint8_t ipid_delta,
+ struct rte_mbuf **segs, uint16_t nb_segs)
+{
+ struct tcp_hdr *tcp_hdr;
+ struct ipv4_hdr *ipv4_hdr;
+ struct rte_mbuf *seg;
+ uint32_t sent_seq;
+ uint16_t inner_l2_offset;
+ uint16_t id, i;
+
+ inner_l2_offset = pkt->outer_l2_len + pkt->outer_l3_len + pkt->l2_len;
+ ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *) +
+ inner_l2_offset);
+ tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr + pkt->l3_len);
+ id = rte_be_to_cpu_16(ipv4_hdr->packet_id);
+ sent_seq = rte_be_to_cpu_32(tcp_hdr->sent_seq);
+
+ for (i = 0; i < nb_segs; i++) {
+ seg = segs[i];
+ /* update inner IPv4 header */
+ ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(seg, char *) +
+ inner_l2_offset);
+ ipv4_hdr->total_length = rte_cpu_to_be_16(seg->pkt_len -
+ inner_l2_offset);
+ ipv4_hdr->packet_id = rte_cpu_to_be_16(id);
+ id += ipid_delta;
+
+ /* update inner TCP header */
+ tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr + seg->l3_len);
+ tcp_hdr->sent_seq = rte_cpu_to_be_32(sent_seq);
+ if (likely(i < nb_segs - 1))
+ tcp_hdr->tcp_flags &= (~(TCP_HDR_PSH_MASK |
+ TCP_HDR_FIN_MASK));
+
+ sent_seq += (seg->pkt_len - seg->data_len);
+ }
+}
+
+void
+gso_update_pkt_headers(struct rte_mbuf *pkt, uint8_t ipid_delta,
+ struct rte_mbuf **segs, uint16_t nb_segs)
+{
+ switch (pkt->packet_type) {
+ case ETHER_VLAN_IPv4_TCP_PKT:
+ case ETHER_IPv4_TCP_PKT:
+ update_inner_tcp4_header(pkt, ipid_delta, segs, nb_segs);
+ break;
+ }
+}
diff --git a/lib/librte_gso/gso_common.h b/lib/librte_gso/gso_common.h
new file mode 100644
index 0000000..ce3b955
--- /dev/null
+++ b/lib/librte_gso/gso_common.h
@@ -0,0 +1,107 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _GSO_COMMON_H_
+#define _GSO_COMMON_H_
+
+#include <stdint.h>
+#include <rte_mbuf.h>
+
+#define IPV4_HDR_DF_SHIFT 14
+#define IPV4_HDR_DF_MASK (1 << IPV4_HDR_DF_SHIFT)
+
+#define TCP_HDR_PSH_MASK ((uint8_t)0x08)
+#define TCP_HDR_FIN_MASK ((uint8_t)0x01)
+
+#define ETHER_IPv4_PKT (RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4)
+/* TCP/IPv4 packet. */
+#define ETHER_IPv4_TCP_PKT (ETHER_IPv4_PKT | RTE_PTYPE_L4_TCP)
+
+/* TCP/IPv4 packet with VLAN tag. */
+#define ETHER_VLAN_IPv4_TCP_PKT (RTE_PTYPE_L2_ETHER_VLAN | \
+ RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L4_TCP)
+
+/**
+ * Internal function which updates relevant packet headers, following
+ * segmentation. This is required to update, for example, the IPv4
+ * 'total_length' field, to reflect the reduced length of the now-
+ * segmented packet.
+ *
+ * @param pkt
+ * The original packet.
+ * @param ipid_delta
+ * The increasing uint of IP ids.
+ * @param segs
+ * Pointer array used for storing mbuf addresses for GSO segments.
+ * @param nb_segs
+ * The number of GSO segments placed in segs.
+ */
+void gso_update_pkt_headers(struct rte_mbuf *pkt, uint8_t ipid_delta,
+ struct rte_mbuf **segs, uint16_t nb_segs);
+
+/**
+ * Internal function which divides the input packet into small segments.
+ * Each of the newly-created segments is organized as a two-segment mbuf,
+ * where the first segment is a standard mbuf, which stores a copy of
+ * packet header, and the second is an indirect mbuf which points to a
+ * section of data in the input packet.
+ *
+ * @param pkt
+ * Packet to segment.
+ * @param pkt_hdr_offset
+ * Packet header offset, measured in byte.
+ * @param pyld_unit_size
+ * The max payload length of a GSO segment.
+ * @param direct_pool
+ * MBUF pool used for allocating direct buffers for output segments.
+ * @param indirect_pool
+ * MBUF pool used for allocating indirect buffers for output segments.
+ * @param pkts_out
+ * Pointer array used to keep the mbuf addresses of output segments.
+ * @param nb_pkts_out
+ * The max number of items that pkts_out can keep.
+ *
+ * @return
+ * - The number of segments created in the event of success.
+ * - If no GSO is performed, return 1.
+ * - If available memory in mempools is insufficient, return -ENOMEM.
+ * - -EINVAL for invalid parameters
+ */
+int gso_do_segment(struct rte_mbuf *pkt,
+ uint16_t pkt_hdr_offset,
+ uint16_t pyld_unit_size,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out);
+#endif
diff --git a/lib/librte_gso/gso_tcp.c b/lib/librte_gso/gso_tcp.c
new file mode 100644
index 0000000..d52cf28
--- /dev/null
+++ b/lib/librte_gso/gso_tcp.c
@@ -0,0 +1,83 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include <rte_ether.h>
+#include <rte_ip.h>
+#include <rte_tcp.h>
+
+#include "gso_common.h"
+#include "gso_tcp.h"
+
+int
+gso_tcp4_segment(struct rte_mbuf *pkt,
+ uint16_t gso_size,
+ uint8_t ipid_delta,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out)
+{
+ struct ether_hdr *eth_hdr;
+ struct ipv4_hdr *ipv4_hdr;
+ uint16_t tcp_dl;
+ uint16_t pyld_unit_size;
+ uint16_t hdr_offset;
+ int ret = 1;
+
+ eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+ ipv4_hdr = (struct ipv4_hdr *)((char *)eth_hdr + pkt->l2_len);
+
+ /* don't process fragmented packet */
+ if ((ipv4_hdr->fragment_offset &
+ rte_cpu_to_be_16(IPV4_HDR_DF_MASK)) == 0)
+ return ret;
+
+ tcp_dl = rte_be_to_cpu_16(ipv4_hdr->total_length) -
+ pkt->l3_len - pkt->l4_len;
+ /* don't process packet without data */
+ if (tcp_dl == 0)
+ return ret;
+
+ hdr_offset = pkt->l2_len + pkt->l3_len + pkt->l4_len;
+ pyld_unit_size = gso_size - hdr_offset - ETHER_CRC_LEN;
+
+ /* segment the payload */
+ ret = gso_do_segment(pkt, hdr_offset, pyld_unit_size, direct_pool,
+ indirect_pool, pkts_out, nb_pkts_out);
+
+ if (ret > 1)
+ gso_update_pkt_headers(pkt, ipid_delta, pkts_out, ret);
+
+ return ret;
+}
diff --git a/lib/librte_gso/gso_tcp.h b/lib/librte_gso/gso_tcp.h
new file mode 100644
index 0000000..a578535
--- /dev/null
+++ b/lib/librte_gso/gso_tcp.h
@@ -0,0 +1,76 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _GSO_TCP_H_
+#define _GSO_TCP_H_
+
+#include <stdint.h>
+#include <rte_mbuf.h>
+
+/**
+ * Segment an IPv4/TCP packet. This function assumes the input packet has
+ * correct checksums and doesn't update checksums for GSO segment.
+ * Furthermore, it doesn't process IP fragment packets.
+ *
+ * @param pkt
+ * The packet mbuf to segment.
+ * @param gso_size
+ * The max length of a GSO segment, measured in bytes.
+ * @param ipid_delta
+ * The increasing uint of IP ids.
+ * @param direct_pool
+ * MBUF pool used for allocating direct buffers for output segments.
+ * @param indirect_pool
+ * MBUF pool used for allocating indirect buffers for output segments.
+ * @param pkts_out
+ * Pointer array, which is used to store mbuf addresses of GSO segments.
+ * Caller should guarantee that 'pkts_out' is sufficiently large to store
+ * all GSO segments.
+ * @param nb_pkts_out
+ * The max number of items that 'pkts_out' can keep.
+ *
+ * @return
+ * - The number of GSO segments on success.
+ * - Return 1 if no GSO is performed.
+ * - Return -ENOMEM if available memory in mempools is insufficient.
+ * - Return -EINVAL for invalid parameters.
+ */
+int gso_tcp4_segment(struct rte_mbuf *pkt,
+ uint16_t gso_size,
+ uint8_t ip_delta,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out);
+
+#endif
diff --git a/lib/librte_gso/rte_gso.c b/lib/librte_gso/rte_gso.c
index fef6725..ef03375 100644
--- a/lib/librte_gso/rte_gso.c
+++ b/lib/librte_gso/rte_gso.c
@@ -31,18 +31,58 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

+#include <rte_log.h>
+
#include "rte_gso.h"
+#include "gso_common.h"
+#include "gso_tcp.h"

int
rte_gso_segment(struct rte_mbuf *pkt,
- struct rte_gso_ctx gso_ctx __rte_unused,
+ struct rte_gso_ctx gso_ctx,
struct rte_mbuf **pkts_out,
uint16_t nb_pkts_out)
{
+ struct rte_mempool *direct_pool, *indirect_pool;
+ struct rte_mbuf *pkt_seg;
+ uint16_t gso_size;
+ uint8_t ipid_delta;
+ int ret = 1;
+
if (pkt == NULL || pkts_out == NULL || nb_pkts_out < 1)
return -EINVAL;

- pkts_out[0] = pkt;
+ if (gso_ctx.gso_size >= pkt->pkt_len ||
+ (pkt->packet_type & gso_ctx.gso_types) !=
+ pkt->packet_type) {
+ pkts_out[0] = pkt;
+ return ret;
+ }
+
+ direct_pool = gso_ctx.direct_pool;
+ indirect_pool = gso_ctx.indirect_pool;
+ gso_size = gso_ctx.gso_size;
+ ipid_delta = gso_ctx.ipid_flag == RTE_GSO_IPID_INCREASE;
+
+ switch (pkt->packet_type) {
+ case ETHER_VLAN_IPv4_TCP_PKT:
+ case ETHER_IPv4_TCP_PKT:
+ ret = gso_tcp4_segment(pkt, gso_size, ipid_delta,
+ direct_pool, indirect_pool,
+ pkts_out, nb_pkts_out);
+ break;
+ default:
+ RTE_LOG(WARNING, GSO, "Unsupported packet type\n");
+ }
+
+ if (ret > 1) {
+ pkt_seg = pkt;
+ while (pkt_seg) {
+ rte_mbuf_refcnt_update(pkt_seg, -1);
+ pkt_seg = pkt_seg->next;
+ }
+ } else if (ret == 1)
+ pkts_out[0] = pkt;

- return 1;
+ return ret;
}
--
2.7.4
Jiayu Hu
2017-09-05 07:57:48 UTC
Permalink
From: Mark Kavanagh <***@intel.com>

This patch adds GSO support for VxLAN-encapsulated packets. Supported
VxLAN packets must have an outer IPv4 header (prepended by an optional
VLAN tag), and contain an inner TCP/IPv4 packet (with an optional inner
VLAN tag).

VxLAN GSO assumes that all input packets have correct checksums and
doesn't update checksums for output packets. Additionally, it doesn't
process IP fragmented packets.

As with TCP/IPv4 GSO, VxLAN GSO uses a two-segment MBUF to organize each
output packet, which mandates support for multi-segment mbufs in the TX
functions of the NIC driver. Also, if a packet is GSOed, VxLAN GSO
reduces its MBUF refcnt by 1. As a result, when all of its GSOed
segments are freed, the packet is freed automatically.

Signed-off-by: Mark Kavanagh <***@intel.com>
Signed-off-by: Jiayu Hu <***@intel.com>
---
lib/librte_gso/Makefile | 1 +
lib/librte_gso/gso_common.c | 50 ++++++++++++++++++++++++++++++
lib/librte_gso/gso_common.h | 36 +++++++++++++++++++++-
lib/librte_gso/gso_tunnel.c | 61 ++++++++++++++++++++++++++++++++++++
lib/librte_gso/gso_tunnel.h | 75 +++++++++++++++++++++++++++++++++++++++++++++
lib/librte_gso/rte_gso.c | 9 ++++++
6 files changed, 231 insertions(+), 1 deletion(-)
create mode 100644 lib/librte_gso/gso_tunnel.c
create mode 100644 lib/librte_gso/gso_tunnel.h

diff --git a/lib/librte_gso/Makefile b/lib/librte_gso/Makefile
index 0f8e38f..a4d1a81 100644
--- a/lib/librte_gso/Makefile
+++ b/lib/librte_gso/Makefile
@@ -44,6 +44,7 @@ LIBABIVER := 1
SRCS-$(CONFIG_RTE_LIBRTE_GSO) += rte_gso.c
SRCS-$(CONFIG_RTE_LIBRTE_GSO) += gso_common.c
SRCS-$(CONFIG_RTE_LIBRTE_GSO) += gso_tcp.c
+SRCS-$(CONFIG_RTE_LIBRTE_GSO) += gso_tunnel.c

# install this header file
SYMLINK-$(CONFIG_RTE_LIBRTE_GSO)-include += rte_gso.h
diff --git a/lib/librte_gso/gso_common.c b/lib/librte_gso/gso_common.c
index 4d4c3fd..1e16c9c 100644
--- a/lib/librte_gso/gso_common.c
+++ b/lib/librte_gso/gso_common.c
@@ -39,6 +39,7 @@
#include <rte_ether.h>
#include <rte_ip.h>
#include <rte_tcp.h>
+#include <rte_udp.h>

#include "gso_common.h"

@@ -194,11 +195,60 @@ update_inner_tcp4_header(struct rte_mbuf *pkt, uint8_t ipid_delta,
}
}

+static inline void
+update_outer_ipv4_header(struct rte_mbuf *pkt, uint16_t id)
+{
+ struct ipv4_hdr *ipv4_hdr;
+
+ ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *) +
+ pkt->outer_l2_len);
+ ipv4_hdr->total_length = rte_cpu_to_be_16(pkt->pkt_len -
+ pkt->outer_l2_len);
+ ipv4_hdr->packet_id = rte_cpu_to_be_16(id);
+}
+
+static inline void
+update_outer_udp_header(struct rte_mbuf *pkt)
+{
+ struct udp_hdr *udp_hdr;
+ uint16_t length;
+
+ length = pkt->outer_l2_len + pkt->outer_l3_len;
+ udp_hdr = (struct udp_hdr *)(rte_pktmbuf_mtod(pkt, char *) +
+ length);
+ udp_hdr->dgram_len = rte_cpu_to_be_16(pkt->pkt_len - length);
+}
+
+static inline void
+update_ipv4_vxlan_tcp4_header(struct rte_mbuf *pkt, uint8_t ipid_delta,
+ struct rte_mbuf **segs, uint16_t nb_segs)
+{
+ struct ipv4_hdr *ipv4_hdr;
+ uint16_t i, id;
+
+ ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *) +
+ pkt->outer_l2_len);
+ id = rte_be_to_cpu_16(ipv4_hdr->packet_id);
+ for (i = 0; i < nb_segs; i++) {
+ update_outer_ipv4_header(segs[i], id);
+ id += ipid_delta;
+ update_outer_udp_header(segs[i]);
+ }
+ /* update inner TCP/IPv4 headers */
+ update_inner_tcp4_header(pkt, ipid_delta, segs, nb_segs);
+}
+
void
gso_update_pkt_headers(struct rte_mbuf *pkt, uint8_t ipid_delta,
struct rte_mbuf **segs, uint16_t nb_segs)
{
switch (pkt->packet_type) {
+ case ETHER_VLAN_IPv4_UDP_VXLAN_VLAN_IPv4_TCP_PKT:
+ case ETHER_VLAN_IPv4_UDP_VXLAN_IPv4_TCP_PKT:
+ case ETHER_IPv4_UDP_VXLAN_VLAN_IPv4_TCP_PKT:
+ case ETHER_IPv4_UDP_VXLAN_IPv4_TCP_PKT:
+ update_ipv4_vxlan_tcp4_header(pkt, ipid_delta, segs, nb_segs);
+ break;
case ETHER_VLAN_IPv4_TCP_PKT:
case ETHER_IPv4_TCP_PKT:
update_inner_tcp4_header(pkt, ipid_delta, segs, nb_segs);
diff --git a/lib/librte_gso/gso_common.h b/lib/librte_gso/gso_common.h
index ce3b955..3f76fd1 100644
--- a/lib/librte_gso/gso_common.h
+++ b/lib/librte_gso/gso_common.h
@@ -44,6 +44,13 @@
#define TCP_HDR_FIN_MASK ((uint8_t)0x01)

#define ETHER_IPv4_PKT (RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4)
+#define INNER_ETHER_IPv4_TCP_PKT (RTE_PTYPE_INNER_L2_ETHER | \
+ RTE_PTYPE_INNER_L3_IPV4 | \
+ RTE_PTYPE_INNER_L4_TCP)
+#define INNER_ETHER_VLAN_IPv4_TCP_PKT (RTE_PTYPE_INNER_L2_ETHER_VLAN | \
+ RTE_PTYPE_INNER_L3_IPV4 | \
+ RTE_PTYPE_INNER_L4_TCP)
+
/* TCP/IPv4 packet. */
#define ETHER_IPv4_TCP_PKT (ETHER_IPv4_PKT | RTE_PTYPE_L4_TCP)

@@ -51,6 +58,33 @@
#define ETHER_VLAN_IPv4_TCP_PKT (RTE_PTYPE_L2_ETHER_VLAN | \
RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L4_TCP)

+/* VxLAN packet */
+#define ETHER_IPv4_UDP_VXLAN_IPv4_TCP_PKT (ETHER_IPv4_PKT | \
+ RTE_PTYPE_L4_UDP | \
+ RTE_PTYPE_TUNNEL_VXLAN | \
+ INNER_ETHER_IPv4_TCP_PKT)
+
+/* VxLAN packet with outer VLAN tag. */
+#define ETHER_VLAN_IPv4_UDP_VXLAN_IPv4_TCP_PKT (RTE_PTYPE_L2_ETHER_VLAN | \
+ RTE_PTYPE_L3_IPV4 | \
+ RTE_PTYPE_L4_UDP | \
+ RTE_PTYPE_TUNNEL_VXLAN | \
+ INNER_ETHER_IPv4_TCP_PKT)
+
+/* VxLAN packet with inner VLAN tag. */
+#define ETHER_IPv4_UDP_VXLAN_VLAN_IPv4_TCP_PKT (ETHER_IPv4_PKT | \
+ RTE_PTYPE_L4_UDP | \
+ RTE_PTYPE_TUNNEL_VXLAN | \
+ INNER_ETHER_VLAN_IPv4_TCP_PKT)
+
+/* VxLAN packet with both outer and inner VLAN tags. */
+#define ETHER_VLAN_IPv4_UDP_VXLAN_VLAN_IPv4_TCP_PKT (\
+ RTE_PTYPE_L2_ETHER_VLAN | \
+ RTE_PTYPE_L3_IPV4 | \
+ RTE_PTYPE_L4_UDP | \
+ RTE_PTYPE_TUNNEL_VXLAN | \
+ INNER_ETHER_VLAN_IPv4_TCP_PKT)
+
/**
* Internal function which updates relevant packet headers, following
* segmentation. This is required to update, for example, the IPv4
@@ -79,7 +113,7 @@ void gso_update_pkt_headers(struct rte_mbuf *pkt, uint8_t ipid_delta,
* @param pkt
* Packet to segment.
* @param pkt_hdr_offset
- * Packet header offset, measured in byte.
+ * Packet header offset, measured in bytes.
* @param pyld_unit_size
* The max payload length of a GSO segment.
* @param direct_pool
diff --git a/lib/librte_gso/gso_tunnel.c b/lib/librte_gso/gso_tunnel.c
new file mode 100644
index 0000000..69aa91f
--- /dev/null
+++ b/lib/librte_gso/gso_tunnel.c
@@ -0,0 +1,61 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <rte_ether.h>
+
+#include "gso_common.h"
+#include "gso_tunnel.h"
+
+int
+gso_tunnel_segment(struct rte_mbuf *pkt,
+ uint16_t gso_size,
+ uint8_t ipid_delta,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out)
+{
+ uint16_t pyld_unit_size, hdr_offset;
+ int ret;
+
+ hdr_offset = pkt->outer_l2_len + pkt->outer_l3_len + pkt->l2_len +
+ pkt->l3_len + pkt->l4_len;
+ pyld_unit_size = gso_size - hdr_offset - ETHER_CRC_LEN;
+
+ ret = gso_do_segment(pkt, hdr_offset, pyld_unit_size, direct_pool,
+ indirect_pool, pkts_out, nb_pkts_out);
+ if (ret > 1)
+ gso_update_pkt_headers(pkt, ipid_delta, pkts_out, ret);
+
+ return ret;
+}
diff --git a/lib/librte_gso/gso_tunnel.h b/lib/librte_gso/gso_tunnel.h
new file mode 100644
index 0000000..80bd0c5
--- /dev/null
+++ b/lib/librte_gso/gso_tunnel.h
@@ -0,0 +1,75 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _GSO_TUNNEL_H_
+#define _GSO_TUNNEL_H_
+
+#include <stdint.h>
+#include <rte_mbuf.h>
+
+/**
+ * Segment an tunneling packet. This function assumes the input packet
+ * has correct checksums and doesn't update checksums for GSO segment.
+ * Furthermore, it doesn't process IP fragment packets.
+ *
+ * @param pkt
+ * The packet mbuf to segment.
+ * @param gso_size
+ * The max length of a GSO segment, measured in bytes.
+ * @param ipid_delta
+ * The increasing uint of IP ids.
+ * @param direct_pool
+ * MBUF pool used for allocating direct buffers for output segments.
+ * @param indirect_pool
+ * MBUF pool used for allocating indirect buffers for output segments.
+ * @param pkts_out
+ * Pointer array, which is used to store mbuf addresses of GSO segments.
+ * Caller should guarantee that 'pkts_out' is sufficiently large to store
+ * all GSO segments.
+ * @param nb_pkts_out
+ * The max number of items that 'pkts_out' can keep.
+ *
+ * @return
+ * - The number of GSO segments on success.
+ * - Return 1 if no GSO is performed.
+ * - Return -ENOMEM if available memory in mempools is insufficient.
+ * - Return -EINVAL for invalid parameters.
+ */
+int gso_tunnel_segment(struct rte_mbuf *pkt,
+ uint16_t gso_size,
+ uint8_t ipid_delta,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out);
+#endif
diff --git a/lib/librte_gso/rte_gso.c b/lib/librte_gso/rte_gso.c
index ef03375..0170abc 100644
--- a/lib/librte_gso/rte_gso.c
+++ b/lib/librte_gso/rte_gso.c
@@ -36,6 +36,7 @@
#include "rte_gso.h"
#include "gso_common.h"
#include "gso_tcp.h"
+#include "gso_tunnel.h"

int
rte_gso_segment(struct rte_mbuf *pkt,
@@ -71,6 +72,14 @@ rte_gso_segment(struct rte_mbuf *pkt,
direct_pool, indirect_pool,
pkts_out, nb_pkts_out);
break;
+ case ETHER_VLAN_IPv4_UDP_VXLAN_VLAN_IPv4_TCP_PKT:
+ case ETHER_VLAN_IPv4_UDP_VXLAN_IPv4_TCP_PKT:
+ case ETHER_IPv4_UDP_VXLAN_VLAN_IPv4_TCP_PKT:
+ case ETHER_IPv4_UDP_VXLAN_IPv4_TCP_PKT:
+ ret = gso_tunnel_segment(pkt, gso_size, ipid_delta,
+ direct_pool, indirect_pool,
+ pkts_out, nb_pkts_out);
+ break;
default:
RTE_LOG(WARNING, GSO, "Unsupported packet type\n");
}
--
2.7.4
Jiayu Hu
2017-09-05 07:57:49 UTC
Permalink
From: Mark Kavanagh <***@intel.com>

This patch adds GSO support for GRE-tunneled packets. Supported GRE
packets must contain an outer IPv4 header, and inner TCP/IPv4 headers.
They may also contain a single VLAN tag. GRE GSO assumes that all input
packets have correct checksums and doesn't update checksums for output
packets. Additionally, it doesn't process IP fragmented packets.

As with VxLAN GSO, GRE GSO uses a two-segment MBUF to organize each
output packet, which requires multi-segment mbuf support in the TX
functions of the NIC driver. Also, if a packet is GSOed, GRE GSO reduces
its MBUF refcnt by 1. As a result, when all of its GSOed segments are
freed, the packet is freed automatically.

Signed-off-by: Mark Kavanagh <***@intel.com>
Signed-off-by: Jiayu Hu <***@intel.com>
---
lib/librte_gso/gso_common.c | 24 ++++++++++++++++++++++++
lib/librte_gso/gso_common.h | 15 +++++++++++++++
lib/librte_gso/rte_gso.c | 2 ++
3 files changed, 41 insertions(+)

diff --git a/lib/librte_gso/gso_common.c b/lib/librte_gso/gso_common.c
index 1e16c9c..668d2d0 100644
--- a/lib/librte_gso/gso_common.c
+++ b/lib/librte_gso/gso_common.c
@@ -37,6 +37,7 @@
#include <rte_malloc.h>

#include <rte_ether.h>
+#include <rte_gre.h>
#include <rte_ip.h>
#include <rte_tcp.h>
#include <rte_udp.h>
@@ -238,6 +239,25 @@ update_ipv4_vxlan_tcp4_header(struct rte_mbuf *pkt, uint8_t ipid_delta,
update_inner_tcp4_header(pkt, ipid_delta, segs, nb_segs);
}

+static inline void
+update_ipv4_gre_tcp4_header(struct rte_mbuf *pkt, uint8_t ipid_delta,
+ struct rte_mbuf **segs, uint16_t nb_segs)
+{
+ struct ipv4_hdr *ipv4_hdr;
+ uint16_t i, id;
+
+ ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *) +
+ pkt->outer_l2_len);
+ id = rte_be_to_cpu_16(ipv4_hdr->packet_id);
+ for (i = 0; i < nb_segs; i++) {
+ update_outer_ipv4_header(segs[i], id);
+ id += ipid_delta;
+ }
+
+ /* update inner TCP/IPv4 headers */
+ update_inner_tcp4_header(pkt, ipid_delta, segs, nb_segs);
+}
+
void
gso_update_pkt_headers(struct rte_mbuf *pkt, uint8_t ipid_delta,
struct rte_mbuf **segs, uint16_t nb_segs)
@@ -249,6 +269,10 @@ gso_update_pkt_headers(struct rte_mbuf *pkt, uint8_t ipid_delta,
case ETHER_IPv4_UDP_VXLAN_IPv4_TCP_PKT:
update_ipv4_vxlan_tcp4_header(pkt, ipid_delta, segs, nb_segs);
break;
+ case ETHER_VLAN_IPv4_GRE_IPv4_TCP_PKT:
+ case ETHER_IPv4_GRE_IPv4_TCP_PKT:
+ update_ipv4_gre_tcp4_header(pkt, ipid_delta, segs, nb_segs);
+ break;
case ETHER_VLAN_IPv4_TCP_PKT:
case ETHER_IPv4_TCP_PKT:
update_inner_tcp4_header(pkt, ipid_delta, segs, nb_segs);
diff --git a/lib/librte_gso/gso_common.h b/lib/librte_gso/gso_common.h
index 3f76fd1..bd53bde 100644
--- a/lib/librte_gso/gso_common.h
+++ b/lib/librte_gso/gso_common.h
@@ -85,6 +85,21 @@
RTE_PTYPE_TUNNEL_VXLAN | \
INNER_ETHER_VLAN_IPv4_TCP_PKT)

+/* GRE packet. */
+#define ETHER_IPv4_GRE_IPv4_TCP_PKT (\
+ ETHER_IPv4_PKT | \
+ RTE_PTYPE_TUNNEL_GRE | \
+ RTE_PTYPE_INNER_L3_IPV4 | \
+ RTE_PTYPE_INNER_L4_TCP)
+
+/* GRE packet with VLAN tag. */
+#define ETHER_VLAN_IPv4_GRE_IPv4_TCP_PKT (\
+ RTE_PTYPE_L2_ETHER_VLAN | \
+ RTE_PTYPE_L3_IPV4 | \
+ RTE_PTYPE_TUNNEL_GRE | \
+ RTE_PTYPE_INNER_L3_IPV4 | \
+ RTE_PTYPE_INNER_L4_TCP)
+
/**
* Internal function which updates relevant packet headers, following
* segmentation. This is required to update, for example, the IPv4
diff --git a/lib/librte_gso/rte_gso.c b/lib/librte_gso/rte_gso.c
index 0170abc..d40fda9 100644
--- a/lib/librte_gso/rte_gso.c
+++ b/lib/librte_gso/rte_gso.c
@@ -76,6 +76,8 @@ rte_gso_segment(struct rte_mbuf *pkt,
case ETHER_VLAN_IPv4_UDP_VXLAN_IPv4_TCP_PKT:
case ETHER_IPv4_UDP_VXLAN_VLAN_IPv4_TCP_PKT:
case ETHER_IPv4_UDP_VXLAN_IPv4_TCP_PKT:
+ case ETHER_VLAN_IPv4_GRE_IPv4_TCP_PKT:
+ case ETHER_IPv4_GRE_IPv4_TCP_PKT:
ret = gso_tunnel_segment(pkt, gso_size, ipid_delta,
direct_pool, indirect_pool,
pkts_out, nb_pkts_out);
--
2.7.4
Jiayu Hu
2017-09-05 07:57:50 UTC
Permalink
This patch adds GSO support to the csum forwarding engine. Oversized
packets transmitted over a GSO-enabled port will undergo segmentation
(with the exception of packet-types unsupported by the GSO library).
GSO support is disabled by default.

GSO support may be toggled on a per-port basis, using the command:

"set port <port_id> gso on|off"

The maximum packet length (including the packet header and payload) for
GSO segments may be set with the command:

"set gso segsz <length>"

Show GSO configuration for a given port with the command:

"show port <port_id> gso"

Signed-off-by: Jiayu Hu <***@intel.com>
Signed-off-by: Mark Kavanagh <***@intel.com>
---
app/test-pmd/cmdline.c | 178 ++++++++++++++++++++++++++++++++++++++++++++++++
app/test-pmd/config.c | 24 +++++++
app/test-pmd/csumonly.c | 60 ++++++++++++++--
app/test-pmd/testpmd.c | 15 ++++
app/test-pmd/testpmd.h | 10 +++
5 files changed, 283 insertions(+), 4 deletions(-)

diff --git a/app/test-pmd/cmdline.c b/app/test-pmd/cmdline.c
index cd8c358..03b98a3 100644
--- a/app/test-pmd/cmdline.c
+++ b/app/test-pmd/cmdline.c
@@ -431,6 +431,17 @@ static void cmd_help_long_parsed(void *parsed_result,
" Set max flow number and max packet number per-flow"
" for GRO.\n\n"

+ "set port (port_id) gso (on|off)"
+ " Enable or disable Generic Segmentation Offload in"
+ " csum forwarding engine.\n\n"
+
+ "set gso segsz (length)\n"
+ " Set max packet length for output GSO segments,"
+ " including packet header and payload.\n\n"
+
+ "show port (port_id) gso\n"
+ " Show GSO configuration.\n\n"
+
"set fwd (%s)\n"
" Set packet forwarding mode.\n\n"

@@ -3963,6 +3974,170 @@ cmdline_parse_inst_t cmd_gro_set = {
},
};

+/* *** ENABLE/DISABLE GSO *** */
+struct cmd_gso_enable_result {
+ cmdline_fixed_string_t cmd_set;
+ cmdline_fixed_string_t cmd_port;
+ cmdline_fixed_string_t cmd_keyword;
+ cmdline_fixed_string_t cmd_mode;
+ uint8_t cmd_pid;
+};
+
+static void
+cmd_gso_enable_parsed(void *parsed_result,
+ __attribute__((unused)) struct cmdline *cl,
+ __attribute__((unused)) void *data)
+{
+ struct cmd_gso_enable_result *res;
+
+ res = parsed_result;
+ if (!strcmp(res->cmd_keyword, "gso"))
+ setup_gso(res->cmd_mode, res->cmd_pid);
+}
+
+cmdline_parse_token_string_t cmd_gso_enable_set =
+ TOKEN_STRING_INITIALIZER(struct cmd_gso_enable_result,
+ cmd_set, "set");
+cmdline_parse_token_string_t cmd_gso_enable_port =
+ TOKEN_STRING_INITIALIZER(struct cmd_gso_enable_result,
+ cmd_port, "port");
+cmdline_parse_token_string_t cmd_gso_enable_keyword =
+ TOKEN_STRING_INITIALIZER(struct cmd_gso_enable_result,
+ cmd_keyword, "gso");
+cmdline_parse_token_string_t cmd_gso_enable_mode =
+ TOKEN_STRING_INITIALIZER(struct cmd_gso_enable_result,
+ cmd_mode, "on#off");
+cmdline_parse_token_num_t cmd_gso_enable_pid =
+ TOKEN_NUM_INITIALIZER(struct cmd_gso_enable_result,
+ cmd_pid, UINT8);
+
+cmdline_parse_inst_t cmd_gso_enable = {
+ .f = cmd_gso_enable_parsed,
+ .data = NULL,
+ .help_str = "set port <port_id> gso on|off",
+ .tokens = {
+ (void *)&cmd_gso_enable_set,
+ (void *)&cmd_gso_enable_port,
+ (void *)&cmd_gso_enable_pid,
+ (void *)&cmd_gso_enable_keyword,
+ (void *)&cmd_gso_enable_mode,
+ NULL,
+ },
+};
+
+/* *** SET MAX PACKET LENGTH FOR GSO SEGMENTS *** */
+struct cmd_gso_size_result {
+ cmdline_fixed_string_t cmd_set;
+ cmdline_fixed_string_t cmd_keyword;
+ cmdline_fixed_string_t cmd_segsz;
+ uint16_t cmd_size;
+};
+
+static void
+cmd_gso_size_parsed(void *parsed_result,
+ __attribute__((unused)) struct cmdline *cl,
+ __attribute__((unused)) void *data)
+{
+ struct cmd_gso_size_result *res = parsed_result;
+
+ if (test_done == 0) {
+ printf("Before set GSO segsz, please stop fowarding first\n");
+ return;
+ }
+
+ if (!strcmp(res->cmd_keyword, "gso") &&
+ !strcmp(res->cmd_segsz, "segsz")) {
+ if (res->cmd_size == 0) {
+ printf("gso_size should be larger than 0."
+ " Please input a legal value\n");
+ } else
+ gso_max_segment_size = res->cmd_size;
+ }
+}
+
+cmdline_parse_token_string_t cmd_gso_size_set =
+ TOKEN_STRING_INITIALIZER(struct cmd_gso_size_result,
+ cmd_set, "set");
+cmdline_parse_token_string_t cmd_gso_size_keyword =
+ TOKEN_STRING_INITIALIZER(struct cmd_gso_size_result,
+ cmd_keyword, "gso");
+cmdline_parse_token_string_t cmd_gso_size_segsz =
+ TOKEN_STRING_INITIALIZER(struct cmd_gso_size_result,
+ cmd_segsz, "segsz");
+cmdline_parse_token_num_t cmd_gso_size_size =
+ TOKEN_NUM_INITIALIZER(struct cmd_gso_size_result,
+ cmd_size, UINT16);
+
+cmdline_parse_inst_t cmd_gso_size = {
+ .f = cmd_gso_size_parsed,
+ .data = NULL,
+ .help_str = "set gso segsz <length>",
+ .tokens = {
+ (void *)&cmd_gso_size_set,
+ (void *)&cmd_gso_size_keyword,
+ (void *)&cmd_gso_size_segsz,
+ (void *)&cmd_gso_size_size,
+ NULL,
+ },
+};
+
+/* *** SHOW GSO CONFIGURATION *** */
+struct cmd_gso_show_result {
+ cmdline_fixed_string_t cmd_show;
+ cmdline_fixed_string_t cmd_port;
+ cmdline_fixed_string_t cmd_keyword;
+ uint8_t cmd_pid;
+};
+
+static void
+cmd_gso_show_parsed(void *parsed_result,
+ __attribute__((unused)) struct cmdline *cl,
+ __attribute__((unused)) void *data)
+{
+ struct cmd_gso_show_result *res = parsed_result;
+
+ if (!rte_eth_dev_is_valid_port(res->cmd_pid)) {
+ printf("invalid port id %u\n", res->cmd_pid);
+ return;
+ }
+
+ if (!strcmp(res->cmd_keyword, "gso")) {
+ if (gso_ports[res->cmd_pid].enable) {
+ printf("Max GSO segment size: %uB\n"
+ "Support GSO protocols: TCP/IPv4,"
+ " VxlAN and GRE\n",
+ gso_max_segment_size);
+ } else
+ printf("Port %u doesn't enable GSO\n", res->cmd_pid);
+ }
+}
+
+cmdline_parse_token_string_t cmd_gso_show_show =
+ TOKEN_STRING_INITIALIZER(struct cmd_gso_show_result,
+ cmd_show, "show");
+cmdline_parse_token_string_t cmd_gso_show_port =
+ TOKEN_STRING_INITIALIZER(struct cmd_gso_show_result,
+ cmd_port, "port");
+cmdline_parse_token_string_t cmd_gso_show_keyword =
+ TOKEN_STRING_INITIALIZER(struct cmd_gso_show_result,
+ cmd_keyword, "gso");
+cmdline_parse_token_num_t cmd_gso_show_pid =
+ TOKEN_NUM_INITIALIZER(struct cmd_gso_show_result,
+ cmd_pid, UINT8);
+
+cmdline_parse_inst_t cmd_gso_show = {
+ .f = cmd_gso_show_parsed,
+ .data = NULL,
+ .help_str = "show port <port_id> gso",
+ .tokens = {
+ (void *)&cmd_gso_show_show,
+ (void *)&cmd_gso_show_port,
+ (void *)&cmd_gso_show_pid,
+ (void *)&cmd_gso_show_keyword,
+ NULL,
+ },
+};
+
/* *** ENABLE/DISABLE FLUSH ON RX STREAMS *** */
struct cmd_set_flush_rx {
cmdline_fixed_string_t set;
@@ -14251,6 +14426,9 @@ cmdline_parse_ctx_t main_ctx[] = {
(cmdline_parse_inst_t *)&cmd_tunnel_tso_show,
(cmdline_parse_inst_t *)&cmd_enable_gro,
(cmdline_parse_inst_t *)&cmd_gro_set,
+ (cmdline_parse_inst_t *)&cmd_gso_enable,
+ (cmdline_parse_inst_t *)&cmd_gso_size,
+ (cmdline_parse_inst_t *)&cmd_gso_show,
(cmdline_parse_inst_t *)&cmd_link_flow_control_set,
(cmdline_parse_inst_t *)&cmd_link_flow_control_set_rx,
(cmdline_parse_inst_t *)&cmd_link_flow_control_set_tx,
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 3ae3e1c..3434346 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -2454,6 +2454,30 @@ setup_gro(const char *mode, uint8_t port_id)
}
}

+void
+setup_gso(const char *mode, uint8_t port_id)
+{
+ if (!rte_eth_dev_is_valid_port(port_id)) {
+ printf("invalid port id %u\n", port_id);
+ return;
+ }
+ if (strcmp(mode, "on") == 0) {
+ if (test_done == 0) {
+ printf("before enable GSO,"
+ " please stop forwarding first\n");
+ return;
+ }
+ gso_ports[port_id].enable = 1;
+ } else if (strcmp(mode, "off") == 0) {
+ if (test_done == 0) {
+ printf("before disable GSO,"
+ " please stop forwarding first\n");
+ return;
+ }
+ gso_ports[port_id].enable = 0;
+ }
+}
+
char*
list_pkt_forwarding_modes(void)
{
diff --git a/app/test-pmd/csumonly.c b/app/test-pmd/csumonly.c
index 90c8119..30ae709 100644
--- a/app/test-pmd/csumonly.c
+++ b/app/test-pmd/csumonly.c
@@ -66,10 +66,12 @@
#include <rte_tcp.h>
#include <rte_udp.h>
#include <rte_sctp.h>
+#include <rte_net.h>
#include <rte_prefetch.h>
#include <rte_string_fns.h>
#include <rte_flow.h>
#include <rte_gro.h>
+#include <rte_gso.h>
#include "testpmd.h"

#define IP_DEFTTL 64 /* from RFC 1340. */
@@ -627,6 +629,9 @@ static void
pkt_burst_checksum_forward(struct fwd_stream *fs)
{
struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
+ struct rte_mbuf *gso_segments[GSO_MAX_PKT_BURST];
+ struct rte_gso_ctx *gso_ctx;
+ struct rte_mbuf **tx_pkts_burst;
struct rte_port *txp;
struct rte_mbuf *m, *p;
struct ether_hdr *eth_hdr;
@@ -641,6 +646,9 @@ pkt_burst_checksum_forward(struct fwd_stream *fs)
uint32_t rx_bad_ip_csum;
uint32_t rx_bad_l4_csum;
struct testpmd_offload_info info;
+ uint16_t nb_segments = 0;
+ struct rte_net_hdr_lens hdr_lens;
+ int ret;

#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
uint64_t start_tsc;
@@ -851,13 +859,56 @@ pkt_burst_checksum_forward(struct fwd_stream *fs)
}
}

+ if (gso_ports[fs->tx_port].enable == 0)
+ tx_pkts_burst = pkts_burst;
+ else {
+ gso_ctx = &(current_fwd_lcore()->gso_ctx);
+ gso_ctx->gso_size = gso_max_segment_size;
+ for (i = 0; i < nb_rx; i++) {
+ /* fill packet_type for the packet to segment */
+ pkts_burst[i]->packet_type = rte_net_get_ptype(
+ pkts_burst[i], &hdr_lens,
+ RTE_PTYPE_ALL_MASK);
+
+ ret = rte_gso_segment(pkts_burst[i], *gso_ctx,
+ &gso_segments[nb_segments],
+ GSO_MAX_PKT_BURST - nb_segments);
+ if (ret >= 1)
+ nb_segments += ret;
+ else if (ret < 0) {
+ /* insufficient MBUFs, stop GSO */
+ memcpy(&gso_segments[nb_segments],
+ &pkts_burst[i],
+ sizeof(struct rte_mbuf *) *
+ (nb_rx - i));
+ nb_segments += (nb_rx - i);
+ break;
+ }
+ if (unlikely(nb_rx - i >= GSO_MAX_PKT_BURST -
+ nb_segments)) {
+ /*
+ * insufficient space in gso_segments,
+ * stop GSO.
+ */
+ memcpy(&gso_segments[nb_segments],
+ &pkts_burst[i],
+ sizeof(struct rte_mbuf *) *
+ (nb_rx - i));
+ nb_segments += (nb_rx - i);
+ break;
+ }
+ }
+ tx_pkts_burst = gso_segments;
+ nb_rx = nb_segments;
+ }
+
nb_prep = rte_eth_tx_prepare(fs->tx_port, fs->tx_queue,
- pkts_burst, nb_rx);
+ tx_pkts_burst, nb_rx);
if (nb_prep != nb_rx)
printf("Preparing packet burst to transmit failed: %s\n",
rte_strerror(rte_errno));

- nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst,
+ nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, tx_pkts_burst,
nb_prep);

/*
@@ -868,7 +919,7 @@ pkt_burst_checksum_forward(struct fwd_stream *fs)
while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
rte_delay_us(burst_tx_delay_time);
nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
- &pkts_burst[nb_tx], nb_rx - nb_tx);
+ &tx_pkts_burst[nb_tx], nb_rx - nb_tx);
}
}
fs->tx_packets += nb_tx;
@@ -881,9 +932,10 @@ pkt_burst_checksum_forward(struct fwd_stream *fs)
if (unlikely(nb_tx < nb_rx)) {
fs->fwd_dropped += (nb_rx - nb_tx);
do {
- rte_pktmbuf_free(pkts_burst[nb_tx]);
+ rte_pktmbuf_free(tx_pkts_burst[nb_tx]);
} while (++nb_tx < nb_rx);
}
+
#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
end_tsc = rte_rdtsc();
core_cycles = (end_tsc - start_tsc);
diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
index 7d40139..e83fc95 100644
--- a/app/test-pmd/testpmd.c
+++ b/app/test-pmd/testpmd.c
@@ -400,6 +400,9 @@ static int eth_event_callback(uint8_t port_id,
*/
static int all_ports_started(void);

+struct gso_status gso_ports[RTE_MAX_ETHPORTS];
+uint16_t gso_max_segment_size = ETHER_MAX_LEN;
+
/*
* Helper function to check if socket is already discovered.
* If yes, return positive value. If not, return zero.
@@ -570,6 +573,7 @@ init_config(void)
unsigned int nb_mbuf_per_pool;
lcoreid_t lc_id;
uint8_t port_per_socket[RTE_MAX_NUMA_NODES];
+ uint32_t gso_types = 0;

memset(port_per_socket,0,RTE_MAX_NUMA_NODES);

@@ -654,6 +658,11 @@ init_config(void)

init_port_config();

+ gso_types = RTE_PTYPE_L2_ETHER_VLAN | RTE_PTYPE_L2_ETHER |
+ RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L4_TCP |
+ RTE_PTYPE_L4_UDP | RTE_PTYPE_TUNNEL_VXLAN |
+ RTE_PTYPE_INNER_L3_IPV4 | RTE_PTYPE_INNER_L4_TCP |
+ RTE_PTYPE_TUNNEL_GRE;
/*
* Records which Mbuf pool to use by each logical core, if needed.
*/
@@ -664,6 +673,12 @@ init_config(void)
if (mbp == NULL)
mbp = mbuf_pool_find(0);
fwd_lcores[lc_id]->mbp = mbp;
+ /* initialize GSO context */
+ fwd_lcores[lc_id]->gso_ctx.direct_pool = mbp;
+ fwd_lcores[lc_id]->gso_ctx.indirect_pool = mbp;
+ fwd_lcores[lc_id]->gso_ctx.gso_types = gso_types;
+ fwd_lcores[lc_id]->gso_ctx.gso_size = ETHER_MAX_LEN;
+ fwd_lcores[lc_id]->gso_ctx.ipid_flag = RTE_GSO_IPID_INCREASE;
}

/* Configuration of packet forwarding streams. */
diff --git a/app/test-pmd/testpmd.h b/app/test-pmd/testpmd.h
index c9d7739..725af1a 100644
--- a/app/test-pmd/testpmd.h
+++ b/app/test-pmd/testpmd.h
@@ -36,6 +36,7 @@

#include <rte_pci.h>
#include <rte_gro.h>
+#include <rte_gso.h>

#define RTE_PORT_ALL (~(portid_t)0x0)

@@ -205,6 +206,7 @@ struct rte_port {
* CPU id. configuration table.
*/
struct fwd_lcore {
+ struct rte_gso_ctx gso_ctx; /**< GSO context */
struct rte_mempool *mbp; /**< The mbuf pool to use by this core */
streamid_t stream_idx; /**< index of 1st stream in "fwd_streams" */
streamid_t stream_nb; /**< number of streams in "fwd_streams" */
@@ -442,6 +444,13 @@ struct gro_status {
};
extern struct gro_status gro_ports[RTE_MAX_ETHPORTS];

+#define GSO_MAX_PKT_BURST 2048
+struct gso_status {
+ uint8_t enable;
+};
+extern struct gso_status gso_ports[RTE_MAX_ETHPORTS];
+extern uint16_t gso_max_segment_size;
+
static inline unsigned int
lcore_num(void)
{
@@ -641,6 +650,7 @@ void get_5tuple_filter(uint8_t port_id, uint16_t index);
int rx_queue_id_is_invalid(queueid_t rxq_id);
int tx_queue_id_is_invalid(queueid_t txq_id);
void setup_gro(const char *mode, uint8_t port_id);
+void setup_gso(const char *mode, uint8_t port_id);

/* Functions to manage the set of filtered Multicast MAC addresses */
void mcast_addr_add(uint8_t port_id, struct ether_addr *mc_addr);
--
2.7.4
Jiayu Hu
2017-09-12 02:43:26 UTC
Permalink
Generic Segmentation Offload (GSO) is a SW technique to split large
packets into small ones. Akin to TSO, GSO enables applications to
operate on large packets, thus reducing per-packet processing overhead.

To enable more flexibility to applications, DPDK GSO is implemented
as a standalone library. Applications explicitly use the GSO library
to segment packets. This patch adds GSO support to DPDK for specific
packet types: specifically, TCP/IPv4, VxLAN, and GRE.

The first patch introduces the GSO API framework. The second patch
adds GSO support for TCP/IPv4 packets (containing an optional VLAN
tag). The third patch adds GSO support for VxLAN packets that contain
outer IPv4, and inner TCP/IPv4 headers (plus optional inner and/or
outer VLAN tags). The fourth patch adds GSO support for GRE packets
that contain outer IPv4, and inner TCP/IPv4 headers (with optional
outer VLAN tag). The last patch in the series enables TCP/IPv4, VxLAN,
and GRE GSO in testpmd's checksum forwarding engine.

The performance of TCP/IPv4 GSO on a 10Gbps link is demonstrated using
iperf. Setup for the test is described as follows:

a. Connect 2 x 10Gbps physical ports (P0, P1), which are in the same
machine, together physically.
b. Launch testpmd with P0 and a vhost-user port, and use csum
forwarding engine with "retry".
c. Select IP and TCP HW checksum calculation for P0; select TCP HW
checksum calculation for vhost-user port.
d. Launch a VM with csum and tso offloading enabled.
e. Run iperf-client on virtio-net port in the VM to send TCP packets.
With enabling csum and tso, the VM can send large TCP/IPv4 packets
(mss is up to 64KB).
f. P1 is assigned to linux kernel and enabled kernel GRO. Run
iperf-server on P1.

We conduct three iperf tests:

test-1: enable GSO for P0 in testpmd, and set max GSO segment length
to 1518B. Run two iperf-client in the VM.
test-2: enable TSO for P0 in testpmd, and set TSO segsz to 1518B. Run
two iperf-client in the VM.
test-3: disable GSO and TSO in testpmd. Run two iperf-client in the VM.

Throughput of the above three tests:

test-1: ~9Gbps
test-2: 9.5Gbps
test-3: 3Mbps

The experimental data of VxLAN and GRE will be shown later.

Change log
==========
v3:
- support all IPv4 header flags, including RTE_PTYPE_(INNER_)L3_IPV4,
RTE_PTYPE_(INNER_)L3_IPV4_EXT and RTE_PTYPE_(INNER_)L3_IPV4_EXT_
UNKNOWN.
- fill mbuf->packet_type instead of using rte_net_get_ptype() in
csumonly.c, since rte_net_get_ptype() doesn't support vxlan.
- store the input packet into pkts_out inside gso_tcp4_segment() and
gso_tunnel_tcp4_segment() instead of rte_gso_segment(), when no GSO
is performed.
- add missing incldues.
- optimize file names, function names and function description.
- fix one bug in testpmd.
v2:
- merge data segments whose data_len is less than mss into a large data
segment in gso_do_segment().
- use mbuf->packet_type/l2_len/l3_len etc. instead of parsing the packet
header in rte_gso_segment().
- provide IP id macros for applications to select fixed or incremental IP
ids.
- change the defination of gso_types in struct rte_gso_ctx.
- replace rte_pktmbuf_detach() with rte_pktmbuf_free().
- refactor gso_update_pkt_headers().
- change the return value of rte_gso_segment().
- remove parameter checks in rte_gso_segment().
- use rte_net_get_ptype() in app/test-pmd/csumonly.c to fill
mbuf->packet_type.
- add a new GSO command in testpmd to show GSO configuration for ports.
- misc: fix typo and optimize function description.

Jiayu Hu (3):
gso: add Generic Segmentation Offload API framework
gso: add TCP/IPv4 GSO support
app/testpmd: enable TCP/IPv4, VxLAN and GRE GSO

Mark Kavanagh (2):
gso: add VxLAN GSO support
gso: add GRE GSO support

app/test-pmd/cmdline.c | 178 +++++++++++++++++++++
app/test-pmd/config.c | 24 +++
app/test-pmd/csumonly.c | 102 +++++++++++-
app/test-pmd/testpmd.c | 16 ++
app/test-pmd/testpmd.h | 10 ++
config/common_base | 5 +
lib/Makefile | 2 +
lib/librte_eal/common/include/rte_log.h | 1 +
lib/librte_gso/Makefile | 52 ++++++
lib/librte_gso/gso_common.c | 270 ++++++++++++++++++++++++++++++++
lib/librte_gso/gso_common.h | 165 +++++++++++++++++++
lib/librte_gso/gso_tcp4.c | 83 ++++++++++
lib/librte_gso/gso_tcp4.h | 76 +++++++++
lib/librte_gso/gso_tunnel_tcp4.c | 85 ++++++++++
lib/librte_gso/gso_tunnel_tcp4.h | 76 +++++++++
lib/librte_gso/rte_gso.c | 91 +++++++++++
lib/librte_gso/rte_gso.h | 133 ++++++++++++++++
lib/librte_gso/rte_gso_version.map | 7 +
mk/rte.app.mk | 1 +
19 files changed, 1373 insertions(+), 4 deletions(-)
create mode 100644 lib/librte_gso/Makefile
create mode 100644 lib/librte_gso/gso_common.c
create mode 100644 lib/librte_gso/gso_common.h
create mode 100644 lib/librte_gso/gso_tcp4.c
create mode 100644 lib/librte_gso/gso_tcp4.h
create mode 100644 lib/librte_gso/gso_tunnel_tcp4.c
create mode 100644 lib/librte_gso/gso_tunnel_tcp4.h
create mode 100644 lib/librte_gso/rte_gso.c
create mode 100644 lib/librte_gso/rte_gso.h
create mode 100644 lib/librte_gso/rte_gso_version.map
--
2.7.4
Jiayu Hu
2017-09-12 02:43:27 UTC
Permalink
Generic Segmentation Offload (GSO) is a SW technique to split large
packets into small ones. Akin to TSO, GSO enables applications to
operate on large packets, thus reducing per-packet processing overhead.

To enable more flexibility to applications, DPDK GSO is implemented
as a standalone library. Applications explicitly use the GSO library
to segment packets. This patch introduces the GSO API framework to DPDK.

The GSO library provides a segmentation API, rte_gso_segment(), for
applications. It splits an input packet into small ones in each
invocation. The GSO library refers to these small packets generated
by rte_gso_segment() as GSO segments. Each of the newly-created GSO
segments is organized as a two-segment MBUF, where the first segment is a
standard MBUF, which stores a copy of packet header, and the second is an
indirect MBUF which points to a section of data in the input packet.
rte_gso_segment() reduces the refcnt of the input packet by 1. Therefore,
when all GSO segments are freed, the input packet is freed automatically.
Additionally, since each GSO segment has multiple MBUFs (i.e. 2 MBUFs),
the driver of the interface which the GSO segments are sent to should
support to transmit multi-segment packets.

Signed-off-by: Jiayu Hu <***@intel.com>
Signed-off-by: Mark Kavanagh <***@intel.com>
---
config/common_base | 5 ++
lib/Makefile | 2 +
lib/librte_gso/Makefile | 49 ++++++++++++++
lib/librte_gso/rte_gso.c | 50 ++++++++++++++
lib/librte_gso/rte_gso.h | 133 +++++++++++++++++++++++++++++++++++++
lib/librte_gso/rte_gso_version.map | 7 ++
mk/rte.app.mk | 1 +
7 files changed, 247 insertions(+)
create mode 100644 lib/librte_gso/Makefile
create mode 100644 lib/librte_gso/rte_gso.c
create mode 100644 lib/librte_gso/rte_gso.h
create mode 100644 lib/librte_gso/rte_gso_version.map

diff --git a/config/common_base b/config/common_base
index 5e97a08..603e340 100644
--- a/config/common_base
+++ b/config/common_base
@@ -652,6 +652,11 @@ CONFIG_RTE_LIBRTE_IP_FRAG_TBL_STAT=n
CONFIG_RTE_LIBRTE_GRO=y

#
+# Compile GSO library
+#
+CONFIG_RTE_LIBRTE_GSO=y
+
+#
# Compile librte_meter
#
CONFIG_RTE_LIBRTE_METER=y
diff --git a/lib/Makefile b/lib/Makefile
index 86caba1..3d123f4 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -108,6 +108,8 @@ DIRS-$(CONFIG_RTE_LIBRTE_REORDER) += librte_reorder
DEPDIRS-librte_reorder := librte_eal librte_mempool librte_mbuf
DIRS-$(CONFIG_RTE_LIBRTE_PDUMP) += librte_pdump
DEPDIRS-librte_pdump := librte_eal librte_mempool librte_mbuf librte_ether
+DIRS-$(CONFIG_RTE_LIBRTE_GSO) += librte_gso
+DEPDIRS-librte_gso := librte_eal librte_mbuf librte_ether librte_net

ifeq ($(CONFIG_RTE_EXEC_ENV_LINUXAPP),y)
DIRS-$(CONFIG_RTE_LIBRTE_KNI) += librte_kni
diff --git a/lib/librte_gso/Makefile b/lib/librte_gso/Makefile
new file mode 100644
index 0000000..aeaacbc
--- /dev/null
+++ b/lib/librte_gso/Makefile
@@ -0,0 +1,49 @@
+# BSD LICENSE
+#
+# Copyright(c) 2017 Intel Corporation. All rights reserved.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+# library name
+LIB = librte_gso.a
+
+CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3
+
+EXPORT_MAP := rte_gso_version.map
+
+LIBABIVER := 1
+
+#source files
+SRCS-$(CONFIG_RTE_LIBRTE_GSO) += rte_gso.c
+
+# install this header file
+SYMLINK-$(CONFIG_RTE_LIBRTE_GSO)-include += rte_gso.h
+
+include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/lib/librte_gso/rte_gso.c b/lib/librte_gso/rte_gso.c
new file mode 100644
index 0000000..dda50ee
--- /dev/null
+++ b/lib/librte_gso/rte_gso.c
@@ -0,0 +1,50 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <errno.h>
+
+#include "rte_gso.h"
+
+int
+rte_gso_segment(struct rte_mbuf *pkt,
+ struct rte_gso_ctx gso_ctx __rte_unused,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out)
+{
+ if (pkt == NULL || pkts_out == NULL || nb_pkts_out < 1)
+ return -EINVAL;
+
+ pkts_out[0] = pkt;
+
+ return 1;
+}
diff --git a/lib/librte_gso/rte_gso.h b/lib/librte_gso/rte_gso.h
new file mode 100644
index 0000000..db757d6
--- /dev/null
+++ b/lib/librte_gso/rte_gso.h
@@ -0,0 +1,133 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_GSO_H_
+#define _RTE_GSO_H_
+
+/**
+ * @file
+ * Interface to GSO library
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <rte_mbuf.h>
+
+/* GSO IP id flags for the IPv4 header */
+#define RTE_GSO_IPID_FIXED (1 << 0)
+/**< Use fixed IP ids for output GSO segments */
+#define RTE_GSO_IPID_INCREASE (1 << 1)
+/**< Use incremental IP ids for output GSO segments */
+
+/**
+ * GSO context structure.
+ */
+struct rte_gso_ctx {
+ struct rte_mempool *direct_pool;
+ /**< MBUF pool for allocating direct buffers, which are used
+ * to store packet headers for GSO segments.
+ */
+ struct rte_mempool *indirect_pool;
+ /**< MBUF pool for allocating indirect buffers, which are used
+ * to locate packet payloads for GSO segments. The indirect
+ * buffer doesn't contain any data, but simply points to an
+ * offset within the packet to segment.
+ */
+ uint32_t gso_types;
+ /**< packet types to perform GSO. For example, if applications
+ * want to segment TCP/IPv4 packets, may set (RTE_PTYPE_L2_ETHER |
+ * RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L4_TCP) to gso_types.
+ */
+ uint16_t gso_size;
+ /**< maximum size of an output GSO segment, including packet
+ * header and payload, measured in bytes.
+ */
+ uint8_t ipid_flag;
+ /**< flag to indicate GSO uses fixed or incremental IP ids for
+ * IPv4 headers of output GSO segments.
+ */
+};
+
+/**
+ * Segmentation function, which supports processing of both single- and
+ * multi- segment packets.
+ *
+ * Note that we refer to the packets that are segmented from the input
+ * packet as 'GSO segments'. rte_gso_segment() assumes the input packet
+ * has correct checksums, and it doesn't update checksums for output
+ * GSO segments. Additionally, it doesn't process IP fragment packets.
+ *
+ * Each of the newly-created GSO segments is organized as a two-segment
+ * MBUF, where the first segment is a standard MBUF, which stores a copy
+ * of packet header, and the second is an indirect MBUF which points to
+ * a section of data in the input packet. Since each GSO segment has
+ * multiple MBUFs (i.e. 2 MBUFs), the driver of the interface which the
+ * GSO segments are sent to should support to transmit multi-segment
+ * packets.
+ *
+ * If the input packet is GSOed, its mbuf refcnt reduces by 1. Therefore,
+ * when all GSO segments are freed, the input packet is freed automatically.
+ *
+ * If the memory space in pkts_out or MBUF pools is insufficient, this
+ * function fails, and it returns (-1) * errno. Otherwise, GSO successes,
+ * and this function returns the number of output GSO segments filled in
+ * pkts_out.
+ *
+ * @param pkt
+ * The packet mbuf to segment.
+ * @param ctx
+ * GSO context object.
+ * @param pkts_out
+ * Pointer array used to store the MBUF addresses of output GSO
+ * segments, when rte_gso_segment() successes.
+ * @param nb_pkts_out
+ * The max number of items that pkts_out can keep.
+ *
+ * @return
+ * - The number of GSO segments filled in pkts_out on success.
+ * - Return -ENOMEM if run out of memory in MBUF pools.
+ * - Return -EINVAL for invalid parameters.
+ */
+int rte_gso_segment(struct rte_mbuf *pkt,
+ struct rte_gso_ctx ctx,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_GSO_H_ */
diff --git a/lib/librte_gso/rte_gso_version.map b/lib/librte_gso/rte_gso_version.map
new file mode 100644
index 0000000..e1fd453
--- /dev/null
+++ b/lib/librte_gso/rte_gso_version.map
@@ -0,0 +1,7 @@
+DPDK_17.11 {
+ global:
+
+ rte_gso_segment;
+
+ local: *;
+};
diff --git a/mk/rte.app.mk b/mk/rte.app.mk
index c25fdd9..d4c9873 100644
--- a/mk/rte.app.mk
+++ b/mk/rte.app.mk
@@ -66,6 +66,7 @@ _LDLIBS-$(CONFIG_RTE_LIBRTE_PDUMP) += -lrte_pdump
_LDLIBS-$(CONFIG_RTE_LIBRTE_DISTRIBUTOR) += -lrte_distributor
_LDLIBS-$(CONFIG_RTE_LIBRTE_IP_FRAG) += -lrte_ip_frag
_LDLIBS-$(CONFIG_RTE_LIBRTE_GRO) += -lrte_gro
+_LDLIBS-$(CONFIG_RTE_LIBRTE_GSO) += -lrte_gso
_LDLIBS-$(CONFIG_RTE_LIBRTE_METER) += -lrte_meter
_LDLIBS-$(CONFIG_RTE_LIBRTE_SCHED) += -lrte_sched
_LDLIBS-$(CONFIG_RTE_LIBRTE_LPM) += -lrte_lpm
--
2.7.4
Ananyev, Konstantin
2017-09-12 10:36:41 UTC
Permalink
Hi Jiayu,
Few comments from be inline.
Konstantin
Post by Jiayu Hu
diff --git a/lib/librte_gso/rte_gso.c b/lib/librte_gso/rte_gso.c
new file mode 100644
index 0000000..dda50ee
--- /dev/null
+++ b/lib/librte_gso/rte_gso.c
@@ -0,0 +1,50 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <errno.h>
+
+#include "rte_gso.h"
+
+int
+rte_gso_segment(struct rte_mbuf *pkt,
+ struct rte_gso_ctx gso_ctx __rte_unused,
No need to pass parameter by value here.
struct rte_gso_ctx *gso_ctx would do.
Even better - const struct rte_gso_ctx *, in case it doesn't need to need
to be updated inside that function.
Post by Jiayu Hu
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out)
+{
+ if (pkt == NULL || pkts_out == NULL || nb_pkts_out < 1)
+ return -EINVAL;
+
+ pkts_out[0] = pkt;
+
+ return 1;
+}
diff --git a/lib/librte_gso/rte_gso.h b/lib/librte_gso/rte_gso.h
new file mode 100644
index 0000000..db757d6
--- /dev/null
+++ b/lib/librte_gso/rte_gso.h
@@ -0,0 +1,133 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_GSO_H_
+#define _RTE_GSO_H_
+
+/**
+ * Interface to GSO library
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <rte_mbuf.h>
+
+/* GSO IP id flags for the IPv4 header */
+#define RTE_GSO_IPID_FIXED (1 << 0)
+/**< Use fixed IP ids for output GSO segments */
+#define RTE_GSO_IPID_INCREASE (1 << 1)
+/**< Use incremental IP ids for output GSO segments */
As values above are mutually exclusive, I think you don't need both flags.
Just one seems enough.
Post by Jiayu Hu
+
+/**
+ * GSO context structure.
+ */
+struct rte_gso_ctx {
+ struct rte_mempool *direct_pool;
+ /**< MBUF pool for allocating direct buffers, which are used
+ * to store packet headers for GSO segments.
+ */
+ struct rte_mempool *indirect_pool;
+ /**< MBUF pool for allocating indirect buffers, which are used
+ * to locate packet payloads for GSO segments. The indirect
+ * buffer doesn't contain any data, but simply points to an
+ * offset within the packet to segment.
+ */
+ uint32_t gso_types;
+ /**< packet types to perform GSO. For example, if applications
+ * want to segment TCP/IPv4 packets, may set (RTE_PTYPE_L2_ETHER |
+ * RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L4_TCP) to gso_types.
Actually after another thought - it probably should be no ptype mask, but mask
of rte_ethdev DEV_TX_OFFLOAD_*_TSO flags that are used to advertise real HW TSO offloads.
Let say for GSO that supports TSO over IPv4 it would be:
PKT_TX_TCP_SEG | PKT_TX_IPV4.
That would allow user to use GSO and TSO in a transparent way,
plus ptype is not actually a proper bitmask, but a set of enums,
so it not always possible to distinguish what ptype is supported just by bitmask.
Sorry for causing confusion here.
Post by Jiayu Hu
+ */
+ uint16_t gso_size;
+ /**< maximum size of an output GSO segment, including packet
+ * header and payload, measured in bytes.
+ */
+ uint8_t ipid_flag;
I'd suggest uint32_t flags (or even uint64_t).
Who knows what extra flags we'll need in future here.
Post by Jiayu Hu
+ /**< flag to indicate GSO uses fixed or incremental IP ids for
+ * IPv4 headers of output GSO segments.
+ */
+};
+
+/**
+ * Segmentation function, which supports processing of both single- and
+ * multi- segment packets.
+ *
+ * Note that we refer to the packets that are segmented from the input
+ * packet as 'GSO segments'. rte_gso_segment() assumes the input packet
+ * has correct checksums, and it doesn't update checksums for output
+ * GSO segments. Additionally, it doesn't process IP fragment packets.
+ *
+ * Each of the newly-created GSO segments is organized as a two-segment
+ * MBUF, where the first segment is a standard MBUF, which stores a copy
+ * of packet header, and the second is an indirect MBUF which points to
+ * a section of data in the input packet. Since each GSO segment has
+ * multiple MBUFs (i.e. 2 MBUFs), the driver of the interface which the
+ * GSO segments are sent to should support to transmit multi-segment
+ * packets.
+ *
+ * If the input packet is GSOed, its mbuf refcnt reduces by 1. Therefore,
+ * when all GSO segments are freed, the input packet is freed automatically.
+ *
+ * If the memory space in pkts_out or MBUF pools is insufficient, this
+ * function fails, and it returns (-1) * errno. Otherwise, GSO successes,
+ * and this function returns the number of output GSO segments filled in
+ * pkts_out.
+ *
+ * The packet mbuf to segment.
+ * GSO context object.
+ * Pointer array used to store the MBUF addresses of output GSO
+ * segments, when rte_gso_segment() successes.
+ * The max number of items that pkts_out can keep.
+ *
+ * - The number of GSO segments filled in pkts_out on success.
+ * - Return -ENOMEM if run out of memory in MBUF pools.
+ * - Return -EINVAL for invalid parameters.
+ */
+int rte_gso_segment(struct rte_mbuf *pkt,
+ struct rte_gso_ctx ctx,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_GSO_H_ */
Jiayu Hu
2017-09-13 02:11:07 UTC
Permalink
Hi Konstantin,

Thanks for your quick feedbacks. Replies are inline.

Thanks,
Jiayu
Post by Ananyev, Konstantin
Hi Jiayu,
Few comments from be inline.
Konstantin
Post by Jiayu Hu
diff --git a/lib/librte_gso/rte_gso.c b/lib/librte_gso/rte_gso.c
new file mode 100644
index 0000000..dda50ee
--- /dev/null
+++ b/lib/librte_gso/rte_gso.c
@@ -0,0 +1,50 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <errno.h>
+
+#include "rte_gso.h"
+
+int
+rte_gso_segment(struct rte_mbuf *pkt,
+ struct rte_gso_ctx gso_ctx __rte_unused,
No need to pass parameter by value here.
struct rte_gso_ctx *gso_ctx would do.
Even better - const struct rte_gso_ctx *, in case it doesn't need to need
to be updated inside that function.
Yes, agree. I will use rte_gso_ctx *gso_ctx instead.
Post by Ananyev, Konstantin
Post by Jiayu Hu
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out)
+{
+ if (pkt == NULL || pkts_out == NULL || nb_pkts_out < 1)
+ return -EINVAL;
+
+ pkts_out[0] = pkt;
+
+ return 1;
+}
diff --git a/lib/librte_gso/rte_gso.h b/lib/librte_gso/rte_gso.h
new file mode 100644
index 0000000..db757d6
--- /dev/null
+++ b/lib/librte_gso/rte_gso.h
@@ -0,0 +1,133 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_GSO_H_
+#define _RTE_GSO_H_
+
+/**
+ * Interface to GSO library
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <rte_mbuf.h>
+
+/* GSO IP id flags for the IPv4 header */
+#define RTE_GSO_IPID_FIXED (1 << 0)
+/**< Use fixed IP ids for output GSO segments */
+#define RTE_GSO_IPID_INCREASE (1 << 1)
+/**< Use incremental IP ids for output GSO segments */
As values above are mutually exclusive, I think you don't need both flags.
Just one seems enough.
Agree, I will remove RTE_GSO_IPID_INCREASE.
Post by Ananyev, Konstantin
Post by Jiayu Hu
+
+/**
+ * GSO context structure.
+ */
+struct rte_gso_ctx {
+ struct rte_mempool *direct_pool;
+ /**< MBUF pool for allocating direct buffers, which are used
+ * to store packet headers for GSO segments.
+ */
+ struct rte_mempool *indirect_pool;
+ /**< MBUF pool for allocating indirect buffers, which are used
+ * to locate packet payloads for GSO segments. The indirect
+ * buffer doesn't contain any data, but simply points to an
+ * offset within the packet to segment.
+ */
+ uint32_t gso_types;
+ /**< packet types to perform GSO. For example, if applications
+ * want to segment TCP/IPv4 packets, may set (RTE_PTYPE_L2_ETHER |
+ * RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L4_TCP) to gso_types.
Actually after another thought - it probably should be no ptype mask, but mask
of rte_ethdev DEV_TX_OFFLOAD_*_TSO flags that are used to advertise real HW TSO offloads.
PKT_TX_TCP_SEG | PKT_TX_IPV4.
That would allow user to use GSO and TSO in a transparent way,
plus ptype is not actually a proper bitmask, but a set of enums,
so it not always possible to distinguish what ptype is supported just by bitmask.
Sorry for causing confusion here.
Yes, agree. Reusing packet_type indeed introduces lots of macros to applications.
DEV_TX_OFFLOAD_*_TSO is a better choice, and it can also make HW offload and SW
offload consistent.
Post by Ananyev, Konstantin
Post by Jiayu Hu
+ */
+ uint16_t gso_size;
+ /**< maximum size of an output GSO segment, including packet
+ * header and payload, measured in bytes.
+ */
+ uint8_t ipid_flag;
I'd suggest uint32_t flags (or even uint64_t).
Who knows what extra flags we'll need in future here.
Make sense. I will use uint64_t.
Post by Ananyev, Konstantin
Post by Jiayu Hu
+ /**< flag to indicate GSO uses fixed or incremental IP ids for
+ * IPv4 headers of output GSO segments.
+ */
+};
+
+/**
+ * Segmentation function, which supports processing of both single- and
+ * multi- segment packets.
+ *
+ * Note that we refer to the packets that are segmented from the input
+ * packet as 'GSO segments'. rte_gso_segment() assumes the input packet
+ * has correct checksums, and it doesn't update checksums for output
+ * GSO segments. Additionally, it doesn't process IP fragment packets.
+ *
+ * Each of the newly-created GSO segments is organized as a two-segment
+ * MBUF, where the first segment is a standard MBUF, which stores a copy
+ * of packet header, and the second is an indirect MBUF which points to
+ * a section of data in the input packet. Since each GSO segment has
+ * multiple MBUFs (i.e. 2 MBUFs), the driver of the interface which the
+ * GSO segments are sent to should support to transmit multi-segment
+ * packets.
+ *
+ * If the input packet is GSOed, its mbuf refcnt reduces by 1. Therefore,
+ * when all GSO segments are freed, the input packet is freed automatically.
+ *
+ * If the memory space in pkts_out or MBUF pools is insufficient, this
+ * function fails, and it returns (-1) * errno. Otherwise, GSO successes,
+ * and this function returns the number of output GSO segments filled in
+ * pkts_out.
+ *
+ * The packet mbuf to segment.
+ * GSO context object.
+ * Pointer array used to store the MBUF addresses of output GSO
+ * segments, when rte_gso_segment() successes.
+ * The max number of items that pkts_out can keep.
+ *
+ * - The number of GSO segments filled in pkts_out on success.
+ * - Return -ENOMEM if run out of memory in MBUF pools.
+ * - Return -EINVAL for invalid parameters.
+ */
+int rte_gso_segment(struct rte_mbuf *pkt,
+ struct rte_gso_ctx ctx,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_GSO_H_ */
Ferruh Yigit
2017-09-14 18:33:12 UTC
Permalink
Post by Jiayu Hu
Generic Segmentation Offload (GSO) is a SW technique to split large
packets into small ones. Akin to TSO, GSO enables applications to
operate on large packets, thus reducing per-packet processing overhead.
To enable more flexibility to applications, DPDK GSO is implemented
as a standalone library. Applications explicitly use the GSO library
to segment packets. This patch introduces the GSO API framework to DPDK.
The GSO library provides a segmentation API, rte_gso_segment(), for
applications. It splits an input packet into small ones in each
invocation. The GSO library refers to these small packets generated
by rte_gso_segment() as GSO segments. Each of the newly-created GSO
segments is organized as a two-segment MBUF, where the first segment is a
standard MBUF, which stores a copy of packet header, and the second is an
indirect MBUF which points to a section of data in the input packet.
rte_gso_segment() reduces the refcnt of the input packet by 1. Therefore,
when all GSO segments are freed, the input packet is freed automatically.
Additionally, since each GSO segment has multiple MBUFs (i.e. 2 MBUFs),
the driver of the interface which the GSO segments are sent to should
support to transmit multi-segment packets.
---
config/common_base | 5 ++
lib/Makefile | 2 +
lib/librte_gso/Makefile | 49 ++++++++++++++
lib/librte_gso/rte_gso.c | 50 ++++++++++++++
lib/librte_gso/rte_gso.h | 133 +++++++++++++++++++++++++++++++++++++
lib/librte_gso/rte_gso_version.map | 7 ++
mk/rte.app.mk | 1 +
Can you please update documentation for new library:

- library documentation "doc/guides/prog_guide/xxx.rst"
- api documentation: doc/api/doxy-api.conf, doc/api/doxy-api-index.md
- release notes update to announce new library
- release notes, "Shared Library Versions" section with new library.

<...>
Hu, Jiayu
2017-09-15 01:12:16 UTC
Permalink
Hi Ferruh,
-----Original Message-----
From: Yigit, Ferruh
Sent: Friday, September 15, 2017 2:33 AM
Subject: Re: [dpdk-dev] [PATCH v3 1/5] gso: add Generic Segmentation
Offload API framework
Post by Jiayu Hu
Generic Segmentation Offload (GSO) is a SW technique to split large
packets into small ones. Akin to TSO, GSO enables applications to
operate on large packets, thus reducing per-packet processing overhead.
To enable more flexibility to applications, DPDK GSO is implemented
as a standalone library. Applications explicitly use the GSO library
to segment packets. This patch introduces the GSO API framework to DPDK.
The GSO library provides a segmentation API, rte_gso_segment(), for
applications. It splits an input packet into small ones in each
invocation. The GSO library refers to these small packets generated
by rte_gso_segment() as GSO segments. Each of the newly-created GSO
segments is organized as a two-segment MBUF, where the first segment is
a
Post by Jiayu Hu
standard MBUF, which stores a copy of packet header, and the second is an
indirect MBUF which points to a section of data in the input packet.
rte_gso_segment() reduces the refcnt of the input packet by 1. Therefore,
when all GSO segments are freed, the input packet is freed automatically.
Additionally, since each GSO segment has multiple MBUFs (i.e. 2 MBUFs),
the driver of the interface which the GSO segments are sent to should
support to transmit multi-segment packets.
---
config/common_base | 5 ++
lib/Makefile | 2 +
lib/librte_gso/Makefile | 49 ++++++++++++++
lib/librte_gso/rte_gso.c | 50 ++++++++++++++
lib/librte_gso/rte_gso.h | 133
+++++++++++++++++++++++++++++++++++++
Post by Jiayu Hu
lib/librte_gso/rte_gso_version.map | 7 ++
mk/rte.app.mk | 1 +
- library documentation "doc/guides/prog_guide/xxx.rst"
- api documentation: doc/api/doxy-api.conf, doc/api/doxy-api-index.md
- release notes update to announce new library
- release notes, "Shared Library Versions" section with new library.
Thanks for your reminder. I will u
Jiayu Hu
2017-09-12 02:43:29 UTC
Permalink
From: Mark Kavanagh <***@intel.com>

This patch adds GSO support for VxLAN-encapsulated packets. Supported
VxLAN packets must have an outer IPv4 header (prepended by an optional
VLAN tag), and contain an inner TCP/IPv4 packet (with an optional inner
VLAN tag).

VxLAN GSO assumes that all input packets have correct checksums and
doesn't update checksums for output packets. Additionally, it doesn't
process IP fragmented packets.

As with TCP/IPv4 GSO, VxLAN GSO uses a two-segment MBUF to organize each
output packet, which mandates support for multi-segment mbufs in the TX
functions of the NIC driver. Also, if a packet is GSOed, VxLAN GSO
reduces its MBUF refcnt by 1. As a result, when all of its GSOed
segments are freed, the packet is freed automatically.

Signed-off-by: Mark Kavanagh <***@intel.com>
Signed-off-by: Jiayu Hu <***@intel.com>
---
lib/librte_gso/Makefile | 1 +
lib/librte_gso/gso_common.c | 48 ++++++++++++++++++++++-
lib/librte_gso/gso_common.h | 33 ++++++++++++++++
lib/librte_gso/gso_tunnel_tcp4.c | 85 ++++++++++++++++++++++++++++++++++++++++
lib/librte_gso/gso_tunnel_tcp4.h | 76 +++++++++++++++++++++++++++++++++++
lib/librte_gso/rte_gso.c | 7 +++-
6 files changed, 248 insertions(+), 2 deletions(-)
create mode 100644 lib/librte_gso/gso_tunnel_tcp4.c
create mode 100644 lib/librte_gso/gso_tunnel_tcp4.h

diff --git a/lib/librte_gso/Makefile b/lib/librte_gso/Makefile
index 2be64d1..e6d41df 100644
--- a/lib/librte_gso/Makefile
+++ b/lib/librte_gso/Makefile
@@ -44,6 +44,7 @@ LIBABIVER := 1
SRCS-$(CONFIG_RTE_LIBRTE_GSO) += rte_gso.c
SRCS-$(CONFIG_RTE_LIBRTE_GSO) += gso_common.c
SRCS-$(CONFIG_RTE_LIBRTE_GSO) += gso_tcp4.c
+SRCS-$(CONFIG_RTE_LIBRTE_GSO) += gso_tunnel_tcp4.c

# install this header file
SYMLINK-$(CONFIG_RTE_LIBRTE_GSO)-include += rte_gso.h
diff --git a/lib/librte_gso/gso_common.c b/lib/librte_gso/gso_common.c
index 7c32e03..c6779d0 100644
--- a/lib/librte_gso/gso_common.c
+++ b/lib/librte_gso/gso_common.c
@@ -39,6 +39,7 @@
#include <rte_ether.h>
#include <rte_ip.h>
#include <rte_tcp.h>
+#include <rte_udp.h>

#include "gso_common.h"

@@ -193,10 +194,55 @@ update_inner_tcp4_header(struct rte_mbuf *pkt, uint8_t ipid_delta,
}
}

+static inline void
+update_outer_ipv4_header(struct rte_mbuf *pkt, uint16_t id)
+{
+ struct ipv4_hdr *ipv4_hdr;
+
+ ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *) +
+ pkt->outer_l2_len);
+ ipv4_hdr->total_length = rte_cpu_to_be_16(pkt->pkt_len -
+ pkt->outer_l2_len);
+ ipv4_hdr->packet_id = rte_cpu_to_be_16(id);
+}
+
+static inline void
+update_outer_udp_header(struct rte_mbuf *pkt)
+{
+ struct udp_hdr *udp_hdr;
+ uint16_t length;
+
+ length = pkt->outer_l2_len + pkt->outer_l3_len;
+ udp_hdr = (struct udp_hdr *)(rte_pktmbuf_mtod(pkt, char *) +
+ length);
+ udp_hdr->dgram_len = rte_cpu_to_be_16(pkt->pkt_len - length);
+}
+
+static inline void
+update_ipv4_vxlan_tcp4_header(struct rte_mbuf *pkt, uint8_t ipid_delta,
+ struct rte_mbuf **segs, uint16_t nb_segs)
+{
+ struct ipv4_hdr *ipv4_hdr;
+ uint16_t i, id;
+
+ ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *) +
+ pkt->outer_l2_len);
+ id = rte_be_to_cpu_16(ipv4_hdr->packet_id);
+ for (i = 0; i < nb_segs; i++) {
+ update_outer_ipv4_header(segs[i], id);
+ id += ipid_delta;
+ update_outer_udp_header(segs[i]);
+ }
+ /* Update inner TCP/IPv4 headers */
+ update_inner_tcp4_header(pkt, ipid_delta, segs, nb_segs);
+}
+
void
gso_update_pkt_headers(struct rte_mbuf *pkt, uint8_t ipid_delta,
struct rte_mbuf **segs, uint16_t nb_segs)
{
- if (is_ipv4_tcp(pkt->packet_type))
+ if (is_ipv4_vxlan_ipv4_tcp(pkt->packet_type))
+ update_ipv4_vxlan_tcp4_header(pkt, ipid_delta, segs, nb_segs);
+ else if (is_ipv4_tcp(pkt->packet_type))
update_inner_tcp4_header(pkt, ipid_delta, segs, nb_segs);
}
diff --git a/lib/librte_gso/gso_common.h b/lib/librte_gso/gso_common.h
index 3c76520..2377a1d 100644
--- a/lib/librte_gso/gso_common.h
+++ b/lib/librte_gso/gso_common.h
@@ -56,6 +56,39 @@ static inline uint8_t is_ipv4_tcp(uint32_t ptype)
}
}

+#define IS_INNER_IPV4_HDR(ptype) (((ptype) == RTE_PTYPE_INNER_L3_IPV4) | \
+ ((ptype) == RTE_PTYPE_INNER_L3_IPV4_EXT) | \
+ ((ptype) == RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN))
+
+#define ETHER_UDP_VXLAN_ETHER_TCP_PKT (RTE_PTYPE_L2_ETHER | \
+ RTE_PTYPE_L4_UDP | RTE_PTYPE_TUNNEL_VXLAN | \
+ RTE_PTYPE_INNER_L2_ETHER | RTE_PTYPE_INNER_L4_TCP)
+#define ETHER_VLAN_UDP_VXLAN_ETHER_TCP_PKT (RTE_PTYPE_L2_ETHER_VLAN | \
+ RTE_PTYPE_L4_UDP | RTE_PTYPE_TUNNEL_VXLAN | \
+ RTE_PTYPE_INNER_L2_ETHER | RTE_PTYPE_INNER_L4_TCP)
+#define ETHER_UDP_VXLAN_ETHER_VLAN_TCP_PKT (RTE_PTYPE_L2_ETHER | \
+ RTE_PTYPE_L4_UDP | RTE_PTYPE_TUNNEL_VXLAN | \
+ RTE_PTYPE_INNER_L2_ETHER_VLAN | RTE_PTYPE_INNER_L4_TCP)
+#define ETHER_VLAN_UDP_VXLAN_ETHER_VLAN_TCP_PKT (RTE_PTYPE_L2_ETHER_VLAN | \
+ RTE_PTYPE_L4_UDP | RTE_PTYPE_TUNNEL_VXLAN | \
+ RTE_PTYPE_INNER_L2_ETHER_VLAN | RTE_PTYPE_INNER_L4_TCP)
+static inline uint8_t is_ipv4_vxlan_ipv4_tcp(uint32_t ptype)
+{
+ uint32_t type;
+
+ type = ptype & (~(RTE_PTYPE_L3_MASK | RTE_PTYPE_INNER_L3_MASK));
+ switch (type) {
+ case ETHER_UDP_VXLAN_ETHER_TCP_PKT:
+ case ETHER_VLAN_UDP_VXLAN_ETHER_TCP_PKT:
+ case ETHER_UDP_VXLAN_ETHER_VLAN_TCP_PKT:
+ case ETHER_VLAN_UDP_VXLAN_ETHER_VLAN_TCP_PKT:
+ return (RTE_ETH_IS_IPV4_HDR(ptype) > 0) ?
+ IS_INNER_IPV4_HDR(ptype & RTE_PTYPE_INNER_L3_MASK) : 0;
+ default:
+ return 0;
+ }
+}
+
/**
* Internal function which updates relevant packet headers, following
* segmentation. This is required to update, for example, the IPv4
diff --git a/lib/librte_gso/gso_tunnel_tcp4.c b/lib/librte_gso/gso_tunnel_tcp4.c
new file mode 100644
index 0000000..8ca52d1
--- /dev/null
+++ b/lib/librte_gso/gso_tunnel_tcp4.c
@@ -0,0 +1,85 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <rte_ether.h>
+#include <rte_ip.h>
+
+#include "gso_common.h"
+#include "gso_tunnel_tcp4.h"
+
+int
+gso_tunnel_tcp4_segment(struct rte_mbuf *pkt,
+ uint16_t gso_size,
+ uint8_t ipid_delta,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out)
+{
+ struct ipv4_hdr *inner_ipv4_hdr;
+ uint16_t pyld_unit_size, hdr_offset;
+ uint16_t tcp_dl;
+ int ret = 1;
+
+ hdr_offset = pkt->outer_l2_len + pkt->outer_l3_len + pkt->l2_len;
+ inner_ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *) +
+ hdr_offset);
+ /*
+ * Don't process the packet whose DF bit of the inner IPv4
+ * header isn't set.
+ */
+ if (unlikely((inner_ipv4_hdr->fragment_offset & rte_cpu_to_be_16(
+ IPV4_HDR_DF_MASK)) == 0)) {
+ pkts_out[0] = pkt;
+ return ret;
+ }
+
+ tcp_dl = rte_be_to_cpu_16(inner_ipv4_hdr->total_length) -
+ pkt->l3_len - pkt->l4_len;
+ /* Don't process the packet without data */
+ if (unlikely(tcp_dl == 0)) {
+ pkts_out[0] = pkt;
+ return ret;
+ }
+
+ hdr_offset += pkt->l3_len + pkt->l4_len;
+ pyld_unit_size = gso_size - hdr_offset - ETHER_CRC_LEN;
+
+ /* Segment the payload */
+ ret = gso_do_segment(pkt, hdr_offset, pyld_unit_size, direct_pool,
+ indirect_pool, pkts_out, nb_pkts_out);
+ if (ret > 1)
+ gso_update_pkt_headers(pkt, ipid_delta, pkts_out, ret);
+
+ return ret;
+}
diff --git a/lib/librte_gso/gso_tunnel_tcp4.h b/lib/librte_gso/gso_tunnel_tcp4.h
new file mode 100644
index 0000000..0280b38
--- /dev/null
+++ b/lib/librte_gso/gso_tunnel_tcp4.h
@@ -0,0 +1,76 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _GSO_TUNNEL_TCP4_H_
+#define _GSO_TUNNEL_TCP4_H_
+
+#include <stdint.h>
+#include <rte_mbuf.h>
+
+/**
+ * Segment an tunneling packet with inner TCP/IPv4 headers. This function
+ * assumes the input packet has correct checksums and doesn't update
+ * checksums for GSO segment. Furthermore, it doesn't process IP fragment
+ * packets.
+ *
+ * @param pkt
+ * The packet mbuf to segment.
+ * @param gso_size
+ * The max length of a GSO segment, measured in bytes.
+ * @param ipid_delta
+ * The increasing uint of IP ids.
+ * @param direct_pool
+ * MBUF pool used for allocating direct buffers for output segments.
+ * @param indirect_pool
+ * MBUF pool used for allocating indirect buffers for output segments.
+ * @param pkts_out
+ * Pointer array used to store the MBUF addresses of output GSO
+ * segments, when gso_tunnel_tcp4_segment() successes. If the memory
+ * space in pkts_out is insufficient, gso_tcp4_segment() fails and
+ * returns -EINVAL.
+ * @param nb_pkts_out
+ * The max number of items that 'pkts_out' can keep.
+ *
+ * @return
+ * - The number of GSO segments filled in pkts_out on success.
+ * - Return -ENOMEM if run out of memory in MBUF pools.
+ * - Return -EINVAL for invalid parameters.
+ */
+int gso_tunnel_tcp4_segment(struct rte_mbuf *pkt,
+ uint16_t gso_size,
+ uint8_t ipid_delta,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out);
+#endif
diff --git a/lib/librte_gso/rte_gso.c b/lib/librte_gso/rte_gso.c
index 95f6ea6..226c75a 100644
--- a/lib/librte_gso/rte_gso.c
+++ b/lib/librte_gso/rte_gso.c
@@ -38,6 +38,7 @@
#include "rte_gso.h"
#include "gso_common.h"
#include "gso_tcp4.h"
+#include "gso_tunnel_tcp4.h"

int
rte_gso_segment(struct rte_mbuf *pkt,
@@ -66,7 +67,11 @@ rte_gso_segment(struct rte_mbuf *pkt,
gso_size = gso_ctx.gso_size;
ipid_delta = gso_ctx.ipid_flag == RTE_GSO_IPID_INCREASE;

- if (is_ipv4_tcp(pkt->packet_type)) {
+ if (is_ipv4_vxlan_ipv4_tcp(pkt->packet_type)) {
+ ret = gso_tunnel_tcp4_segment(pkt, gso_size, ipid_delta,
+ direct_pool, indirect_pool,
+ pkts_out, nb_pkts_out);
+ } else if (is_ipv4_tcp(pkt->packet_type)) {
ret = gso_tcp4_segment(pkt, gso_size, ipid_delta,
direct_pool, indirect_pool,
pkts_out, nb_pkts_out);
--
2.7.4
Jiayu Hu
2017-09-12 02:43:28 UTC
Permalink
This patch adds GSO support for TCP/IPv4 packets. Supported packets
may include a single VLAN tag. TCP/IPv4 GSO assumes that all input
packets have correct checksums, and doesn't update checksums for output
packets (the responsibility for this lies with the application).
Additionally, TCP/IPv4 GSO doesn't process IP fragmented packets.

TCP/IPv4 GSO uses two chained MBUFs, one direct MBUF and one indrect
MBUF, to organize an output packet. Note that we refer to these two
chained MBUFs as a two-segment MBUF. The direct MBUF stores the packet
header, while the indirect mbuf simply points to a location within the
original packet's payload. Consequently, use of the GSO library requires
multi-segment MBUF support in the TX functions of the NIC driver.

If a packet is GSOed, TCP/IPv4 GSO reduces its MBUF refcnt by 1. As a
result, when all of its GSOed segments are freed, the packet is freed
automatically.

Signed-off-by: Jiayu Hu <***@intel.com>
Signed-off-by: Mark Kavanagh <***@intel.com>
---
lib/librte_eal/common/include/rte_log.h | 1 +
lib/librte_gso/Makefile | 2 +
lib/librte_gso/gso_common.c | 202 ++++++++++++++++++++++++++++++++
lib/librte_gso/gso_common.h | 113 ++++++++++++++++++
lib/librte_gso/gso_tcp4.c | 83 +++++++++++++
lib/librte_gso/gso_tcp4.h | 76 ++++++++++++
lib/librte_gso/rte_gso.c | 41 ++++++-
7 files changed, 515 insertions(+), 3 deletions(-)
create mode 100644 lib/librte_gso/gso_common.c
create mode 100644 lib/librte_gso/gso_common.h
create mode 100644 lib/librte_gso/gso_tcp4.c
create mode 100644 lib/librte_gso/gso_tcp4.h

diff --git a/lib/librte_eal/common/include/rte_log.h b/lib/librte_eal/common/include/rte_log.h
index ec8dba7..2fa1199 100644
--- a/lib/librte_eal/common/include/rte_log.h
+++ b/lib/librte_eal/common/include/rte_log.h
@@ -87,6 +87,7 @@ extern struct rte_logs rte_logs;
#define RTE_LOGTYPE_CRYPTODEV 17 /**< Log related to cryptodev. */
#define RTE_LOGTYPE_EFD 18 /**< Log related to EFD. */
#define RTE_LOGTYPE_EVENTDEV 19 /**< Log related to eventdev. */
+#define RTE_LOGTYPE_GSO 20 /**< Log related to GSO. */

/* these log types can be used in an application */
#define RTE_LOGTYPE_USER1 24 /**< User-defined log type 1. */
diff --git a/lib/librte_gso/Makefile b/lib/librte_gso/Makefile
index aeaacbc..2be64d1 100644
--- a/lib/librte_gso/Makefile
+++ b/lib/librte_gso/Makefile
@@ -42,6 +42,8 @@ LIBABIVER := 1

#source files
SRCS-$(CONFIG_RTE_LIBRTE_GSO) += rte_gso.c
+SRCS-$(CONFIG_RTE_LIBRTE_GSO) += gso_common.c
+SRCS-$(CONFIG_RTE_LIBRTE_GSO) += gso_tcp4.c

# install this header file
SYMLINK-$(CONFIG_RTE_LIBRTE_GSO)-include += rte_gso.h
diff --git a/lib/librte_gso/gso_common.c b/lib/librte_gso/gso_common.c
new file mode 100644
index 0000000..7c32e03
--- /dev/null
+++ b/lib/librte_gso/gso_common.c
@@ -0,0 +1,202 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdbool.h>
+#include <errno.h>
+
+#include <rte_memcpy.h>
+#include <rte_mempool.h>
+#include <rte_ether.h>
+#include <rte_ip.h>
+#include <rte_tcp.h>
+
+#include "gso_common.h"
+
+static inline void
+hdr_segment_init(struct rte_mbuf *hdr_segment, struct rte_mbuf *pkt,
+ uint16_t pkt_hdr_offset)
+{
+ /* Copy MBUF metadata */
+ hdr_segment->nb_segs = 1;
+ hdr_segment->port = pkt->port;
+ hdr_segment->ol_flags = pkt->ol_flags;
+ hdr_segment->packet_type = pkt->packet_type;
+ hdr_segment->pkt_len = pkt_hdr_offset;
+ hdr_segment->data_len = pkt_hdr_offset;
+ hdr_segment->tx_offload = pkt->tx_offload;
+
+ /* Copy the packet header */
+ rte_memcpy(rte_pktmbuf_mtod(hdr_segment, char *),
+ rte_pktmbuf_mtod(pkt, char *),
+ pkt_hdr_offset);
+}
+
+static inline void
+free_gso_segment(struct rte_mbuf **pkts, uint16_t nb_pkts)
+{
+ uint16_t i;
+
+ for (i = 0; i < nb_pkts; i++)
+ rte_pktmbuf_free(pkts[i]);
+}
+
+int
+gso_do_segment(struct rte_mbuf *pkt,
+ uint16_t pkt_hdr_offset,
+ uint16_t pyld_unit_size,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out)
+{
+ struct rte_mbuf *pkt_in;
+ struct rte_mbuf *hdr_segment, *pyld_segment, *prev_segment;
+ uint16_t pkt_in_data_pos, segment_bytes_remaining;
+ uint16_t pyld_len, nb_segs;
+ bool more_in_pkt, more_out_segs;
+
+ pkt_in = pkt;
+ nb_segs = 0;
+ more_in_pkt = 1;
+ pkt_in_data_pos = pkt_hdr_offset;
+
+ while (more_in_pkt) {
+ if (unlikely(nb_segs >= nb_pkts_out)) {
+ free_gso_segment(pkts_out, nb_segs);
+ return -EINVAL;
+ }
+
+ /* Allocate a direct MBUF */
+ hdr_segment = rte_pktmbuf_alloc(direct_pool);
+ if (unlikely(hdr_segment == NULL)) {
+ free_gso_segment(pkts_out, nb_segs);
+ return -ENOMEM;
+ }
+ /* Fill the packet header */
+ hdr_segment_init(hdr_segment, pkt, pkt_hdr_offset);
+
+ prev_segment = hdr_segment;
+ segment_bytes_remaining = pyld_unit_size;
+ more_out_segs = 1;
+
+ while (more_out_segs && more_in_pkt) {
+ /* Allocate an indirect MBUF */
+ pyld_segment = rte_pktmbuf_alloc(indirect_pool);
+ if (unlikely(pyld_segment == NULL)) {
+ rte_pktmbuf_free(hdr_segment);
+ free_gso_segment(pkts_out, nb_segs);
+ return -ENOMEM;
+ }
+ /* Attach to current MBUF segment of pkt */
+ rte_pktmbuf_attach(pyld_segment, pkt_in);
+
+ prev_segment->next = pyld_segment;
+ prev_segment = pyld_segment;
+
+ pyld_len = segment_bytes_remaining;
+ if (pyld_len + pkt_in_data_pos > pkt_in->data_len)
+ pyld_len = pkt_in->data_len - pkt_in_data_pos;
+
+ pyld_segment->data_off = pkt_in_data_pos +
+ pkt_in->data_off;
+ pyld_segment->data_len = pyld_len;
+
+ /* Update header segment */
+ hdr_segment->pkt_len += pyld_len;
+ hdr_segment->nb_segs++;
+
+ pkt_in_data_pos += pyld_len;
+ segment_bytes_remaining -= pyld_len;
+
+ /* Finish processing a MBUF segment of pkt */
+ if (pkt_in_data_pos == pkt_in->data_len) {
+ pkt_in = pkt_in->next;
+ pkt_in_data_pos = 0;
+ if (pkt_in == NULL)
+ more_in_pkt = 0;
+ }
+
+ /* Finish generating a GSO segment */
+ if (segment_bytes_remaining == 0)
+ more_out_segs = 0;
+ }
+ pkts_out[nb_segs++] = hdr_segment;
+ }
+ return nb_segs;
+}
+
+static inline void
+update_inner_tcp4_header(struct rte_mbuf *pkt, uint8_t ipid_delta,
+ struct rte_mbuf **segs, uint16_t nb_segs)
+{
+ struct tcp_hdr *tcp_hdr;
+ struct ipv4_hdr *ipv4_hdr;
+ struct rte_mbuf *seg;
+ uint32_t sent_seq;
+ uint16_t inner_l2_offset;
+ uint16_t id, i;
+
+ inner_l2_offset = pkt->outer_l2_len + pkt->outer_l3_len + pkt->l2_len;
+ ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *) +
+ inner_l2_offset);
+ tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr + pkt->l3_len);
+ id = rte_be_to_cpu_16(ipv4_hdr->packet_id);
+ sent_seq = rte_be_to_cpu_32(tcp_hdr->sent_seq);
+
+ for (i = 0; i < nb_segs; i++) {
+ seg = segs[i];
+ /* Update the inner IPv4 header */
+ ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(seg, char *) +
+ inner_l2_offset);
+ ipv4_hdr->total_length = rte_cpu_to_be_16(seg->pkt_len -
+ inner_l2_offset);
+ ipv4_hdr->packet_id = rte_cpu_to_be_16(id);
+ id += ipid_delta;
+
+ /* Update the inner TCP header */
+ tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr + seg->l3_len);
+ tcp_hdr->sent_seq = rte_cpu_to_be_32(sent_seq);
+ if (likely(i < nb_segs - 1))
+ tcp_hdr->tcp_flags &= (~(TCP_HDR_PSH_MASK |
+ TCP_HDR_FIN_MASK));
+ sent_seq += (seg->pkt_len - seg->data_len);
+ }
+}
+
+void
+gso_update_pkt_headers(struct rte_mbuf *pkt, uint8_t ipid_delta,
+ struct rte_mbuf **segs, uint16_t nb_segs)
+{
+ if (is_ipv4_tcp(pkt->packet_type))
+ update_inner_tcp4_header(pkt, ipid_delta, segs, nb_segs);
+}
diff --git a/lib/librte_gso/gso_common.h b/lib/librte_gso/gso_common.h
new file mode 100644
index 0000000..3c76520
--- /dev/null
+++ b/lib/librte_gso/gso_common.h
@@ -0,0 +1,113 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _GSO_COMMON_H_
+#define _GSO_COMMON_H_
+
+#include <stdint.h>
+#include <rte_mbuf.h>
+
+#define IPV4_HDR_DF_SHIFT 14
+#define IPV4_HDR_DF_MASK (1 << IPV4_HDR_DF_SHIFT)
+
+#define TCP_HDR_PSH_MASK ((uint8_t)0x08)
+#define TCP_HDR_FIN_MASK ((uint8_t)0x01)
+
+#define ETHER_TCP_PKT (RTE_PTYPE_L2_ETHER | RTE_PTYPE_L4_TCP)
+#define ETHER_VLAN_TCP_PKT (RTE_PTYPE_L2_ETHER_VLAN | RTE_PTYPE_L4_TCP)
+static inline uint8_t is_ipv4_tcp(uint32_t ptype)
+{
+ switch (ptype & (~RTE_PTYPE_L3_MASK)) {
+ case ETHER_VLAN_TCP_PKT:
+ case ETHER_TCP_PKT:
+ return RTE_ETH_IS_IPV4_HDR(ptype);
+ default:
+ return 0;
+ }
+}
+
+/**
+ * Internal function which updates relevant packet headers, following
+ * segmentation. This is required to update, for example, the IPv4
+ * 'total_length' field, to reflect the reduced length of the now-
+ * segmented packet.
+ *
+ * @param pkt
+ * The original packet.
+ * @param ipid_delta
+ * The increasing uint of IP ids.
+ * @param segs
+ * Pointer array used for storing mbuf addresses for GSO segments.
+ * @param nb_segs
+ * The number of GSO segments placed in segs.
+ */
+void gso_update_pkt_headers(struct rte_mbuf *pkt, uint8_t ipid_delta,
+ struct rte_mbuf **segs, uint16_t nb_segs);
+
+/**
+ * Internal function which divides the input packet into small segments.
+ * Each of the newly-created segments is organized as a two-segment MBUF,
+ * where the first segment is a standard mbuf, which stores a copy of
+ * packet header, and the second is an indirect mbuf which points to a
+ * section of data in the input packet.
+ *
+ * @param pkt
+ * Packet to segment.
+ * @param pkt_hdr_offset
+ * Packet header offset, measured in bytes.
+ * @param pyld_unit_size
+ * The max payload length of a GSO segment.
+ * @param direct_pool
+ * MBUF pool used for allocating direct buffers for output segments.
+ * @param indirect_pool
+ * MBUF pool used for allocating indirect buffers for output segments.
+ * @param pkts_out
+ * Pointer array used to keep the mbuf addresses of output segments. If
+ * the memory space in pkts_out is insufficient, gso_do_segment() fails
+ * and returns -EINVAL.
+ * @param nb_pkts_out
+ * The max number of items that pkts_out can keep.
+ *
+ * @return
+ * - The number of segments created in the event of success.
+ * - Return -ENOMEM if run out of memory in MBUF pools.
+ * - Return -EINVAL for invalid parameters.
+ */
+int gso_do_segment(struct rte_mbuf *pkt,
+ uint16_t pkt_hdr_offset,
+ uint16_t pyld_unit_size,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out);
+#endif
diff --git a/lib/librte_gso/gso_tcp4.c b/lib/librte_gso/gso_tcp4.c
new file mode 100644
index 0000000..8d4bfb2
--- /dev/null
+++ b/lib/librte_gso/gso_tcp4.c
@@ -0,0 +1,83 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include <rte_ether.h>
+#include <rte_ip.h>
+
+#include "gso_common.h"
+#include "gso_tcp4.h"
+
+int
+gso_tcp4_segment(struct rte_mbuf *pkt,
+ uint16_t gso_size,
+ uint8_t ipid_delta,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out)
+{
+ struct ipv4_hdr *ipv4_hdr;
+ uint16_t tcp_dl;
+ uint16_t pyld_unit_size;
+ uint16_t hdr_offset;
+ int ret = 1;
+
+ ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *) +
+ pkt->l2_len);
+ /* Don't process the fragmented packet */
+ if (unlikely((ipv4_hdr->fragment_offset & rte_cpu_to_be_16(
+ IPV4_HDR_DF_MASK)) == 0)) {
+ pkts_out[0] = pkt;
+ return ret;
+ }
+
+ tcp_dl = rte_be_to_cpu_16(ipv4_hdr->total_length) - pkt->l3_len -
+ pkt->l4_len;
+ /* Don't process the packet without data */
+ if (unlikely(tcp_dl == 0)) {
+ pkts_out[0] = pkt;
+ return ret;
+ }
+
+ hdr_offset = pkt->l2_len + pkt->l3_len + pkt->l4_len;
+ pyld_unit_size = gso_size - hdr_offset - ETHER_CRC_LEN;
+
+ /* Segment the payload */
+ ret = gso_do_segment(pkt, hdr_offset, pyld_unit_size, direct_pool,
+ indirect_pool, pkts_out, nb_pkts_out);
+ if (ret > 1)
+ gso_update_pkt_headers(pkt, ipid_delta, pkts_out, ret);
+
+ return ret;
+}
diff --git a/lib/librte_gso/gso_tcp4.h b/lib/librte_gso/gso_tcp4.h
new file mode 100644
index 0000000..9c07984
--- /dev/null
+++ b/lib/librte_gso/gso_tcp4.h
@@ -0,0 +1,76 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _GSO_TCP4_H_
+#define _GSO_TCP4_H_
+
+#include <stdint.h>
+#include <rte_mbuf.h>
+
+/**
+ * Segment an IPv4/TCP packet. This function assumes the input packet has
+ * correct checksums and doesn't update checksums for GSO segment.
+ * Furthermore, it doesn't process IP fragment packets.
+ *
+ * @param pkt
+ * The packet mbuf to segment.
+ * @param gso_size
+ * The max length of a GSO segment, measured in bytes.
+ * @param ipid_delta
+ * The increasing uint of IP ids.
+ * @param direct_pool
+ * MBUF pool used for allocating direct buffers for output segments.
+ * @param indirect_pool
+ * MBUF pool used for allocating indirect buffers for output segments.
+ * @param pkts_out
+ * Pointer array used to store the MBUF addresses of output GSO
+ * segments, when gso_tcp4_segment() successes. If the memory space in
+ * pkts_out is insufficient, gso_tcp4_segment() fails and returns
+ * -EINVAL.
+ * @param nb_pkts_out
+ * The max number of items that 'pkts_out' can keep.
+ *
+ * @return
+ * - The number of GSO segments filled in pkts_out on success.
+ * - Return -ENOMEM if run out of memory in MBUF pools.
+ * - Return -EINVAL for invalid parameters.
+ */
+int gso_tcp4_segment(struct rte_mbuf *pkt,
+ uint16_t gso_size,
+ uint8_t ip_delta,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out);
+
+#endif
diff --git a/lib/librte_gso/rte_gso.c b/lib/librte_gso/rte_gso.c
index dda50ee..95f6ea6 100644
--- a/lib/librte_gso/rte_gso.c
+++ b/lib/librte_gso/rte_gso.c
@@ -33,18 +33,53 @@

#include <errno.h>

+#include <rte_log.h>
+
#include "rte_gso.h"
+#include "gso_common.h"
+#include "gso_tcp4.h"

int
rte_gso_segment(struct rte_mbuf *pkt,
- struct rte_gso_ctx gso_ctx __rte_unused,
+ struct rte_gso_ctx gso_ctx,
struct rte_mbuf **pkts_out,
uint16_t nb_pkts_out)
{
+ struct rte_mempool *direct_pool, *indirect_pool;
+ struct rte_mbuf *pkt_seg;
+ uint16_t gso_size;
+ uint8_t ipid_delta;
+ int ret = 1;
+
if (pkt == NULL || pkts_out == NULL || nb_pkts_out < 1)
return -EINVAL;

- pkts_out[0] = pkt;
+ if (gso_ctx.gso_size >= pkt->pkt_len ||
+ (pkt->packet_type & gso_ctx.gso_types) !=
+ pkt->packet_type) {
+ pkts_out[0] = pkt;
+ return ret;
+ }
+
+ direct_pool = gso_ctx.direct_pool;
+ indirect_pool = gso_ctx.indirect_pool;
+ gso_size = gso_ctx.gso_size;
+ ipid_delta = gso_ctx.ipid_flag == RTE_GSO_IPID_INCREASE;
+
+ if (is_ipv4_tcp(pkt->packet_type)) {
+ ret = gso_tcp4_segment(pkt, gso_size, ipid_delta,
+ direct_pool, indirect_pool,
+ pkts_out, nb_pkts_out);
+ } else
+ RTE_LOG(WARNING, GSO, "Unsupported packet type\n");
+
+ if (ret > 1) {
+ pkt_seg = pkt;
+ while (pkt_seg) {
+ rte_mbuf_refcnt_update(pkt_seg, -1);
+ pkt_seg = pkt_seg->next;
+ }
+ }

- return 1;
+ return ret;
}
--
2.7.4
Ananyev, Konstantin
2017-09-12 11:17:49 UTC
Permalink
Hi Jayu,
-----Original Message-----
From: Hu, Jiayu
Sent: Tuesday, September 12, 2017 3:43 AM
Subject: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
This patch adds GSO support for TCP/IPv4 packets. Supported packets
may include a single VLAN tag. TCP/IPv4 GSO assumes that all input
packets have correct checksums, and doesn't update checksums for output
packets (the responsibility for this lies with the application).
Probably it shouldn't say that checksum have to be valid, right?
As you don't update checksum(s) inside the lib - it probably doesn't matter.
Additionally, TCP/IPv4 GSO doesn't process IP fragmented packets.
TCP/IPv4 GSO uses two chained MBUFs, one direct MBUF and one indrect
MBUF, to organize an output packet. Note that we refer to these two
chained MBUFs as a two-segment MBUF. The direct MBUF stores the packet
header, while the indirect mbuf simply points to a location within the
original packet's payload. Consequently, use of the GSO library requires
multi-segment MBUF support in the TX functions of the NIC driver.
If a packet is GSOed, TCP/IPv4 GSO reduces its MBUF refcnt by 1. As a
result, when all of its GSOed segments are freed, the packet is freed
automatically.
---
lib/librte_eal/common/include/rte_log.h | 1 +
lib/librte_gso/Makefile | 2 +
lib/librte_gso/gso_common.c | 202 ++++++++++++++++++++++++++++++++
lib/librte_gso/gso_common.h | 113 ++++++++++++++++++
lib/librte_gso/gso_tcp4.c | 83 +++++++++++++
lib/librte_gso/gso_tcp4.h | 76 ++++++++++++
lib/librte_gso/rte_gso.c | 41 ++++++-
7 files changed, 515 insertions(+), 3 deletions(-)
create mode 100644 lib/librte_gso/gso_common.c
create mode 100644 lib/librte_gso/gso_common.h
create mode 100644 lib/librte_gso/gso_tcp4.c
create mode 100644 lib/librte_gso/gso_tcp4.h
diff --git a/lib/librte_eal/common/include/rte_log.h b/lib/librte_eal/common/include/rte_log.h
index ec8dba7..2fa1199 100644
--- a/lib/librte_eal/common/include/rte_log.h
+++ b/lib/librte_eal/common/include/rte_log.h
@@ -87,6 +87,7 @@ extern struct rte_logs rte_logs;
#define RTE_LOGTYPE_CRYPTODEV 17 /**< Log related to cryptodev. */
#define RTE_LOGTYPE_EFD 18 /**< Log related to EFD. */
#define RTE_LOGTYPE_EVENTDEV 19 /**< Log related to eventdev. */
+#define RTE_LOGTYPE_GSO 20 /**< Log related to GSO. */
/* these log types can be used in an application */
#define RTE_LOGTYPE_USER1 24 /**< User-defined log type 1. */
diff --git a/lib/librte_gso/Makefile b/lib/librte_gso/Makefile
index aeaacbc..2be64d1 100644
--- a/lib/librte_gso/Makefile
+++ b/lib/librte_gso/Makefile
@@ -42,6 +42,8 @@ LIBABIVER := 1
#source files
SRCS-$(CONFIG_RTE_LIBRTE_GSO) += rte_gso.c
+SRCS-$(CONFIG_RTE_LIBRTE_GSO) += gso_common.c
+SRCS-$(CONFIG_RTE_LIBRTE_GSO) += gso_tcp4.c
# install this header file
SYMLINK-$(CONFIG_RTE_LIBRTE_GSO)-include += rte_gso.h
diff --git a/lib/librte_gso/gso_common.c b/lib/librte_gso/gso_common.c
new file mode 100644
index 0000000..7c32e03
--- /dev/null
+++ b/lib/librte_gso/gso_common.c
@@ -0,0 +1,202 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdbool.h>
+#include <errno.h>
+
+#include <rte_memcpy.h>
+#include <rte_mempool.h>
+#include <rte_ether.h>
+#include <rte_ip.h>
+#include <rte_tcp.h>
+
+#include "gso_common.h"
+
+static inline void
+hdr_segment_init(struct rte_mbuf *hdr_segment, struct rte_mbuf *pkt,
+ uint16_t pkt_hdr_offset)
+{
+ /* Copy MBUF metadata */
+ hdr_segment->nb_segs = 1;
+ hdr_segment->port = pkt->port;
+ hdr_segment->ol_flags = pkt->ol_flags;
+ hdr_segment->packet_type = pkt->packet_type;
+ hdr_segment->pkt_len = pkt_hdr_offset;
+ hdr_segment->data_len = pkt_hdr_offset;
+ hdr_segment->tx_offload = pkt->tx_offload;
+
+ /* Copy the packet header */
+ rte_memcpy(rte_pktmbuf_mtod(hdr_segment, char *),
+ rte_pktmbuf_mtod(pkt, char *),
+ pkt_hdr_offset);
+}
+
+static inline void
+free_gso_segment(struct rte_mbuf **pkts, uint16_t nb_pkts)
+{
+ uint16_t i;
+
+ for (i = 0; i < nb_pkts; i++)
+ rte_pktmbuf_free(pkts[i]);
+}
+
+int
+gso_do_segment(struct rte_mbuf *pkt,
+ uint16_t pkt_hdr_offset,
+ uint16_t pyld_unit_size,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out)
+{
+ struct rte_mbuf *pkt_in;
+ struct rte_mbuf *hdr_segment, *pyld_segment, *prev_segment;
+ uint16_t pkt_in_data_pos, segment_bytes_remaining;
+ uint16_t pyld_len, nb_segs;
+ bool more_in_pkt, more_out_segs;
+
+ pkt_in = pkt;
+ nb_segs = 0;
+ more_in_pkt = 1;
+ pkt_in_data_pos = pkt_hdr_offset;
+
+ while (more_in_pkt) {
+ if (unlikely(nb_segs >= nb_pkts_out)) {
+ free_gso_segment(pkts_out, nb_segs);
+ return -EINVAL;
+ }
+
+ /* Allocate a direct MBUF */
+ hdr_segment = rte_pktmbuf_alloc(direct_pool);
+ if (unlikely(hdr_segment == NULL)) {
+ free_gso_segment(pkts_out, nb_segs);
+ return -ENOMEM;
+ }
+ /* Fill the packet header */
+ hdr_segment_init(hdr_segment, pkt, pkt_hdr_offset);
+
+ prev_segment = hdr_segment;
+ segment_bytes_remaining = pyld_unit_size;
+ more_out_segs = 1;
+
+ while (more_out_segs && more_in_pkt) {
+ /* Allocate an indirect MBUF */
+ pyld_segment = rte_pktmbuf_alloc(indirect_pool);
+ if (unlikely(pyld_segment == NULL)) {
+ rte_pktmbuf_free(hdr_segment);
+ free_gso_segment(pkts_out, nb_segs);
+ return -ENOMEM;
+ }
+ /* Attach to current MBUF segment of pkt */
+ rte_pktmbuf_attach(pyld_segment, pkt_in);
+
+ prev_segment->next = pyld_segment;
+ prev_segment = pyld_segment;
+
+ pyld_len = segment_bytes_remaining;
+ if (pyld_len + pkt_in_data_pos > pkt_in->data_len)
+ pyld_len = pkt_in->data_len - pkt_in_data_pos;
+
+ pyld_segment->data_off = pkt_in_data_pos +
+ pkt_in->data_off;
+ pyld_segment->data_len = pyld_len;
+
+ /* Update header segment */
+ hdr_segment->pkt_len += pyld_len;
+ hdr_segment->nb_segs++;
+
+ pkt_in_data_pos += pyld_len;
+ segment_bytes_remaining -= pyld_len;
+
+ /* Finish processing a MBUF segment of pkt */
+ if (pkt_in_data_pos == pkt_in->data_len) {
+ pkt_in = pkt_in->next;
+ pkt_in_data_pos = 0;
+ if (pkt_in == NULL)
+ more_in_pkt = 0;
+ }
+
+ /* Finish generating a GSO segment */
+ if (segment_bytes_remaining == 0)
+ more_out_segs = 0;
+ }
+ pkts_out[nb_segs++] = hdr_segment;
+ }
+ return nb_segs;
+}
+
+static inline void
+update_inner_tcp4_header(struct rte_mbuf *pkt, uint8_t ipid_delta,
+ struct rte_mbuf **segs, uint16_t nb_segs)
+{
+ struct tcp_hdr *tcp_hdr;
+ struct ipv4_hdr *ipv4_hdr;
+ struct rte_mbuf *seg;
+ uint32_t sent_seq;
+ uint16_t inner_l2_offset;
+ uint16_t id, i;
+
+ inner_l2_offset = pkt->outer_l2_len + pkt->outer_l3_len + pkt->l2_len;
Shouldn't it be: pkt->l2_len here?
Or probably even better to pass l2_len as an input parameter.
+ ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *) +
+ inner_l2_offset);
+ tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr + pkt->l3_len);
+ id = rte_be_to_cpu_16(ipv4_hdr->packet_id);
+ sent_seq = rte_be_to_cpu_32(tcp_hdr->sent_seq);
+
+ for (i = 0; i < nb_segs; i++) {
+ seg = segs[i];
+ /* Update the inner IPv4 header */
+ ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(seg, char *) +
+ inner_l2_offset);
+ ipv4_hdr->total_length = rte_cpu_to_be_16(seg->pkt_len -
+ inner_l2_offset);
+ ipv4_hdr->packet_id = rte_cpu_to_be_16(id);
+ id += ipid_delta;
+
+ /* Update the inner TCP header */
+ tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr + seg->l3_len);
+ tcp_hdr->sent_seq = rte_cpu_to_be_32(sent_seq);
+ if (likely(i < nb_segs - 1))
+ tcp_hdr->tcp_flags &= (~(TCP_HDR_PSH_MASK |
+ TCP_HDR_FIN_MASK));
+ sent_seq += (seg->pkt_len - seg->data_len);
+ }
+}
+
+void
+gso_update_pkt_headers(struct rte_mbuf *pkt, uint8_t ipid_delta,
+ struct rte_mbuf **segs, uint16_t nb_segs)
+{
+ if (is_ipv4_tcp(pkt->packet_type))
+ update_inner_tcp4_header(pkt, ipid_delta, segs, nb_segs);
+}
diff --git a/lib/librte_gso/gso_common.h b/lib/librte_gso/gso_common.h
new file mode 100644
index 0000000..3c76520
--- /dev/null
+++ b/lib/librte_gso/gso_common.h
@@ -0,0 +1,113 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _GSO_COMMON_H_
+#define _GSO_COMMON_H_
+
+#include <stdint.h>
+#include <rte_mbuf.h>
+
+#define IPV4_HDR_DF_SHIFT 14
We have that already defined in librte_net/rte_ip.h
+#define IPV4_HDR_DF_MASK (1 << IPV4_HDR_DF_SHIFT)
+
+#define TCP_HDR_PSH_MASK ((uint8_t)0x08)
+#define TCP_HDR_FIN_MASK ((uint8_t)0x01)
+
+#define ETHER_TCP_PKT (RTE_PTYPE_L2_ETHER | RTE_PTYPE_L4_TCP)
+#define ETHER_VLAN_TCP_PKT (RTE_PTYPE_L2_ETHER_VLAN | RTE_PTYPE_L4_TCP)
+static inline uint8_t is_ipv4_tcp(uint32_t ptype)
+{
+ switch (ptype & (~RTE_PTYPE_L3_MASK)) {
Why not just:
return RTE_ETH_IS_IPV4_HDR(ptype) && (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_TCP;
?
+ return RTE_ETH_IS_IPV4_HDR(ptype);
+ return 0;
+ }
+}
+
+/**
+ * Internal function which updates relevant packet headers, following
+ * segmentation. This is required to update, for example, the IPv4
+ * 'total_length' field, to reflect the reduced length of the now-
+ * segmented packet.
+ *
+ * The original packet.
+ * The increasing uint of IP ids.
+ * Pointer array used for storing mbuf addresses for GSO segments.
+ * The number of GSO segments placed in segs.
+ */
+void gso_update_pkt_headers(struct rte_mbuf *pkt, uint8_t ipid_delta,
+ struct rte_mbuf **segs, uint16_t nb_segs);
+
+/**
+ * Internal function which divides the input packet into small segments.
+ * Each of the newly-created segments is organized as a two-segment MBUF,
+ * where the first segment is a standard mbuf, which stores a copy of
+ * packet header, and the second is an indirect mbuf which points to a
+ * section of data in the input packet.
+ *
+ * Packet to segment.
+ * Packet header offset, measured in bytes.
+ * The max payload length of a GSO segment.
+ * MBUF pool used for allocating direct buffers for output segments.
+ * MBUF pool used for allocating indirect buffers for output segments.
+ * Pointer array used to keep the mbuf addresses of output segments. If
+ * the memory space in pkts_out is insufficient, gso_do_segment() fails
+ * and returns -EINVAL.
+ * The max number of items that pkts_out can keep.
+ *
+ * - The number of segments created in the event of success.
+ * - Return -ENOMEM if run out of memory in MBUF pools.
+ * - Return -EINVAL for invalid parameters.
+ */
+int gso_do_segment(struct rte_mbuf *pkt,
+ uint16_t pkt_hdr_offset,
+ uint16_t pyld_unit_size,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out);
+#endif
diff --git a/lib/librte_gso/gso_tcp4.c b/lib/librte_gso/gso_tcp4.c
new file mode 100644
index 0000000..8d4bfb2
--- /dev/null
+++ b/lib/librte_gso/gso_tcp4.c
@@ -0,0 +1,83 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include <rte_ether.h>
+#include <rte_ip.h>
+
+#include "gso_common.h"
+#include "gso_tcp4.h"
+
+int
+gso_tcp4_segment(struct rte_mbuf *pkt,
+ uint16_t gso_size,
+ uint8_t ipid_delta,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out)
+{
+ struct ipv4_hdr *ipv4_hdr;
+ uint16_t tcp_dl;
+ uint16_t pyld_unit_size;
+ uint16_t hdr_offset;
+ int ret = 1;
+
+ ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *) +
+ pkt->l2_len);
+ /* Don't process the fragmented packet */
+ if (unlikely((ipv4_hdr->fragment_offset & rte_cpu_to_be_16(
+ IPV4_HDR_DF_MASK)) == 0)) {
It is not a check for fragmented packet - it is a check that fragmentation is allowed for that packet.
Should be IPV4_HDR_DF_MASK - 1, I think.
+ pkts_out[0] = pkt;
+ return ret;
+ }
+
+ tcp_dl = rte_be_to_cpu_16(ipv4_hdr->total_length) - pkt->l3_len -
+ pkt->l4_len;
Why not use pkt->pkt_len - pkt->l2_len -pkt_l3_len - pkt_l4_len?
+ /* Don't process the packet without data */
+ if (unlikely(tcp_dl == 0)) {
+ pkts_out[0] = pkt;
+ return ret;
+ }
+
+ hdr_offset = pkt->l2_len + pkt->l3_len + pkt->l4_len;
+ pyld_unit_size = gso_size - hdr_offset - ETHER_CRC_LEN;
Hmm, why do we need to count CRC_LEN here?
+
+ /* Segment the payload */
+ ret = gso_do_segment(pkt, hdr_offset, pyld_unit_size, direct_pool,
+ indirect_pool, pkts_out, nb_pkts_out);
+ if (ret > 1)
+ gso_update_pkt_headers(pkt, ipid_delta, pkts_out, ret);
+
+ return ret;
+}
diff --git a/lib/librte_gso/gso_tcp4.h b/lib/librte_gso/gso_tcp4.h
new file mode 100644
index 0000000..9c07984
--- /dev/null
+++ b/lib/librte_gso/gso_tcp4.h
@@ -0,0 +1,76 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _GSO_TCP4_H_
+#define _GSO_TCP4_H_
+
+#include <stdint.h>
+#include <rte_mbuf.h>
+
+/**
+ * Segment an IPv4/TCP packet. This function assumes the input packet has
+ * correct checksums and doesn't update checksums for GSO segment.
+ * Furthermore, it doesn't process IP fragment packets.
+ *
+ * The packet mbuf to segment.
+ * The max length of a GSO segment, measured in bytes.
+ * The increasing uint of IP ids.
+ * MBUF pool used for allocating direct buffers for output segments.
+ * MBUF pool used for allocating indirect buffers for output segments.
+ * Pointer array used to store the MBUF addresses of output GSO
+ * segments, when gso_tcp4_segment() successes. If the memory space in
+ * pkts_out is insufficient, gso_tcp4_segment() fails and returns
+ * -EINVAL.
+ * The max number of items that 'pkts_out' can keep.
+ *
+ * - The number of GSO segments filled in pkts_out on success.
+ * - Return -ENOMEM if run out of memory in MBUF pools.
+ * - Return -EINVAL for invalid parameters.
+ */
+int gso_tcp4_segment(struct rte_mbuf *pkt,
+ uint16_t gso_size,
+ uint8_t ip_delta,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out);
+
+#endif
diff --git a/lib/librte_gso/rte_gso.c b/lib/librte_gso/rte_gso.c
index dda50ee..95f6ea6 100644
--- a/lib/librte_gso/rte_gso.c
+++ b/lib/librte_gso/rte_gso.c
@@ -33,18 +33,53 @@
#include <errno.h>
+#include <rte_log.h>
+
#include "rte_gso.h"
+#include "gso_common.h"
+#include "gso_tcp4.h"
int
rte_gso_segment(struct rte_mbuf *pkt,
- struct rte_gso_ctx gso_ctx __rte_unused,
+ struct rte_gso_ctx gso_ctx,
struct rte_mbuf **pkts_out,
uint16_t nb_pkts_out)
{
+ struct rte_mempool *direct_pool, *indirect_pool;
+ struct rte_mbuf *pkt_seg;
+ uint16_t gso_size;
+ uint8_t ipid_delta;
+ int ret = 1;
+
if (pkt == NULL || pkts_out == NULL || nb_pkts_out < 1)
return -EINVAL;
- pkts_out[0] = pkt;
+ if (gso_ctx.gso_size >= pkt->pkt_len ||
+ (pkt->packet_type & gso_ctx.gso_types) !=
+ pkt->packet_type) {
+ pkts_out[0] = pkt;
+ return ret;
+ }
+
+ direct_pool = gso_ctx.direct_pool;
+ indirect_pool = gso_ctx.indirect_pool;
+ gso_size = gso_ctx.gso_size;
+ ipid_delta = gso_ctx.ipid_flag == RTE_GSO_IPID_INCREASE;
+
+ if (is_ipv4_tcp(pkt->packet_type)) {
Probably we need here:
If (is_ipv4_tcp(pkt->packet_type) && (gso_ctx->gso_types & DEV_TX_OFFLOAD_TCP_TSO) != 0) {...
+ ret = gso_tcp4_segment(pkt, gso_size, ipid_delta,
+ direct_pool, indirect_pool,
+ pkts_out, nb_pkts_out);
+ } else
+ RTE_LOG(WARNING, GSO, "Unsupported packet type\n");
Shouldn't we do pkt_out[0] = pkt; here?
+
+ if (ret > 1) {
+ pkt_seg = pkt;
+ while (pkt_seg) {
+ rte_mbuf_refcnt_update(pkt_seg, -1);
+ pkt_seg = pkt_seg->next;
+ }
+ }
- return 1;
+ return ret;
}
--
2.7.4
Jiayu Hu
2017-09-13 02:48:01 UTC
Permalink
Hi Konstantin,
Post by Ananyev, Konstantin
Hi Jayu,
-----Original Message-----
From: Hu, Jiayu
Sent: Tuesday, September 12, 2017 3:43 AM
Subject: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
This patch adds GSO support for TCP/IPv4 packets. Supported packets
may include a single VLAN tag. TCP/IPv4 GSO assumes that all input
packets have correct checksums, and doesn't update checksums for output
packets (the responsibility for this lies with the application).
Probably it shouldn't say that checksum have to be valid, right?
As you don't update checksum(s) inside the lib - it probably doesn't matter.
Yes, you are right. It's better to use:
"TCP/IPv4 GSO doesn't check if checksums are correct and doesn't update
checksums for output packets".
Post by Ananyev, Konstantin
Additionally, TCP/IPv4 GSO doesn't process IP fragmented packets.
TCP/IPv4 GSO uses two chained MBUFs, one direct MBUF and one indrect
MBUF, to organize an output packet. Note that we refer to these two
chained MBUFs as a two-segment MBUF. The direct MBUF stores the packet
header, while the indirect mbuf simply points to a location within the
original packet's payload. Consequently, use of the GSO library requires
multi-segment MBUF support in the TX functions of the NIC driver.
If a packet is GSOed, TCP/IPv4 GSO reduces its MBUF refcnt by 1. As a
result, when all of its GSOed segments are freed, the packet is freed
automatically.
---
lib/librte_eal/common/include/rte_log.h | 1 +
lib/librte_gso/Makefile | 2 +
lib/librte_gso/gso_common.c | 202 ++++++++++++++++++++++++++++++++
lib/librte_gso/gso_common.h | 113 ++++++++++++++++++
lib/librte_gso/gso_tcp4.c | 83 +++++++++++++
lib/librte_gso/gso_tcp4.h | 76 ++++++++++++
lib/librte_gso/rte_gso.c | 41 ++++++-
7 files changed, 515 insertions(+), 3 deletions(-)
create mode 100644 lib/librte_gso/gso_common.c
create mode 100644 lib/librte_gso/gso_common.h
create mode 100644 lib/librte_gso/gso_tcp4.c
create mode 100644 lib/librte_gso/gso_tcp4.h
diff --git a/lib/librte_eal/common/include/rte_log.h b/lib/librte_eal/common/include/rte_log.h
index ec8dba7..2fa1199 100644
--- a/lib/librte_eal/common/include/rte_log.h
+++ b/lib/librte_eal/common/include/rte_log.h
@@ -87,6 +87,7 @@ extern struct rte_logs rte_logs;
#define RTE_LOGTYPE_CRYPTODEV 17 /**< Log related to cryptodev. */
#define RTE_LOGTYPE_EFD 18 /**< Log related to EFD. */
#define RTE_LOGTYPE_EVENTDEV 19 /**< Log related to eventdev. */
+#define RTE_LOGTYPE_GSO 20 /**< Log related to GSO. */
/* these log types can be used in an application */
#define RTE_LOGTYPE_USER1 24 /**< User-defined log type 1. */
diff --git a/lib/librte_gso/Makefile b/lib/librte_gso/Makefile
index aeaacbc..2be64d1 100644
--- a/lib/librte_gso/Makefile
+++ b/lib/librte_gso/Makefile
@@ -42,6 +42,8 @@ LIBABIVER := 1
#source files
SRCS-$(CONFIG_RTE_LIBRTE_GSO) += rte_gso.c
+SRCS-$(CONFIG_RTE_LIBRTE_GSO) += gso_common.c
+SRCS-$(CONFIG_RTE_LIBRTE_GSO) += gso_tcp4.c
# install this header file
SYMLINK-$(CONFIG_RTE_LIBRTE_GSO)-include += rte_gso.h
diff --git a/lib/librte_gso/gso_common.c b/lib/librte_gso/gso_common.c
new file mode 100644
index 0000000..7c32e03
--- /dev/null
+++ b/lib/librte_gso/gso_common.c
@@ -0,0 +1,202 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdbool.h>
+#include <errno.h>
+
+#include <rte_memcpy.h>
+#include <rte_mempool.h>
+#include <rte_ether.h>
+#include <rte_ip.h>
+#include <rte_tcp.h>
+
+#include "gso_common.h"
+
+static inline void
+hdr_segment_init(struct rte_mbuf *hdr_segment, struct rte_mbuf *pkt,
+ uint16_t pkt_hdr_offset)
+{
+ /* Copy MBUF metadata */
+ hdr_segment->nb_segs = 1;
+ hdr_segment->port = pkt->port;
+ hdr_segment->ol_flags = pkt->ol_flags;
+ hdr_segment->packet_type = pkt->packet_type;
+ hdr_segment->pkt_len = pkt_hdr_offset;
+ hdr_segment->data_len = pkt_hdr_offset;
+ hdr_segment->tx_offload = pkt->tx_offload;
+
+ /* Copy the packet header */
+ rte_memcpy(rte_pktmbuf_mtod(hdr_segment, char *),
+ rte_pktmbuf_mtod(pkt, char *),
+ pkt_hdr_offset);
+}
+
+static inline void
+free_gso_segment(struct rte_mbuf **pkts, uint16_t nb_pkts)
+{
+ uint16_t i;
+
+ for (i = 0; i < nb_pkts; i++)
+ rte_pktmbuf_free(pkts[i]);
+}
+
+int
+gso_do_segment(struct rte_mbuf *pkt,
+ uint16_t pkt_hdr_offset,
+ uint16_t pyld_unit_size,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out)
+{
+ struct rte_mbuf *pkt_in;
+ struct rte_mbuf *hdr_segment, *pyld_segment, *prev_segment;
+ uint16_t pkt_in_data_pos, segment_bytes_remaining;
+ uint16_t pyld_len, nb_segs;
+ bool more_in_pkt, more_out_segs;
+
+ pkt_in = pkt;
+ nb_segs = 0;
+ more_in_pkt = 1;
+ pkt_in_data_pos = pkt_hdr_offset;
+
+ while (more_in_pkt) {
+ if (unlikely(nb_segs >= nb_pkts_out)) {
+ free_gso_segment(pkts_out, nb_segs);
+ return -EINVAL;
+ }
+
+ /* Allocate a direct MBUF */
+ hdr_segment = rte_pktmbuf_alloc(direct_pool);
+ if (unlikely(hdr_segment == NULL)) {
+ free_gso_segment(pkts_out, nb_segs);
+ return -ENOMEM;
+ }
+ /* Fill the packet header */
+ hdr_segment_init(hdr_segment, pkt, pkt_hdr_offset);
+
+ prev_segment = hdr_segment;
+ segment_bytes_remaining = pyld_unit_size;
+ more_out_segs = 1;
+
+ while (more_out_segs && more_in_pkt) {
+ /* Allocate an indirect MBUF */
+ pyld_segment = rte_pktmbuf_alloc(indirect_pool);
+ if (unlikely(pyld_segment == NULL)) {
+ rte_pktmbuf_free(hdr_segment);
+ free_gso_segment(pkts_out, nb_segs);
+ return -ENOMEM;
+ }
+ /* Attach to current MBUF segment of pkt */
+ rte_pktmbuf_attach(pyld_segment, pkt_in);
+
+ prev_segment->next = pyld_segment;
+ prev_segment = pyld_segment;
+
+ pyld_len = segment_bytes_remaining;
+ if (pyld_len + pkt_in_data_pos > pkt_in->data_len)
+ pyld_len = pkt_in->data_len - pkt_in_data_pos;
+
+ pyld_segment->data_off = pkt_in_data_pos +
+ pkt_in->data_off;
+ pyld_segment->data_len = pyld_len;
+
+ /* Update header segment */
+ hdr_segment->pkt_len += pyld_len;
+ hdr_segment->nb_segs++;
+
+ pkt_in_data_pos += pyld_len;
+ segment_bytes_remaining -= pyld_len;
+
+ /* Finish processing a MBUF segment of pkt */
+ if (pkt_in_data_pos == pkt_in->data_len) {
+ pkt_in = pkt_in->next;
+ pkt_in_data_pos = 0;
+ if (pkt_in == NULL)
+ more_in_pkt = 0;
+ }
+
+ /* Finish generating a GSO segment */
+ if (segment_bytes_remaining == 0)
+ more_out_segs = 0;
+ }
+ pkts_out[nb_segs++] = hdr_segment;
+ }
+ return nb_segs;
+}
+
+static inline void
+update_inner_tcp4_header(struct rte_mbuf *pkt, uint8_t ipid_delta,
+ struct rte_mbuf **segs, uint16_t nb_segs)
+{
+ struct tcp_hdr *tcp_hdr;
+ struct ipv4_hdr *ipv4_hdr;
+ struct rte_mbuf *seg;
+ uint32_t sent_seq;
+ uint16_t inner_l2_offset;
+ uint16_t id, i;
+
+ inner_l2_offset = pkt->outer_l2_len + pkt->outer_l3_len + pkt->l2_len;
Shouldn't it be: pkt->l2_len here?
Or probably even better to pass l2_len as an input parameter.
Oh, yes. Applications won't guarantee outer_l2_len and outer_l3_len are 0
for non-tunnelling packets. I will add l2_len as a parameter instead.
Post by Ananyev, Konstantin
+ ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *) +
+ inner_l2_offset);
+ tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr + pkt->l3_len);
+ id = rte_be_to_cpu_16(ipv4_hdr->packet_id);
+ sent_seq = rte_be_to_cpu_32(tcp_hdr->sent_seq);
+
+ for (i = 0; i < nb_segs; i++) {
+ seg = segs[i];
+ /* Update the inner IPv4 header */
+ ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(seg, char *) +
+ inner_l2_offset);
+ ipv4_hdr->total_length = rte_cpu_to_be_16(seg->pkt_len -
+ inner_l2_offset);
+ ipv4_hdr->packet_id = rte_cpu_to_be_16(id);
+ id += ipid_delta;
+
+ /* Update the inner TCP header */
+ tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr + seg->l3_len);
+ tcp_hdr->sent_seq = rte_cpu_to_be_32(sent_seq);
+ if (likely(i < nb_segs - 1))
+ tcp_hdr->tcp_flags &= (~(TCP_HDR_PSH_MASK |
+ TCP_HDR_FIN_MASK));
+ sent_seq += (seg->pkt_len - seg->data_len);
+ }
+}
+
+void
+gso_update_pkt_headers(struct rte_mbuf *pkt, uint8_t ipid_delta,
+ struct rte_mbuf **segs, uint16_t nb_segs)
+{
+ if (is_ipv4_tcp(pkt->packet_type))
+ update_inner_tcp4_header(pkt, ipid_delta, segs, nb_segs);
+}
diff --git a/lib/librte_gso/gso_common.h b/lib/librte_gso/gso_common.h
new file mode 100644
index 0000000..3c76520
--- /dev/null
+++ b/lib/librte_gso/gso_common.h
@@ -0,0 +1,113 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _GSO_COMMON_H_
+#define _GSO_COMMON_H_
+
+#include <stdint.h>
+#include <rte_mbuf.h>
+
+#define IPV4_HDR_DF_SHIFT 14
We have that already defined in librte_net/rte_ip.h
Yes. I will remove it here.
Post by Ananyev, Konstantin
+#define IPV4_HDR_DF_MASK (1 << IPV4_HDR_DF_SHIFT)
+
+#define TCP_HDR_PSH_MASK ((uint8_t)0x08)
+#define TCP_HDR_FIN_MASK ((uint8_t)0x01)
+
+#define ETHER_TCP_PKT (RTE_PTYPE_L2_ETHER | RTE_PTYPE_L4_TCP)
+#define ETHER_VLAN_TCP_PKT (RTE_PTYPE_L2_ETHER_VLAN | RTE_PTYPE_L4_TCP)
+static inline uint8_t is_ipv4_tcp(uint32_t ptype)
+{
+ switch (ptype & (~RTE_PTYPE_L3_MASK)) {
return RTE_ETH_IS_IPV4_HDR(ptype) && (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_TCP;
?
Yes, we don't need to check if the packet is vlan encapsulated.
Post by Ananyev, Konstantin
+ return RTE_ETH_IS_IPV4_HDR(ptype);
+ return 0;
+ }
+}
+
+/**
+ * Internal function which updates relevant packet headers, following
+ * segmentation. This is required to update, for example, the IPv4
+ * 'total_length' field, to reflect the reduced length of the now-
+ * segmented packet.
+ *
+ * The original packet.
+ * The increasing uint of IP ids.
+ * Pointer array used for storing mbuf addresses for GSO segments.
+ * The number of GSO segments placed in segs.
+ */
+void gso_update_pkt_headers(struct rte_mbuf *pkt, uint8_t ipid_delta,
+ struct rte_mbuf **segs, uint16_t nb_segs);
+
+/**
+ * Internal function which divides the input packet into small segments.
+ * Each of the newly-created segments is organized as a two-segment MBUF,
+ * where the first segment is a standard mbuf, which stores a copy of
+ * packet header, and the second is an indirect mbuf which points to a
+ * section of data in the input packet.
+ *
+ * Packet to segment.
+ * Packet header offset, measured in bytes.
+ * The max payload length of a GSO segment.
+ * MBUF pool used for allocating direct buffers for output segments.
+ * MBUF pool used for allocating indirect buffers for output segments.
+ * Pointer array used to keep the mbuf addresses of output segments. If
+ * the memory space in pkts_out is insufficient, gso_do_segment() fails
+ * and returns -EINVAL.
+ * The max number of items that pkts_out can keep.
+ *
+ * - The number of segments created in the event of success.
+ * - Return -ENOMEM if run out of memory in MBUF pools.
+ * - Return -EINVAL for invalid parameters.
+ */
+int gso_do_segment(struct rte_mbuf *pkt,
+ uint16_t pkt_hdr_offset,
+ uint16_t pyld_unit_size,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out);
+#endif
diff --git a/lib/librte_gso/gso_tcp4.c b/lib/librte_gso/gso_tcp4.c
new file mode 100644
index 0000000..8d4bfb2
--- /dev/null
+++ b/lib/librte_gso/gso_tcp4.c
@@ -0,0 +1,83 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include <rte_ether.h>
+#include <rte_ip.h>
+
+#include "gso_common.h"
+#include "gso_tcp4.h"
+
+int
+gso_tcp4_segment(struct rte_mbuf *pkt,
+ uint16_t gso_size,
+ uint8_t ipid_delta,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out)
+{
+ struct ipv4_hdr *ipv4_hdr;
+ uint16_t tcp_dl;
+ uint16_t pyld_unit_size;
+ uint16_t hdr_offset;
+ int ret = 1;
+
+ ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *) +
+ pkt->l2_len);
+ /* Don't process the fragmented packet */
+ if (unlikely((ipv4_hdr->fragment_offset & rte_cpu_to_be_16(
+ IPV4_HDR_DF_MASK)) == 0)) {
It is not a check for fragmented packet - it is a check that fragmentation is allowed for that packet.
Should be IPV4_HDR_DF_MASK - 1, I think.
IMO, IPV4_HDR_DF_MASK whose value is (1 << 14) is used to get DF bit. It's a
little-endian value. But ipv4_hdr->fragment_offset is big-endian order.
So the value of DF bit should be "ipv4_hdr->fragment_offset & rte_cpu_to_be_16(
IPV4_HDR_DF_MASK)". If this value is 0, the input packet is fragmented.
Post by Ananyev, Konstantin
+ pkts_out[0] = pkt;
+ return ret;
+ }
+
+ tcp_dl = rte_be_to_cpu_16(ipv4_hdr->total_length) - pkt->l3_len -
+ pkt->l4_len;
Why not use pkt->pkt_len - pkt->l2_len -pkt_l3_len - pkt_l4_len?
Yes, we can use pkt->pkt_len - pkt->l2_len -pkt_l3_len - pkt_l4_len here.
Post by Ananyev, Konstantin
+ /* Don't process the packet without data */
+ if (unlikely(tcp_dl == 0)) {
+ pkts_out[0] = pkt;
+ return ret;
+ }
+
+ hdr_offset = pkt->l2_len + pkt->l3_len + pkt->l4_len;
+ pyld_unit_size = gso_size - hdr_offset - ETHER_CRC_LEN;
Hmm, why do we need to count CRC_LEN here?
Yes, we shouldn't count ETHER_CRC_LEN here. Its length should be
included in gso_size.
Post by Ananyev, Konstantin
+
+ /* Segment the payload */
+ ret = gso_do_segment(pkt, hdr_offset, pyld_unit_size, direct_pool,
+ indirect_pool, pkts_out, nb_pkts_out);
+ if (ret > 1)
+ gso_update_pkt_headers(pkt, ipid_delta, pkts_out, ret);
+
+ return ret;
+}
diff --git a/lib/librte_gso/gso_tcp4.h b/lib/librte_gso/gso_tcp4.h
new file mode 100644
index 0000000..9c07984
--- /dev/null
+++ b/lib/librte_gso/gso_tcp4.h
@@ -0,0 +1,76 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _GSO_TCP4_H_
+#define _GSO_TCP4_H_
+
+#include <stdint.h>
+#include <rte_mbuf.h>
+
+/**
+ * Segment an IPv4/TCP packet. This function assumes the input packet has
+ * correct checksums and doesn't update checksums for GSO segment.
+ * Furthermore, it doesn't process IP fragment packets.
+ *
+ * The packet mbuf to segment.
+ * The max length of a GSO segment, measured in bytes.
+ * The increasing uint of IP ids.
+ * MBUF pool used for allocating direct buffers for output segments.
+ * MBUF pool used for allocating indirect buffers for output segments.
+ * Pointer array used to store the MBUF addresses of output GSO
+ * segments, when gso_tcp4_segment() successes. If the memory space in
+ * pkts_out is insufficient, gso_tcp4_segment() fails and returns
+ * -EINVAL.
+ * The max number of items that 'pkts_out' can keep.
+ *
+ * - The number of GSO segments filled in pkts_out on success.
+ * - Return -ENOMEM if run out of memory in MBUF pools.
+ * - Return -EINVAL for invalid parameters.
+ */
+int gso_tcp4_segment(struct rte_mbuf *pkt,
+ uint16_t gso_size,
+ uint8_t ip_delta,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out);
+
+#endif
diff --git a/lib/librte_gso/rte_gso.c b/lib/librte_gso/rte_gso.c
index dda50ee..95f6ea6 100644
--- a/lib/librte_gso/rte_gso.c
+++ b/lib/librte_gso/rte_gso.c
@@ -33,18 +33,53 @@
#include <errno.h>
+#include <rte_log.h>
+
#include "rte_gso.h"
+#include "gso_common.h"
+#include "gso_tcp4.h"
int
rte_gso_segment(struct rte_mbuf *pkt,
- struct rte_gso_ctx gso_ctx __rte_unused,
+ struct rte_gso_ctx gso_ctx,
struct rte_mbuf **pkts_out,
uint16_t nb_pkts_out)
{
+ struct rte_mempool *direct_pool, *indirect_pool;
+ struct rte_mbuf *pkt_seg;
+ uint16_t gso_size;
+ uint8_t ipid_delta;
+ int ret = 1;
+
if (pkt == NULL || pkts_out == NULL || nb_pkts_out < 1)
return -EINVAL;
- pkts_out[0] = pkt;
+ if (gso_ctx.gso_size >= pkt->pkt_len ||
+ (pkt->packet_type & gso_ctx.gso_types) !=
+ pkt->packet_type) {
+ pkts_out[0] = pkt;
+ return ret;
+ }
+
+ direct_pool = gso_ctx.direct_pool;
+ indirect_pool = gso_ctx.indirect_pool;
+ gso_size = gso_ctx.gso_size;
+ ipid_delta = gso_ctx.ipid_flag == RTE_GSO_IPID_INCREASE;
+
+ if (is_ipv4_tcp(pkt->packet_type)) {
If (is_ipv4_tcp(pkt->packet_type) && (gso_ctx->gso_types & DEV_TX_OFFLOAD_TCP_TSO) != 0) {...
+ ret = gso_tcp4_segment(pkt, gso_size, ipid_delta,
+ direct_pool, indirect_pool,
+ pkts_out, nb_pkts_out);
+ } else
+ RTE_LOG(WARNING, GSO, "Unsupported packet type\n");
Shouldn't we do pkt_out[0] = pkt; here?
Yes, we need to add it here. Thanks for reminder.
Post by Ananyev, Konstantin
+
+ if (ret > 1) {
+ pkt_seg = pkt;
+ while (pkt_seg) {
+ rte_mbuf_refcnt_update(pkt_seg, -1);
+ pkt_seg = pkt_seg->next;
+ }
+ }
- return 1;
+ return ret;
}
--
2.7.4
Ananyev, Konstantin
2017-09-13 09:38:23 UTC
Permalink
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
+
+int
+gso_tcp4_segment(struct rte_mbuf *pkt,
+ uint16_t gso_size,
+ uint8_t ipid_delta,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out)
+{
+ struct ipv4_hdr *ipv4_hdr;
+ uint16_t tcp_dl;
+ uint16_t pyld_unit_size;
+ uint16_t hdr_offset;
+ int ret = 1;
+
+ ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *) +
+ pkt->l2_len);
+ /* Don't process the fragmented packet */
+ if (unlikely((ipv4_hdr->fragment_offset & rte_cpu_to_be_16(
+ IPV4_HDR_DF_MASK)) == 0)) {
It is not a check for fragmented packet - it is a check that fragmentation is allowed for that packet.
Should be IPV4_HDR_DF_MASK - 1, I think.
DF bit doesn't indicate is packet fragmented or not.
It forbids to fragment packet any further.
To check is packet already fragmented or not, you have to check MF bit and frag_offset.
Both have to be zero for un-fragmented packets.
Post by Jiayu Hu
IMO, IPV4_HDR_DF_MASK whose value is (1 << 14) is used to get DF bit. It's a
little-endian value. But ipv4_hdr->fragment_offset is big-endian order.
So the value of DF bit should be "ipv4_hdr->fragment_offset & rte_cpu_to_be_16(
IPV4_HDR_DF_MASK)". If this value is 0, the input packet is fragmented.
Post by Ananyev, Konstantin
Post by Jiayu Hu
+ pkts_out[0] = pkt;
+ return ret;
+ }
+
+ tcp_dl = rte_be_to_cpu_16(ipv4_hdr->total_length) - pkt->l3_len -
+ pkt->l4_len;
Why not use pkt->pkt_len - pkt->l2_len -pkt_l3_len - pkt_l4_len?
Yes, we can use pkt->pkt_len - pkt->l2_len -pkt_l3_len - pkt_l4_len here.
Post by Ananyev, Konstantin
Post by Jiayu Hu
+ /* Don't process the packet without data */
+ if (unlikely(tcp_dl == 0)) {
+ pkts_out[0] = pkt;
+ return ret;
+ }
+
+ hdr_offset = pkt->l2_len + pkt->l3_len + pkt->l4_len;
+ pyld_unit_size = gso_size - hdr_offset - ETHER_CRC_LEN;
Hmm, why do we need to count CRC_LEN here?
Yes, we shouldn't count ETHER_CRC_LEN here. Its length should be
included in gso_size.
Why?
What is the point to account crc len into this computation?
Why not just assume that gso_size is already a max_frame_size - crc_len
As I remember, when we RX packet crc bytes will be already stripped,
when user populates the packet, he doesn't care about crc bytes too.

Konstantin
Hu, Jiayu
2017-09-13 10:23:21 UTC
Permalink
Hi Konstantin,
-----Original Message-----
From: Ananyev, Konstantin
Sent: Wednesday, September 13, 2017 5:38 PM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
+
+int
+gso_tcp4_segment(struct rte_mbuf *pkt,
+ uint16_t gso_size,
+ uint8_t ipid_delta,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out)
+{
+ struct ipv4_hdr *ipv4_hdr;
+ uint16_t tcp_dl;
+ uint16_t pyld_unit_size;
+ uint16_t hdr_offset;
+ int ret = 1;
+
+ ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *) +
+ pkt->l2_len);
+ /* Don't process the fragmented packet */
+ if (unlikely((ipv4_hdr->fragment_offset & rte_cpu_to_be_16(
+ IPV4_HDR_DF_MASK)) == 0))
{
Post by Jiayu Hu
Post by Ananyev, Konstantin
It is not a check for fragmented packet - it is a check that fragmentation
is allowed for that packet.
Post by Jiayu Hu
Post by Ananyev, Konstantin
Should be IPV4_HDR_DF_MASK - 1, I think.
DF bit doesn't indicate is packet fragmented or not.
It forbids to fragment packet any further.
To check is packet already fragmented or not, you have to check MF bit and frag_offset.
Both have to be zero for un-fragmented packets.
Yes, you are right. I checked the RFC and I misunderstood the meaning of DF bit.
When DF bit is set to 1, the packet isn't IP fragmented. When DF bit is 0, the packet
may or may not be fragmented. So it can't indicate if the packet is an IP fragment.
Only both MF and offset are 0, the packet is not fragmented.
Post by Jiayu Hu
IMO, IPV4_HDR_DF_MASK whose value is (1 << 14) is used to get DF bit. It's
a
Post by Jiayu Hu
little-endian value. But ipv4_hdr->fragment_offset is big-endian order.
So the value of DF bit should be "ipv4_hdr->fragment_offset &
rte_cpu_to_be_16(
Post by Jiayu Hu
IPV4_HDR_DF_MASK)". If this value is 0, the input packet is fragmented.
Post by Ananyev, Konstantin
Post by Jiayu Hu
+ pkts_out[0] = pkt;
+ return ret;
+ }
+
+ tcp_dl = rte_be_to_cpu_16(ipv4_hdr->total_length) - pkt->l3_len -
+ pkt->l4_len;
Why not use pkt->pkt_len - pkt->l2_len -pkt_l3_len - pkt_l4_len?
Yes, we can use pkt->pkt_len - pkt->l2_len -pkt_l3_len - pkt_l4_len here.
Post by Ananyev, Konstantin
Post by Jiayu Hu
+ /* Don't process the packet without data */
+ if (unlikely(tcp_dl == 0)) {
+ pkts_out[0] = pkt;
+ return ret;
+ }
+
+ hdr_offset = pkt->l2_len + pkt->l3_len + pkt->l4_len;
+ pyld_unit_size = gso_size - hdr_offset - ETHER_CRC_LEN;
Hmm, why do we need to count CRC_LEN here?
Yes, we shouldn't count ETHER_CRC_LEN here. Its length should be
included in gso_size.
Why?
What is the point to account crc len into this computation?
Why not just assume that gso_size is already a max_frame_size - crc_len
As I remember, when we RX packet crc bytes will be already stripped,
when user populates the packet, he doesn't care about crc bytes too.
Sorry, maybe I didn't make it clear. I don't mean that applications must count
CRC when set gso_segsz. It's related specific scenarios to decide if count CRC
in gso_segsz or not, IMO. The GSO library shouldn't be aware of CRC, and just
uses gso_segsz to split packets.

Thanks,
Jiayu
Konstantin
Kavanagh, Mark B
2017-09-13 14:52:11 UTC
Permalink
From: Ananyev, Konstantin
Sent: Wednesday, September 13, 2017 10:38 AM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
+
+int
+gso_tcp4_segment(struct rte_mbuf *pkt,
+ uint16_t gso_size,
+ uint8_t ipid_delta,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out)
+{
+ struct ipv4_hdr *ipv4_hdr;
+ uint16_t tcp_dl;
+ uint16_t pyld_unit_size;
+ uint16_t hdr_offset;
+ int ret = 1;
+
+ ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *) +
+ pkt->l2_len);
+ /* Don't process the fragmented packet */
+ if (unlikely((ipv4_hdr->fragment_offset & rte_cpu_to_be_16(
+ IPV4_HDR_DF_MASK)) == 0)) {
It is not a check for fragmented packet - it is a check that fragmentation
is allowed for that packet.
Post by Jiayu Hu
Post by Ananyev, Konstantin
Should be IPV4_HDR_DF_MASK - 1, I think.
DF bit doesn't indicate is packet fragmented or not.
It forbids to fragment packet any further.
To check is packet already fragmented or not, you have to check MF bit and frag_offset.
Both have to be zero for un-fragmented packets.
Post by Jiayu Hu
IMO, IPV4_HDR_DF_MASK whose value is (1 << 14) is used to get DF bit. It's a
little-endian value. But ipv4_hdr->fragment_offset is big-endian order.
So the value of DF bit should be "ipv4_hdr->fragment_offset &
rte_cpu_to_be_16(
Post by Jiayu Hu
IPV4_HDR_DF_MASK)". If this value is 0, the input packet is fragmented.
Post by Ananyev, Konstantin
Post by Jiayu Hu
+ pkts_out[0] = pkt;
+ return ret;
+ }
+
+ tcp_dl = rte_be_to_cpu_16(ipv4_hdr->total_length) - pkt->l3_len -
+ pkt->l4_len;
Why not use pkt->pkt_len - pkt->l2_len -pkt_l3_len - pkt_l4_len?
Yes, we can use pkt->pkt_len - pkt->l2_len -pkt_l3_len - pkt_l4_len here.
Post by Ananyev, Konstantin
Post by Jiayu Hu
+ /* Don't process the packet without data */
+ if (unlikely(tcp_dl == 0)) {
+ pkts_out[0] = pkt;
+ return ret;
+ }
+
+ hdr_offset = pkt->l2_len + pkt->l3_len + pkt->l4_len;
+ pyld_unit_size = gso_size - hdr_offset - ETHER_CRC_LEN;
Hmm, why do we need to count CRC_LEN here?
Yes, we shouldn't count ETHER_CRC_LEN here. Its length should be
included in gso_size.
Why?
What is the point to account crc len into this computation?
Why not just assume that gso_size is already a max_frame_size - crc_len
As I remember, when we RX packet crc bytes will be already stripped,
when user populates the packet, he doesn't care about crc bytes too.
Hi Konstantin,

When packet is tx'd, the 4B for CRC are added back into the packet; if the payload is already at max capacity, then the actual segment size will be 4B larger than expected (e.g. 1522B, as opposed to 1518B).
To prevent that from happening, we account for the CRC len in this calculation.

If I've missed anything, please do let me know!

Thanks,
Mark
Konstantin
Ananyev, Konstantin
2017-09-13 15:13:18 UTC
Permalink
Hi Mark,
-----Original Message-----
From: Kavanagh, Mark B
Sent: Wednesday, September 13, 2017 3:52 PM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
From: Ananyev, Konstantin
Sent: Wednesday, September 13, 2017 10:38 AM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
+
+int
+gso_tcp4_segment(struct rte_mbuf *pkt,
+ uint16_t gso_size,
+ uint8_t ipid_delta,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out)
+{
+ struct ipv4_hdr *ipv4_hdr;
+ uint16_t tcp_dl;
+ uint16_t pyld_unit_size;
+ uint16_t hdr_offset;
+ int ret = 1;
+
+ ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *) +
+ pkt->l2_len);
+ /* Don't process the fragmented packet */
+ if (unlikely((ipv4_hdr->fragment_offset & rte_cpu_to_be_16(
+ IPV4_HDR_DF_MASK)) == 0)) {
It is not a check for fragmented packet - it is a check that fragmentation
is allowed for that packet.
Post by Jiayu Hu
Post by Ananyev, Konstantin
Should be IPV4_HDR_DF_MASK - 1, I think.
DF bit doesn't indicate is packet fragmented or not.
It forbids to fragment packet any further.
To check is packet already fragmented or not, you have to check MF bit and frag_offset.
Both have to be zero for un-fragmented packets.
Post by Jiayu Hu
IMO, IPV4_HDR_DF_MASK whose value is (1 << 14) is used to get DF bit. It's a
little-endian value. But ipv4_hdr->fragment_offset is big-endian order.
So the value of DF bit should be "ipv4_hdr->fragment_offset &
rte_cpu_to_be_16(
Post by Jiayu Hu
IPV4_HDR_DF_MASK)". If this value is 0, the input packet is fragmented.
Post by Ananyev, Konstantin
Post by Jiayu Hu
+ pkts_out[0] = pkt;
+ return ret;
+ }
+
+ tcp_dl = rte_be_to_cpu_16(ipv4_hdr->total_length) - pkt->l3_len -
+ pkt->l4_len;
Why not use pkt->pkt_len - pkt->l2_len -pkt_l3_len - pkt_l4_len?
Yes, we can use pkt->pkt_len - pkt->l2_len -pkt_l3_len - pkt_l4_len here.
Post by Ananyev, Konstantin
Post by Jiayu Hu
+ /* Don't process the packet without data */
+ if (unlikely(tcp_dl == 0)) {
+ pkts_out[0] = pkt;
+ return ret;
+ }
+
+ hdr_offset = pkt->l2_len + pkt->l3_len + pkt->l4_len;
+ pyld_unit_size = gso_size - hdr_offset - ETHER_CRC_LEN;
Hmm, why do we need to count CRC_LEN here?
Yes, we shouldn't count ETHER_CRC_LEN here. Its length should be
included in gso_size.
Why?
What is the point to account crc len into this computation?
Why not just assume that gso_size is already a max_frame_size - crc_len
As I remember, when we RX packet crc bytes will be already stripped,
when user populates the packet, he doesn't care about crc bytes too.
Hi Konstantin,
When packet is tx'd, the 4B for CRC are added back into the packet; if the payload is already at max capacity, then the actual segment size
will be 4B larger than expected (e.g. 1522B, as opposed to 1518B).
To prevent that from happening, we account for the CRC len in this calculation.
Ok, and what prevents you to set gso_ctx.gso_size = 1514; /*ether frame size without crc bytes */
?
Konstantin
If I've missed anything, please do let me know!
Thanks,
Mark
Konstantin
Hu, Jiayu
2017-09-14 00:59:58 UTC
Permalink
Hi Konstantin,
-----Original Message-----
From: Ananyev, Konstantin
Sent: Wednesday, September 13, 2017 11:13 PM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Mark,
-----Original Message-----
From: Kavanagh, Mark B
Sent: Wednesday, September 13, 2017 3:52 PM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
From: Ananyev, Konstantin
Sent: Wednesday, September 13, 2017 10:38 AM
Tan, Jianfeng
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
+
+int
+gso_tcp4_segment(struct rte_mbuf *pkt,
+ uint16_t gso_size,
+ uint8_t ipid_delta,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out)
+{
+ struct ipv4_hdr *ipv4_hdr;
+ uint16_t tcp_dl;
+ uint16_t pyld_unit_size;
+ uint16_t hdr_offset;
+ int ret = 1;
+
+ ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *)
+
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
+ pkt->l2_len);
+ /* Don't process the fragmented packet */
+ if (unlikely((ipv4_hdr->fragment_offset & rte_cpu_to_be_16(
+
IPV4_HDR_DF_MASK)) == 0)) {
Post by Jiayu Hu
Post by Ananyev, Konstantin
It is not a check for fragmented packet - it is a check that
fragmentation
is allowed for that packet.
Post by Jiayu Hu
Post by Ananyev, Konstantin
Should be IPV4_HDR_DF_MASK - 1, I think.
DF bit doesn't indicate is packet fragmented or not.
It forbids to fragment packet any further.
To check is packet already fragmented or not, you have to check MF bit
and
frag_offset.
Both have to be zero for un-fragmented packets.
Post by Jiayu Hu
IMO, IPV4_HDR_DF_MASK whose value is (1 << 14) is used to get DF bit.
It's a
Post by Jiayu Hu
little-endian value. But ipv4_hdr->fragment_offset is big-endian order.
So the value of DF bit should be "ipv4_hdr->fragment_offset &
rte_cpu_to_be_16(
Post by Jiayu Hu
IPV4_HDR_DF_MASK)". If this value is 0, the input packet is fragmented.
Post by Ananyev, Konstantin
Post by Jiayu Hu
+ pkts_out[0] = pkt;
+ return ret;
+ }
+
+ tcp_dl = rte_be_to_cpu_16(ipv4_hdr->total_length) - pkt-
l3_len -
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
+ pkt->l4_len;
Why not use pkt->pkt_len - pkt->l2_len -pkt_l3_len - pkt_l4_len?
Yes, we can use pkt->pkt_len - pkt->l2_len -pkt_l3_len - pkt_l4_len here.
Post by Ananyev, Konstantin
Post by Jiayu Hu
+ /* Don't process the packet without data */
+ if (unlikely(tcp_dl == 0)) {
+ pkts_out[0] = pkt;
+ return ret;
+ }
+
+ hdr_offset = pkt->l2_len + pkt->l3_len + pkt->l4_len;
+ pyld_unit_size = gso_size - hdr_offset - ETHER_CRC_LEN;
Hmm, why do we need to count CRC_LEN here?
Yes, we shouldn't count ETHER_CRC_LEN here. Its length should be
included in gso_size.
Why?
What is the point to account crc len into this computation?
Why not just assume that gso_size is already a max_frame_size - crc_len
As I remember, when we RX packet crc bytes will be already stripped,
when user populates the packet, he doesn't care about crc bytes too.
Hi Konstantin,
When packet is tx'd, the 4B for CRC are added back into the packet; if the
payload is already at max capacity, then the actual segment size
will be 4B larger than expected (e.g. 1522B, as opposed to 1518B).
To prevent that from happening, we account for the CRC len in this
calculation.
Ok, and what prevents you to set gso_ctx.gso_size = 1514; /*ether frame
size without crc bytes */
?
Exactly, applications can set 1514 to gso_segsz instead of 1518, if the lower layer
will add CRC to the packet.

Jiayu
Konstantin
If I've missed anything, please do let me know!
Thanks,
Mark
Konstantin
Kavanagh, Mark B
2017-09-14 08:35:29 UTC
Permalink
From: Hu, Jiayu
Sent: Thursday, September 14, 2017 2:00 AM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Konstantin,
-----Original Message-----
From: Ananyev, Konstantin
Sent: Wednesday, September 13, 2017 11:13 PM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Mark,
-----Original Message-----
From: Kavanagh, Mark B
Sent: Wednesday, September 13, 2017 3:52 PM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
From: Ananyev, Konstantin
Sent: Wednesday, September 13, 2017 10:38 AM
Tan, Jianfeng
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
+
+int
+gso_tcp4_segment(struct rte_mbuf *pkt,
+ uint16_t gso_size,
+ uint8_t ipid_delta,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out)
+{
+ struct ipv4_hdr *ipv4_hdr;
+ uint16_t tcp_dl;
+ uint16_t pyld_unit_size;
+ uint16_t hdr_offset;
+ int ret = 1;
+
+ ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *)
+
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
+ pkt->l2_len);
+ /* Don't process the fragmented packet */
+ if (unlikely((ipv4_hdr->fragment_offset & rte_cpu_to_be_16(
+
IPV4_HDR_DF_MASK)) == 0)) {
Post by Jiayu Hu
Post by Ananyev, Konstantin
It is not a check for fragmented packet - it is a check that
fragmentation
is allowed for that packet.
Post by Jiayu Hu
Post by Ananyev, Konstantin
Should be IPV4_HDR_DF_MASK - 1, I think.
DF bit doesn't indicate is packet fragmented or not.
It forbids to fragment packet any further.
To check is packet already fragmented or not, you have to check MF bit
and
frag_offset.
Both have to be zero for un-fragmented packets.
Post by Jiayu Hu
IMO, IPV4_HDR_DF_MASK whose value is (1 << 14) is used to get DF bit.
It's a
Post by Jiayu Hu
little-endian value. But ipv4_hdr->fragment_offset is big-endian order.
So the value of DF bit should be "ipv4_hdr->fragment_offset &
rte_cpu_to_be_16(
Post by Jiayu Hu
IPV4_HDR_DF_MASK)". If this value is 0, the input packet is fragmented.
Post by Ananyev, Konstantin
Post by Jiayu Hu
+ pkts_out[0] = pkt;
+ return ret;
+ }
+
+ tcp_dl = rte_be_to_cpu_16(ipv4_hdr->total_length) - pkt-
l3_len -
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
+ pkt->l4_len;
Why not use pkt->pkt_len - pkt->l2_len -pkt_l3_len - pkt_l4_len?
Yes, we can use pkt->pkt_len - pkt->l2_len -pkt_l3_len - pkt_l4_len
here.
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
+ /* Don't process the packet without data */
+ if (unlikely(tcp_dl == 0)) {
+ pkts_out[0] = pkt;
+ return ret;
+ }
+
+ hdr_offset = pkt->l2_len + pkt->l3_len + pkt->l4_len;
+ pyld_unit_size = gso_size - hdr_offset - ETHER_CRC_LEN;
Hmm, why do we need to count CRC_LEN here?
Yes, we shouldn't count ETHER_CRC_LEN here. Its length should be
included in gso_size.
Why?
What is the point to account crc len into this computation?
Why not just assume that gso_size is already a max_frame_size - crc_len
As I remember, when we RX packet crc bytes will be already stripped,
when user populates the packet, he doesn't care about crc bytes too.
Hi Konstantin,
When packet is tx'd, the 4B for CRC are added back into the packet; if the
payload is already at max capacity, then the actual segment size
will be 4B larger than expected (e.g. 1522B, as opposed to 1518B).
To prevent that from happening, we account for the CRC len in this
calculation.
Ok, and what prevents you to set gso_ctx.gso_size = 1514; /*ether frame
size without crc bytes */
?
Hey Konstantin,

If the user sets the gso_size to 1514, the resultant output segments' size should be 1514, and not 1518. Consequently, the payload capacity of each segment would be reduced accordingly.
The user only cares about the output segment size (i.e. gso_ctx.gso_size); we need to ensure that the size of the segments that are produced is consistent with that. As a result, we need to ensure that any packet overhead is accounted for in the segment size, before we can determine how much space remains for data.

Hope this makes sense.

Thanks,
Mark
Exactly, applications can set 1514 to gso_segsz instead of 1518, if the lower layer
will add CRC to the packet.
Jiayu
Konstantin
If I've missed anything, please do let me know!
Thanks,
Mark
Konstantin
Ananyev, Konstantin
2017-09-14 08:39:42 UTC
Permalink
-----Original Message-----
From: Kavanagh, Mark B
Sent: Thursday, September 14, 2017 9:35 AM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
From: Hu, Jiayu
Sent: Thursday, September 14, 2017 2:00 AM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Konstantin,
-----Original Message-----
From: Ananyev, Konstantin
Sent: Wednesday, September 13, 2017 11:13 PM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Mark,
-----Original Message-----
From: Kavanagh, Mark B
Sent: Wednesday, September 13, 2017 3:52 PM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
From: Ananyev, Konstantin
Sent: Wednesday, September 13, 2017 10:38 AM
Tan, Jianfeng
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
+
+int
+gso_tcp4_segment(struct rte_mbuf *pkt,
+ uint16_t gso_size,
+ uint8_t ipid_delta,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out)
+{
+ struct ipv4_hdr *ipv4_hdr;
+ uint16_t tcp_dl;
+ uint16_t pyld_unit_size;
+ uint16_t hdr_offset;
+ int ret = 1;
+
+ ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *)
+
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
+ pkt->l2_len);
+ /* Don't process the fragmented packet */
+ if (unlikely((ipv4_hdr->fragment_offset & rte_cpu_to_be_16(
+
IPV4_HDR_DF_MASK)) == 0)) {
Post by Jiayu Hu
Post by Ananyev, Konstantin
It is not a check for fragmented packet - it is a check that
fragmentation
is allowed for that packet.
Post by Jiayu Hu
Post by Ananyev, Konstantin
Should be IPV4_HDR_DF_MASK - 1, I think.
DF bit doesn't indicate is packet fragmented or not.
It forbids to fragment packet any further.
To check is packet already fragmented or not, you have to check MF bit
and
frag_offset.
Both have to be zero for un-fragmented packets.
Post by Jiayu Hu
IMO, IPV4_HDR_DF_MASK whose value is (1 << 14) is used to get DF bit.
It's a
Post by Jiayu Hu
little-endian value. But ipv4_hdr->fragment_offset is big-endian order.
So the value of DF bit should be "ipv4_hdr->fragment_offset &
rte_cpu_to_be_16(
Post by Jiayu Hu
IPV4_HDR_DF_MASK)". If this value is 0, the input packet is fragmented.
Post by Ananyev, Konstantin
Post by Jiayu Hu
+ pkts_out[0] = pkt;
+ return ret;
+ }
+
+ tcp_dl = rte_be_to_cpu_16(ipv4_hdr->total_length) - pkt-
l3_len -
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
+ pkt->l4_len;
Why not use pkt->pkt_len - pkt->l2_len -pkt_l3_len - pkt_l4_len?
Yes, we can use pkt->pkt_len - pkt->l2_len -pkt_l3_len - pkt_l4_len
here.
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
+ /* Don't process the packet without data */
+ if (unlikely(tcp_dl == 0)) {
+ pkts_out[0] = pkt;
+ return ret;
+ }
+
+ hdr_offset = pkt->l2_len + pkt->l3_len + pkt->l4_len;
+ pyld_unit_size = gso_size - hdr_offset - ETHER_CRC_LEN;
Hmm, why do we need to count CRC_LEN here?
Yes, we shouldn't count ETHER_CRC_LEN here. Its length should be
included in gso_size.
Why?
What is the point to account crc len into this computation?
Why not just assume that gso_size is already a max_frame_size - crc_len
As I remember, when we RX packet crc bytes will be already stripped,
when user populates the packet, he doesn't care about crc bytes too.
Hi Konstantin,
When packet is tx'd, the 4B for CRC are added back into the packet; if the
payload is already at max capacity, then the actual segment size
will be 4B larger than expected (e.g. 1522B, as opposed to 1518B).
To prevent that from happening, we account for the CRC len in this
calculation.
Ok, and what prevents you to set gso_ctx.gso_size = 1514; /*ether frame
size without crc bytes */
?
Hey Konstantin,
If the user sets the gso_size to 1514, the resultant output segments' size should be 1514, and not 1518.
Yes and then NIC HW will add CRC bytes for you.
You are not filling CRC bytes in HW, and when providing to the HW size to send - it is a payload size
(CRC bytes are not accounted).
Konstantin

Consequently, the payload capacity
of each segment would be reduced accordingly.
The user only cares about the output segment size (i.e. gso_ctx.gso_size); we need to ensure that the size of the segments that are
produced is consistent with that. As a result, we need to ensure that any packet overhead is accounted for in the segment size, before we
can determine how much space remains for data.
Hope this makes sense.
Thanks,
Mark
Exactly, applications can set 1514 to gso_segsz instead of 1518, if the lower layer
will add CRC to the packet.
Jiayu
Konstantin
If I've missed anything, please do let me know!
Thanks,
Mark
Konstantin
Kavanagh, Mark B
2017-09-14 09:00:36 UTC
Permalink
From: Ananyev, Konstantin
Sent: Thursday, September 14, 2017 9:40 AM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
-----Original Message-----
From: Kavanagh, Mark B
Sent: Thursday, September 14, 2017 9:35 AM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
From: Hu, Jiayu
Sent: Thursday, September 14, 2017 2:00 AM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Konstantin,
-----Original Message-----
From: Ananyev, Konstantin
Sent: Wednesday, September 13, 2017 11:13 PM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Mark,
-----Original Message-----
From: Kavanagh, Mark B
Sent: Wednesday, September 13, 2017 3:52 PM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
From: Ananyev, Konstantin
Sent: Wednesday, September 13, 2017 10:38 AM
Tan, Jianfeng
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
+
+int
+gso_tcp4_segment(struct rte_mbuf *pkt,
+ uint16_t gso_size,
+ uint8_t ipid_delta,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out)
+{
+ struct ipv4_hdr *ipv4_hdr;
+ uint16_t tcp_dl;
+ uint16_t pyld_unit_size;
+ uint16_t hdr_offset;
+ int ret = 1;
+
+ ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *)
+
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
+ pkt->l2_len);
+ /* Don't process the fragmented packet */
+ if (unlikely((ipv4_hdr->fragment_offset & rte_cpu_to_be_16(
+
IPV4_HDR_DF_MASK)) == 0)) {
Post by Jiayu Hu
Post by Ananyev, Konstantin
It is not a check for fragmented packet - it is a check that
fragmentation
is allowed for that packet.
Post by Jiayu Hu
Post by Ananyev, Konstantin
Should be IPV4_HDR_DF_MASK - 1, I think.
DF bit doesn't indicate is packet fragmented or not.
It forbids to fragment packet any further.
To check is packet already fragmented or not, you have to check MF bit
and
frag_offset.
Both have to be zero for un-fragmented packets.
Post by Jiayu Hu
IMO, IPV4_HDR_DF_MASK whose value is (1 << 14) is used to get DF
bit.
It's a
Post by Jiayu Hu
little-endian value. But ipv4_hdr->fragment_offset is big-endian
order.
Post by Jiayu Hu
So the value of DF bit should be "ipv4_hdr->fragment_offset &
rte_cpu_to_be_16(
Post by Jiayu Hu
IPV4_HDR_DF_MASK)". If this value is 0, the input packet is
fragmented.
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
+ pkts_out[0] = pkt;
+ return ret;
+ }
+
+ tcp_dl = rte_be_to_cpu_16(ipv4_hdr->total_length) - pkt-
l3_len -
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
+ pkt->l4_len;
Why not use pkt->pkt_len - pkt->l2_len -pkt_l3_len - pkt_l4_len?
Yes, we can use pkt->pkt_len - pkt->l2_len -pkt_l3_len - pkt_l4_len
here.
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
+ /* Don't process the packet without data */
+ if (unlikely(tcp_dl == 0)) {
+ pkts_out[0] = pkt;
+ return ret;
+ }
+
+ hdr_offset = pkt->l2_len + pkt->l3_len + pkt->l4_len;
+ pyld_unit_size = gso_size - hdr_offset - ETHER_CRC_LEN;
Hmm, why do we need to count CRC_LEN here?
Yes, we shouldn't count ETHER_CRC_LEN here. Its length should be
included in gso_size.
Why?
What is the point to account crc len into this computation?
Why not just assume that gso_size is already a max_frame_size -
crc_len
As I remember, when we RX packet crc bytes will be already stripped,
when user populates the packet, he doesn't care about crc bytes too.
Hi Konstantin,
When packet is tx'd, the 4B for CRC are added back into the packet; if
the
payload is already at max capacity, then the actual segment size
will be 4B larger than expected (e.g. 1522B, as opposed to 1518B).
To prevent that from happening, we account for the CRC len in this
calculation.
Ok, and what prevents you to set gso_ctx.gso_size = 1514; /*ether frame
size without crc bytes */
?
Hey Konstantin,
If the user sets the gso_size to 1514, the resultant output segments' size
should be 1514, and not 1518.
Just to clarify - I meant here that the final output segment, including CRC len, should be 1514. I think this is where we're crossing wires ;)
Yes and then NIC HW will add CRC bytes for you.
You are not filling CRC bytes in HW, and when providing to the HW size to send
- it is a payload size
(CRC bytes are not accounted).
Konstantin
Yes, exactly - in that case though, the gso_size specified by the user is not the actual final output segment size, but (segment size - 4B), right?

We can set that expectation in documentation, but from an application's/user's perspective, do you think that this might be confusing/misleading?

Thanks again,
Mark
Consequently, the payload capacity
of each segment would be reduced accordingly.
The user only cares about the output segment size (i.e. gso_ctx.gso_size);
we need to ensure that the size of the segments that are
produced is consistent with that. As a result, we need to ensure that any
packet overhead is accounted for in the segment size, before we
can determine how much space remains for data.
Hope this makes sense.
Thanks,
Mark
Exactly, applications can set 1514 to gso_segsz instead of 1518, if the
lower
layer
will add CRC to the packet.
Jiayu
Konstantin
If I've missed anything, please do let me know!
Thanks,
Mark
Konstantin
Ananyev, Konstantin
2017-09-14 09:10:56 UTC
Permalink
-----Original Message-----
From: Kavanagh, Mark B
Sent: Thursday, September 14, 2017 10:01 AM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
From: Ananyev, Konstantin
Sent: Thursday, September 14, 2017 9:40 AM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
-----Original Message-----
From: Kavanagh, Mark B
Sent: Thursday, September 14, 2017 9:35 AM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
From: Hu, Jiayu
Sent: Thursday, September 14, 2017 2:00 AM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Konstantin,
-----Original Message-----
From: Ananyev, Konstantin
Sent: Wednesday, September 13, 2017 11:13 PM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Mark,
-----Original Message-----
From: Kavanagh, Mark B
Sent: Wednesday, September 13, 2017 3:52 PM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
From: Ananyev, Konstantin
Sent: Wednesday, September 13, 2017 10:38 AM
Tan, Jianfeng
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
+
+int
+gso_tcp4_segment(struct rte_mbuf *pkt,
+ uint16_t gso_size,
+ uint8_t ipid_delta,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out)
+{
+ struct ipv4_hdr *ipv4_hdr;
+ uint16_t tcp_dl;
+ uint16_t pyld_unit_size;
+ uint16_t hdr_offset;
+ int ret = 1;
+
+ ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *)
+
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
+ pkt->l2_len);
+ /* Don't process the fragmented packet */
+ if (unlikely((ipv4_hdr->fragment_offset & rte_cpu_to_be_16(
+
IPV4_HDR_DF_MASK)) == 0)) {
Post by Jiayu Hu
Post by Ananyev, Konstantin
It is not a check for fragmented packet - it is a check that
fragmentation
is allowed for that packet.
Post by Jiayu Hu
Post by Ananyev, Konstantin
Should be IPV4_HDR_DF_MASK - 1, I think.
DF bit doesn't indicate is packet fragmented or not.
It forbids to fragment packet any further.
To check is packet already fragmented or not, you have to check MF bit
and
frag_offset.
Both have to be zero for un-fragmented packets.
Post by Jiayu Hu
IMO, IPV4_HDR_DF_MASK whose value is (1 << 14) is used to get DF
bit.
It's a
Post by Jiayu Hu
little-endian value. But ipv4_hdr->fragment_offset is big-endian
order.
Post by Jiayu Hu
So the value of DF bit should be "ipv4_hdr->fragment_offset &
rte_cpu_to_be_16(
Post by Jiayu Hu
IPV4_HDR_DF_MASK)". If this value is 0, the input packet is
fragmented.
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
+ pkts_out[0] = pkt;
+ return ret;
+ }
+
+ tcp_dl = rte_be_to_cpu_16(ipv4_hdr->total_length) - pkt-
l3_len -
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
+ pkt->l4_len;
Why not use pkt->pkt_len - pkt->l2_len -pkt_l3_len - pkt_l4_len?
Yes, we can use pkt->pkt_len - pkt->l2_len -pkt_l3_len - pkt_l4_len
here.
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
+ /* Don't process the packet without data */
+ if (unlikely(tcp_dl == 0)) {
+ pkts_out[0] = pkt;
+ return ret;
+ }
+
+ hdr_offset = pkt->l2_len + pkt->l3_len + pkt->l4_len;
+ pyld_unit_size = gso_size - hdr_offset - ETHER_CRC_LEN;
Hmm, why do we need to count CRC_LEN here?
Yes, we shouldn't count ETHER_CRC_LEN here. Its length should be
included in gso_size.
Why?
What is the point to account crc len into this computation?
Why not just assume that gso_size is already a max_frame_size -
crc_len
As I remember, when we RX packet crc bytes will be already stripped,
when user populates the packet, he doesn't care about crc bytes too.
Hi Konstantin,
When packet is tx'd, the 4B for CRC are added back into the packet; if
the
payload is already at max capacity, then the actual segment size
will be 4B larger than expected (e.g. 1522B, as opposed to 1518B).
To prevent that from happening, we account for the CRC len in this
calculation.
Ok, and what prevents you to set gso_ctx.gso_size = 1514; /*ether frame
size without crc bytes */
?
Hey Konstantin,
If the user sets the gso_size to 1514, the resultant output segments' size
should be 1514, and not 1518.
Just to clarify - I meant here that the final output segment, including CRC len, should be 1514. I think this is where we're crossing wires ;)
Yes and then NIC HW will add CRC bytes for you.
You are not filling CRC bytes in HW, and when providing to the HW size to send
- it is a payload size
(CRC bytes are not accounted).
Konstantin
Yes, exactly - in that case though, the gso_size specified by the user is not the actual final output segment size, but (segment size - 4B),
right?
CRC bytes will be add by HW, it is totally transparent for user.
We can set that expectation in documentation, but from an application's/user's perspective, do you think that this might be
confusing/misleading?
I think it would be much more confusing to make user account for CRC bytes.
Let say when in DPDK you form a packet and send it out via rte_eth_tx_burst()
you specify only your payload size, not payload size plus crc bytes that HW will add for you.
Konstantin
Thanks again,
Mark
Consequently, the payload capacity
of each segment would be reduced accordingly.
The user only cares about the output segment size (i.e. gso_ctx.gso_size);
we need to ensure that the size of the segments that are
produced is consistent with that. As a result, we need to ensure that any
packet overhead is accounted for in the segment size, before we
can determine how much space remains for data.
Hope this makes sense.
Thanks,
Mark
Exactly, applications can set 1514 to gso_segsz instead of 1518, if the
lower
layer
will add CRC to the packet.
Jiayu
Konstantin
If I've missed anything, please do let me know!
Thanks,
Mark
Konstantin
Kavanagh, Mark B
2017-09-14 09:35:43 UTC
Permalink
From: Ananyev, Konstantin
Sent: Thursday, September 14, 2017 10:11 AM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
-----Original Message-----
From: Kavanagh, Mark B
Sent: Thursday, September 14, 2017 10:01 AM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
From: Ananyev, Konstantin
Sent: Thursday, September 14, 2017 9:40 AM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
-----Original Message-----
From: Kavanagh, Mark B
Sent: Thursday, September 14, 2017 9:35 AM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
From: Hu, Jiayu
Sent: Thursday, September 14, 2017 2:00 AM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Konstantin,
-----Original Message-----
From: Ananyev, Konstantin
Sent: Wednesday, September 13, 2017 11:13 PM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Mark,
-----Original Message-----
From: Kavanagh, Mark B
Sent: Wednesday, September 13, 2017 3:52 PM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
From: Ananyev, Konstantin
Sent: Wednesday, September 13, 2017 10:38 AM
Tan, Jianfeng
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
+
+int
+gso_tcp4_segment(struct rte_mbuf *pkt,
+ uint16_t gso_size,
+ uint8_t ipid_delta,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out)
+{
+ struct ipv4_hdr *ipv4_hdr;
+ uint16_t tcp_dl;
+ uint16_t pyld_unit_size;
+ uint16_t hdr_offset;
+ int ret = 1;
+
+ ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt,
char *)
+
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
+ pkt->l2_len);
+ /* Don't process the fragmented packet */
+ if (unlikely((ipv4_hdr->fragment_offset &
rte_cpu_to_be_16(
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
+
IPV4_HDR_DF_MASK)) == 0)) {
Post by Jiayu Hu
Post by Ananyev, Konstantin
It is not a check for fragmented packet - it is a check that
fragmentation
is allowed for that packet.
Post by Jiayu Hu
Post by Ananyev, Konstantin
Should be IPV4_HDR_DF_MASK - 1, I think.
DF bit doesn't indicate is packet fragmented or not.
It forbids to fragment packet any further.
To check is packet already fragmented or not, you have to check MF
bit
and
frag_offset.
Both have to be zero for un-fragmented packets.
Post by Jiayu Hu
IMO, IPV4_HDR_DF_MASK whose value is (1 << 14) is used to get DF
bit.
It's a
Post by Jiayu Hu
little-endian value. But ipv4_hdr->fragment_offset is big-endian
order.
Post by Jiayu Hu
So the value of DF bit should be "ipv4_hdr->fragment_offset &
rte_cpu_to_be_16(
Post by Jiayu Hu
IPV4_HDR_DF_MASK)". If this value is 0, the input packet is
fragmented.
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
+ pkts_out[0] = pkt;
+ return ret;
+ }
+
+ tcp_dl = rte_be_to_cpu_16(ipv4_hdr->total_length) - pkt-
l3_len -
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
+ pkt->l4_len;
Why not use pkt->pkt_len - pkt->l2_len -pkt_l3_len -
pkt_l4_len?
Post by Jiayu Hu
Yes, we can use pkt->pkt_len - pkt->l2_len -pkt_l3_len -
pkt_l4_len
here.
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
+ /* Don't process the packet without data */
+ if (unlikely(tcp_dl == 0)) {
+ pkts_out[0] = pkt;
+ return ret;
+ }
+
+ hdr_offset = pkt->l2_len + pkt->l3_len + pkt->l4_len;
+ pyld_unit_size = gso_size - hdr_offset - ETHER_CRC_LEN;
Hmm, why do we need to count CRC_LEN here?
Yes, we shouldn't count ETHER_CRC_LEN here. Its length should be
included in gso_size.
Why?
What is the point to account crc len into this computation?
Why not just assume that gso_size is already a max_frame_size -
crc_len
As I remember, when we RX packet crc bytes will be already
stripped,
when user populates the packet, he doesn't care about crc bytes
too.
Hi Konstantin,
When packet is tx'd, the 4B for CRC are added back into the packet;
if
the
payload is already at max capacity, then the actual segment size
will be 4B larger than expected (e.g. 1522B, as opposed to 1518B).
To prevent that from happening, we account for the CRC len in this
calculation.
Ok, and what prevents you to set gso_ctx.gso_size = 1514; /*ether
frame
size without crc bytes */
?
Hey Konstantin,
If the user sets the gso_size to 1514, the resultant output segments'
size
should be 1514, and not 1518.
Just to clarify - I meant here that the final output segment, including CRC
len, should be 1514. I think this is where we're crossing wires ;)
Yes and then NIC HW will add CRC bytes for you.
You are not filling CRC bytes in HW, and when providing to the HW size to
send
- it is a payload size
(CRC bytes are not accounted).
Konstantin
Yes, exactly - in that case though, the gso_size specified by the user is
not the actual final output segment size, but (segment size - 4B),
right?
CRC bytes will be add by HW, it is totally transparent for user.
Yes - I completely agree/understand.
We can set that expectation in documentation, but from an
application's/user's perspective, do you think that this might be
confusing/misleading?
I think it would be much more confusing to make user account for CRC bytes.
Let say when in DPDK you form a packet and send it out via rte_eth_tx_burst()
you specify only your payload size, not payload size plus crc bytes that HW will add for you.
Konstantin
I guess I've just been looking at it from a different perspective (i.e. the user wants to decide the final total packet size); using the example of rte_eth_tx_burst above, I see where you're coming from though.
Thanks for clarifying,
Mark
Thanks again,
Mark
Consequently, the payload capacity
of each segment would be reduced accordingly.
The user only cares about the output segment size (i.e.
gso_ctx.gso_size);
we need to ensure that the size of the segments that are
produced is consistent with that. As a result, we need to ensure that any
packet overhead is accounted for in the segment size, before we
can determine how much space remains for data.
Hope this makes sense.
Thanks,
Mark
Exactly, applications can set 1514 to gso_segsz instead of 1518, if the
lower
layer
will add CRC to the packet.
Jiayu
Konstantin
If I've missed anything, please do let me know!
Thanks,
Mark
Konstantin
Ananyev, Konstantin
2017-09-12 14:17:27 UTC
Permalink
-----Original Message-----
From: Ananyev, Konstantin
Sent: Tuesday, September 12, 2017 12:18 PM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Jayu,
-----Original Message-----
From: Hu, Jiayu
Sent: Tuesday, September 12, 2017 3:43 AM
Subject: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
This patch adds GSO support for TCP/IPv4 packets. Supported packets
may include a single VLAN tag. TCP/IPv4 GSO assumes that all input
packets have correct checksums, and doesn't update checksums for output
packets (the responsibility for this lies with the application).
Probably it shouldn't say that checksum have to be valid, right?
As you don't update checksum(s) inside the lib - it probably doesn't matter.
Additionally, TCP/IPv4 GSO doesn't process IP fragmented packets.
TCP/IPv4 GSO uses two chained MBUFs, one direct MBUF and one indrect
MBUF, to organize an output packet. Note that we refer to these two
chained MBUFs as a two-segment MBUF. The direct MBUF stores the packet
header, while the indirect mbuf simply points to a location within the
original packet's payload. Consequently, use of the GSO library requires
multi-segment MBUF support in the TX functions of the NIC driver.
If a packet is GSOed, TCP/IPv4 GSO reduces its MBUF refcnt by 1. As a
result, when all of its GSOed segments are freed, the packet is freed
automatically.
---
lib/librte_eal/common/include/rte_log.h | 1 +
lib/librte_gso/Makefile | 2 +
lib/librte_gso/gso_common.c | 202 ++++++++++++++++++++++++++++++++
lib/librte_gso/gso_common.h | 113 ++++++++++++++++++
lib/librte_gso/gso_tcp4.c | 83 +++++++++++++
lib/librte_gso/gso_tcp4.h | 76 ++++++++++++
lib/librte_gso/rte_gso.c | 41 ++++++-
7 files changed, 515 insertions(+), 3 deletions(-)
create mode 100644 lib/librte_gso/gso_common.c
create mode 100644 lib/librte_gso/gso_common.h
create mode 100644 lib/librte_gso/gso_tcp4.c
create mode 100644 lib/librte_gso/gso_tcp4.h
diff --git a/lib/librte_eal/common/include/rte_log.h b/lib/librte_eal/common/include/rte_log.h
index ec8dba7..2fa1199 100644
--- a/lib/librte_eal/common/include/rte_log.h
+++ b/lib/librte_eal/common/include/rte_log.h
@@ -87,6 +87,7 @@ extern struct rte_logs rte_logs;
#define RTE_LOGTYPE_CRYPTODEV 17 /**< Log related to cryptodev. */
#define RTE_LOGTYPE_EFD 18 /**< Log related to EFD. */
#define RTE_LOGTYPE_EVENTDEV 19 /**< Log related to eventdev. */
+#define RTE_LOGTYPE_GSO 20 /**< Log related to GSO. */
/* these log types can be used in an application */
#define RTE_LOGTYPE_USER1 24 /**< User-defined log type 1. */
diff --git a/lib/librte_gso/Makefile b/lib/librte_gso/Makefile
index aeaacbc..2be64d1 100644
--- a/lib/librte_gso/Makefile
+++ b/lib/librte_gso/Makefile
@@ -42,6 +42,8 @@ LIBABIVER := 1
#source files
SRCS-$(CONFIG_RTE_LIBRTE_GSO) += rte_gso.c
+SRCS-$(CONFIG_RTE_LIBRTE_GSO) += gso_common.c
+SRCS-$(CONFIG_RTE_LIBRTE_GSO) += gso_tcp4.c
# install this header file
SYMLINK-$(CONFIG_RTE_LIBRTE_GSO)-include += rte_gso.h
diff --git a/lib/librte_gso/gso_common.c b/lib/librte_gso/gso_common.c
new file mode 100644
index 0000000..7c32e03
--- /dev/null
+++ b/lib/librte_gso/gso_common.c
@@ -0,0 +1,202 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdbool.h>
+#include <errno.h>
+
+#include <rte_memcpy.h>
+#include <rte_mempool.h>
+#include <rte_ether.h>
+#include <rte_ip.h>
+#include <rte_tcp.h>
+
+#include "gso_common.h"
+
+static inline void
+hdr_segment_init(struct rte_mbuf *hdr_segment, struct rte_mbuf *pkt,
+ uint16_t pkt_hdr_offset)
+{
+ /* Copy MBUF metadata */
+ hdr_segment->nb_segs = 1;
+ hdr_segment->port = pkt->port;
+ hdr_segment->ol_flags = pkt->ol_flags;
+ hdr_segment->packet_type = pkt->packet_type;
+ hdr_segment->pkt_len = pkt_hdr_offset;
+ hdr_segment->data_len = pkt_hdr_offset;
+ hdr_segment->tx_offload = pkt->tx_offload;
+
+ /* Copy the packet header */
+ rte_memcpy(rte_pktmbuf_mtod(hdr_segment, char *),
+ rte_pktmbuf_mtod(pkt, char *),
+ pkt_hdr_offset);
+}
+
+static inline void
+free_gso_segment(struct rte_mbuf **pkts, uint16_t nb_pkts)
+{
+ uint16_t i;
+
+ for (i = 0; i < nb_pkts; i++)
+ rte_pktmbuf_free(pkts[i]);
+}
+
+int
+gso_do_segment(struct rte_mbuf *pkt,
+ uint16_t pkt_hdr_offset,
+ uint16_t pyld_unit_size,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out)
+{
+ struct rte_mbuf *pkt_in;
+ struct rte_mbuf *hdr_segment, *pyld_segment, *prev_segment;
+ uint16_t pkt_in_data_pos, segment_bytes_remaining;
+ uint16_t pyld_len, nb_segs;
+ bool more_in_pkt, more_out_segs;
+
+ pkt_in = pkt;
+ nb_segs = 0;
+ more_in_pkt = 1;
+ pkt_in_data_pos = pkt_hdr_offset;
+
+ while (more_in_pkt) {
+ if (unlikely(nb_segs >= nb_pkts_out)) {
+ free_gso_segment(pkts_out, nb_segs);
+ return -EINVAL;
+ }
+
+ /* Allocate a direct MBUF */
+ hdr_segment = rte_pktmbuf_alloc(direct_pool);
+ if (unlikely(hdr_segment == NULL)) {
+ free_gso_segment(pkts_out, nb_segs);
+ return -ENOMEM;
+ }
+ /* Fill the packet header */
+ hdr_segment_init(hdr_segment, pkt, pkt_hdr_offset);
+
+ prev_segment = hdr_segment;
+ segment_bytes_remaining = pyld_unit_size;
+ more_out_segs = 1;
+
+ while (more_out_segs && more_in_pkt) {
+ /* Allocate an indirect MBUF */
+ pyld_segment = rte_pktmbuf_alloc(indirect_pool);
+ if (unlikely(pyld_segment == NULL)) {
+ rte_pktmbuf_free(hdr_segment);
+ free_gso_segment(pkts_out, nb_segs);
+ return -ENOMEM;
+ }
+ /* Attach to current MBUF segment of pkt */
+ rte_pktmbuf_attach(pyld_segment, pkt_in);
+
+ prev_segment->next = pyld_segment;
+ prev_segment = pyld_segment;
+
+ pyld_len = segment_bytes_remaining;
+ if (pyld_len + pkt_in_data_pos > pkt_in->data_len)
+ pyld_len = pkt_in->data_len - pkt_in_data_pos;
+
+ pyld_segment->data_off = pkt_in_data_pos +
+ pkt_in->data_off;
+ pyld_segment->data_len = pyld_len;
+
+ /* Update header segment */
+ hdr_segment->pkt_len += pyld_len;
+ hdr_segment->nb_segs++;
+
+ pkt_in_data_pos += pyld_len;
+ segment_bytes_remaining -= pyld_len;
+
+ /* Finish processing a MBUF segment of pkt */
+ if (pkt_in_data_pos == pkt_in->data_len) {
+ pkt_in = pkt_in->next;
+ pkt_in_data_pos = 0;
+ if (pkt_in == NULL)
+ more_in_pkt = 0;
+ }
+
+ /* Finish generating a GSO segment */
+ if (segment_bytes_remaining == 0)
+ more_out_segs = 0;
+ }
+ pkts_out[nb_segs++] = hdr_segment;
+ }
+ return nb_segs;
+}
+
+static inline void
+update_inner_tcp4_header(struct rte_mbuf *pkt, uint8_t ipid_delta,
+ struct rte_mbuf **segs, uint16_t nb_segs)
+{
+ struct tcp_hdr *tcp_hdr;
+ struct ipv4_hdr *ipv4_hdr;
+ struct rte_mbuf *seg;
+ uint32_t sent_seq;
+ uint16_t inner_l2_offset;
+ uint16_t id, i;
+
+ inner_l2_offset = pkt->outer_l2_len + pkt->outer_l3_len + pkt->l2_len;
Shouldn't it be: pkt->l2_len here?
Or probably even better to pass l2_len as an input parameter.
+ ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *) +
+ inner_l2_offset);
+ tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr + pkt->l3_len);
+ id = rte_be_to_cpu_16(ipv4_hdr->packet_id);
+ sent_seq = rte_be_to_cpu_32(tcp_hdr->sent_seq);
+
+ for (i = 0; i < nb_segs; i++) {
+ seg = segs[i];
+ /* Update the inner IPv4 header */
+ ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(seg, char *) +
+ inner_l2_offset);
+ ipv4_hdr->total_length = rte_cpu_to_be_16(seg->pkt_len -
+ inner_l2_offset);
+ ipv4_hdr->packet_id = rte_cpu_to_be_16(id);
+ id += ipid_delta;
+
+ /* Update the inner TCP header */
+ tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr + seg->l3_len);
+ tcp_hdr->sent_seq = rte_cpu_to_be_32(sent_seq);
+ if (likely(i < nb_segs - 1))
+ tcp_hdr->tcp_flags &= (~(TCP_HDR_PSH_MASK |
+ TCP_HDR_FIN_MASK));
+ sent_seq += (seg->pkt_len - seg->data_len);
+ }
+}
+
+void
+gso_update_pkt_headers(struct rte_mbuf *pkt, uint8_t ipid_delta,
+ struct rte_mbuf **segs, uint16_t nb_segs)
+{
+ if (is_ipv4_tcp(pkt->packet_type))
+ update_inner_tcp4_header(pkt, ipid_delta, segs, nb_segs);
+}
diff --git a/lib/librte_gso/gso_common.h b/lib/librte_gso/gso_common.h
new file mode 100644
index 0000000..3c76520
--- /dev/null
+++ b/lib/librte_gso/gso_common.h
@@ -0,0 +1,113 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _GSO_COMMON_H_
+#define _GSO_COMMON_H_
+
+#include <stdint.h>
+#include <rte_mbuf.h>
+
+#define IPV4_HDR_DF_SHIFT 14
We have that already defined in librte_net/rte_ip.h
+#define IPV4_HDR_DF_MASK (1 << IPV4_HDR_DF_SHIFT)
+
+#define TCP_HDR_PSH_MASK ((uint8_t)0x08)
+#define TCP_HDR_FIN_MASK ((uint8_t)0x01)
+
+#define ETHER_TCP_PKT (RTE_PTYPE_L2_ETHER | RTE_PTYPE_L4_TCP)
+#define ETHER_VLAN_TCP_PKT (RTE_PTYPE_L2_ETHER_VLAN | RTE_PTYPE_L4_TCP)
+static inline uint8_t is_ipv4_tcp(uint32_t ptype)
+{
+ switch (ptype & (~RTE_PTYPE_L3_MASK)) {
return RTE_ETH_IS_IPV4_HDR(ptype) && (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_TCP;
?
+ return RTE_ETH_IS_IPV4_HDR(ptype);
+ return 0;
+ }
+}
+
+/**
+ * Internal function which updates relevant packet headers, following
+ * segmentation. This is required to update, for example, the IPv4
+ * 'total_length' field, to reflect the reduced length of the now-
+ * segmented packet.
+ *
+ * The original packet.
+ * The increasing uint of IP ids.
+ * Pointer array used for storing mbuf addresses for GSO segments.
+ * The number of GSO segments placed in segs.
+ */
+void gso_update_pkt_headers(struct rte_mbuf *pkt, uint8_t ipid_delta,
+ struct rte_mbuf **segs, uint16_t nb_segs);
+
+/**
+ * Internal function which divides the input packet into small segments.
+ * Each of the newly-created segments is organized as a two-segment MBUF,
+ * where the first segment is a standard mbuf, which stores a copy of
+ * packet header, and the second is an indirect mbuf which points to a
+ * section of data in the input packet.
+ *
+ * Packet to segment.
+ * Packet header offset, measured in bytes.
+ * The max payload length of a GSO segment.
+ * MBUF pool used for allocating direct buffers for output segments.
+ * MBUF pool used for allocating indirect buffers for output segments.
+ * Pointer array used to keep the mbuf addresses of output segments. If
+ * the memory space in pkts_out is insufficient, gso_do_segment() fails
+ * and returns -EINVAL.
+ * The max number of items that pkts_out can keep.
+ *
+ * - The number of segments created in the event of success.
+ * - Return -ENOMEM if run out of memory in MBUF pools.
+ * - Return -EINVAL for invalid parameters.
+ */
+int gso_do_segment(struct rte_mbuf *pkt,
+ uint16_t pkt_hdr_offset,
+ uint16_t pyld_unit_size,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out);
+#endif
diff --git a/lib/librte_gso/gso_tcp4.c b/lib/librte_gso/gso_tcp4.c
new file mode 100644
index 0000000..8d4bfb2
--- /dev/null
+++ b/lib/librte_gso/gso_tcp4.c
@@ -0,0 +1,83 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include <rte_ether.h>
+#include <rte_ip.h>
+
+#include "gso_common.h"
+#include "gso_tcp4.h"
+
+int
+gso_tcp4_segment(struct rte_mbuf *pkt,
+ uint16_t gso_size,
+ uint8_t ipid_delta,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out)
+{
+ struct ipv4_hdr *ipv4_hdr;
+ uint16_t tcp_dl;
+ uint16_t pyld_unit_size;
+ uint16_t hdr_offset;
+ int ret = 1;
+
+ ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *) +
+ pkt->l2_len);
+ /* Don't process the fragmented packet */
+ if (unlikely((ipv4_hdr->fragment_offset & rte_cpu_to_be_16(
+ IPV4_HDR_DF_MASK)) == 0)) {
It is not a check for fragmented packet - it is a check that fragmentation is allowed for that packet.
Should be IPV4_HDR_DF_MASK - 1, I think.
+ pkts_out[0] = pkt;
+ return ret;
+ }
+
+ tcp_dl = rte_be_to_cpu_16(ipv4_hdr->total_length) - pkt->l3_len -
+ pkt->l4_len;
Why not use pkt->pkt_len - pkt->l2_len -pkt_l3_len - pkt_l4_len?
+ /* Don't process the packet without data */
+ if (unlikely(tcp_dl == 0)) {
+ pkts_out[0] = pkt;
+ return ret;
+ }
+
+ hdr_offset = pkt->l2_len + pkt->l3_len + pkt->l4_len;
+ pyld_unit_size = gso_size - hdr_offset - ETHER_CRC_LEN;
Hmm, why do we need to count CRC_LEN here?
+
+ /* Segment the payload */
+ ret = gso_do_segment(pkt, hdr_offset, pyld_unit_size, direct_pool,
+ indirect_pool, pkts_out, nb_pkts_out);
+ if (ret > 1)
+ gso_update_pkt_headers(pkt, ipid_delta, pkts_out, ret);
+
+ return ret;
+}
diff --git a/lib/librte_gso/gso_tcp4.h b/lib/librte_gso/gso_tcp4.h
new file mode 100644
index 0000000..9c07984
--- /dev/null
+++ b/lib/librte_gso/gso_tcp4.h
@@ -0,0 +1,76 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _GSO_TCP4_H_
+#define _GSO_TCP4_H_
+
+#include <stdint.h>
+#include <rte_mbuf.h>
+
+/**
+ * Segment an IPv4/TCP packet. This function assumes the input packet has
+ * correct checksums and doesn't update checksums for GSO segment.
+ * Furthermore, it doesn't process IP fragment packets.
+ *
+ * The packet mbuf to segment.
+ * The max length of a GSO segment, measured in bytes.
+ * The increasing uint of IP ids.
+ * MBUF pool used for allocating direct buffers for output segments.
+ * MBUF pool used for allocating indirect buffers for output segments.
+ * Pointer array used to store the MBUF addresses of output GSO
+ * segments, when gso_tcp4_segment() successes. If the memory space in
+ * pkts_out is insufficient, gso_tcp4_segment() fails and returns
+ * -EINVAL.
+ * The max number of items that 'pkts_out' can keep.
+ *
+ * - The number of GSO segments filled in pkts_out on success.
+ * - Return -ENOMEM if run out of memory in MBUF pools.
+ * - Return -EINVAL for invalid parameters.
+ */
+int gso_tcp4_segment(struct rte_mbuf *pkt,
+ uint16_t gso_size,
+ uint8_t ip_delta,
+ struct rte_mempool *direct_pool,
+ struct rte_mempool *indirect_pool,
+ struct rte_mbuf **pkts_out,
+ uint16_t nb_pkts_out);
+
+#endif
diff --git a/lib/librte_gso/rte_gso.c b/lib/librte_gso/rte_gso.c
index dda50ee..95f6ea6 100644
--- a/lib/librte_gso/rte_gso.c
+++ b/lib/librte_gso/rte_gso.c
@@ -33,18 +33,53 @@
#include <errno.h>
+#include <rte_log.h>
+
#include "rte_gso.h"
+#include "gso_common.h"
+#include "gso_tcp4.h"
int
rte_gso_segment(struct rte_mbuf *pkt,
- struct rte_gso_ctx gso_ctx __rte_unused,
+ struct rte_gso_ctx gso_ctx,
struct rte_mbuf **pkts_out,
uint16_t nb_pkts_out)
{
+ struct rte_mempool *direct_pool, *indirect_pool;
+ struct rte_mbuf *pkt_seg;
+ uint16_t gso_size;
+ uint8_t ipid_delta;
+ int ret = 1;
+
if (pkt == NULL || pkts_out == NULL || nb_pkts_out < 1)
return -EINVAL;
- pkts_out[0] = pkt;
+ if (gso_ctx.gso_size >= pkt->pkt_len ||
+ (pkt->packet_type & gso_ctx.gso_types) !=
+ pkt->packet_type) {
+ pkts_out[0] = pkt;
+ return ret;
+ }
+
+ direct_pool = gso_ctx.direct_pool;
+ indirect_pool = gso_ctx.indirect_pool;
+ gso_size = gso_ctx.gso_size;
+ ipid_delta = gso_ctx.ipid_flag == RTE_GSO_IPID_INCREASE;
+
+ if (is_ipv4_tcp(pkt->packet_type)) {
If (is_ipv4_tcp(pkt->packet_type) && (gso_ctx->gso_types & DEV_TX_OFFLOAD_TCP_TSO) != 0) {...
Sorry, actually it probably should be:
If (pkt->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_IPV4) == PKT_TX_IPV4 &&
(gso_ctx->gso_types & DEV_TX_OFFLOAD_TCP_TSO) != 0) {...

Konstantin
+ ret = gso_tcp4_segment(pkt, gso_size, ipid_delta,
+ direct_pool, indirect_pool,
+ pkts_out, nb_pkts_out);
+ } else
+ RTE_LOG(WARNING, GSO, "Unsupported packet type\n");
Shouldn't we do pkt_out[0] = pkt; here?
+
+ if (ret > 1) {
+ pkt_seg = pkt;
+ while (pkt_seg) {
+ rte_mbuf_refcnt_update(pkt_seg, -1);
+ pkt_seg = pkt_seg->next;
+ }
+ }
- return 1;
+ return ret;
}
--
2.7.4
Jiayu Hu
2017-09-13 10:44:07 UTC
Permalink
Hi Konstantin,
Post by Ananyev, Konstantin
-----Original Message-----
From: Ananyev, Konstantin
Sent: Tuesday, September 12, 2017 12:18 PM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Post by Jiayu Hu
result, when all of its GSOed segments are freed, the packet is freed
automatically.
diff --git a/lib/librte_gso/rte_gso.c b/lib/librte_gso/rte_gso.c
index dda50ee..95f6ea6 100644
--- a/lib/librte_gso/rte_gso.c
+++ b/lib/librte_gso/rte_gso.c
@@ -33,18 +33,53 @@
#include <errno.h>
+#include <rte_log.h>
+
#include "rte_gso.h"
+#include "gso_common.h"
+#include "gso_tcp4.h"
int
rte_gso_segment(struct rte_mbuf *pkt,
- struct rte_gso_ctx gso_ctx __rte_unused,
+ struct rte_gso_ctx gso_ctx,
struct rte_mbuf **pkts_out,
uint16_t nb_pkts_out)
{
+ struct rte_mempool *direct_pool, *indirect_pool;
+ struct rte_mbuf *pkt_seg;
+ uint16_t gso_size;
+ uint8_t ipid_delta;
+ int ret = 1;
+
if (pkt == NULL || pkts_out == NULL || nb_pkts_out < 1)
return -EINVAL;
- pkts_out[0] = pkt;
+ if (gso_ctx.gso_size >= pkt->pkt_len ||
+ (pkt->packet_type & gso_ctx.gso_types) !=
+ pkt->packet_type) {
+ pkts_out[0] = pkt;
+ return ret;
+ }
+
+ direct_pool = gso_ctx.direct_pool;
+ indirect_pool = gso_ctx.indirect_pool;
+ gso_size = gso_ctx.gso_size;
+ ipid_delta = gso_ctx.ipid_flag == RTE_GSO_IPID_INCREASE;
+
+ if (is_ipv4_tcp(pkt->packet_type)) {
If (is_ipv4_tcp(pkt->packet_type) && (gso_ctx->gso_types & DEV_TX_OFFLOAD_TCP_TSO) != 0) {...
If (pkt->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_IPV4) == PKT_TX_IPV4 &&
(gso_ctx->gso_types & DEV_TX_OFFLOAD_TCP_TSO) != 0) {...
I don't quite understand why the GSO library should be aware if the TSO
flag is set or not. Applications can query device TSO capability before
they call the GSO library. Do I misundertsand anything?

Additionally, we don't need to check if the packet is a TCP/IPv4 packet here?

Thanks,
Jiayu
Post by Ananyev, Konstantin
Konstantin
Post by Jiayu Hu
+ ret = gso_tcp4_segment(pkt, gso_size, ipid_delta,
+ direct_pool, indirect_pool,
+ pkts_out, nb_pkts_out);
+ } else
+ RTE_LOG(WARNING, GSO, "Unsupported packet type\n");
Shouldn't we do pkt_out[0] = pkt; here?
Post by Jiayu Hu
+
+ if (ret > 1) {
+ pkt_seg = pkt;
+ while (pkt_seg) {
+ rte_mbuf_refcnt_update(pkt_seg, -1);
+ pkt_seg = pkt_seg->next;
+ }
+ }
- return 1;
+ return ret;
}
--
2.7.4
Ananyev, Konstantin
2017-09-13 22:10:37 UTC
Permalink
Hi Jiayu,
Post by Jiayu Hu
Post by Ananyev, Konstantin
-----Original Message-----
From: Ananyev, Konstantin
Sent: Tuesday, September 12, 2017 12:18 PM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Post by Jiayu Hu
result, when all of its GSOed segments are freed, the packet is freed
automatically.
diff --git a/lib/librte_gso/rte_gso.c b/lib/librte_gso/rte_gso.c
index dda50ee..95f6ea6 100644
--- a/lib/librte_gso/rte_gso.c
+++ b/lib/librte_gso/rte_gso.c
@@ -33,18 +33,53 @@
#include <errno.h>
+#include <rte_log.h>
+
#include "rte_gso.h"
+#include "gso_common.h"
+#include "gso_tcp4.h"
int
rte_gso_segment(struct rte_mbuf *pkt,
- struct rte_gso_ctx gso_ctx __rte_unused,
+ struct rte_gso_ctx gso_ctx,
struct rte_mbuf **pkts_out,
uint16_t nb_pkts_out)
{
+ struct rte_mempool *direct_pool, *indirect_pool;
+ struct rte_mbuf *pkt_seg;
+ uint16_t gso_size;
+ uint8_t ipid_delta;
+ int ret = 1;
+
if (pkt == NULL || pkts_out == NULL || nb_pkts_out < 1)
return -EINVAL;
- pkts_out[0] = pkt;
+ if (gso_ctx.gso_size >= pkt->pkt_len ||
+ (pkt->packet_type & gso_ctx.gso_types) !=
+ pkt->packet_type) {
+ pkts_out[0] = pkt;
+ return ret;
+ }
+
+ direct_pool = gso_ctx.direct_pool;
+ indirect_pool = gso_ctx.indirect_pool;
+ gso_size = gso_ctx.gso_size;
+ ipid_delta = gso_ctx.ipid_flag == RTE_GSO_IPID_INCREASE;
+
+ if (is_ipv4_tcp(pkt->packet_type)) {
If (is_ipv4_tcp(pkt->packet_type) && (gso_ctx->gso_types & DEV_TX_OFFLOAD_TCP_TSO) != 0) {...
If (pkt->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_IPV4) == PKT_TX_IPV4 &&
(gso_ctx->gso_types & DEV_TX_OFFLOAD_TCP_TSO) != 0) {...
I don't quite understand why the GSO library should be aware if the TSO
flag is set or not. Applications can query device TSO capability before
they call the GSO library. Do I misundertsand anything?
Additionally, we don't need to check if the packet is a TCP/IPv4 packet here?
Well, right now PMD we doesn't rely on ptype to figure out what type of packet and
what TX offload have to be performed.
Instead it looks at TX part of ol_flags, and
My thought was that as what we doing is actually TSO in SW, it would be good
to use the same API here too.
Also with that approach, by setting ol_flags properly user can use the same gso_ctx and still
specify what segmentation to perform on a per-packet basis.

Alternative way is to rely on ptype to distinguish should segmentation be performed on that package or not.
The only advantage I see here is that if someone would like to add GSO for some new protocol,
he wouldn't need to introduce new TX flag value for mbuf.ol_flags.
Though he still would need to update TX_OFFLOAD_* capabilities and probably packet_type definitions.

So from my perspective first variant (use HW TSO API) is more plausible.
Wonder what is your and Mark opinions here?
Konstantin
Jiayu Hu
2017-09-14 06:07:05 UTC
Permalink
Hi Konstantin,
Post by Ananyev, Konstantin
Hi Jiayu,
Post by Jiayu Hu
Post by Ananyev, Konstantin
-----Original Message-----
From: Ananyev, Konstantin
Sent: Tuesday, September 12, 2017 12:18 PM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Post by Jiayu Hu
result, when all of its GSOed segments are freed, the packet is freed
automatically.
diff --git a/lib/librte_gso/rte_gso.c b/lib/librte_gso/rte_gso.c
index dda50ee..95f6ea6 100644
--- a/lib/librte_gso/rte_gso.c
+++ b/lib/librte_gso/rte_gso.c
@@ -33,18 +33,53 @@
#include <errno.h>
+#include <rte_log.h>
+
#include "rte_gso.h"
+#include "gso_common.h"
+#include "gso_tcp4.h"
int
rte_gso_segment(struct rte_mbuf *pkt,
- struct rte_gso_ctx gso_ctx __rte_unused,
+ struct rte_gso_ctx gso_ctx,
struct rte_mbuf **pkts_out,
uint16_t nb_pkts_out)
{
+ struct rte_mempool *direct_pool, *indirect_pool;
+ struct rte_mbuf *pkt_seg;
+ uint16_t gso_size;
+ uint8_t ipid_delta;
+ int ret = 1;
+
if (pkt == NULL || pkts_out == NULL || nb_pkts_out < 1)
return -EINVAL;
- pkts_out[0] = pkt;
+ if (gso_ctx.gso_size >= pkt->pkt_len ||
+ (pkt->packet_type & gso_ctx.gso_types) !=
+ pkt->packet_type) {
+ pkts_out[0] = pkt;
+ return ret;
+ }
+
+ direct_pool = gso_ctx.direct_pool;
+ indirect_pool = gso_ctx.indirect_pool;
+ gso_size = gso_ctx.gso_size;
+ ipid_delta = gso_ctx.ipid_flag == RTE_GSO_IPID_INCREASE;
+
+ if (is_ipv4_tcp(pkt->packet_type)) {
If (is_ipv4_tcp(pkt->packet_type) && (gso_ctx->gso_types & DEV_TX_OFFLOAD_TCP_TSO) != 0) {...
If (pkt->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_IPV4) == PKT_TX_IPV4 &&
(gso_ctx->gso_types & DEV_TX_OFFLOAD_TCP_TSO) != 0) {...
I don't quite understand why the GSO library should be aware if the TSO
flag is set or not. Applications can query device TSO capability before
they call the GSO library. Do I misundertsand anything?
Additionally, we don't need to check if the packet is a TCP/IPv4 packet here?
Well, right now PMD we doesn't rely on ptype to figure out what type of packet and
what TX offload have to be performed.
Instead it looks at TX part of ol_flags, and
My thought was that as what we doing is actually TSO in SW, it would be good
to use the same API here too.
Also with that approach, by setting ol_flags properly user can use the same gso_ctx and still
specify what segmentation to perform on a per-packet basis.
Alternative way is to rely on ptype to distinguish should segmentation be performed on that package or not.
The only advantage I see here is that if someone would like to add GSO for some new protocol,
he wouldn't need to introduce new TX flag value for mbuf.ol_flags.
Though he still would need to update TX_OFFLOAD_* capabilities and probably packet_type definitions.
So from my perspective first variant (use HW TSO API) is more plausible.
Wonder what is your and Mark opinions here?
In the first choice, you mean:
the GSO library uses gso_ctx->gso_types and mbuf->ol_flags to call a specific GSO
segmentation function (e.g. gso_tcp4_segment(), gso_tunnel_xxx()) for each input packet.
Applications should parse the packet type, and set an exactly correct DEV_TX_OFFLOAD_*_TSO
flag to gso_types and ol_flags according to the packet type. That is, the value of gso_types
is on a per-packet basis. Using gso_ctx->gso_types and mbuf->ol_flags at the same time
is because that DEV_TX_OFFLOAD_*_TSO only tells tunnelling type and the inner L4 type, and
we need to know L3 type by ol_flags. With this design, HW segmentation and SW segmentation
are indeed consistent.

If I understand it correctly, applications need to set 'ol_flags = PKT_TX_IPV4' and
'gso_types = DEV_TX_OFFLOAD_VXLAN_TNL_TSO' for a "ether+ipv4+udp+vxlan+ether+ipv4+
tcp+payload" packet. But PKT_TX_IPV4 just present the inner L3 type for tunneled packet.
How about the outer L3 type? Always assume the inner and the outer L3 type are the same?

Jiayu
Post by Ananyev, Konstantin
Konstantin
Ananyev, Konstantin
2017-09-14 08:47:00 UTC
Permalink
Hi Jiayu,
-----Original Message-----
From: Hu, Jiayu
Sent: Thursday, September 14, 2017 7:07 AM
Subject: Re: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Konstantin,
Post by Ananyev, Konstantin
Hi Jiayu,
Post by Jiayu Hu
Post by Ananyev, Konstantin
-----Original Message-----
From: Ananyev, Konstantin
Sent: Tuesday, September 12, 2017 12:18 PM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Post by Jiayu Hu
result, when all of its GSOed segments are freed, the packet is freed
automatically.
diff --git a/lib/librte_gso/rte_gso.c b/lib/librte_gso/rte_gso.c
index dda50ee..95f6ea6 100644
--- a/lib/librte_gso/rte_gso.c
+++ b/lib/librte_gso/rte_gso.c
@@ -33,18 +33,53 @@
#include <errno.h>
+#include <rte_log.h>
+
#include "rte_gso.h"
+#include "gso_common.h"
+#include "gso_tcp4.h"
int
rte_gso_segment(struct rte_mbuf *pkt,
- struct rte_gso_ctx gso_ctx __rte_unused,
+ struct rte_gso_ctx gso_ctx,
struct rte_mbuf **pkts_out,
uint16_t nb_pkts_out)
{
+ struct rte_mempool *direct_pool, *indirect_pool;
+ struct rte_mbuf *pkt_seg;
+ uint16_t gso_size;
+ uint8_t ipid_delta;
+ int ret = 1;
+
if (pkt == NULL || pkts_out == NULL || nb_pkts_out < 1)
return -EINVAL;
- pkts_out[0] = pkt;
+ if (gso_ctx.gso_size >= pkt->pkt_len ||
+ (pkt->packet_type & gso_ctx.gso_types) !=
+ pkt->packet_type) {
+ pkts_out[0] = pkt;
+ return ret;
+ }
+
+ direct_pool = gso_ctx.direct_pool;
+ indirect_pool = gso_ctx.indirect_pool;
+ gso_size = gso_ctx.gso_size;
+ ipid_delta = gso_ctx.ipid_flag == RTE_GSO_IPID_INCREASE;
+
+ if (is_ipv4_tcp(pkt->packet_type)) {
If (is_ipv4_tcp(pkt->packet_type) && (gso_ctx->gso_types & DEV_TX_OFFLOAD_TCP_TSO) != 0) {...
If (pkt->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_IPV4) == PKT_TX_IPV4 &&
(gso_ctx->gso_types & DEV_TX_OFFLOAD_TCP_TSO) != 0) {...
I don't quite understand why the GSO library should be aware if the TSO
flag is set or not. Applications can query device TSO capability before
they call the GSO library. Do I misundertsand anything?
Additionally, we don't need to check if the packet is a TCP/IPv4 packet here?
Well, right now PMD we doesn't rely on ptype to figure out what type of packet and
what TX offload have to be performed.
Instead it looks at TX part of ol_flags, and
My thought was that as what we doing is actually TSO in SW, it would be good
to use the same API here too.
Also with that approach, by setting ol_flags properly user can use the same gso_ctx and still
specify what segmentation to perform on a per-packet basis.
Alternative way is to rely on ptype to distinguish should segmentation be performed on that package or not.
The only advantage I see here is that if someone would like to add GSO for some new protocol,
he wouldn't need to introduce new TX flag value for mbuf.ol_flags.
Though he still would need to update TX_OFFLOAD_* capabilities and probably packet_type definitions.
So from my perspective first variant (use HW TSO API) is more plausible.
Wonder what is your and Mark opinions here?
the GSO library uses gso_ctx->gso_types and mbuf->ol_flags to call a specific GSO
segmentation function (e.g. gso_tcp4_segment(), gso_tunnel_xxx()) for each input packet.
Applications should parse the packet type, and set an exactly correct DEV_TX_OFFLOAD_*_TSO
flag to gso_types and ol_flags according to the packet type. That is, the value of gso_types
is on a per-packet basis. Using gso_ctx->gso_types and mbuf->ol_flags at the same time
is because that DEV_TX_OFFLOAD_*_TSO only tells tunnelling type and the inner L4 type, and
we need to know L3 type by ol_flags. With this design, HW segmentation and SW segmentation
are indeed consistent.
If I understand it correctly, applications need to set 'ol_flags = PKT_TX_IPV4' and
'gso_types = DEV_TX_OFFLOAD_VXLAN_TNL_TSO' for a "ether+ipv4+udp+vxlan+ether+ipv4+
tcp+payload" packet. But PKT_TX_IPV4 just present the inner L3 type for tunneled packet.
How about the outer L3 type? Always assume the inner and the outer L3 type are the same?
It think that for that case you'll have to set in ol_flags:

PKT_TX_IPV4 | PKT_TX_OUTER_IPV4 | PKT_TX_TUNNEL_VXLAN | PKT_TX_TCP_SEG

Konstantin
Jiayu
Post by Ananyev, Konstantin
Konstantin
Hu, Jiayu
2017-09-14 09:29:13 UTC
Permalink
Hi Konstantin,
-----Original Message-----
From: Ananyev, Konstantin
Sent: Thursday, September 14, 2017 4:47 PM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Jiayu,
-----Original Message-----
From: Hu, Jiayu
Sent: Thursday, September 14, 2017 7:07 AM
Subject: Re: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Konstantin,
Post by Ananyev, Konstantin
Hi Jiayu,
Post by Jiayu Hu
Post by Ananyev, Konstantin
-----Original Message-----
From: Ananyev, Konstantin
Sent: Tuesday, September 12, 2017 12:18 PM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Post by Jiayu Hu
result, when all of its GSOed segments are freed, the packet is
freed
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
automatically.
diff --git a/lib/librte_gso/rte_gso.c b/lib/librte_gso/rte_gso.c
index dda50ee..95f6ea6 100644
--- a/lib/librte_gso/rte_gso.c
+++ b/lib/librte_gso/rte_gso.c
@@ -33,18 +33,53 @@
#include <errno.h>
+#include <rte_log.h>
+
#include "rte_gso.h"
+#include "gso_common.h"
+#include "gso_tcp4.h"
int
rte_gso_segment(struct rte_mbuf *pkt,
- struct rte_gso_ctx gso_ctx __rte_unused,
+ struct rte_gso_ctx gso_ctx,
struct rte_mbuf **pkts_out,
uint16_t nb_pkts_out)
{
+ struct rte_mempool *direct_pool, *indirect_pool;
+ struct rte_mbuf *pkt_seg;
+ uint16_t gso_size;
+ uint8_t ipid_delta;
+ int ret = 1;
+
if (pkt == NULL || pkts_out == NULL || nb_pkts_out < 1)
return -EINVAL;
- pkts_out[0] = pkt;
+ if (gso_ctx.gso_size >= pkt->pkt_len ||
+ (pkt->packet_type & gso_ctx.gso_types) !=
+ pkt->packet_type) {
+ pkts_out[0] = pkt;
+ return ret;
+ }
+
+ direct_pool = gso_ctx.direct_pool;
+ indirect_pool = gso_ctx.indirect_pool;
+ gso_size = gso_ctx.gso_size;
+ ipid_delta = gso_ctx.ipid_flag == RTE_GSO_IPID_INCREASE;
+
+ if (is_ipv4_tcp(pkt->packet_type)) {
If (is_ipv4_tcp(pkt->packet_type) && (gso_ctx->gso_types &
DEV_TX_OFFLOAD_TCP_TSO) != 0) {...
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
If (pkt->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_IPV4) == PKT_TX_IPV4
&&
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
(gso_ctx->gso_types & DEV_TX_OFFLOAD_TCP_TSO) != 0) {...
I don't quite understand why the GSO library should be aware if the TSO
flag is set or not. Applications can query device TSO capability before
they call the GSO library. Do I misundertsand anything?
Additionally, we don't need to check if the packet is a TCP/IPv4 packet
here?
Post by Ananyev, Konstantin
Well, right now PMD we doesn't rely on ptype to figure out what type of
packet and
Post by Ananyev, Konstantin
what TX offload have to be performed.
Instead it looks at TX part of ol_flags, and
My thought was that as what we doing is actually TSO in SW, it would be
good
Post by Ananyev, Konstantin
to use the same API here too.
Also with that approach, by setting ol_flags properly user can use the
same gso_ctx and still
Post by Ananyev, Konstantin
specify what segmentation to perform on a per-packet basis.
Alternative way is to rely on ptype to distinguish should segmentation be
performed on that package or not.
Post by Ananyev, Konstantin
The only advantage I see here is that if someone would like to add GSO
for some new protocol,
Post by Ananyev, Konstantin
he wouldn't need to introduce new TX flag value for mbuf.ol_flags.
Though he still would need to update TX_OFFLOAD_* capabilities and
probably packet_type definitions.
Post by Ananyev, Konstantin
So from my perspective first variant (use HW TSO API) is more plausible.
Wonder what is your and Mark opinions here?
the GSO library uses gso_ctx->gso_types and mbuf->ol_flags to call a
specific GSO
segmentation function (e.g. gso_tcp4_segment(), gso_tunnel_xxx()) for
each input packet.
Applications should parse the packet type, and set an exactly correct
DEV_TX_OFFLOAD_*_TSO
flag to gso_types and ol_flags according to the packet type. That is, the
value of gso_types
is on a per-packet basis. Using gso_ctx->gso_types and mbuf->ol_flags at
the same time
is because that DEV_TX_OFFLOAD_*_TSO only tells tunnelling type and the
inner L4 type, and
we need to know L3 type by ol_flags. With this design, HW segmentation
and SW segmentation
are indeed consistent.
If I understand it correctly, applications need to set 'ol_flags =
PKT_TX_IPV4' and
'gso_types = DEV_TX_OFFLOAD_VXLAN_TNL_TSO' for a
"ether+ipv4+udp+vxlan+ether+ipv4+
tcp+payload" packet. But PKT_TX_IPV4 just present the inner L3 type for
tunneled packet.
How about the outer L3 type? Always assume the inner and the outer L3
type are the same?
PKT_TX_IPV4 | PKT_TX_OUTER_IPV4 | PKT_TX_TUNNEL_VXLAN |
PKT_TX_TCP_SEG
OK, so it means PKT_TX_TCP_SEG is also used for tunneled TSO. The
GSO library doesn't need gso_types anymore.

The first choice makes HW and SW segmentation are totally the same.
Applications just need to parse the packet and set proper ol_flags, and
the GSO library uses ol_flags to decide which segmentation function to use.
I think it's better than the second choice which depending on ptype to
choose segmentation function.

Jiayu
Konstantin
Jiayu
Post by Ananyev, Konstantin
Konstantin
Ananyev, Konstantin
2017-09-14 09:35:41 UTC
Permalink
-----Original Message-----
From: Hu, Jiayu
Sent: Thursday, September 14, 2017 10:29 AM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Konstantin,
-----Original Message-----
From: Ananyev, Konstantin
Sent: Thursday, September 14, 2017 4:47 PM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Jiayu,
-----Original Message-----
From: Hu, Jiayu
Sent: Thursday, September 14, 2017 7:07 AM
Subject: Re: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Konstantin,
Post by Ananyev, Konstantin
Hi Jiayu,
Post by Jiayu Hu
Post by Ananyev, Konstantin
-----Original Message-----
From: Ananyev, Konstantin
Sent: Tuesday, September 12, 2017 12:18 PM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Post by Jiayu Hu
result, when all of its GSOed segments are freed, the packet is
freed
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
automatically.
diff --git a/lib/librte_gso/rte_gso.c b/lib/librte_gso/rte_gso.c
index dda50ee..95f6ea6 100644
--- a/lib/librte_gso/rte_gso.c
+++ b/lib/librte_gso/rte_gso.c
@@ -33,18 +33,53 @@
#include <errno.h>
+#include <rte_log.h>
+
#include "rte_gso.h"
+#include "gso_common.h"
+#include "gso_tcp4.h"
int
rte_gso_segment(struct rte_mbuf *pkt,
- struct rte_gso_ctx gso_ctx __rte_unused,
+ struct rte_gso_ctx gso_ctx,
struct rte_mbuf **pkts_out,
uint16_t nb_pkts_out)
{
+ struct rte_mempool *direct_pool, *indirect_pool;
+ struct rte_mbuf *pkt_seg;
+ uint16_t gso_size;
+ uint8_t ipid_delta;
+ int ret = 1;
+
if (pkt == NULL || pkts_out == NULL || nb_pkts_out < 1)
return -EINVAL;
- pkts_out[0] = pkt;
+ if (gso_ctx.gso_size >= pkt->pkt_len ||
+ (pkt->packet_type & gso_ctx.gso_types) !=
+ pkt->packet_type) {
+ pkts_out[0] = pkt;
+ return ret;
+ }
+
+ direct_pool = gso_ctx.direct_pool;
+ indirect_pool = gso_ctx.indirect_pool;
+ gso_size = gso_ctx.gso_size;
+ ipid_delta = gso_ctx.ipid_flag == RTE_GSO_IPID_INCREASE;
+
+ if (is_ipv4_tcp(pkt->packet_type)) {
If (is_ipv4_tcp(pkt->packet_type) && (gso_ctx->gso_types &
DEV_TX_OFFLOAD_TCP_TSO) != 0) {...
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
If (pkt->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_IPV4) == PKT_TX_IPV4
&&
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
(gso_ctx->gso_types & DEV_TX_OFFLOAD_TCP_TSO) != 0) {...
I don't quite understand why the GSO library should be aware if the TSO
flag is set or not. Applications can query device TSO capability before
they call the GSO library. Do I misundertsand anything?
Additionally, we don't need to check if the packet is a TCP/IPv4 packet
here?
Post by Ananyev, Konstantin
Well, right now PMD we doesn't rely on ptype to figure out what type of
packet and
Post by Ananyev, Konstantin
what TX offload have to be performed.
Instead it looks at TX part of ol_flags, and
My thought was that as what we doing is actually TSO in SW, it would be
good
Post by Ananyev, Konstantin
to use the same API here too.
Also with that approach, by setting ol_flags properly user can use the
same gso_ctx and still
Post by Ananyev, Konstantin
specify what segmentation to perform on a per-packet basis.
Alternative way is to rely on ptype to distinguish should segmentation be
performed on that package or not.
Post by Ananyev, Konstantin
The only advantage I see here is that if someone would like to add GSO
for some new protocol,
Post by Ananyev, Konstantin
he wouldn't need to introduce new TX flag value for mbuf.ol_flags.
Though he still would need to update TX_OFFLOAD_* capabilities and
probably packet_type definitions.
Post by Ananyev, Konstantin
So from my perspective first variant (use HW TSO API) is more plausible.
Wonder what is your and Mark opinions here?
the GSO library uses gso_ctx->gso_types and mbuf->ol_flags to call a
specific GSO
segmentation function (e.g. gso_tcp4_segment(), gso_tunnel_xxx()) for
each input packet.
Applications should parse the packet type, and set an exactly correct
DEV_TX_OFFLOAD_*_TSO
flag to gso_types and ol_flags according to the packet type. That is, the
value of gso_types
is on a per-packet basis. Using gso_ctx->gso_types and mbuf->ol_flags at
the same time
is because that DEV_TX_OFFLOAD_*_TSO only tells tunnelling type and the
inner L4 type, and
we need to know L3 type by ol_flags. With this design, HW segmentation
and SW segmentation
are indeed consistent.
If I understand it correctly, applications need to set 'ol_flags =
PKT_TX_IPV4' and
'gso_types = DEV_TX_OFFLOAD_VXLAN_TNL_TSO' for a
"ether+ipv4+udp+vxlan+ether+ipv4+
tcp+payload" packet. But PKT_TX_IPV4 just present the inner L3 type for
tunneled packet.
How about the outer L3 type? Always assume the inner and the outer L3
type are the same?
PKT_TX_IPV4 | PKT_TX_OUTER_IPV4 | PKT_TX_TUNNEL_VXLAN |
PKT_TX_TCP_SEG
OK, so it means PKT_TX_TCP_SEG is also used for tunneled TSO. The
GSO library doesn't need gso_types anymore.
You still might need gso_ctx.gso_types to let user limit what types of segmentation
that particular gso_ctx supports.
An alternative would be to assume that each gso_ctx supports all
currently implemented segmentations.
This is possible too, but probably not very convenient to the user.
Konstantin
The first choice makes HW and SW segmentation are totally the same.
Applications just need to parse the packet and set proper ol_flags, and
the GSO library uses ol_flags to decide which segmentation function to use.
I think it's better than the second choice which depending on ptype to
choose segmentation function.
Jiayu
Konstantin
Jiayu
Post by Ananyev, Konstantin
Konstantin
Hu, Jiayu
2017-09-14 10:01:03 UTC
Permalink
Hi Konstantin and Mark,
-----Original Message-----
From: Ananyev, Konstantin
Sent: Thursday, September 14, 2017 5:36 PM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
-----Original Message-----
From: Hu, Jiayu
Sent: Thursday, September 14, 2017 10:29 AM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Konstantin,
-----Original Message-----
From: Ananyev, Konstantin
Sent: Thursday, September 14, 2017 4:47 PM
Tan,
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Jiayu,
-----Original Message-----
From: Hu, Jiayu
Sent: Thursday, September 14, 2017 7:07 AM
Tan,
Subject: Re: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Konstantin,
Post by Ananyev, Konstantin
Hi Jiayu,
Post by Jiayu Hu
Post by Ananyev, Konstantin
-----Original Message-----
From: Ananyev, Konstantin
Sent: Tuesday, September 12, 2017 12:18 PM
Jianfeng
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Post by Jiayu Hu
result, when all of its GSOed segments are freed, the packet is
freed
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
automatically.
diff --git a/lib/librte_gso/rte_gso.c b/lib/librte_gso/rte_gso.c
index dda50ee..95f6ea6 100644
--- a/lib/librte_gso/rte_gso.c
+++ b/lib/librte_gso/rte_gso.c
@@ -33,18 +33,53 @@
#include <errno.h>
+#include <rte_log.h>
+
#include "rte_gso.h"
+#include "gso_common.h"
+#include "gso_tcp4.h"
int
rte_gso_segment(struct rte_mbuf *pkt,
- struct rte_gso_ctx gso_ctx __rte_unused,
+ struct rte_gso_ctx gso_ctx,
struct rte_mbuf **pkts_out,
uint16_t nb_pkts_out)
{
+ struct rte_mempool *direct_pool, *indirect_pool;
+ struct rte_mbuf *pkt_seg;
+ uint16_t gso_size;
+ uint8_t ipid_delta;
+ int ret = 1;
+
if (pkt == NULL || pkts_out == NULL || nb_pkts_out < 1)
return -EINVAL;
- pkts_out[0] = pkt;
+ if (gso_ctx.gso_size >= pkt->pkt_len ||
+ (pkt->packet_type & gso_ctx.gso_types) !=
+ pkt->packet_type) {
+ pkts_out[0] = pkt;
+ return ret;
+ }
+
+ direct_pool = gso_ctx.direct_pool;
+ indirect_pool = gso_ctx.indirect_pool;
+ gso_size = gso_ctx.gso_size;
+ ipid_delta = gso_ctx.ipid_flag == RTE_GSO_IPID_INCREASE;
+
+ if (is_ipv4_tcp(pkt->packet_type)) {
If (is_ipv4_tcp(pkt->packet_type) && (gso_ctx->gso_types &
DEV_TX_OFFLOAD_TCP_TSO) != 0) {...
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
If (pkt->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_IPV4) ==
PKT_TX_IPV4
&&
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
(gso_ctx->gso_types & DEV_TX_OFFLOAD_TCP_TSO) != 0) {...
I don't quite understand why the GSO library should be aware if the
TSO
Post by Ananyev, Konstantin
Post by Jiayu Hu
flag is set or not. Applications can query device TSO capability
before
Post by Ananyev, Konstantin
Post by Jiayu Hu
they call the GSO library. Do I misundertsand anything?
Additionally, we don't need to check if the packet is a TCP/IPv4
packet
here?
Post by Ananyev, Konstantin
Well, right now PMD we doesn't rely on ptype to figure out what type
of
packet and
Post by Ananyev, Konstantin
what TX offload have to be performed.
Instead it looks at TX part of ol_flags, and
My thought was that as what we doing is actually TSO in SW, it would
be
good
Post by Ananyev, Konstantin
to use the same API here too.
Also with that approach, by setting ol_flags properly user can use the
same gso_ctx and still
Post by Ananyev, Konstantin
specify what segmentation to perform on a per-packet basis.
Alternative way is to rely on ptype to distinguish should segmentation
be
performed on that package or not.
Post by Ananyev, Konstantin
The only advantage I see here is that if someone would like to add
GSO
for some new protocol,
Post by Ananyev, Konstantin
he wouldn't need to introduce new TX flag value for mbuf.ol_flags.
Though he still would need to update TX_OFFLOAD_* capabilities and
probably packet_type definitions.
Post by Ananyev, Konstantin
So from my perspective first variant (use HW TSO API) is more
plausible.
Post by Ananyev, Konstantin
Wonder what is your and Mark opinions here?
the GSO library uses gso_ctx->gso_types and mbuf->ol_flags to call a
specific GSO
segmentation function (e.g. gso_tcp4_segment(), gso_tunnel_xxx()) for
each input packet.
Applications should parse the packet type, and set an exactly correct
DEV_TX_OFFLOAD_*_TSO
flag to gso_types and ol_flags according to the packet type. That is, the
value of gso_types
is on a per-packet basis. Using gso_ctx->gso_types and mbuf->ol_flags
at
the same time
is because that DEV_TX_OFFLOAD_*_TSO only tells tunnelling type and
the
inner L4 type, and
we need to know L3 type by ol_flags. With this design, HW
segmentation
and SW segmentation
are indeed consistent.
If I understand it correctly, applications need to set 'ol_flags =
PKT_TX_IPV4' and
'gso_types = DEV_TX_OFFLOAD_VXLAN_TNL_TSO' for a
"ether+ipv4+udp+vxlan+ether+ipv4+
tcp+payload" packet. But PKT_TX_IPV4 just present the inner L3 type for
tunneled packet.
How about the outer L3 type? Always assume the inner and the outer L3
type are the same?
PKT_TX_IPV4 | PKT_TX_OUTER_IPV4 | PKT_TX_TUNNEL_VXLAN |
PKT_TX_TCP_SEG
OK, so it means PKT_TX_TCP_SEG is also used for tunneled TSO. The
GSO library doesn't need gso_types anymore.
You still might need gso_ctx.gso_types to let user limit what types of segmentation
that particular gso_ctx supports.
An alternative would be to assume that each gso_ctx supports all
currently implemented segmentations.
This is possible too, but probably not very convenient to the user.
Hmm, make sense.

One thing to confirm: the value of gso_types should be DEV_TX_OFFLOAD_*_TSO,
or new macros?

Jiayu
Konstantin
The first choice makes HW and SW segmentation are totally the same.
Applications just need to parse the packet and set proper ol_flags, and
the GSO library uses ol_flags to decide which segmentation function to use.
I think it's better than the second choice which depending on ptype to
choose segmentation function.
Jiayu
Konstantin
Jiayu
Post by Ananyev, Konstantin
Konstantin
Kavanagh, Mark B
2017-09-14 15:42:01 UTC
Permalink
From: Hu, Jiayu
Sent: Thursday, September 14, 2017 11:01 AM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Konstantin and Mark,
-----Original Message-----
From: Ananyev, Konstantin
Sent: Thursday, September 14, 2017 5:36 PM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
-----Original Message-----
From: Hu, Jiayu
Sent: Thursday, September 14, 2017 10:29 AM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Konstantin,
-----Original Message-----
From: Ananyev, Konstantin
Sent: Thursday, September 14, 2017 4:47 PM
Tan,
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Jiayu,
-----Original Message-----
From: Hu, Jiayu
Sent: Thursday, September 14, 2017 7:07 AM
Tan,
Subject: Re: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Konstantin,
Post by Ananyev, Konstantin
Hi Jiayu,
Post by Jiayu Hu
Post by Ananyev, Konstantin
-----Original Message-----
From: Ananyev, Konstantin
Sent: Tuesday, September 12, 2017 12:18 PM
Jianfeng
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Post by Jiayu Hu
result, when all of its GSOed segments are freed, the packet
is
freed
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
automatically.
diff --git a/lib/librte_gso/rte_gso.c
b/lib/librte_gso/rte_gso.c
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
index dda50ee..95f6ea6 100644
--- a/lib/librte_gso/rte_gso.c
+++ b/lib/librte_gso/rte_gso.c
@@ -33,18 +33,53 @@
#include <errno.h>
+#include <rte_log.h>
+
#include "rte_gso.h"
+#include "gso_common.h"
+#include "gso_tcp4.h"
int
rte_gso_segment(struct rte_mbuf *pkt,
- struct rte_gso_ctx gso_ctx __rte_unused,
+ struct rte_gso_ctx gso_ctx,
struct rte_mbuf **pkts_out,
uint16_t nb_pkts_out)
{
+ struct rte_mempool *direct_pool, *indirect_pool;
+ struct rte_mbuf *pkt_seg;
+ uint16_t gso_size;
+ uint8_t ipid_delta;
+ int ret = 1;
+
if (pkt == NULL || pkts_out == NULL || nb_pkts_out < 1)
return -EINVAL;
- pkts_out[0] = pkt;
+ if (gso_ctx.gso_size >= pkt->pkt_len ||
+ (pkt->packet_type & gso_ctx.gso_types) !=
+ pkt->packet_type) {
+ pkts_out[0] = pkt;
+ return ret;
+ }
+
+ direct_pool = gso_ctx.direct_pool;
+ indirect_pool = gso_ctx.indirect_pool;
+ gso_size = gso_ctx.gso_size;
+ ipid_delta = gso_ctx.ipid_flag == RTE_GSO_IPID_INCREASE;
+
+ if (is_ipv4_tcp(pkt->packet_type)) {
If (is_ipv4_tcp(pkt->packet_type) && (gso_ctx->gso_types &
DEV_TX_OFFLOAD_TCP_TSO) != 0) {...
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
If (pkt->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_IPV4) ==
PKT_TX_IPV4
&&
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
(gso_ctx->gso_types & DEV_TX_OFFLOAD_TCP_TSO) != 0) {...
I don't quite understand why the GSO library should be aware if
the
TSO
Post by Ananyev, Konstantin
Post by Jiayu Hu
flag is set or not. Applications can query device TSO capability
before
Post by Ananyev, Konstantin
Post by Jiayu Hu
they call the GSO library. Do I misundertsand anything?
Additionally, we don't need to check if the packet is a TCP/IPv4
packet
here?
Post by Ananyev, Konstantin
Well, right now PMD we doesn't rely on ptype to figure out what
type
of
packet and
Post by Ananyev, Konstantin
what TX offload have to be performed.
Instead it looks at TX part of ol_flags, and
My thought was that as what we doing is actually TSO in SW, it would
be
good
Post by Ananyev, Konstantin
to use the same API here too.
Also with that approach, by setting ol_flags properly user can use
the
same gso_ctx and still
Post by Ananyev, Konstantin
specify what segmentation to perform on a per-packet basis.
Alternative way is to rely on ptype to distinguish should
segmentation
be
performed on that package or not.
Post by Ananyev, Konstantin
The only advantage I see here is that if someone would like to add
GSO
for some new protocol,
Post by Ananyev, Konstantin
he wouldn't need to introduce new TX flag value for mbuf.ol_flags.
Though he still would need to update TX_OFFLOAD_* capabilities and
probably packet_type definitions.
Post by Ananyev, Konstantin
So from my perspective first variant (use HW TSO API) is more
plausible.
Post by Ananyev, Konstantin
Wonder what is your and Mark opinions here?
the GSO library uses gso_ctx->gso_types and mbuf->ol_flags to call a
specific GSO
segmentation function (e.g. gso_tcp4_segment(), gso_tunnel_xxx()) for
each input packet.
Applications should parse the packet type, and set an exactly correct
DEV_TX_OFFLOAD_*_TSO
flag to gso_types and ol_flags according to the packet type. That is,
the
value of gso_types
is on a per-packet basis. Using gso_ctx->gso_types and mbuf->ol_flags
at
the same time
is because that DEV_TX_OFFLOAD_*_TSO only tells tunnelling type and
the
inner L4 type, and
we need to know L3 type by ol_flags. With this design, HW
segmentation
and SW segmentation
are indeed consistent.
If I understand it correctly, applications need to set 'ol_flags =
PKT_TX_IPV4' and
'gso_types = DEV_TX_OFFLOAD_VXLAN_TNL_TSO' for a
"ether+ipv4+udp+vxlan+ether+ipv4+
tcp+payload" packet. But PKT_TX_IPV4 just present the inner L3 type
for
tunneled packet.
How about the outer L3 type? Always assume the inner and the outer L3
type are the same?
PKT_TX_IPV4 | PKT_TX_OUTER_IPV4 | PKT_TX_TUNNEL_VXLAN |
PKT_TX_TCP_SEG
OK, so it means PKT_TX_TCP_SEG is also used for tunneled TSO. The
GSO library doesn't need gso_types anymore.
You still might need gso_ctx.gso_types to let user limit what types of segmentation
that particular gso_ctx supports.
An alternative would be to assume that each gso_ctx supports all
currently implemented segmentations.
This is possible too, but probably not very convenient to the user.
Hmm, make sense.
One thing to confirm: the value of gso_types should be DEV_TX_OFFLOAD_*_TSO,
or new macros?
Hi Jiayu, Konstantin,

I think that the existing macros are fine, as they provide a consistent view of segmentation capabilities to the application/user.

I was initially concerned that they might be too coarse-grained (i.e. only IPv4 is currently supported, and not IPv6), but as per Konstantin's previous example, the DEV_TX_OFFLOAD_*_TSO macros can be used in concert with the packet type to determine whether a packet should be fragmented or not.

Thanks,
Mark
Jiayu
Konstantin
The first choice makes HW and SW segmentation are totally the same.
Applications just need to parse the packet and set proper ol_flags, and
the GSO library uses ol_flags to decide which segmentation function to
use.
I think it's better than the second choice which depending on ptype to
choose segmentation function.
Jiayu
Konstantin
Jiayu
Post by Ananyev, Konstantin
Konstantin
Ananyev, Konstantin
2017-09-14 18:38:42 UTC
Permalink
-----Original Message-----
From: Kavanagh, Mark B
Sent: Thursday, September 14, 2017 4:42 PM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
From: Hu, Jiayu
Sent: Thursday, September 14, 2017 11:01 AM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Konstantin and Mark,
-----Original Message-----
From: Ananyev, Konstantin
Sent: Thursday, September 14, 2017 5:36 PM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
-----Original Message-----
From: Hu, Jiayu
Sent: Thursday, September 14, 2017 10:29 AM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Konstantin,
-----Original Message-----
From: Ananyev, Konstantin
Sent: Thursday, September 14, 2017 4:47 PM
Tan,
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Jiayu,
-----Original Message-----
From: Hu, Jiayu
Sent: Thursday, September 14, 2017 7:07 AM
Tan,
Subject: Re: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Konstantin,
Post by Ananyev, Konstantin
Hi Jiayu,
Post by Jiayu Hu
Post by Ananyev, Konstantin
-----Original Message-----
From: Ananyev, Konstantin
Sent: Tuesday, September 12, 2017 12:18 PM
Jianfeng
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Post by Jiayu Hu
result, when all of its GSOed segments are freed, the packet
is
freed
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
automatically.
diff --git a/lib/librte_gso/rte_gso.c
b/lib/librte_gso/rte_gso.c
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
index dda50ee..95f6ea6 100644
--- a/lib/librte_gso/rte_gso.c
+++ b/lib/librte_gso/rte_gso.c
@@ -33,18 +33,53 @@
#include <errno.h>
+#include <rte_log.h>
+
#include "rte_gso.h"
+#include "gso_common.h"
+#include "gso_tcp4.h"
int
rte_gso_segment(struct rte_mbuf *pkt,
- struct rte_gso_ctx gso_ctx __rte_unused,
+ struct rte_gso_ctx gso_ctx,
struct rte_mbuf **pkts_out,
uint16_t nb_pkts_out)
{
+ struct rte_mempool *direct_pool, *indirect_pool;
+ struct rte_mbuf *pkt_seg;
+ uint16_t gso_size;
+ uint8_t ipid_delta;
+ int ret = 1;
+
if (pkt == NULL || pkts_out == NULL || nb_pkts_out < 1)
return -EINVAL;
- pkts_out[0] = pkt;
+ if (gso_ctx.gso_size >= pkt->pkt_len ||
+ (pkt->packet_type & gso_ctx.gso_types) !=
+ pkt->packet_type) {
+ pkts_out[0] = pkt;
+ return ret;
+ }
+
+ direct_pool = gso_ctx.direct_pool;
+ indirect_pool = gso_ctx.indirect_pool;
+ gso_size = gso_ctx.gso_size;
+ ipid_delta = gso_ctx.ipid_flag == RTE_GSO_IPID_INCREASE;
+
+ if (is_ipv4_tcp(pkt->packet_type)) {
If (is_ipv4_tcp(pkt->packet_type) && (gso_ctx->gso_types &
DEV_TX_OFFLOAD_TCP_TSO) != 0) {...
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
If (pkt->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_IPV4) ==
PKT_TX_IPV4
&&
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
(gso_ctx->gso_types & DEV_TX_OFFLOAD_TCP_TSO) != 0) {...
I don't quite understand why the GSO library should be aware if
the
TSO
Post by Ananyev, Konstantin
Post by Jiayu Hu
flag is set or not. Applications can query device TSO capability
before
Post by Ananyev, Konstantin
Post by Jiayu Hu
they call the GSO library. Do I misundertsand anything?
Additionally, we don't need to check if the packet is a TCP/IPv4
packet
here?
Post by Ananyev, Konstantin
Well, right now PMD we doesn't rely on ptype to figure out what
type
of
packet and
Post by Ananyev, Konstantin
what TX offload have to be performed.
Instead it looks at TX part of ol_flags, and
My thought was that as what we doing is actually TSO in SW, it would
be
good
Post by Ananyev, Konstantin
to use the same API here too.
Also with that approach, by setting ol_flags properly user can use
the
same gso_ctx and still
Post by Ananyev, Konstantin
specify what segmentation to perform on a per-packet basis.
Alternative way is to rely on ptype to distinguish should
segmentation
be
performed on that package or not.
Post by Ananyev, Konstantin
The only advantage I see here is that if someone would like to add
GSO
for some new protocol,
Post by Ananyev, Konstantin
he wouldn't need to introduce new TX flag value for mbuf.ol_flags.
Though he still would need to update TX_OFFLOAD_* capabilities and
probably packet_type definitions.
Post by Ananyev, Konstantin
So from my perspective first variant (use HW TSO API) is more
plausible.
Post by Ananyev, Konstantin
Wonder what is your and Mark opinions here?
the GSO library uses gso_ctx->gso_types and mbuf->ol_flags to call a
specific GSO
segmentation function (e.g. gso_tcp4_segment(), gso_tunnel_xxx()) for
each input packet.
Applications should parse the packet type, and set an exactly correct
DEV_TX_OFFLOAD_*_TSO
flag to gso_types and ol_flags according to the packet type. That is,
the
value of gso_types
is on a per-packet basis. Using gso_ctx->gso_types and mbuf->ol_flags
at
the same time
is because that DEV_TX_OFFLOAD_*_TSO only tells tunnelling type and
the
inner L4 type, and
we need to know L3 type by ol_flags. With this design, HW
segmentation
and SW segmentation
are indeed consistent.
If I understand it correctly, applications need to set 'ol_flags =
PKT_TX_IPV4' and
'gso_types = DEV_TX_OFFLOAD_VXLAN_TNL_TSO' for a
"ether+ipv4+udp+vxlan+ether+ipv4+
tcp+payload" packet. But PKT_TX_IPV4 just present the inner L3 type
for
tunneled packet.
How about the outer L3 type? Always assume the inner and the outer L3
type are the same?
PKT_TX_IPV4 | PKT_TX_OUTER_IPV4 | PKT_TX_TUNNEL_VXLAN |
PKT_TX_TCP_SEG
OK, so it means PKT_TX_TCP_SEG is also used for tunneled TSO. The
GSO library doesn't need gso_types anymore.
You still might need gso_ctx.gso_types to let user limit what types of segmentation
that particular gso_ctx supports.
An alternative would be to assume that each gso_ctx supports all
currently implemented segmentations.
This is possible too, but probably not very convenient to the user.
Hmm, make sense.
One thing to confirm: the value of gso_types should be DEV_TX_OFFLOAD_*_TSO,
or new macros?
Hi Jiayu, Konstantin,
I think that the existing macros are fine, as they provide a consistent view of segmentation capabilities to the application/user.
+1
I also think it is better to re-use DEV_TX_OFFLOAD_*_TSO.
I was initially concerned that they might be too coarse-grained (i.e. only IPv4 is currently supported, and not IPv6), but as per Konstantin's
previous example, the DEV_TX_OFFLOAD_*_TSO macros can be used in concert with the packet type to determine whether a packet should
be fragmented or not.
Thanks,
Mark
Jiayu
Konstantin
The first choice makes HW and SW segmentation are totally the same.
Applications just need to parse the packet and set proper ol_flags, and
the GSO library uses ol_flags to decide which segmentation function to
use.
I think it's better than the second choice which depending on ptype to
choose segmentation function.
Jiayu
Konstantin
Jiayu
Post by Ananyev, Konstantin
Konstantin
Hu, Jiayu
2017-09-15 07:54:40 UTC
Permalink
Hi Konstantin,
-----Original Message-----
From: Ananyev, Konstantin
Sent: Friday, September 15, 2017 2:39 AM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
-----Original Message-----
From: Kavanagh, Mark B
Sent: Thursday, September 14, 2017 4:42 PM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
From: Hu, Jiayu
Sent: Thursday, September 14, 2017 11:01 AM
Mark B
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Konstantin and Mark,
-----Original Message-----
From: Ananyev, Konstantin
Sent: Thursday, September 14, 2017 5:36 PM
Tan,
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
-----Original Message-----
From: Hu, Jiayu
Sent: Thursday, September 14, 2017 10:29 AM
Tan,
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Konstantin,
-----Original Message-----
From: Ananyev, Konstantin
Sent: Thursday, September 14, 2017 4:47 PM
Tan,
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Jiayu,
-----Original Message-----
From: Hu, Jiayu
Sent: Thursday, September 14, 2017 7:07 AM
Tan,
Subject: Re: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Konstantin,
On Thu, Sep 14, 2017 at 06:10:37AM +0800, Ananyev, Konstantin
Post by Ananyev, Konstantin
Hi Jiayu,
Post by Jiayu Hu
Post by Ananyev, Konstantin
-----Original Message-----
From: Ananyev, Konstantin
Sent: Tuesday, September 12, 2017 12:18 PM
Jianfeng
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Post by Jiayu Hu
result, when all of its GSOed segments are freed, the
packet
is
freed
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
automatically.
diff --git a/lib/librte_gso/rte_gso.c
b/lib/librte_gso/rte_gso.c
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
index dda50ee..95f6ea6 100644
--- a/lib/librte_gso/rte_gso.c
+++ b/lib/librte_gso/rte_gso.c
@@ -33,18 +33,53 @@
#include <errno.h>
+#include <rte_log.h>
+
#include "rte_gso.h"
+#include "gso_common.h"
+#include "gso_tcp4.h"
int
rte_gso_segment(struct rte_mbuf *pkt,
- struct rte_gso_ctx gso_ctx __rte_unused,
+ struct rte_gso_ctx gso_ctx,
struct rte_mbuf **pkts_out,
uint16_t nb_pkts_out)
{
+ struct rte_mempool *direct_pool, *indirect_pool;
+ struct rte_mbuf *pkt_seg;
+ uint16_t gso_size;
+ uint8_t ipid_delta;
+ int ret = 1;
+
if (pkt == NULL || pkts_out == NULL || nb_pkts_out
< 1)
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
return -EINVAL;
- pkts_out[0] = pkt;
+ if (gso_ctx.gso_size >= pkt->pkt_len ||
+ (pkt->packet_type &
gso_ctx.gso_types) !=
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
+ pkt->packet_type) {
+ pkts_out[0] = pkt;
+ return ret;
+ }
+
+ direct_pool = gso_ctx.direct_pool;
+ indirect_pool = gso_ctx.indirect_pool;
+ gso_size = gso_ctx.gso_size;
+ ipid_delta = gso_ctx.ipid_flag ==
RTE_GSO_IPID_INCREASE;
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
+
+ if (is_ipv4_tcp(pkt->packet_type)) {
If (is_ipv4_tcp(pkt->packet_type) && (gso_ctx->gso_types
&
DEV_TX_OFFLOAD_TCP_TSO) != 0) {...
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
If (pkt->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_IPV4) ==
PKT_TX_IPV4
&&
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
(gso_ctx->gso_types & DEV_TX_OFFLOAD_TCP_TSO) != 0)
{...
Post by Ananyev, Konstantin
Post by Jiayu Hu
I don't quite understand why the GSO library should be aware if
the
TSO
Post by Ananyev, Konstantin
Post by Jiayu Hu
flag is set or not. Applications can query device TSO capability
before
Post by Ananyev, Konstantin
Post by Jiayu Hu
they call the GSO library. Do I misundertsand anything?
Additionally, we don't need to check if the packet is a TCP/IPv4
packet
here?
Post by Ananyev, Konstantin
Well, right now PMD we doesn't rely on ptype to figure out what
type
of
packet and
Post by Ananyev, Konstantin
what TX offload have to be performed.
Instead it looks at TX part of ol_flags, and
My thought was that as what we doing is actually TSO in SW, it
would
be
good
Post by Ananyev, Konstantin
to use the same API here too.
Also with that approach, by setting ol_flags properly user can use
the
same gso_ctx and still
Post by Ananyev, Konstantin
specify what segmentation to perform on a per-packet basis.
Alternative way is to rely on ptype to distinguish should
segmentation
be
performed on that package or not.
Post by Ananyev, Konstantin
The only advantage I see here is that if someone would like to
add
GSO
for some new protocol,
Post by Ananyev, Konstantin
he wouldn't need to introduce new TX flag value for
mbuf.ol_flags.
Post by Ananyev, Konstantin
Though he still would need to update TX_OFFLOAD_* capabilities
and
probably packet_type definitions.
Post by Ananyev, Konstantin
So from my perspective first variant (use HW TSO API) is more
plausible.
Post by Ananyev, Konstantin
Wonder what is your and Mark opinions here?
the GSO library uses gso_ctx->gso_types and mbuf->ol_flags to call
a
specific GSO
segmentation function (e.g. gso_tcp4_segment(), gso_tunnel_xxx())
for
each input packet.
Applications should parse the packet type, and set an exactly
correct
DEV_TX_OFFLOAD_*_TSO
flag to gso_types and ol_flags according to the packet type. That is,
the
value of gso_types
is on a per-packet basis. Using gso_ctx->gso_types and mbuf-
ol_flags
at
the same time
is because that DEV_TX_OFFLOAD_*_TSO only tells tunnelling type
and
the
inner L4 type, and
we need to know L3 type by ol_flags. With this design, HW
segmentation
and SW segmentation
are indeed consistent.
If I understand it correctly, applications need to set 'ol_flags =
PKT_TX_IPV4' and
'gso_types = DEV_TX_OFFLOAD_VXLAN_TNL_TSO' for a
"ether+ipv4+udp+vxlan+ether+ipv4+
tcp+payload" packet. But PKT_TX_IPV4 just present the inner L3
type
for
tunneled packet.
How about the outer L3 type? Always assume the inner and the
outer L3
type are the same?
PKT_TX_IPV4 | PKT_TX_OUTER_IPV4 | PKT_TX_TUNNEL_VXLAN | PKT_TX_TCP_SEG
OK, so it means PKT_TX_TCP_SEG is also used for tunneled TSO. The
GSO library doesn't need gso_types anymore.
You still might need gso_ctx.gso_types to let user limit what types of
segmentation
that particular gso_ctx supports.
An alternative would be to assume that each gso_ctx supports all
currently implemented segmentations.
This is possible too, but probably not very convenient to the user.
Hmm, make sense.
One thing to confirm: the value of gso_types should be
DEV_TX_OFFLOAD_*_TSO,
or new macros?
Hi Jiayu, Konstantin,
I think that the existing macros are fine, as they provide a consistent view
of segmentation capabilities to the application/user.
+1
I also think it is better to re-use DEV_TX_OFFLOAD_*_TSO.
There might be an 'issue', if we use 'PKT_TX_TCP_SEG' to tell the
GSO library to segment a packet or not. Given the scenario that
an application only wants to do GSO and doesn't want to use TSO.
The application sets 'mbuf->ol_flags=PKT_TX_TCP_SEG' and doesn't
set mbuf->tso_segsz. Then the GSO library segments the packet, and
all output GSO segments have the same ol_flags as the input packet
(in current GSO library design). Then the output GSO segments are
transmitted to rte_eth_tx_prepare(). If the NIC is i40e, its TX prepare function,
i40e_prep_pkts, checks if mbuf->tso_segsz is in the range of I40E_MIN_TSO_MSS
and I40E_MAX_TSO_MSS, when PKT_TX_TCP_SEG is set. So an error happens in
this scenario, since tso_segsz is 0.

In fact, it may confuse the PMD driver when set PKT_TX_TCP_SEG but don't want
to do TSO. One solution is that the GSO library removes the PKT_TX_TCP_SEG flag
for all GSO segments after finishes segmenting. Wonder you and Mark's opinion.

Thanks,
Jiayu
I was initially concerned that they might be too coarse-grained (i.e. only
IPv4 is currently supported, and not IPv6), but as per Konstantin's
previous example, the DEV_TX_OFFLOAD_*_TSO macros can be used in
concert with the packet type to determine whether a packet should
be fragmented or not.
Thanks,
Mark
Jiayu
Konstantin
The first choice makes HW and SW segmentation are totally the same.
Applications just need to parse the packet and set proper ol_flags, and
the GSO library uses ol_flags to decide which segmentation function to
use.
I think it's better than the second choice which depending on ptype to
choose segmentation function.
Jiayu
Konstantin
Jiayu
Post by Ananyev, Konstantin
Konstantin
Ananyev, Konstantin
2017-09-15 08:15:53 UTC
Permalink
Hi Jiayu,
-----Original Message-----
From: Hu, Jiayu
Sent: Friday, September 15, 2017 8:55 AM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Konstantin,
-----Original Message-----
From: Ananyev, Konstantin
Sent: Friday, September 15, 2017 2:39 AM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
-----Original Message-----
From: Kavanagh, Mark B
Sent: Thursday, September 14, 2017 4:42 PM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
From: Hu, Jiayu
Sent: Thursday, September 14, 2017 11:01 AM
Mark B
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Konstantin and Mark,
-----Original Message-----
From: Ananyev, Konstantin
Sent: Thursday, September 14, 2017 5:36 PM
Tan,
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
-----Original Message-----
From: Hu, Jiayu
Sent: Thursday, September 14, 2017 10:29 AM
Tan,
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Konstantin,
-----Original Message-----
From: Ananyev, Konstantin
Sent: Thursday, September 14, 2017 4:47 PM
Tan,
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Jiayu,
-----Original Message-----
From: Hu, Jiayu
Sent: Thursday, September 14, 2017 7:07 AM
Tan,
Subject: Re: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Konstantin,
On Thu, Sep 14, 2017 at 06:10:37AM +0800, Ananyev, Konstantin
Post by Ananyev, Konstantin
Hi Jiayu,
Post by Jiayu Hu
Post by Ananyev, Konstantin
-----Original Message-----
From: Ananyev, Konstantin
Sent: Tuesday, September 12, 2017 12:18 PM
Jianfeng
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Post by Jiayu Hu
result, when all of its GSOed segments are freed, the
packet
is
freed
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
automatically.
diff --git a/lib/librte_gso/rte_gso.c
b/lib/librte_gso/rte_gso.c
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
index dda50ee..95f6ea6 100644
--- a/lib/librte_gso/rte_gso.c
+++ b/lib/librte_gso/rte_gso.c
@@ -33,18 +33,53 @@
#include <errno.h>
+#include <rte_log.h>
+
#include "rte_gso.h"
+#include "gso_common.h"
+#include "gso_tcp4.h"
int
rte_gso_segment(struct rte_mbuf *pkt,
- struct rte_gso_ctx gso_ctx __rte_unused,
+ struct rte_gso_ctx gso_ctx,
struct rte_mbuf **pkts_out,
uint16_t nb_pkts_out)
{
+ struct rte_mempool *direct_pool, *indirect_pool;
+ struct rte_mbuf *pkt_seg;
+ uint16_t gso_size;
+ uint8_t ipid_delta;
+ int ret = 1;
+
if (pkt == NULL || pkts_out == NULL || nb_pkts_out
< 1)
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
return -EINVAL;
- pkts_out[0] = pkt;
+ if (gso_ctx.gso_size >= pkt->pkt_len ||
+ (pkt->packet_type &
gso_ctx.gso_types) !=
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
+ pkt->packet_type) {
+ pkts_out[0] = pkt;
+ return ret;
+ }
+
+ direct_pool = gso_ctx.direct_pool;
+ indirect_pool = gso_ctx.indirect_pool;
+ gso_size = gso_ctx.gso_size;
+ ipid_delta = gso_ctx.ipid_flag ==
RTE_GSO_IPID_INCREASE;
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
+
+ if (is_ipv4_tcp(pkt->packet_type)) {
If (is_ipv4_tcp(pkt->packet_type) && (gso_ctx->gso_types
&
DEV_TX_OFFLOAD_TCP_TSO) != 0) {...
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
If (pkt->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_IPV4) ==
PKT_TX_IPV4
&&
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
(gso_ctx->gso_types & DEV_TX_OFFLOAD_TCP_TSO) != 0)
{...
Post by Ananyev, Konstantin
Post by Jiayu Hu
I don't quite understand why the GSO library should be aware if
the
TSO
Post by Ananyev, Konstantin
Post by Jiayu Hu
flag is set or not. Applications can query device TSO capability
before
Post by Ananyev, Konstantin
Post by Jiayu Hu
they call the GSO library. Do I misundertsand anything?
Additionally, we don't need to check if the packet is a TCP/IPv4
packet
here?
Post by Ananyev, Konstantin
Well, right now PMD we doesn't rely on ptype to figure out what
type
of
packet and
Post by Ananyev, Konstantin
what TX offload have to be performed.
Instead it looks at TX part of ol_flags, and
My thought was that as what we doing is actually TSO in SW, it
would
be
good
Post by Ananyev, Konstantin
to use the same API here too.
Also with that approach, by setting ol_flags properly user can use
the
same gso_ctx and still
Post by Ananyev, Konstantin
specify what segmentation to perform on a per-packet basis.
Alternative way is to rely on ptype to distinguish should
segmentation
be
performed on that package or not.
Post by Ananyev, Konstantin
The only advantage I see here is that if someone would like to
add
GSO
for some new protocol,
Post by Ananyev, Konstantin
he wouldn't need to introduce new TX flag value for
mbuf.ol_flags.
Post by Ananyev, Konstantin
Though he still would need to update TX_OFFLOAD_* capabilities
and
probably packet_type definitions.
Post by Ananyev, Konstantin
So from my perspective first variant (use HW TSO API) is more
plausible.
Post by Ananyev, Konstantin
Wonder what is your and Mark opinions here?
the GSO library uses gso_ctx->gso_types and mbuf->ol_flags to call
a
specific GSO
segmentation function (e.g. gso_tcp4_segment(), gso_tunnel_xxx())
for
each input packet.
Applications should parse the packet type, and set an exactly
correct
DEV_TX_OFFLOAD_*_TSO
flag to gso_types and ol_flags according to the packet type. That is,
the
value of gso_types
is on a per-packet basis. Using gso_ctx->gso_types and mbuf-
ol_flags
at
the same time
is because that DEV_TX_OFFLOAD_*_TSO only tells tunnelling type
and
the
inner L4 type, and
we need to know L3 type by ol_flags. With this design, HW
segmentation
and SW segmentation
are indeed consistent.
If I understand it correctly, applications need to set 'ol_flags =
PKT_TX_IPV4' and
'gso_types = DEV_TX_OFFLOAD_VXLAN_TNL_TSO' for a
"ether+ipv4+udp+vxlan+ether+ipv4+
tcp+payload" packet. But PKT_TX_IPV4 just present the inner L3
type
for
tunneled packet.
How about the outer L3 type? Always assume the inner and the
outer L3
type are the same?
PKT_TX_IPV4 | PKT_TX_OUTER_IPV4 | PKT_TX_TUNNEL_VXLAN |
PKT_TX_TCP_SEG
OK, so it means PKT_TX_TCP_SEG is also used for tunneled TSO. The
GSO library doesn't need gso_types anymore.
You still might need gso_ctx.gso_types to let user limit what types of
segmentation
that particular gso_ctx supports.
An alternative would be to assume that each gso_ctx supports all
currently implemented segmentations.
This is possible too, but probably not very convenient to the user.
Hmm, make sense.
One thing to confirm: the value of gso_types should be
DEV_TX_OFFLOAD_*_TSO,
or new macros?
Hi Jiayu, Konstantin,
I think that the existing macros are fine, as they provide a consistent view
of segmentation capabilities to the application/user.
+1
I also think it is better to re-use DEV_TX_OFFLOAD_*_TSO.
There might be an 'issue', if we use 'PKT_TX_TCP_SEG' to tell the
GSO library to segment a packet or not. Given the scenario that
an application only wants to do GSO and doesn't want to use TSO.
The application sets 'mbuf->ol_flags=PKT_TX_TCP_SEG' and doesn't
set mbuf->tso_segsz. Then the GSO library segments the packet, and
all output GSO segments have the same ol_flags as the input packet
(in current GSO library design). Then the output GSO segments are
transmitted to rte_eth_tx_prepare(). If the NIC is i40e, its TX prepare function,
i40e_prep_pkts, checks if mbuf->tso_segsz is in the range of I40E_MIN_TSO_MSS
and I40E_MAX_TSO_MSS, when PKT_TX_TCP_SEG is set. So an error happens in
this scenario, since tso_segsz is 0.
In fact, it may confuse the PMD driver when set PKT_TX_TCP_SEG but don't want
to do TSO. One solution is that the GSO library removes the PKT_TX_TCP_SEG flag
for all GSO segments after finishes segmenting.
Yes, that was my thought too: after successful segmentation we probably
need to cleanup related ol_flags.
Konstantin
Wonder you and Mark's opinion.
Thanks,
Jiayu
I was initially concerned that they might be too coarse-grained (i.e. only
IPv4 is currently supported, and not IPv6), but as per Konstantin's
previous example, the DEV_TX_OFFLOAD_*_TSO macros can be used in
concert with the packet type to determine whether a packet should
be fragmented or not.
Thanks,
Mark
Jiayu
Konstantin
The first choice makes HW and SW segmentation are totally the same.
Applications just need to parse the packet and set proper ol_flags, and
the GSO library uses ol_flags to decide which segmentation function to
use.
I think it's better than the second choice which depending on ptype to
choose segmentation function.
Jiayu
Konstantin
Jiayu
Post by Ananyev, Konstantin
Konstantin
Ananyev, Konstantin
2017-09-15 08:17:04 UTC
Permalink
-----Original Message-----
From: Ananyev, Konstantin
Sent: Friday, September 15, 2017 9:16 AM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Jiayu,
-----Original Message-----
From: Hu, Jiayu
Sent: Friday, September 15, 2017 8:55 AM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Konstantin,
-----Original Message-----
From: Ananyev, Konstantin
Sent: Friday, September 15, 2017 2:39 AM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
-----Original Message-----
From: Kavanagh, Mark B
Sent: Thursday, September 14, 2017 4:42 PM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
From: Hu, Jiayu
Sent: Thursday, September 14, 2017 11:01 AM
Mark B
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Konstantin and Mark,
-----Original Message-----
From: Ananyev, Konstantin
Sent: Thursday, September 14, 2017 5:36 PM
Tan,
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
-----Original Message-----
From: Hu, Jiayu
Sent: Thursday, September 14, 2017 10:29 AM
Tan,
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Konstantin,
-----Original Message-----
From: Ananyev, Konstantin
Sent: Thursday, September 14, 2017 4:47 PM
Tan,
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Jiayu,
-----Original Message-----
From: Hu, Jiayu
Sent: Thursday, September 14, 2017 7:07 AM
Tan,
Subject: Re: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Konstantin,
On Thu, Sep 14, 2017 at 06:10:37AM +0800, Ananyev, Konstantin
Post by Ananyev, Konstantin
Hi Jiayu,
Post by Jiayu Hu
Post by Ananyev, Konstantin
-----Original Message-----
From: Ananyev, Konstantin
Sent: Tuesday, September 12, 2017 12:18 PM
Jianfeng
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Post by Jiayu Hu
result, when all of its GSOed segments are freed, the
packet
is
freed
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
automatically.
diff --git a/lib/librte_gso/rte_gso.c
b/lib/librte_gso/rte_gso.c
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
index dda50ee..95f6ea6 100644
--- a/lib/librte_gso/rte_gso.c
+++ b/lib/librte_gso/rte_gso.c
@@ -33,18 +33,53 @@
#include <errno.h>
+#include <rte_log.h>
+
#include "rte_gso.h"
+#include "gso_common.h"
+#include "gso_tcp4.h"
int
rte_gso_segment(struct rte_mbuf *pkt,
- struct rte_gso_ctx gso_ctx __rte_unused,
+ struct rte_gso_ctx gso_ctx,
struct rte_mbuf **pkts_out,
uint16_t nb_pkts_out)
{
+ struct rte_mempool *direct_pool, *indirect_pool;
+ struct rte_mbuf *pkt_seg;
+ uint16_t gso_size;
+ uint8_t ipid_delta;
+ int ret = 1;
+
if (pkt == NULL || pkts_out == NULL || nb_pkts_out
< 1)
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
return -EINVAL;
- pkts_out[0] = pkt;
+ if (gso_ctx.gso_size >= pkt->pkt_len ||
+ (pkt->packet_type &
gso_ctx.gso_types) !=
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
+ pkt->packet_type) {
+ pkts_out[0] = pkt;
+ return ret;
+ }
+
+ direct_pool = gso_ctx.direct_pool;
+ indirect_pool = gso_ctx.indirect_pool;
+ gso_size = gso_ctx.gso_size;
+ ipid_delta = gso_ctx.ipid_flag ==
RTE_GSO_IPID_INCREASE;
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
+
+ if (is_ipv4_tcp(pkt->packet_type)) {
If (is_ipv4_tcp(pkt->packet_type) && (gso_ctx->gso_types
&
DEV_TX_OFFLOAD_TCP_TSO) != 0) {...
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
If (pkt->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_IPV4) ==
PKT_TX_IPV4
&&
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
(gso_ctx->gso_types & DEV_TX_OFFLOAD_TCP_TSO) != 0)
{...
Post by Ananyev, Konstantin
Post by Jiayu Hu
I don't quite understand why the GSO library should be aware if
the
TSO
Post by Ananyev, Konstantin
Post by Jiayu Hu
flag is set or not. Applications can query device TSO capability
before
Post by Ananyev, Konstantin
Post by Jiayu Hu
they call the GSO library. Do I misundertsand anything?
Additionally, we don't need to check if the packet is a TCP/IPv4
packet
here?
Post by Ananyev, Konstantin
Well, right now PMD we doesn't rely on ptype to figure out what
type
of
packet and
Post by Ananyev, Konstantin
what TX offload have to be performed.
Instead it looks at TX part of ol_flags, and
My thought was that as what we doing is actually TSO in SW, it
would
be
good
Post by Ananyev, Konstantin
to use the same API here too.
Also with that approach, by setting ol_flags properly user can use
the
same gso_ctx and still
Post by Ananyev, Konstantin
specify what segmentation to perform on a per-packet basis.
Alternative way is to rely on ptype to distinguish should
segmentation
be
performed on that package or not.
Post by Ananyev, Konstantin
The only advantage I see here is that if someone would like to
add
GSO
for some new protocol,
Post by Ananyev, Konstantin
he wouldn't need to introduce new TX flag value for
mbuf.ol_flags.
Post by Ananyev, Konstantin
Though he still would need to update TX_OFFLOAD_* capabilities
and
probably packet_type definitions.
Post by Ananyev, Konstantin
So from my perspective first variant (use HW TSO API) is more
plausible.
Post by Ananyev, Konstantin
Wonder what is your and Mark opinions here?
the GSO library uses gso_ctx->gso_types and mbuf->ol_flags to call
a
specific GSO
segmentation function (e.g. gso_tcp4_segment(), gso_tunnel_xxx())
for
each input packet.
Applications should parse the packet type, and set an exactly
correct
DEV_TX_OFFLOAD_*_TSO
flag to gso_types and ol_flags according to the packet type. That is,
the
value of gso_types
is on a per-packet basis. Using gso_ctx->gso_types and mbuf-
ol_flags
at
the same time
is because that DEV_TX_OFFLOAD_*_TSO only tells tunnelling type
and
the
inner L4 type, and
we need to know L3 type by ol_flags. With this design, HW
segmentation
and SW segmentation
are indeed consistent.
If I understand it correctly, applications need to set 'ol_flags =
PKT_TX_IPV4' and
'gso_types = DEV_TX_OFFLOAD_VXLAN_TNL_TSO' for a
"ether+ipv4+udp+vxlan+ether+ipv4+
tcp+payload" packet. But PKT_TX_IPV4 just present the inner L3
type
for
tunneled packet.
How about the outer L3 type? Always assume the inner and the
outer L3
type are the same?
PKT_TX_IPV4 | PKT_TX_OUTER_IPV4 | PKT_TX_TUNNEL_VXLAN |
PKT_TX_TCP_SEG
OK, so it means PKT_TX_TCP_SEG is also used for tunneled TSO. The
GSO library doesn't need gso_types anymore.
You still might need gso_ctx.gso_types to let user limit what types of
segmentation
that particular gso_ctx supports.
An alternative would be to assume that each gso_ctx supports all
currently implemented segmentations.
This is possible too, but probably not very convenient to the user.
Hmm, make sense.
One thing to confirm: the value of gso_types should be
DEV_TX_OFFLOAD_*_TSO,
or new macros?
Hi Jiayu, Konstantin,
I think that the existing macros are fine, as they provide a consistent view
of segmentation capabilities to the application/user.
+1
I also think it is better to re-use DEV_TX_OFFLOAD_*_TSO.
There might be an 'issue', if we use 'PKT_TX_TCP_SEG' to tell the
GSO library to segment a packet or not. Given the scenario that
an application only wants to do GSO and doesn't want to use TSO.
The application sets 'mbuf->ol_flags=PKT_TX_TCP_SEG' and doesn't
set mbuf->tso_segsz. Then the GSO library segments the packet, and
all output GSO segments have the same ol_flags as the input packet
(in current GSO library design). Then the output GSO segments are
transmitted to rte_eth_tx_prepare(). If the NIC is i40e, its TX prepare function,
i40e_prep_pkts, checks if mbuf->tso_segsz is in the range of I40E_MIN_TSO_MSS
and I40E_MAX_TSO_MSS, when PKT_TX_TCP_SEG is set. So an error happens in
this scenario, since tso_segsz is 0.
In fact, it may confuse the PMD driver when set PKT_TX_TCP_SEG but don't want
to do TSO. One solution is that the GSO library removes the PKT_TX_TCP_SEG flag
for all GSO segments after finishes segmenting.
Yes, that was my thought too: after successful segmentation we probably
need to cleanup related ol_flags.
In fact, we just don't need to set these flags in our newly created segments.
Konstantin
Wonder you and Mark's opinion.
Thanks,
Jiayu
I was initially concerned that they might be too coarse-grained (i.e. only
IPv4 is currently supported, and not IPv6), but as per Konstantin's
previous example, the DEV_TX_OFFLOAD_*_TSO macros can be used in
concert with the packet type to determine whether a packet should
be fragmented or not.
Thanks,
Mark
Jiayu
Konstantin
The first choice makes HW and SW segmentation are totally the same.
Applications just need to parse the packet and set proper ol_flags, and
the GSO library uses ol_flags to decide which segmentation function to
use.
I think it's better than the second choice which depending on ptype to
choose segmentation function.
Jiayu
Konstantin
Jiayu
Post by Ananyev, Konstantin
Konstantin
Hu, Jiayu
2017-09-15 08:38:20 UTC
Permalink
-----Original Message-----
From: Ananyev, Konstantin
Sent: Friday, September 15, 2017 4:17 PM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
-----Original Message-----
From: Ananyev, Konstantin
Sent: Friday, September 15, 2017 9:16 AM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Jiayu,
-----Original Message-----
From: Hu, Jiayu
Sent: Friday, September 15, 2017 8:55 AM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Konstantin,
-----Original Message-----
From: Ananyev, Konstantin
Sent: Friday, September 15, 2017 2:39 AM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
-----Original Message-----
From: Kavanagh, Mark B
Sent: Thursday, September 14, 2017 4:42 PM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
From: Hu, Jiayu
Sent: Thursday, September 14, 2017 11:01 AM
Kavanagh,
Mark B
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Konstantin and Mark,
-----Original Message-----
From: Ananyev, Konstantin
Sent: Thursday, September 14, 2017 5:36 PM
Tan,
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
-----Original Message-----
From: Hu, Jiayu
Sent: Thursday, September 14, 2017 10:29 AM
Tan,
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Konstantin,
-----Original Message-----
From: Ananyev, Konstantin
Sent: Thursday, September 14, 2017 4:47 PM
Tan,
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Jiayu,
-----Original Message-----
From: Hu, Jiayu
Sent: Thursday, September 14, 2017 7:07 AM
Tan,
Subject: Re: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Konstantin,
On Thu, Sep 14, 2017 at 06:10:37AM +0800, Ananyev,
Konstantin
Post by Ananyev, Konstantin
Hi Jiayu,
Post by Jiayu Hu
Post by Ananyev, Konstantin
-----Original Message-----
From: Ananyev, Konstantin
Sent: Tuesday, September 12, 2017 12:18 PM
Tan,
Jianfeng
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO
support
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
result, when all of its GSOed segments are freed, the
packet
is
freed
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
automatically.
diff --git a/lib/librte_gso/rte_gso.c
b/lib/librte_gso/rte_gso.c
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
index dda50ee..95f6ea6 100644
--- a/lib/librte_gso/rte_gso.c
+++ b/lib/librte_gso/rte_gso.c
@@ -33,18 +33,53 @@
#include <errno.h>
+#include <rte_log.h>
+
#include "rte_gso.h"
+#include "gso_common.h"
+#include "gso_tcp4.h"
int
rte_gso_segment(struct rte_mbuf *pkt,
- struct rte_gso_ctx gso_ctx
__rte_unused,
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
+ struct rte_gso_ctx gso_ctx,
struct rte_mbuf **pkts_out,
uint16_t nb_pkts_out)
{
+ struct rte_mempool *direct_pool,
*indirect_pool;
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
+ struct rte_mbuf *pkt_seg;
+ uint16_t gso_size;
+ uint8_t ipid_delta;
+ int ret = 1;
+
if (pkt == NULL || pkts_out == NULL ||
nb_pkts_out
< 1)
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
return -EINVAL;
- pkts_out[0] = pkt;
+ if (gso_ctx.gso_size >= pkt->pkt_len ||
+ (pkt->packet_type &
gso_ctx.gso_types) !=
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
+ pkt->packet_type) {
+ pkts_out[0] = pkt;
+ return ret;
+ }
+
+ direct_pool = gso_ctx.direct_pool;
+ indirect_pool = gso_ctx.indirect_pool;
+ gso_size = gso_ctx.gso_size;
+ ipid_delta = gso_ctx.ipid_flag ==
RTE_GSO_IPID_INCREASE;
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
+
+ if (is_ipv4_tcp(pkt->packet_type)) {
If (is_ipv4_tcp(pkt->packet_type) && (gso_ctx-
gso_types
&
DEV_TX_OFFLOAD_TCP_TSO) != 0) {...
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
If (pkt->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_IPV4) ==
PKT_TX_IPV4
&&
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
(gso_ctx->gso_types &
DEV_TX_OFFLOAD_TCP_TSO) != 0)
{...
Post by Ananyev, Konstantin
Post by Jiayu Hu
I don't quite understand why the GSO library should be
aware if
the
TSO
Post by Ananyev, Konstantin
Post by Jiayu Hu
flag is set or not. Applications can query device TSO
capability
before
Post by Ananyev, Konstantin
Post by Jiayu Hu
they call the GSO library. Do I misundertsand anything?
Additionally, we don't need to check if the packet is a
TCP/IPv4
packet
here?
Post by Ananyev, Konstantin
Well, right now PMD we doesn't rely on ptype to figure out
what
type
of
packet and
Post by Ananyev, Konstantin
what TX offload have to be performed.
Instead it looks at TX part of ol_flags, and
My thought was that as what we doing is actually TSO in SW,
it
would
be
good
Post by Ananyev, Konstantin
to use the same API here too.
Also with that approach, by setting ol_flags properly user
can use
the
same gso_ctx and still
Post by Ananyev, Konstantin
specify what segmentation to perform on a per-packet
basis.
Post by Ananyev, Konstantin
Alternative way is to rely on ptype to distinguish should
segmentation
be
performed on that package or not.
Post by Ananyev, Konstantin
The only advantage I see here is that if someone would like
to
add
GSO
for some new protocol,
Post by Ananyev, Konstantin
he wouldn't need to introduce new TX flag value for
mbuf.ol_flags.
Post by Ananyev, Konstantin
Though he still would need to update TX_OFFLOAD_*
capabilities
and
probably packet_type definitions.
Post by Ananyev, Konstantin
So from my perspective first variant (use HW TSO API) is
more
plausible.
Post by Ananyev, Konstantin
Wonder what is your and Mark opinions here?
the GSO library uses gso_ctx->gso_types and mbuf->ol_flags
to call
a
specific GSO
segmentation function (e.g. gso_tcp4_segment(),
gso_tunnel_xxx())
for
each input packet.
Applications should parse the packet type, and set an exactly
correct
DEV_TX_OFFLOAD_*_TSO
flag to gso_types and ol_flags according to the packet type.
That is,
the
value of gso_types
is on a per-packet basis. Using gso_ctx->gso_types and mbuf-
ol_flags
at
the same time
is because that DEV_TX_OFFLOAD_*_TSO only tells tunnelling
type
and
the
inner L4 type, and
we need to know L3 type by ol_flags. With this design, HW
segmentation
and SW segmentation
are indeed consistent.
If I understand it correctly, applications need to set 'ol_flags =
PKT_TX_IPV4' and
'gso_types = DEV_TX_OFFLOAD_VXLAN_TNL_TSO' for a
"ether+ipv4+udp+vxlan+ether+ipv4+
tcp+payload" packet. But PKT_TX_IPV4 just present the inner
L3
type
for
tunneled packet.
How about the outer L3 type? Always assume the inner and
the
outer L3
type are the same?
PKT_TX_IPV4 | PKT_TX_OUTER_IPV4 |
PKT_TX_TUNNEL_VXLAN |
PKT_TX_TCP_SEG
OK, so it means PKT_TX_TCP_SEG is also used for tunneled TSO.
The
GSO library doesn't need gso_types anymore.
You still might need gso_ctx.gso_types to let user limit what types
of
segmentation
that particular gso_ctx supports.
An alternative would be to assume that each gso_ctx supports all
currently implemented segmentations.
This is possible too, but probably not very convenient to the user.
Hmm, make sense.
One thing to confirm: the value of gso_types should be
DEV_TX_OFFLOAD_*_TSO,
or new macros?
Hi Jiayu, Konstantin,
I think that the existing macros are fine, as they provide a consistent
view
of segmentation capabilities to the application/user.
+1
I also think it is better to re-use DEV_TX_OFFLOAD_*_TSO.
There might be an 'issue', if we use 'PKT_TX_TCP_SEG' to tell the
GSO library to segment a packet or not. Given the scenario that
an application only wants to do GSO and doesn't want to use TSO.
The application sets 'mbuf->ol_flags=PKT_TX_TCP_SEG' and doesn't
set mbuf->tso_segsz. Then the GSO library segments the packet, and
all output GSO segments have the same ol_flags as the input packet
(in current GSO library design). Then the output GSO segments are
transmitted to rte_eth_tx_prepare(). If the NIC is i40e, its TX prepare
function,
i40e_prep_pkts, checks if mbuf->tso_segsz is in the range of
I40E_MIN_TSO_MSS
and I40E_MAX_TSO_MSS, when PKT_TX_TCP_SEG is set. So an error
happens in
this scenario, since tso_segsz is 0.
In fact, it may confuse the PMD driver when set PKT_TX_TCP_SEG but
don't want
to do TSO. One solution is that the GSO library removes the
PKT_TX_TCP_SEG flag
for all GSO segments after finishes segmenting.
Yes, that was my thought too: after successful segmentation we probably
need to cleanup related ol_flags.
In fact, we just don't need to set these flags in our newly created segments.
+1. PKT_TX_TCP_SEG is not needed, but others, like PKT_TX_IPV4, should be
kept, since they may also be used by other HW offloadings, like csum.

Thanks,
Jiayu
Konstantin
Wonder you and Mark's opinion.
Thanks,
Jiayu
I was initially concerned that they might be too coarse-grained (i.e.
only
IPv4 is currently supported, and not IPv6), but as per Konstantin's
previous example, the DEV_TX_OFFLOAD_*_TSO macros can be used
in
concert with the packet type to determine whether a packet should
be fragmented or not.
Thanks,
Mark
Jiayu
Konstantin
The first choice makes HW and SW segmentation are totally the
same.
Applications just need to parse the packet and set proper ol_flags,
and
the GSO library uses ol_flags to decide which segmentation
function to
use.
I think it's better than the second choice which depending on
ptype to
choose segmentation function.
Jiayu
Konstantin
Jiayu
Post by Ananyev, Konstantin
Konstantin
Kavanagh, Mark B
2017-09-14 08:51:56 UTC
Permalink
From: Hu, Jiayu
Sent: Thursday, September 14, 2017 7:07 AM
Subject: Re: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Konstantin,
Post by Ananyev, Konstantin
Hi Jiayu,
Post by Jiayu Hu
Post by Ananyev, Konstantin
-----Original Message-----
From: Ananyev, Konstantin
Sent: Tuesday, September 12, 2017 12:18 PM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Post by Jiayu Hu
result, when all of its GSOed segments are freed, the packet is
freed
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
automatically.
diff --git a/lib/librte_gso/rte_gso.c b/lib/librte_gso/rte_gso.c
index dda50ee..95f6ea6 100644
--- a/lib/librte_gso/rte_gso.c
+++ b/lib/librte_gso/rte_gso.c
@@ -33,18 +33,53 @@
#include <errno.h>
+#include <rte_log.h>
+
#include "rte_gso.h"
+#include "gso_common.h"
+#include "gso_tcp4.h"
int
rte_gso_segment(struct rte_mbuf *pkt,
- struct rte_gso_ctx gso_ctx __rte_unused,
+ struct rte_gso_ctx gso_ctx,
struct rte_mbuf **pkts_out,
uint16_t nb_pkts_out)
{
+ struct rte_mempool *direct_pool, *indirect_pool;
+ struct rte_mbuf *pkt_seg;
+ uint16_t gso_size;
+ uint8_t ipid_delta;
+ int ret = 1;
+
if (pkt == NULL || pkts_out == NULL || nb_pkts_out < 1)
return -EINVAL;
- pkts_out[0] = pkt;
+ if (gso_ctx.gso_size >= pkt->pkt_len ||
+ (pkt->packet_type & gso_ctx.gso_types) !=
+ pkt->packet_type) {
+ pkts_out[0] = pkt;
+ return ret;
+ }
+
+ direct_pool = gso_ctx.direct_pool;
+ indirect_pool = gso_ctx.indirect_pool;
+ gso_size = gso_ctx.gso_size;
+ ipid_delta = gso_ctx.ipid_flag == RTE_GSO_IPID_INCREASE;
+
+ if (is_ipv4_tcp(pkt->packet_type)) {
If (is_ipv4_tcp(pkt->packet_type) && (gso_ctx->gso_types &
DEV_TX_OFFLOAD_TCP_TSO) != 0) {...
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
If (pkt->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_IPV4) == PKT_TX_IPV4 &&
(gso_ctx->gso_types & DEV_TX_OFFLOAD_TCP_TSO) != 0) {...
I don't quite understand why the GSO library should be aware if the TSO
flag is set or not. Applications can query device TSO capability before
they call the GSO library. Do I misundertsand anything?
Additionally, we don't need to check if the packet is a TCP/IPv4 packet
here?
Post by Ananyev, Konstantin
Well, right now PMD we doesn't rely on ptype to figure out what type of
packet and
Post by Ananyev, Konstantin
what TX offload have to be performed.
Instead it looks at TX part of ol_flags, and
My thought was that as what we doing is actually TSO in SW, it would be good
to use the same API here too.
Also with that approach, by setting ol_flags properly user can use the same
gso_ctx and still
Post by Ananyev, Konstantin
specify what segmentation to perform on a per-packet basis.
Alternative way is to rely on ptype to distinguish should segmentation be
performed on that package or not.
Post by Ananyev, Konstantin
The only advantage I see here is that if someone would like to add GSO for
some new protocol,
Post by Ananyev, Konstantin
he wouldn't need to introduce new TX flag value for mbuf.ol_flags.
Though he still would need to update TX_OFFLOAD_* capabilities and probably
packet_type definitions.
Post by Ananyev, Konstantin
So from my perspective first variant (use HW TSO API) is more plausible.
Wonder what is your and Mark opinions here?
the GSO library uses gso_ctx->gso_types and mbuf->ol_flags to call a specific GSO
segmentation function (e.g. gso_tcp4_segment(), gso_tunnel_xxx()) for each input packet.
Applications should parse the packet type, and set an exactly correct DEV_TX_OFFLOAD_*_TSO
flag to gso_types and ol_flags according to the packet type. That is, the value of gso_types
is on a per-packet basis. Using gso_ctx->gso_types and mbuf->ol_flags at the same time
is because that DEV_TX_OFFLOAD_*_TSO only tells tunnelling type and the inner L4 type, and
we need to know L3 type by ol_flags. With this design, HW segmentation and SW segmentation
are indeed consistent.
If I understand it correctly, applications need to set 'ol_flags = PKT_TX_IPV4' and
'gso_types = DEV_TX_OFFLOAD_VXLAN_TNL_TSO' for a
"ether+ipv4+udp+vxlan+ether+ipv4+
tcp+payload" packet. But PKT_TX_IPV4 just present the inner L3 type for tunneled packet.
How about the outer L3 type? Always assume the inner and the outer L3 type are the same?
Hi Jiayu,

If I'm not mistaken, I think what Konstantin is suggesting is as follows:

- The DEV_TX_OFFLOAD_*_TSO flags are currently used to describe a NIC's TSO capabilities; the GSO capabilities may also be described using the same macros, to provide a consistent view of segmentation capabilities across the HW and SW implementations.

- As part of segmentation, it's still a case of checking the packet type, but then setting the appropriate ol_flags in the mbuf, which the GSO library can use to segment the packet.

Thanks,
Mark
Jiayu
Post by Ananyev, Konstantin
Konstantin
Hu, Jiayu
2017-09-14 09:45:49 UTC
Permalink
Hi Mark,
-----Original Message-----
From: Kavanagh, Mark B
Sent: Thursday, September 14, 2017 4:52 PM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
From: Hu, Jiayu
Sent: Thursday, September 14, 2017 7:07 AM
Jianfeng
Subject: Re: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Hi Konstantin,
Post by Ananyev, Konstantin
Hi Jiayu,
Post by Jiayu Hu
Post by Ananyev, Konstantin
-----Original Message-----
From: Ananyev, Konstantin
Sent: Tuesday, September 12, 2017 12:18 PM
Subject: RE: [PATCH v3 2/5] gso: add TCP/IPv4 GSO support
Post by Jiayu Hu
result, when all of its GSOed segments are freed, the packet is
freed
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
Post by Jiayu Hu
automatically.
diff --git a/lib/librte_gso/rte_gso.c b/lib/librte_gso/rte_gso.c
index dda50ee..95f6ea6 100644
--- a/lib/librte_gso/rte_gso.c
+++ b/lib/librte_gso/rte_gso.c
@@ -33,18 +33,53 @@
#include <errno.h>
+#include <rte_log.h>
+
#include "rte_gso.h"
+#include "gso_common.h"
+#include "gso_tcp4.h"
int
rte_gso_segment(struct rte_mbuf *pkt,
- struct rte_gso_ctx gso_ctx __rte_unused,
+ struct rte_gso_ctx gso_ctx,
struct rte_mbuf **pkts_out,
uint16_t nb_pkts_out)
{
+ struct rte_mempool *direct_pool, *indirect_pool;
+ struct rte_mbuf *pkt_seg;
+ uint16_t gso_size;
+ uint8_t ipid_delta;
+ int ret = 1;
+
if (pkt == NULL || pkts_out == NULL || nb_pkts_out < 1)
return -EINVAL;
- pkts_out[0] = pkt;
+ if (gso_ctx.gso_size >= pkt->pkt_len ||
+ (pkt->packet_type & gso_ctx.gso_types) !=
+ pkt->packet_type) {
+ pkts_out[0] = pkt;
+ return ret;
+ }
+
+ direct_pool = gso_ctx.direct_pool;
+ indirect_pool = gso_ctx.indirect_pool;
+ gso_size = gso_ctx.gso_size;
+ ipid_delta = gso_ctx.ipid_flag == RTE_GSO_IPID_INCREASE;
+
+ if (is_ipv4_tcp(pkt->packet_type)) {
If (is_ipv4_tcp(pkt->packet_type) && (gso_ctx->gso_types &
DEV_TX_OFFLOAD_TCP_TSO) != 0) {...
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
If (pkt->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_IPV4) == PKT_TX_IPV4
&&
Post by Ananyev, Konstantin
Post by Jiayu Hu
Post by Ananyev, Konstantin
(gso_ctx->gso_types & DEV_TX_OFFLOAD_TCP_TSO) != 0) {...
I don't quite understand why the GSO library should be aware if the TSO
flag is set or not. Applications can query device TSO capability before
they call the GSO library. Do I misundertsand anything?
Additionally, we don't need to check if the packet is a TCP/IPv4 packet
here?
Post by Ananyev, Konstantin
Well, right now PMD we doesn't rely on ptype to figure out what type of
packet and
Post by Ananyev, Konstantin
what TX offload have to be performed.
Instead it looks at TX part of ol_flags, and
My thought was that as what we doing is actually TSO in SW, it would be
good
Post by Ananyev, Konstantin
to use the same API here too.
Also with that approach, by setting ol_flags properly user can use the
same
gso_ctx and still
Post by Ananyev, Konstantin
specify what segmentation to perform on a per-packet basis.
Alternative way is to rely on ptype to distinguish should segmentation be
performed on that package or not.
Post by Ananyev, Konstantin
The only advantage I see here is that if someone would like to add GSO
for
some new protocol,
Post by Ananyev, Konstantin
he wouldn't need to introduce new TX flag value for mbuf.ol_flags.
Though he still would need to update TX_OFFLOAD_* capabilities and
probably
packet_type definitions.
Post by Ananyev, Konstantin
So from my perspective first variant (use HW TSO API) is more plausible.
Wonder what is your and Mark opinions here?
the GSO library uses gso_ctx->gso_types and mbuf->ol_flags to call a
specific
GSO
segmentation function (e.g. gso_tcp4_segment(), gso_tunnel_xxx()) for
each
input packet.
Applications should parse the packet type, and set an exactly correct
DEV_TX_OFFLOAD_*_TSO
flag to gso_types and ol_flags according to the packet type. That is, the
value of gso_types
is on a per-packet basis. Using gso_ctx->gso_types and mbuf->ol_flags at
the
same time
is because that DEV_TX_OFFLOAD_*_TSO only tells tunnelling type and the
inner
L4 type, and
we need to know L3 type by ol_flags. With this design, HW segmentation
and SW
segmentation
are indeed consistent.
If I understand it correctly, applications need to set 'ol_flags = PKT_TX_IPV4' and
'gso_types = DEV_TX_OFFLOAD_VXLAN_TNL_TSO' for a
"ether+ipv4+udp+vxlan+ether+ipv4+
tcp+payload" packet. But PKT_TX_IPV4 just present the inner L3 type for tunneled packet.
How about the outer L3 type? Always assume the inner and the outer L3
type are
the same?
Hi Jiayu,
- The DEV_TX_OFFLOAD_*_TSO flags are currently used to describe a NIC's
TSO capabilities; the GSO capabilities may also be described using the same
macros, to provide a consistent view of segmentation capabilities across the
HW and SW implementations.
Yes, DEV_TX_OFFLOAD_*_TSO stored in gso_types are used to by applications
to tell the GSO library what GSO types are required. The GSO library uses ol_flags
to decide which segmentation function to use.

Thanks,
Jiayu
- As part of segmentation, it's still a case of checking the packet type, but
then setting the appropriate ol_flags in the mbuf, which the GSO library can
use to segment the packet.
Thanks,
Mark
Jiayu
Post by Ananyev, Konstantin
Konstantin
Jiayu Hu
2017-09-12 02:43:30 UTC
Permalink
From: Mark Kavanagh <***@intel.com>

This patch adds GSO support for GRE-tunneled packets. Supported GRE
packets must contain an outer IPv4 header, and inner TCP/IPv4 headers.
They may also contain a single VLAN tag. GRE GSO assumes that all input
packets have correct checksums and doesn't update checksums for output
packets. Additionally, it doesn't process IP fragmented packets.

As with VxLAN GSO, GRE GSO uses a two-segment MBUF to organize each
output packet, which requires multi-segment mbuf support in the TX
functions of the NIC driver. Also, if a packet is GSOed, GRE GSO reduces
its MBUF refcnt by 1. As a result, when all of its GSOed segments are
freed, the packet is freed automatically.

Signed-off-by: Mark Kavanagh <***@intel.com>
Signed-off-by: Jiayu Hu <***@intel.com>
---
lib/librte_gso/gso_common.c | 22 ++++++++++++++++++++++
lib/librte_gso/gso_common.h | 19 +++++++++++++++++++
lib/librte_gso/rte_gso.c | 3 ++-
3 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/lib/librte_gso/gso_common.c b/lib/librte_gso/gso_common.c
index c6779d0..bd56924 100644
--- a/lib/librte_gso/gso_common.c
+++ b/lib/librte_gso/gso_common.c
@@ -37,6 +37,7 @@
#include <rte_memcpy.h>
#include <rte_mempool.h>
#include <rte_ether.h>
+#include <rte_gre.h>
#include <rte_ip.h>
#include <rte_tcp.h>
#include <rte_udp.h>
@@ -237,12 +238,33 @@ update_ipv4_vxlan_tcp4_header(struct rte_mbuf *pkt, uint8_t ipid_delta,
update_inner_tcp4_header(pkt, ipid_delta, segs, nb_segs);
}

+static inline void
+update_ipv4_gre_tcp4_header(struct rte_mbuf *pkt, uint8_t ipid_delta,
+ struct rte_mbuf **segs, uint16_t nb_segs)
+{
+ struct ipv4_hdr *ipv4_hdr;
+ uint16_t i, id;
+
+ ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *) +
+ pkt->outer_l2_len);
+ id = rte_be_to_cpu_16(ipv4_hdr->packet_id);
+ for (i = 0; i < nb_segs; i++) {
+ update_outer_ipv4_header(segs[i], id);
+ id += ipid_delta;
+ }
+
+ /* Update inner TCP/IPv4 headers */
+ update_inner_tcp4_header(pkt, ipid_delta, segs, nb_segs);
+}
+
void
gso_update_pkt_headers(struct rte_mbuf *pkt, uint8_t ipid_delta,
struct rte_mbuf **segs, uint16_t nb_segs)
{
if (is_ipv4_vxlan_ipv4_tcp(pkt->packet_type))
update_ipv4_vxlan_tcp4_header(pkt, ipid_delta, segs, nb_segs);
+ else if (is_ipv4_gre_ipv4_tcp(pkt->packet_type))
+ update_ipv4_gre_tcp4_header(pkt, ipid_delta, segs, nb_segs);
else if (is_ipv4_tcp(pkt->packet_type))
update_inner_tcp4_header(pkt, ipid_delta, segs, nb_segs);
}
diff --git a/lib/librte_gso/gso_common.h b/lib/librte_gso/gso_common.h
index 2377a1d..f6d3238 100644
--- a/lib/librte_gso/gso_common.h
+++ b/lib/librte_gso/gso_common.h
@@ -89,6 +89,25 @@ static inline uint8_t is_ipv4_vxlan_ipv4_tcp(uint32_t ptype)
}
}

+#define ETHER_GRE_TCP (RTE_PTYPE_L2_ETHER | RTE_PTYPE_TUNNEL_GRE | \
+ RTE_PTYPE_INNER_L4_TCP)
+#define ETHER_VLAN_GRE_TCP (RTE_PTYPE_L2_ETHER_VLAN | RTE_PTYPE_TUNNEL_GRE | \
+ RTE_PTYPE_INNER_L4_TCP)
+static inline uint8_t is_ipv4_gre_ipv4_tcp(uint32_t ptype)
+{
+ uint32_t type;
+
+ type = ptype & (~(RTE_PTYPE_L3_MASK | RTE_PTYPE_INNER_L3_MASK));
+ switch (type) {
+ case ETHER_GRE_TCP:
+ case ETHER_VLAN_GRE_TCP:
+ return (RTE_ETH_IS_IPV4_HDR(ptype) > 0) ?
+ IS_INNER_IPV4_HDR(ptype & RTE_PTYPE_INNER_L3_MASK) : 0;
+ default:
+ return 0;
+ }
+}
+
/**
* Internal function which updates relevant packet headers, following
* segmentation. This is required to update, for example, the IPv4
diff --git a/lib/librte_gso/rte_gso.c b/lib/librte_gso/rte_gso.c
index 226c75a..e0925ae 100644
--- a/lib/librte_gso/rte_gso.c
+++ b/lib/librte_gso/rte_gso.c
@@ -67,7 +67,8 @@ rte_gso_segment(struct rte_mbuf *pkt,
gso_size = gso_ctx.gso_size;
ipid_delta = gso_ctx.ipid_flag == RTE_GSO_IPID_INCREASE;

- if (is_ipv4_vxlan_ipv4_tcp(pkt->packet_type)) {
+ if (is_ipv4_vxlan_ipv4_tcp(pkt->packet_type) ||
+ is_ipv4_gre_ipv4_tcp(pkt->packet_type)) {
ret = gso_tunnel_tcp4_segment(pkt, gso_size, ipid_delta,
direct_pool, indirect_pool,
pkts_out, nb_pkts_out);
--
2.7.4
Jiayu Hu
2017-09-12 02:43:31 UTC
Permalink
This patch adds GSO support to the csum forwarding engine. Oversized
packets transmitted over a GSO-enabled port will undergo segmentation
(with the exception of packet-types unsupported by the GSO library).
GSO support is disabled by default.

GSO support may be toggled on a per-port basis, using the command:

"set port <port_id> gso on|off"

The maximum packet length (including the packet header and payload) for
GSO segments may be set with the command:

"set gso segsz <length>"

Show GSO configuration for a given port with the command:

"show port <port_id> gso"

Signed-off-by: Jiayu Hu <***@intel.com>
Signed-off-by: Mark Kavanagh <***@intel.com>
---
app/test-pmd/cmdline.c | 178 ++++++++++++++++++++++++++++++++++++++++++++++++
app/test-pmd/config.c | 24 +++++++
app/test-pmd/csumonly.c | 102 +++++++++++++++++++++++++--
app/test-pmd/testpmd.c | 16 +++++
app/test-pmd/testpmd.h | 10 +++
5 files changed, 326 insertions(+), 4 deletions(-)

diff --git a/app/test-pmd/cmdline.c b/app/test-pmd/cmdline.c
index cd8c358..03b98a3 100644
--- a/app/test-pmd/cmdline.c
+++ b/app/test-pmd/cmdline.c
@@ -431,6 +431,17 @@ static void cmd_help_long_parsed(void *parsed_result,
" Set max flow number and max packet number per-flow"
" for GRO.\n\n"

+ "set port (port_id) gso (on|off)"
+ " Enable or disable Generic Segmentation Offload in"
+ " csum forwarding engine.\n\n"
+
+ "set gso segsz (length)\n"
+ " Set max packet length for output GSO segments,"
+ " including packet header and payload.\n\n"
+
+ "show port (port_id) gso\n"
+ " Show GSO configuration.\n\n"
+
"set fwd (%s)\n"
" Set packet forwarding mode.\n\n"

@@ -3963,6 +3974,170 @@ cmdline_parse_inst_t cmd_gro_set = {
},
};

+/* *** ENABLE/DISABLE GSO *** */
+struct cmd_gso_enable_result {
+ cmdline_fixed_string_t cmd_set;
+ cmdline_fixed_string_t cmd_port;
+ cmdline_fixed_string_t cmd_keyword;
+ cmdline_fixed_string_t cmd_mode;
+ uint8_t cmd_pid;
+};
+
+static void
+cmd_gso_enable_parsed(void *parsed_result,
+ __attribute__((unused)) struct cmdline *cl,
+ __attribute__((unused)) void *data)
+{
+ struct cmd_gso_enable_result *res;
+
+ res = parsed_result;
+ if (!strcmp(res->cmd_keyword, "gso"))
+ setup_gso(res->cmd_mode, res->cmd_pid);
+}
+
+cmdline_parse_token_string_t cmd_gso_enable_set =
+ TOKEN_STRING_INITIALIZER(struct cmd_gso_enable_result,
+ cmd_set, "set");
+cmdline_parse_token_string_t cmd_gso_enable_port =
+ TOKEN_STRING_INITIALIZER(struct cmd_gso_enable_result,
+ cmd_port, "port");
+cmdline_parse_token_string_t cmd_gso_enable_keyword =
+ TOKEN_STRING_INITIALIZER(struct cmd_gso_enable_result,
+ cmd_keyword, "gso");
+cmdline_parse_token_string_t cmd_gso_enable_mode =
+ TOKEN_STRING_INITIALIZER(struct cmd_gso_enable_result,
+ cmd_mode, "on#off");
+cmdline_parse_token_num_t cmd_gso_enable_pid =
+ TOKEN_NUM_INITIALIZER(struct cmd_gso_enable_result,
+ cmd_pid, UINT8);
+
+cmdline_parse_inst_t cmd_gso_enable = {
+ .f = cmd_gso_enable_parsed,
+ .data = NULL,
+ .help_str = "set port <port_id> gso on|off",
+ .tokens = {
+ (void *)&cmd_gso_enable_set,
+ (void *)&cmd_gso_enable_port,
+ (void *)&cmd_gso_enable_pid,
+ (void *)&cmd_gso_enable_keyword,
+ (void *)&cmd_gso_enable_mode,
+ NULL,
+ },
+};
+
+/* *** SET MAX PACKET LENGTH FOR GSO SEGMENTS *** */
+struct cmd_gso_size_result {
+ cmdline_fixed_string_t cmd_set;
+ cmdline_fixed_string_t cmd_keyword;
+ cmdline_fixed_string_t cmd_segsz;
+ uint16_t cmd_size;
+};
+
+static void
+cmd_gso_size_parsed(void *parsed_result,
+ __attribute__((unused)) struct cmdline *cl,
+ __attribute__((unused)) void *data)
+{
+ struct cmd_gso_size_result *res = parsed_result;
+
+ if (test_done == 0) {
+ printf("Before set GSO segsz, please stop fowarding first\n");
+ return;
+ }
+
+ if (!strcmp(res->cmd_keyword, "gso") &&
+ !strcmp(res->cmd_segsz, "segsz")) {
+ if (res->cmd_size == 0) {
+ printf("gso_size should be larger than 0."
+ " Please input a legal value\n");
+ } else
+ gso_max_segment_size = res->cmd_size;
+ }
+}
+
+cmdline_parse_token_string_t cmd_gso_size_set =
+ TOKEN_STRING_INITIALIZER(struct cmd_gso_size_result,
+ cmd_set, "set");
+cmdline_parse_token_string_t cmd_gso_size_keyword =
+ TOKEN_STRING_INITIALIZER(struct cmd_gso_size_result,
+ cmd_keyword, "gso");
+cmdline_parse_token_string_t cmd_gso_size_segsz =
+ TOKEN_STRING_INITIALIZER(struct cmd_gso_size_result,
+ cmd_segsz, "segsz");
+cmdline_parse_token_num_t cmd_gso_size_size =
+ TOKEN_NUM_INITIALIZER(struct cmd_gso_size_result,
+ cmd_size, UINT16);
+
+cmdline_parse_inst_t cmd_gso_size = {
+ .f = cmd_gso_size_parsed,
+ .data = NULL,
+ .help_str = "set gso segsz <length>",
+ .tokens = {
+ (void *)&cmd_gso_size_set,
+ (void *)&cmd_gso_size_keyword,
+ (void *)&cmd_gso_size_segsz,
+ (void *)&cmd_gso_size_size,
+ NULL,
+ },
+};
+
+/* *** SHOW GSO CONFIGURATION *** */
+struct cmd_gso_show_result {
+ cmdline_fixed_string_t cmd_show;
+ cmdline_fixed_string_t cmd_port;
+ cmdline_fixed_string_t cmd_keyword;
+ uint8_t cmd_pid;
+};
+
+static void
+cmd_gso_show_parsed(void *parsed_result,
+ __attribute__((unused)) struct cmdline *cl,
+ __attribute__((unused)) void *data)
+{
+ struct cmd_gso_show_result *res = parsed_result;
+
+ if (!rte_eth_dev_is_valid_port(res->cmd_pid)) {
+ printf("invalid port id %u\n", res->cmd_pid);
+ return;
+ }
+
+ if (!strcmp(res->cmd_keyword, "gso")) {
+ if (gso_ports[res->cmd_pid].enable) {
+ printf("Max GSO segment size: %uB\n"
+ "Support GSO protocols: TCP/IPv4,"
+ " VxlAN and GRE\n",
+ gso_max_segment_size);
+ } else
+ printf("Port %u doesn't enable GSO\n", res->cmd_pid);
+ }
+}
+
+cmdline_parse_token_string_t cmd_gso_show_show =
+ TOKEN_STRING_INITIALIZER(struct cmd_gso_show_result,
+ cmd_show, "show");
+cmdline_parse_token_string_t cmd_gso_show_port =
+ TOKEN_STRING_INITIALIZER(struct cmd_gso_show_result,
+ cmd_port, "port");
+cmdline_parse_token_string_t cmd_gso_show_keyword =
+ TOKEN_STRING_INITIALIZER(struct cmd_gso_show_result,
+ cmd_keyword, "gso");
+cmdline_parse_token_num_t cmd_gso_show_pid =
+ TOKEN_NUM_INITIALIZER(struct cmd_gso_show_result,
+ cmd_pid, UINT8);
+
+cmdline_parse_inst_t cmd_gso_show = {
+ .f = cmd_gso_show_parsed,
+ .data = NULL,
+ .help_str = "show port <port_id> gso",
+ .tokens = {
+ (void *)&cmd_gso_show_show,
+ (void *)&cmd_gso_show_port,
+ (void *)&cmd_gso_show_pid,
+ (void *)&cmd_gso_show_keyword,
+ NULL,
+ },
+};
+
/* *** ENABLE/DISABLE FLUSH ON RX STREAMS *** */
struct cmd_set_flush_rx {
cmdline_fixed_string_t set;
@@ -14251,6 +14426,9 @@ cmdline_parse_ctx_t main_ctx[] = {
(cmdline_parse_inst_t *)&cmd_tunnel_tso_show,
(cmdline_parse_inst_t *)&cmd_enable_gro,
(cmdline_parse_inst_t *)&cmd_gro_set,
+ (cmdline_parse_inst_t *)&cmd_gso_enable,
+ (cmdline_parse_inst_t *)&cmd_gso_size,
+ (cmdline_parse_inst_t *)&cmd_gso_show,
(cmdline_parse_inst_t *)&cmd_link_flow_control_set,
(cmdline_parse_inst_t *)&cmd_link_flow_control_set_rx,
(cmdline_parse_inst_t *)&cmd_link_flow_control_set_tx,
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 3ae3e1c..3434346 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -2454,6 +2454,30 @@ setup_gro(const char *mode, uint8_t port_id)
}
}

+void
+setup_gso(const char *mode, uint8_t port_id)
+{
+ if (!rte_eth_dev_is_valid_port(port_id)) {
+ printf("invalid port id %u\n", port_id);
+ return;
+ }
+ if (strcmp(mode, "on") == 0) {
+ if (test_done == 0) {
+ printf("before enable GSO,"
+ " please stop forwarding first\n");
+ return;
+ }
+ gso_ports[port_id].enable = 1;
+ } else if (strcmp(mode, "off") == 0) {
+ if (test_done == 0) {
+ printf("before disable GSO,"
+ " please stop forwarding first\n");
+ return;
+ }
+ gso_ports[port_id].enable = 0;
+ }
+}
+
char*
list_pkt_forwarding_modes(void)
{
diff --git a/app/test-pmd/csumonly.c b/app/test-pmd/csumonly.c
index 90c8119..8e9a8a1 100644
--- a/app/test-pmd/csumonly.c
+++ b/app/test-pmd/csumonly.c
@@ -66,10 +66,12 @@
#include <rte_tcp.h>
#include <rte_udp.h>
#include <rte_sctp.h>
+#include <rte_net.h>
#include <rte_prefetch.h>
#include <rte_string_fns.h>
#include <rte_flow.h>
#include <rte_gro.h>
+#include <rte_gso.h>
#include "testpmd.h"

#define IP_DEFTTL 64 /* from RFC 1340. */
@@ -103,6 +105,7 @@ struct testpmd_offload_info {
uint16_t tso_segsz;
uint16_t tunnel_tso_segsz;
uint32_t pkt_len;
+ uint32_t packet_type;
};

/* simplified GRE header */
@@ -129,10 +132,25 @@ parse_ipv4(struct ipv4_hdr *ipv4_hdr, struct testpmd_offload_info *info)
info->l3_len = (ipv4_hdr->version_ihl & 0x0f) * 4;
info->l4_proto = ipv4_hdr->next_proto_id;

+ if (info->is_tunnel)
+ info->packet_type |= RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN;
+ else
+ info->packet_type |= RTE_PTYPE_L3_IPV4_EXT_UNKNOWN;
+
/* only fill l4_len for TCP, it's useful for TSO */
if (info->l4_proto == IPPROTO_TCP) {
tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr + info->l3_len);
info->l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
+ if (info->is_tunnel)
+ info->packet_type |= RTE_PTYPE_INNER_L4_TCP;
+ else
+ info->packet_type |= RTE_PTYPE_L4_TCP;
+ } else if (info->l4_proto == IPPROTO_UDP) {
+ if (info->is_tunnel)
+ info->packet_type |= RTE_PTYPE_INNER_L4_UDP;
+ else
+ info->packet_type |= RTE_PTYPE_L4_UDP;
+ info->l4_len = 0;
} else
info->l4_len = 0;
}
@@ -146,10 +164,25 @@ parse_ipv6(struct ipv6_hdr *ipv6_hdr, struct testpmd_offload_info *info)
info->l3_len = sizeof(struct ipv6_hdr);
info->l4_proto = ipv6_hdr->proto;

+ if (info->is_tunnel)
+ info->packet_type |= RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN;
+ else
+ info->packet_type |= RTE_PTYPE_L3_IPV6_EXT_UNKNOWN;
+
/* only fill l4_len for TCP, it's useful for TSO */
if (info->l4_proto == IPPROTO_TCP) {
tcp_hdr = (struct tcp_hdr *)((char *)ipv6_hdr + info->l3_len);
info->l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
+ if (info->is_tunnel)
+ info->packet_type |= RTE_PTYPE_INNER_L4_TCP;
+ else
+ info->packet_type |= RTE_PTYPE_L4_TCP;
+ } else if (info->l4_proto == IPPROTO_UDP) {
+ if (info->is_tunnel)
+ info->packet_type |= RTE_PTYPE_INNER_L4_UDP;
+ else
+ info->packet_type |= RTE_PTYPE_L4_UDP;
+ info->l4_len = 0;
} else
info->l4_len = 0;
}
@@ -164,16 +197,26 @@ parse_ethernet(struct ether_hdr *eth_hdr, struct testpmd_offload_info *info)
{
struct ipv4_hdr *ipv4_hdr;
struct ipv6_hdr *ipv6_hdr;
+ uint32_t l2_type;

info->l2_len = sizeof(struct ether_hdr);
info->ethertype = eth_hdr->ether_type;
+ if (info->is_tunnel)
+ l2_type = RTE_PTYPE_INNER_L2_ETHER;
+ else
+ l2_type = RTE_PTYPE_L2_ETHER;

if (info->ethertype == _htons(ETHER_TYPE_VLAN)) {
struct vlan_hdr *vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1);

info->l2_len += sizeof(struct vlan_hdr);
info->ethertype = vlan_hdr->eth_proto;
+ if (info->is_tunnel)
+ l2_type = RTE_PTYPE_INNER_L2_ETHER_VLAN;
+ else
+ l2_type = RTE_PTYPE_L2_ETHER_VLAN;
}
+ info->packet_type |= l2_type;

switch (info->ethertype) {
case _htons(ETHER_TYPE_IPv4):
@@ -212,6 +255,7 @@ parse_vxlan(struct udp_hdr *udp_hdr,
info->outer_l2_len = info->l2_len;
info->outer_l3_len = info->l3_len;
info->outer_l4_proto = info->l4_proto;
+ info->packet_type |= RTE_PTYPE_TUNNEL_VXLAN;

eth_hdr = (struct ether_hdr *)((char *)udp_hdr +
sizeof(struct udp_hdr) +
@@ -245,6 +289,7 @@ parse_gre(struct simple_gre_hdr *gre_hdr, struct testpmd_offload_info *info)
info->outer_l2_len = info->l2_len;
info->outer_l3_len = info->l3_len;
info->outer_l4_proto = info->l4_proto;
+ info->packet_type |= RTE_PTYPE_TUNNEL_GRE;

ipv4_hdr = (struct ipv4_hdr *)((char *)gre_hdr + gre_len);

@@ -258,6 +303,7 @@ parse_gre(struct simple_gre_hdr *gre_hdr, struct testpmd_offload_info *info)
info->outer_l2_len = info->l2_len;
info->outer_l3_len = info->l3_len;
info->outer_l4_proto = info->l4_proto;
+ info->packet_type |= RTE_PTYPE_TUNNEL_GRE;

ipv6_hdr = (struct i