Discussion:
[dpdk-dev] [PATCH v3 01/68] eal: move get_virtual_area out of linuxapp eal_memory.c
(too old to reply)
Anatoly Burakov
2018-04-03 23:21:13 UTC
Permalink
Move get_virtual_area out of linuxapp EAL memory and make it
common to EAL, so that other code could reserve virtual areas
as well.

Signed-off-by: Anatoly Burakov <***@intel.com>
---

Notes:
v3: replace uint64_t with size_t for size variables

lib/librte_eal/common/eal_common_memory.c | 101 ++++++++++++++++++++++
lib/librte_eal/common/eal_private.h | 33 +++++++
lib/librte_eal/linuxapp/eal/eal_memory.c | 137 ++++++------------------------
3 files changed, 161 insertions(+), 110 deletions(-)

diff --git a/lib/librte_eal/common/eal_common_memory.c b/lib/librte_eal/common/eal_common_memory.c
index 852f3bb..5b8ced4 100644
--- a/lib/librte_eal/common/eal_common_memory.c
+++ b/lib/librte_eal/common/eal_common_memory.c
@@ -2,10 +2,12 @@
* Copyright(c) 2010-2014 Intel Corporation
*/

+#include <errno.h>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <stdarg.h>
+#include <string.h>
#include <unistd.h>
#include <inttypes.h>
#include <sys/mman.h>
@@ -14,12 +16,111 @@
#include <rte_memory.h>
#include <rte_eal.h>
#include <rte_eal_memconfig.h>
+#include <rte_errno.h>
#include <rte_log.h>

#include "eal_private.h"
#include "eal_internal_cfg.h"

/*
+ * Try to mmap *size bytes in /dev/zero. If it is successful, return the
+ * pointer to the mmap'd area and keep *size unmodified. Else, retry
+ * with a smaller zone: decrease *size by hugepage_sz until it reaches
+ * 0. In this case, return NULL. Note: this function returns an address
+ * which is a multiple of hugepage size.
+ */
+
+static uint64_t baseaddr_offset;
+static uint64_t system_page_sz;
+
+void *
+eal_get_virtual_area(void *requested_addr, size_t *size,
+ size_t page_sz, int flags, int mmap_flags)
+{
+ bool addr_is_hint, allow_shrink, unmap, no_align;
+ uint64_t map_sz;
+ void *mapped_addr, *aligned_addr;
+
+ if (system_page_sz == 0)
+ system_page_sz = sysconf(_SC_PAGESIZE);
+
+ mmap_flags |= MAP_PRIVATE | MAP_ANONYMOUS;
+
+ RTE_LOG(DEBUG, EAL, "Ask a virtual area of 0x%zx bytes\n", *size);
+
+ addr_is_hint = (flags & EAL_VIRTUAL_AREA_ADDR_IS_HINT) > 0;
+ allow_shrink = (flags & EAL_VIRTUAL_AREA_ALLOW_SHRINK) > 0;
+ unmap = (flags & EAL_VIRTUAL_AREA_UNMAP) > 0;
+
+ if (requested_addr == NULL && internal_config.base_virtaddr != 0) {
+ requested_addr = (void *) (internal_config.base_virtaddr +
+ (size_t)baseaddr_offset);
+ requested_addr = RTE_PTR_ALIGN(requested_addr, page_sz);
+ addr_is_hint = true;
+ }
+
+ /* if requested address is not aligned by page size, or if requested
+ * address is NULL, add page size to requested length as we may get an
+ * address that's aligned by system page size, which can be smaller than
+ * our requested page size. additionally, we shouldn't try to align if
+ * system page size is the same as requested page size.
+ */
+ no_align = (requested_addr != NULL &&
+ ((uintptr_t)requested_addr & (page_sz - 1)) == 0) ||
+ page_sz == system_page_sz;
+
+ do {
+ map_sz = no_align ? *size : *size + page_sz;
+
+ mapped_addr = mmap(requested_addr, map_sz, PROT_READ,
+ mmap_flags, -1, 0);
+ if (mapped_addr == MAP_FAILED && allow_shrink)
+ *size -= page_sz;
+ } while (allow_shrink && mapped_addr == MAP_FAILED && *size > 0);
+
+ /* align resulting address - if map failed, we will ignore the value
+ * anyway, so no need to add additional checks.
+ */
+ aligned_addr = no_align ? mapped_addr :
+ RTE_PTR_ALIGN(mapped_addr, page_sz);
+
+ if (*size == 0) {
+ RTE_LOG(ERR, EAL, "Cannot get a virtual area of any size: %s\n",
+ strerror(errno));
+ rte_errno = errno;
+ return NULL;
+ } else if (mapped_addr == MAP_FAILED) {
+ RTE_LOG(ERR, EAL, "Cannot get a virtual area: %s\n",
+ strerror(errno));
+ /* pass errno up the call chain */
+ rte_errno = errno;
+ return NULL;
+ } else if (requested_addr != NULL && !addr_is_hint &&
+ aligned_addr != requested_addr) {
+ RTE_LOG(ERR, EAL, "Cannot get a virtual area at requested address: %p (got %p)\n",
+ requested_addr, aligned_addr);
+ munmap(mapped_addr, map_sz);
+ rte_errno = EADDRNOTAVAIL;
+ return NULL;
+ } else if (requested_addr != NULL && addr_is_hint &&
+ aligned_addr != requested_addr) {
+ RTE_LOG(WARNING, EAL, "WARNING! Base virtual address hint (%p != %p) not respected!\n",
+ requested_addr, aligned_addr);
+ RTE_LOG(WARNING, EAL, " This may cause issues with mapping memory into secondary processes\n");
+ }
+
+ if (unmap)
+ munmap(mapped_addr, map_sz);
+
+ RTE_LOG(DEBUG, EAL, "Virtual area found at %p (size = 0x%zx)\n",
+ aligned_addr, *size);
+
+ baseaddr_offset += *size;
+
+ return aligned_addr;
+}
+
+/*
* Return a pointer to a read-only table of struct rte_physmem_desc
* elements, containing the layout of all addressable physical
* memory. The last element of the table contains a NULL address.
diff --git a/lib/librte_eal/common/eal_private.h b/lib/librte_eal/common/eal_private.h
index 0b28770..3fed436 100644
--- a/lib/librte_eal/common/eal_private.h
+++ b/lib/librte_eal/common/eal_private.h
@@ -127,6 +127,39 @@ int rte_eal_alarm_init(void);
int rte_eal_check_module(const char *module_name);

/**
+ * Get virtual area of specified size from the OS.
+ *
+ * This function is private to the EAL.
+ *
+ * @param requested_addr
+ * Address where to request address space.
+ * @param size
+ * Size of requested area.
+ * @param page_sz
+ * Page size on which to align requested virtual area.
+ * @param flags
+ * EAL_VIRTUAL_AREA_* flags.
+ * @param mmap_flags
+ * Extra flags passed directly to mmap().
+ *
+ * @return
+ * Virtual area address if successful.
+ * NULL if unsuccessful.
+ */
+
+#define EAL_VIRTUAL_AREA_ADDR_IS_HINT (1 << 0)
+/**< don't fail if cannot get exact requested address. */
+#define EAL_VIRTUAL_AREA_ALLOW_SHRINK (1 << 1)
+/**< try getting smaller sized (decrement by page size) virtual areas if cannot
+ * get area of requested size.
+ */
+#define EAL_VIRTUAL_AREA_UNMAP (1 << 2)
+/**< immediately unmap reserved virtual area. */
+void *
+eal_get_virtual_area(void *requested_addr, size_t *size,
+ size_t page_sz, int flags, int mmap_flags);
+
+/**
* Get cpu core_id.
*
* This function is private to the EAL.
diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
index ecf375b..5642cc8 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -28,6 +28,7 @@
#include <numaif.h>
#endif

+#include <rte_errno.h>
#include <rte_log.h>
#include <rte_memory.h>
#include <rte_launch.h>
@@ -57,8 +58,6 @@
* zone as well as a physical contiguous zone.
*/

-static uint64_t baseaddr_offset;
-
static bool phys_addrs_available = true;

#define RANDOMIZE_VA_SPACE_FILE "/proc/sys/kernel/randomize_va_space"
@@ -221,82 +220,6 @@ aslr_enabled(void)
}
}

-/*
- * Try to mmap *size bytes in /dev/zero. If it is successful, return the
- * pointer to the mmap'd area and keep *size unmodified. Else, retry
- * with a smaller zone: decrease *size by hugepage_sz until it reaches
- * 0. In this case, return NULL. Note: this function returns an address
- * which is a multiple of hugepage size.
- */
-static void *
-get_virtual_area(size_t *size, size_t hugepage_sz)
-{
- void *addr;
- void *addr_hint;
- int fd;
- long aligned_addr;
-
- if (internal_config.base_virtaddr != 0) {
- int page_size = sysconf(_SC_PAGE_SIZE);
- addr_hint = (void *) (uintptr_t)
- (internal_config.base_virtaddr + baseaddr_offset);
- addr_hint = RTE_PTR_ALIGN_FLOOR(addr_hint, page_size);
- } else {
- addr_hint = NULL;
- }
-
- RTE_LOG(DEBUG, EAL, "Ask a virtual area of 0x%zx bytes\n", *size);
-
-
- fd = open("/dev/zero", O_RDONLY);
- if (fd < 0){
- RTE_LOG(ERR, EAL, "Cannot open /dev/zero\n");
- return NULL;
- }
- do {
- addr = mmap(addr_hint, (*size) + hugepage_sz, PROT_READ,
-#ifdef RTE_ARCH_PPC_64
- MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
-#else
- MAP_PRIVATE,
-#endif
- fd, 0);
- if (addr == MAP_FAILED) {
- *size -= hugepage_sz;
- } else if (addr_hint != NULL && addr != addr_hint) {
- RTE_LOG(WARNING, EAL, "WARNING! Base virtual address "
- "hint (%p != %p) not respected!\n",
- addr_hint, addr);
- RTE_LOG(WARNING, EAL, " This may cause issues with "
- "mapping memory into secondary processes\n");
- }
- } while (addr == MAP_FAILED && *size > 0);
-
- if (addr == MAP_FAILED) {
- close(fd);
- RTE_LOG(ERR, EAL, "Cannot get a virtual area: %s\n",
- strerror(errno));
- return NULL;
- }
-
- munmap(addr, (*size) + hugepage_sz);
- close(fd);
-
- /* align addr to a huge page size boundary */
- aligned_addr = (long)addr;
- aligned_addr += (hugepage_sz - 1);
- aligned_addr &= (~(hugepage_sz - 1));
- addr = (void *)(aligned_addr);
-
- RTE_LOG(DEBUG, EAL, "Virtual area found at %p (size = 0x%zx)\n",
- addr, *size);
-
- /* increment offset */
- baseaddr_offset += *size;
-
- return addr;
-}
-
static sigjmp_buf huge_jmpenv;

static void huge_sigbus_handler(int signo __rte_unused)
@@ -445,7 +368,16 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
/* get the biggest virtual memory area up to
* vma_len. If it fails, vma_addr is NULL, so
* let the kernel provide the address. */
- vma_addr = get_virtual_area(&vma_len, hpi->hugepage_sz);
+ vma_addr = eal_get_virtual_area(NULL, &vma_len,
+ hpi->hugepage_sz,
+ EAL_VIRTUAL_AREA_ALLOW_SHRINK |
+ EAL_VIRTUAL_AREA_UNMAP,
+#ifdef RTE_ARCH_PPC_64
+ MAP_HUGETLB
+#else
+ 0
+#endif
+ );
if (vma_addr == NULL)
vma_len = hugepage_sz;
}
@@ -1343,7 +1275,7 @@ rte_eal_hugepage_attach(void)
unsigned i, s = 0; /* s used to track the segment number */
unsigned max_seg = RTE_MAX_MEMSEG;
off_t size = 0;
- int fd, fd_zero = -1, fd_hugepage = -1;
+ int fd, fd_hugepage = -1;

if (aslr_enabled() > 0) {
RTE_LOG(WARNING, EAL, "WARNING: Address Space Layout Randomization "
@@ -1354,11 +1286,6 @@ rte_eal_hugepage_attach(void)

test_phys_addrs_available();

- fd_zero = open("/dev/zero", O_RDONLY);
- if (fd_zero < 0) {
- RTE_LOG(ERR, EAL, "Could not open /dev/zero\n");
- goto error;
- }
fd_hugepage = open(eal_hugepage_info_path(), O_RDONLY);
if (fd_hugepage < 0) {
RTE_LOG(ERR, EAL, "Could not open %s\n", eal_hugepage_info_path());
@@ -1368,6 +1295,8 @@ rte_eal_hugepage_attach(void)
/* map all segments into memory to make sure we get the addrs */
for (s = 0; s < RTE_MAX_MEMSEG; ++s) {
void *base_addr;
+ size_t mmap_sz;
+ int mmap_flags = 0;

/*
* the first memory segment with len==0 is the one that
@@ -1376,35 +1305,26 @@ rte_eal_hugepage_attach(void)
if (mcfg->memseg[s].len == 0)
break;

- /*
- * fdzero is mmapped to get a contiguous block of virtual
- * addresses of the appropriate memseg size.
- * use mmap to get identical addresses as the primary process.
+ /* get identical addresses as the primary process.
*/
- base_addr = mmap(mcfg->memseg[s].addr, mcfg->memseg[s].len,
- PROT_READ,
#ifdef RTE_ARCH_PPC_64
- MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
-#else
- MAP_PRIVATE,
+ mmap_flags |= MAP_HUGETLB;
#endif
- fd_zero, 0);
- if (base_addr == MAP_FAILED ||
- base_addr != mcfg->memseg[s].addr) {
+ mmap_sz = mcfg->memseg[s].len;
+ base_addr = eal_get_virtual_area(mcfg->memseg[s].addr,
+ &mmap_sz, mcfg->memseg[s].hugepage_sz, 0,
+ mmap_flags);
+ if (base_addr == NULL) {
max_seg = s;
- if (base_addr != MAP_FAILED) {
- /* errno is stale, don't use */
- RTE_LOG(ERR, EAL, "Could not mmap %llu bytes "
- "in /dev/zero at [%p], got [%p] - "
- "please use '--base-virtaddr' option\n",
+ if (rte_errno == EADDRNOTAVAIL) {
+ RTE_LOG(ERR, EAL, "Could not mmap %llu bytes at [%p] - please use '--base-virtaddr' option\n",
(unsigned long long)mcfg->memseg[s].len,
- mcfg->memseg[s].addr, base_addr);
- munmap(base_addr, mcfg->memseg[s].len);
+ mcfg->memseg[s].addr);
} else {
- RTE_LOG(ERR, EAL, "Could not mmap %llu bytes "
- "in /dev/zero at [%p]: '%s'\n",
+ RTE_LOG(ERR, EAL, "Could not mmap %llu bytes at [%p]: '%s'\n",
(unsigned long long)mcfg->memseg[s].len,
- mcfg->memseg[s].addr, strerror(errno));
+ mcfg->memseg[s].addr,
+ rte_strerror(rte_errno));
}
if (aslr_enabled() > 0) {
RTE_LOG(ERR, EAL, "It is recommended to "
@@ -1469,7 +1389,6 @@ rte_eal_hugepage_attach(void)
}
/* unmap the hugepage config file, since we are done using it */
munmap(hp, size);
- close(fd_zero);
close(fd_hugepage);
return 0;

@@ -1478,8 +1397,6 @@ rte_eal_hugepage_attach(void)
munmap(mcfg->memseg[i].addr, mcfg->memseg[i].len);
if (hp != NULL && hp != MAP_FAILED)
munmap(hp, size);
- if (fd_zero >= 0)
- close(fd_zero);
if (fd_hugepage >= 0)
close(fd_hugepage);
return -1;
--
2.7.4
Anatoly Burakov
2018-04-03 23:21:24 UTC
Permalink
This fixes the following drivers in one go:

grep -Rl rte_eth_dma_zone_reserve drivers/

drivers/net/avf/avf_rxtx.c
drivers/net/thunderx/nicvf_ethdev.c
drivers/net/e1000/igb_rxtx.c
drivers/net/e1000/em_rxtx.c
drivers/net/fm10k/fm10k_ethdev.c
drivers/net/vmxnet3/vmxnet3_rxtx.c
drivers/net/liquidio/lio_rxtx.c
drivers/net/i40e/i40e_rxtx.c
drivers/net/sfc/sfc.c
drivers/net/ixgbe/ixgbe_rxtx.c
drivers/net/nfp/nfp_net.c

Signed-off-by: Anatoly Burakov <***@intel.com>
---

Notes:
v3: moved this patch earlier in the patchset

lib/librte_ether/rte_ethdev.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lib/librte_ether/rte_ethdev.c b/lib/librte_ether/rte_ethdev.c
index 2c74f7e..10cfa20 100644
--- a/lib/librte_ether/rte_ethdev.c
+++ b/lib/librte_ether/rte_ethdev.c
@@ -3403,7 +3403,8 @@ rte_eth_dma_zone_reserve(const struct rte_eth_dev *dev, const char *ring_name,
if (mz)
return mz;

- return rte_memzone_reserve_aligned(z_name, size, socket_id, 0, align);
+ return rte_memzone_reserve_aligned_contig(z_name, size, socket_id, 0,
+ align);
}

int
--
2.7.4
Anatoly Burakov
2018-04-03 23:21:21 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/common/rte_malloc.c | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/lib/librte_eal/common/rte_malloc.c b/lib/librte_eal/common/rte_malloc.c
index f11a822..2cda48e 100644
--- a/lib/librte_eal/common/rte_malloc.c
+++ b/lib/librte_eal/common/rte_malloc.c
@@ -30,7 +30,7 @@ void rte_free(void *addr)
{
if (addr == NULL) return;
if (malloc_heap_free(malloc_elem_from_data(addr)) < 0)
- rte_panic("Fatal error: Invalid memory\n");
+ RTE_LOG(ERR, EAL, "Error: Invalid memory\n");
}

/*
@@ -134,8 +134,10 @@ rte_realloc(void *ptr, size_t size, unsigned align)
return rte_malloc(NULL, size, align);

struct malloc_elem *elem = malloc_elem_from_data(ptr);
- if (elem == NULL)
- rte_panic("Fatal error: memory corruption detected\n");
+ if (elem == NULL) {
+ RTE_LOG(ERR, EAL, "Error: memory corruption detected\n");
+ return NULL;
+ }

size = RTE_CACHE_LINE_ROUNDUP(size), align = RTE_CACHE_LINE_ROUNDUP(align);
/* check alignment matches first, and if ok, see if we can resize block */
--
2.7.4
Anatoly Burakov
2018-04-03 23:21:18 UTC
Permalink
We need this function to join newly allocated segments with the heap.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/common/malloc_elem.c | 6 +++---
lib/librte_eal/common/malloc_elem.h | 3 +++
2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c
index e02ed88..2291ee1 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -325,8 +325,8 @@ join_elem(struct malloc_elem *elem1, struct malloc_elem *elem2)
elem1->next = next;
}

-static struct malloc_elem *
-elem_join_adjacent_free(struct malloc_elem *elem)
+struct malloc_elem *
+malloc_elem_join_adjacent_free(struct malloc_elem *elem)
{
/*
* check if next element exists, is adjacent and is free, if so join
@@ -388,7 +388,7 @@ malloc_elem_free(struct malloc_elem *elem)
ptr = RTE_PTR_ADD(elem, sizeof(*elem));
data_len = elem->size - MALLOC_ELEM_OVERHEAD;

- elem = elem_join_adjacent_free(elem);
+ elem = malloc_elem_join_adjacent_free(elem);

malloc_elem_free_list_insert(elem);

diff --git a/lib/librte_eal/common/malloc_elem.h b/lib/librte_eal/common/malloc_elem.h
index 40e8eb5..99921d2 100644
--- a/lib/librte_eal/common/malloc_elem.h
+++ b/lib/librte_eal/common/malloc_elem.h
@@ -141,6 +141,9 @@ malloc_elem_alloc(struct malloc_elem *elem, size_t size,
int
malloc_elem_free(struct malloc_elem *elem);

+struct malloc_elem *
+malloc_elem_join_adjacent_free(struct malloc_elem *elem);
+
/*
* attempt to resize a malloc_elem by expanding into any free space
* immediately after it in memory.
--
2.7.4
Anatoly Burakov
2018-04-03 23:21:19 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/common/malloc_elem.c | 12 ++++++------
lib/librte_eal/common/malloc_elem.h | 3 +++
2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c
index 2291ee1..008f5a3 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -245,8 +245,8 @@ malloc_elem_free_list_insert(struct malloc_elem *elem)
/*
* Remove the specified element from its heap's free list.
*/
-static void
-elem_free_list_remove(struct malloc_elem *elem)
+void
+malloc_elem_free_list_remove(struct malloc_elem *elem)
{
LIST_REMOVE(elem, free_list);
}
@@ -266,7 +266,7 @@ malloc_elem_alloc(struct malloc_elem *elem, size_t size, unsigned align,
const size_t trailer_size = elem->size - old_elem_size - size -
MALLOC_ELEM_OVERHEAD;

- elem_free_list_remove(elem);
+ malloc_elem_free_list_remove(elem);

if (trailer_size > MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) {
/* split it, too much free space after elem */
@@ -340,7 +340,7 @@ malloc_elem_join_adjacent_free(struct malloc_elem *elem)
erase = RTE_PTR_SUB(elem->next, MALLOC_ELEM_TRAILER_LEN);

/* remove from free list, join to this one */
- elem_free_list_remove(elem->next);
+ malloc_elem_free_list_remove(elem->next);
join_elem(elem, elem->next);

/* erase header and trailer */
@@ -360,7 +360,7 @@ malloc_elem_join_adjacent_free(struct malloc_elem *elem)
erase = RTE_PTR_SUB(elem, MALLOC_ELEM_TRAILER_LEN);

/* remove from free list, join to this one */
- elem_free_list_remove(elem->prev);
+ malloc_elem_free_list_remove(elem->prev);

new_elem = elem->prev;
join_elem(new_elem, elem);
@@ -423,7 +423,7 @@ malloc_elem_resize(struct malloc_elem *elem, size_t size)
/* we now know the element fits, so remove from free list,
* join the two
*/
- elem_free_list_remove(elem->next);
+ malloc_elem_free_list_remove(elem->next);
join_elem(elem, elem->next);

if (elem->size - new_size >= MIN_DATA_SIZE + MALLOC_ELEM_OVERHEAD) {
diff --git a/lib/librte_eal/common/malloc_elem.h b/lib/librte_eal/common/malloc_elem.h
index 99921d2..46e2383 100644
--- a/lib/librte_eal/common/malloc_elem.h
+++ b/lib/librte_eal/common/malloc_elem.h
@@ -151,6 +151,9 @@ malloc_elem_join_adjacent_free(struct malloc_elem *elem);
int
malloc_elem_resize(struct malloc_elem *elem, size_t size);

+void
+malloc_elem_free_list_remove(struct malloc_elem *elem);
+
/*
* dump contents of malloc elem to a file.
*/
--
2.7.4
Anatoly Burakov
2018-04-03 23:21:16 UTC
Permalink
Malloc heap is now a doubly linked list, so it's now possible to
iterate over each malloc element regardless of its state.

Signed-off-by: Anatoly Burakov <***@intel.com>
---

Notes:
v3: mark function as experimental

lib/librte_eal/common/include/rte_malloc.h | 10 ++++++++++
lib/librte_eal/common/malloc_elem.c | 24 ++++++++++++++++++++++++
lib/librte_eal/common/malloc_elem.h | 6 ++++++
lib/librte_eal/common/malloc_heap.c | 22 ++++++++++++++++++++++
lib/librte_eal/common/malloc_heap.h | 3 +++
lib/librte_eal/common/rte_malloc.c | 17 +++++++++++++++++
lib/librte_eal/rte_eal_version.map | 1 +
7 files changed, 83 insertions(+)

diff --git a/lib/librte_eal/common/include/rte_malloc.h b/lib/librte_eal/common/include/rte_malloc.h
index f02a8ba..a9fb7e4 100644
--- a/lib/librte_eal/common/include/rte_malloc.h
+++ b/lib/librte_eal/common/include/rte_malloc.h
@@ -13,6 +13,7 @@

#include <stdio.h>
#include <stddef.h>
+#include <rte_compat.h>
#include <rte_memory.h>

#ifdef __cplusplus
@@ -278,6 +279,15 @@ void
rte_malloc_dump_stats(FILE *f, const char *type);

/**
+ * Dump contents of all malloc heaps to a file.
+ *
+ * @param f
+ * A pointer to a file for output
+ */
+void __rte_experimental
+rte_malloc_dump_heaps(FILE *f);
+
+/**
* Set the maximum amount of allocated memory for this type.
*
* This is not yet implemented
diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c
index eb41200..e02ed88 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -1,6 +1,7 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright(c) 2010-2014 Intel Corporation
*/
+#include <inttypes.h>
#include <stdint.h>
#include <stddef.h>
#include <stdio.h>
@@ -434,3 +435,26 @@ malloc_elem_resize(struct malloc_elem *elem, size_t size)
}
return 0;
}
+
+static inline const char *
+elem_state_to_str(enum elem_state state)
+{
+ switch (state) {
+ case ELEM_PAD:
+ return "PAD";
+ case ELEM_BUSY:
+ return "BUSY";
+ case ELEM_FREE:
+ return "FREE";
+ }
+ return "ERROR";
+}
+
+void
+malloc_elem_dump(const struct malloc_elem *elem, FILE *f)
+{
+ fprintf(f, "Malloc element at %p (%s)\n", elem,
+ elem_state_to_str(elem->state));
+ fprintf(f, " len: 0x%zx pad: 0x%" PRIx32 "\n", elem->size, elem->pad);
+ fprintf(f, " prev: %p next: %p\n", elem->prev, elem->next);
+}
diff --git a/lib/librte_eal/common/malloc_elem.h b/lib/librte_eal/common/malloc_elem.h
index 238e451..40e8eb5 100644
--- a/lib/librte_eal/common/malloc_elem.h
+++ b/lib/librte_eal/common/malloc_elem.h
@@ -149,6 +149,12 @@ int
malloc_elem_resize(struct malloc_elem *elem, size_t size);

/*
+ * dump contents of malloc elem to a file.
+ */
+void
+malloc_elem_dump(const struct malloc_elem *elem, FILE *f);
+
+/*
* Given an element size, compute its freelist index.
*/
size_t
diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c
index 9c95166..44538d7 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -217,6 +217,28 @@ malloc_heap_get_stats(struct malloc_heap *heap,
return 0;
}

+/*
+ * Function to retrieve data for heap on given socket
+ */
+void
+malloc_heap_dump(struct malloc_heap *heap, FILE *f)
+{
+ struct malloc_elem *elem;
+
+ rte_spinlock_lock(&heap->lock);
+
+ fprintf(f, "Heap size: 0x%zx\n", heap->total_size);
+ fprintf(f, "Heap alloc count: %u\n", heap->alloc_count);
+
+ elem = heap->first;
+ while (elem) {
+ malloc_elem_dump(elem, f);
+ elem = elem->next;
+ }
+
+ rte_spinlock_unlock(&heap->lock);
+}
+
int
rte_eal_malloc_heap_init(void)
{
diff --git a/lib/librte_eal/common/malloc_heap.h b/lib/librte_eal/common/malloc_heap.h
index ab0005c..bb28422 100644
--- a/lib/librte_eal/common/malloc_heap.h
+++ b/lib/librte_eal/common/malloc_heap.h
@@ -37,6 +37,9 @@ int
malloc_heap_get_stats(struct malloc_heap *heap,
struct rte_malloc_socket_stats *socket_stats);

+void
+malloc_heap_dump(struct malloc_heap *heap, FILE *f);
+
int
rte_eal_malloc_heap_init(void);

diff --git a/lib/librte_eal/common/rte_malloc.c b/lib/librte_eal/common/rte_malloc.c
index 970813e..f11a822 100644
--- a/lib/librte_eal/common/rte_malloc.c
+++ b/lib/librte_eal/common/rte_malloc.c
@@ -182,6 +182,23 @@ rte_malloc_get_socket_stats(int socket,
}

/*
+ * Function to dump contents of all heaps
+ */
+void __rte_experimental
+rte_malloc_dump_heaps(FILE *f)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ unsigned int idx;
+
+ for (idx = 0; idx < rte_socket_count(); idx++) {
+ unsigned int socket = rte_socket_id_by_idx(idx);
+ fprintf(f, "Heap on socket %i:\n", socket);
+ malloc_heap_dump(&mcfg->malloc_heaps[socket], f);
+ }
+
+}
+
+/*
* Print stats on memory type. If type is NULL, info on all types is printed
*/
void
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index dd38783..d9fc458 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -222,6 +222,7 @@ EXPERIMENTAL {
rte_eal_hotplug_remove;
rte_eal_mbuf_user_pool_ops;
rte_log_register_type_and_pick_level;
+ rte_malloc_dump_heaps;
rte_mp_action_register;
rte_mp_action_unregister;
rte_mp_reply;
--
2.7.4
Anatoly Burakov
2018-04-03 23:21:14 UTC
Permalink
Down the line, we will need to do everything from the heap as any
alloc or free may trigger alloc/free OS memory, which would involve
growing/shrinking heap.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/common/malloc_elem.c | 16 ++--------------
lib/librte_eal/common/malloc_heap.c | 38 +++++++++++++++++++++++++++++++++++++
lib/librte_eal/common/malloc_heap.h | 6 ++++++
lib/librte_eal/common/rte_malloc.c | 4 ++--
4 files changed, 48 insertions(+), 16 deletions(-)

diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c
index 0cadc8a..ea041e2 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -243,10 +243,6 @@ join_elem(struct malloc_elem *elem1, struct malloc_elem *elem2)
int
malloc_elem_free(struct malloc_elem *elem)
{
- if (!malloc_elem_cookies_ok(elem) || elem->state != ELEM_BUSY)
- return -1;
-
- rte_spinlock_lock(&(elem->heap->lock));
size_t sz = elem->size - sizeof(*elem) - MALLOC_ELEM_TRAILER_LEN;
uint8_t *ptr = (uint8_t *)&elem[1];
struct malloc_elem *next = RTE_PTR_ADD(elem, elem->size);
@@ -274,8 +270,6 @@ malloc_elem_free(struct malloc_elem *elem)

memset(ptr, 0, sz);

- rte_spinlock_unlock(&(elem->heap->lock));
-
return 0;
}

@@ -292,11 +286,10 @@ malloc_elem_resize(struct malloc_elem *elem, size_t size)
return 0;

struct malloc_elem *next = RTE_PTR_ADD(elem, elem->size);
- rte_spinlock_lock(&elem->heap->lock);
if (next ->state != ELEM_FREE)
- goto err_return;
+ return -1;
if (elem->size + next->size < new_size)
- goto err_return;
+ return -1;

/* we now know the element fits, so remove from free list,
* join the two
@@ -311,10 +304,5 @@ malloc_elem_resize(struct malloc_elem *elem, size_t size)
split_elem(elem, split_pt);
malloc_elem_free_list_insert(split_pt);
}
- rte_spinlock_unlock(&elem->heap->lock);
return 0;
-
-err_return:
- rte_spinlock_unlock(&elem->heap->lock);
- return -1;
}
diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c
index 7aafc88..7d8d70a 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -145,6 +145,44 @@ malloc_heap_alloc(struct malloc_heap *heap,
return elem == NULL ? NULL : (void *)(&elem[1]);
}

+int
+malloc_heap_free(struct malloc_elem *elem)
+{
+ struct malloc_heap *heap;
+ int ret;
+
+ if (!malloc_elem_cookies_ok(elem) || elem->state != ELEM_BUSY)
+ return -1;
+
+ /* elem may be merged with previous element, so keep heap address */
+ heap = elem->heap;
+
+ rte_spinlock_lock(&(heap->lock));
+
+ ret = malloc_elem_free(elem);
+
+ rte_spinlock_unlock(&(heap->lock));
+
+ return ret;
+}
+
+int
+malloc_heap_resize(struct malloc_elem *elem, size_t size)
+{
+ int ret;
+
+ if (!malloc_elem_cookies_ok(elem) || elem->state != ELEM_BUSY)
+ return -1;
+
+ rte_spinlock_lock(&(elem->heap->lock));
+
+ ret = malloc_elem_resize(elem, size);
+
+ rte_spinlock_unlock(&(elem->heap->lock));
+
+ return ret;
+}
+
/*
* Function to retrieve data for heap on given socket
*/
diff --git a/lib/librte_eal/common/malloc_heap.h b/lib/librte_eal/common/malloc_heap.h
index e0defa7..ab0005c 100644
--- a/lib/librte_eal/common/malloc_heap.h
+++ b/lib/librte_eal/common/malloc_heap.h
@@ -28,6 +28,12 @@ malloc_heap_alloc(struct malloc_heap *heap, const char *type, size_t size,
unsigned flags, size_t align, size_t bound);

int
+malloc_heap_free(struct malloc_elem *elem);
+
+int
+malloc_heap_resize(struct malloc_elem *elem, size_t size);
+
+int
malloc_heap_get_stats(struct malloc_heap *heap,
struct rte_malloc_socket_stats *socket_stats);

diff --git a/lib/librte_eal/common/rte_malloc.c b/lib/librte_eal/common/rte_malloc.c
index e0e0d0b..970813e 100644
--- a/lib/librte_eal/common/rte_malloc.c
+++ b/lib/librte_eal/common/rte_malloc.c
@@ -29,7 +29,7 @@
void rte_free(void *addr)
{
if (addr == NULL) return;
- if (malloc_elem_free(malloc_elem_from_data(addr)) < 0)
+ if (malloc_heap_free(malloc_elem_from_data(addr)) < 0)
rte_panic("Fatal error: Invalid memory\n");
}

@@ -140,7 +140,7 @@ rte_realloc(void *ptr, size_t size, unsigned align)
size = RTE_CACHE_LINE_ROUNDUP(size), align = RTE_CACHE_LINE_ROUNDUP(align);
/* check alignment matches first, and if ok, see if we can resize block */
if (RTE_PTR_ALIGN(ptr,align) == ptr &&
- malloc_elem_resize(elem, size) == 0)
+ malloc_heap_resize(elem, size) == 0)
return ptr;

/* either alignment is off, or we have no room to expand,
--
2.7.4
Anatoly Burakov
2018-04-03 23:21:15 UTC
Permalink
As we are preparing for dynamic memory allocation, we need to be
able to handle holes in our malloc heap, hence we're switching to
doubly linked list, and prepare infrastructure to support it.

Since our heap is now aware where are our first and last elements,
there is no longer any need to have a dummy element at the end of
each heap, so get rid of that as well. Instead, let insert/remove/
join/split operations handle end-of-list conditions automatically.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/common/include/rte_malloc_heap.h | 6 +
lib/librte_eal/common/malloc_elem.c | 200 +++++++++++++++++++-----
lib/librte_eal/common/malloc_elem.h | 14 +-
lib/librte_eal/common/malloc_heap.c | 8 +-
4 files changed, 179 insertions(+), 49 deletions(-)

diff --git a/lib/librte_eal/common/include/rte_malloc_heap.h b/lib/librte_eal/common/include/rte_malloc_heap.h
index ba99ed9..d43fa90 100644
--- a/lib/librte_eal/common/include/rte_malloc_heap.h
+++ b/lib/librte_eal/common/include/rte_malloc_heap.h
@@ -13,12 +13,18 @@
/* Number of free lists per heap, grouped by size. */
#define RTE_HEAP_NUM_FREELISTS 13

+/* dummy definition, for pointers */
+struct malloc_elem;
+
/**
* Structure to hold malloc heap
*/
struct malloc_heap {
rte_spinlock_t lock;
LIST_HEAD(, malloc_elem) free_head[RTE_HEAP_NUM_FREELISTS];
+ struct malloc_elem *volatile first;
+ struct malloc_elem *volatile last;
+
unsigned alloc_count;
size_t total_size;
} __rte_cache_aligned;
diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c
index ea041e2..eb41200 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -31,6 +31,7 @@ malloc_elem_init(struct malloc_elem *elem,
elem->heap = heap;
elem->ms = ms;
elem->prev = NULL;
+ elem->next = NULL;
memset(&elem->free_list, 0, sizeof(elem->free_list));
elem->state = ELEM_FREE;
elem->size = size;
@@ -39,15 +40,56 @@ malloc_elem_init(struct malloc_elem *elem,
set_trailer(elem);
}

-/*
- * Initialize a dummy malloc_elem header for the end-of-memseg marker
- */
void
-malloc_elem_mkend(struct malloc_elem *elem, struct malloc_elem *prev)
+malloc_elem_insert(struct malloc_elem *elem)
{
- malloc_elem_init(elem, prev->heap, prev->ms, 0);
- elem->prev = prev;
- elem->state = ELEM_BUSY; /* mark busy so its never merged */
+ struct malloc_elem *prev_elem, *next_elem;
+ struct malloc_heap *heap = elem->heap;
+
+ if (heap->first == NULL && heap->last == NULL) {
+ /* if empty heap */
+ heap->first = elem;
+ heap->last = elem;
+ prev_elem = NULL;
+ next_elem = NULL;
+ } else if (elem < heap->first) {
+ /* if lower than start */
+ prev_elem = NULL;
+ next_elem = heap->first;
+ heap->first = elem;
+ } else if (elem > heap->last) {
+ /* if higher than end */
+ prev_elem = heap->last;
+ next_elem = NULL;
+ heap->last = elem;
+ } else {
+ /* the new memory is somewhere inbetween start and end */
+ uint64_t dist_from_start, dist_from_end;
+
+ dist_from_end = RTE_PTR_DIFF(heap->last, elem);
+ dist_from_start = RTE_PTR_DIFF(elem, heap->first);
+
+ /* check which is closer, and find closest list entries */
+ if (dist_from_start < dist_from_end) {
+ prev_elem = heap->first;
+ while (prev_elem->next < elem)
+ prev_elem = prev_elem->next;
+ next_elem = prev_elem->next;
+ } else {
+ next_elem = heap->last;
+ while (next_elem->prev > elem)
+ next_elem = next_elem->prev;
+ prev_elem = next_elem->prev;
+ }
+ }
+
+ /* insert new element */
+ elem->prev = prev_elem;
+ elem->next = next_elem;
+ if (prev_elem)
+ prev_elem->next = elem;
+ if (next_elem)
+ next_elem->prev = elem;
}

/*
@@ -98,18 +140,58 @@ malloc_elem_can_hold(struct malloc_elem *elem, size_t size, unsigned align,
static void
split_elem(struct malloc_elem *elem, struct malloc_elem *split_pt)
{
- struct malloc_elem *next_elem = RTE_PTR_ADD(elem, elem->size);
+ struct malloc_elem *next_elem = elem->next;
const size_t old_elem_size = (uintptr_t)split_pt - (uintptr_t)elem;
const size_t new_elem_size = elem->size - old_elem_size;

malloc_elem_init(split_pt, elem->heap, elem->ms, new_elem_size);
split_pt->prev = elem;
- next_elem->prev = split_pt;
+ split_pt->next = next_elem;
+ if (next_elem)
+ next_elem->prev = split_pt;
+ else
+ elem->heap->last = split_pt;
+ elem->next = split_pt;
elem->size = old_elem_size;
set_trailer(elem);
}

/*
+ * our malloc heap is a doubly linked list, so doubly remove our element.
+ */
+static void __rte_unused
+remove_elem(struct malloc_elem *elem)
+{
+ struct malloc_elem *next, *prev;
+ next = elem->next;
+ prev = elem->prev;
+
+ if (next)
+ next->prev = prev;
+ else
+ elem->heap->last = prev;
+ if (prev)
+ prev->next = next;
+ else
+ elem->heap->first = next;
+
+ elem->prev = NULL;
+ elem->next = NULL;
+}
+
+static int
+next_elem_is_adjacent(struct malloc_elem *elem)
+{
+ return elem->next == RTE_PTR_ADD(elem, elem->size);
+}
+
+static int
+prev_elem_is_adjacent(struct malloc_elem *elem)
+{
+ return elem == RTE_PTR_ADD(elem->prev, elem->prev->size);
+}
+
+/*
* Given an element size, compute its freelist index.
* We free an element into the freelist containing similarly-sized elements.
* We try to allocate elements starting with the freelist containing
@@ -192,6 +274,9 @@ malloc_elem_alloc(struct malloc_elem *elem, size_t size, unsigned align,

split_elem(elem, new_free_elem);
malloc_elem_free_list_insert(new_free_elem);
+
+ if (elem == elem->heap->last)
+ elem->heap->last = new_free_elem;
}

if (old_elem_size < MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) {
@@ -230,9 +315,62 @@ malloc_elem_alloc(struct malloc_elem *elem, size_t size, unsigned align,
static inline void
join_elem(struct malloc_elem *elem1, struct malloc_elem *elem2)
{
- struct malloc_elem *next = RTE_PTR_ADD(elem2, elem2->size);
+ struct malloc_elem *next = elem2->next;
elem1->size += elem2->size;
- next->prev = elem1;
+ if (next)
+ next->prev = elem1;
+ else
+ elem1->heap->last = elem1;
+ elem1->next = next;
+}
+
+static struct malloc_elem *
+elem_join_adjacent_free(struct malloc_elem *elem)
+{
+ /*
+ * check if next element exists, is adjacent and is free, if so join
+ * with it, need to remove from free list.
+ */
+ if (elem->next != NULL && elem->next->state == ELEM_FREE &&
+ next_elem_is_adjacent(elem)) {
+ void *erase;
+
+ /* we will want to erase the trailer and header */
+ erase = RTE_PTR_SUB(elem->next, MALLOC_ELEM_TRAILER_LEN);
+
+ /* remove from free list, join to this one */
+ elem_free_list_remove(elem->next);
+ join_elem(elem, elem->next);
+
+ /* erase header and trailer */
+ memset(erase, 0, MALLOC_ELEM_OVERHEAD);
+ }
+
+ /*
+ * check if prev element exists, is adjacent and is free, if so join
+ * with it, need to remove from free list.
+ */
+ if (elem->prev != NULL && elem->prev->state == ELEM_FREE &&
+ prev_elem_is_adjacent(elem)) {
+ struct malloc_elem *new_elem;
+ void *erase;
+
+ /* we will want to erase trailer and header */
+ erase = RTE_PTR_SUB(elem, MALLOC_ELEM_TRAILER_LEN);
+
+ /* remove from free list, join to this one */
+ elem_free_list_remove(elem->prev);
+
+ new_elem = elem->prev;
+ join_elem(new_elem, elem);
+
+ /* erase header and trailer */
+ memset(erase, 0, MALLOC_ELEM_OVERHEAD);
+
+ elem = new_elem;
+ }
+
+ return elem;
}

/*
@@ -243,32 +381,20 @@ join_elem(struct malloc_elem *elem1, struct malloc_elem *elem2)
int
malloc_elem_free(struct malloc_elem *elem)
{
- size_t sz = elem->size - sizeof(*elem) - MALLOC_ELEM_TRAILER_LEN;
- uint8_t *ptr = (uint8_t *)&elem[1];
- struct malloc_elem *next = RTE_PTR_ADD(elem, elem->size);
- if (next->state == ELEM_FREE){
- /* remove from free list, join to this one */
- elem_free_list_remove(next);
- join_elem(elem, next);
- sz += (sizeof(*elem) + MALLOC_ELEM_TRAILER_LEN);
- }
+ void *ptr;
+ size_t data_len;
+
+ ptr = RTE_PTR_ADD(elem, sizeof(*elem));
+ data_len = elem->size - MALLOC_ELEM_OVERHEAD;
+
+ elem = elem_join_adjacent_free(elem);

- /* check if previous element is free, if so join with it and return,
- * need to re-insert in free list, as that element's size is changing
- */
- if (elem->prev != NULL && elem->prev->state == ELEM_FREE) {
- elem_free_list_remove(elem->prev);
- join_elem(elem->prev, elem);
- sz += (sizeof(*elem) + MALLOC_ELEM_TRAILER_LEN);
- ptr -= (sizeof(*elem) + MALLOC_ELEM_TRAILER_LEN);
- elem = elem->prev;
- }
malloc_elem_free_list_insert(elem);

/* decrease heap's count of allocated elements */
elem->heap->alloc_count--;

- memset(ptr, 0, sz);
+ memset(ptr, 0, data_len);

return 0;
}
@@ -281,21 +407,23 @@ int
malloc_elem_resize(struct malloc_elem *elem, size_t size)
{
const size_t new_size = size + elem->pad + MALLOC_ELEM_OVERHEAD;
+
/* if we request a smaller size, then always return ok */
if (elem->size >= new_size)
return 0;

- struct malloc_elem *next = RTE_PTR_ADD(elem, elem->size);
- if (next ->state != ELEM_FREE)
+ /* check if there is a next element, it's free and adjacent */
+ if (!elem->next || elem->next->state != ELEM_FREE ||
+ !next_elem_is_adjacent(elem))
return -1;
- if (elem->size + next->size < new_size)
+ if (elem->size + elem->next->size < new_size)
return -1;

/* we now know the element fits, so remove from free list,
* join the two
*/
- elem_free_list_remove(next);
- join_elem(elem, next);
+ elem_free_list_remove(elem->next);
+ join_elem(elem, elem->next);

if (elem->size - new_size >= MIN_DATA_SIZE + MALLOC_ELEM_OVERHEAD) {
/* now we have a big block together. Lets cut it down a bit, by splitting */
diff --git a/lib/librte_eal/common/malloc_elem.h b/lib/librte_eal/common/malloc_elem.h
index f4c1c7a..238e451 100644
--- a/lib/librte_eal/common/malloc_elem.h
+++ b/lib/librte_eal/common/malloc_elem.h
@@ -18,8 +18,12 @@ enum elem_state {

struct malloc_elem {
struct malloc_heap *heap;
- struct malloc_elem *volatile prev; /* points to prev elem in memseg */
- LIST_ENTRY(malloc_elem) free_list; /* list of free elements in heap */
+ struct malloc_elem *volatile prev;
+ /**< points to prev elem in memseg */
+ struct malloc_elem *volatile next;
+ /**< points to next elem in memseg */
+ LIST_ENTRY(malloc_elem) free_list;
+ /**< list of free elements in heap */
const struct rte_memseg *ms;
volatile enum elem_state state;
uint32_t pad;
@@ -110,12 +114,8 @@ malloc_elem_init(struct malloc_elem *elem,
const struct rte_memseg *ms,
size_t size);

-/*
- * initialise a dummy malloc_elem header for the end-of-memseg marker
- */
void
-malloc_elem_mkend(struct malloc_elem *elem,
- struct malloc_elem *prev_free);
+malloc_elem_insert(struct malloc_elem *elem);

/*
* return true if the current malloc_elem can hold a block of data
diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c
index 7d8d70a..9c95166 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -70,15 +70,11 @@ check_hugepage_sz(unsigned flags, uint64_t hugepage_sz)
static void
malloc_heap_add_memseg(struct malloc_heap *heap, struct rte_memseg *ms)
{
- /* allocate the memory block headers, one at end, one at start */
struct malloc_elem *start_elem = (struct malloc_elem *)ms->addr;
- struct malloc_elem *end_elem = RTE_PTR_ADD(ms->addr,
- ms->len - MALLOC_ELEM_OVERHEAD);
- end_elem = RTE_PTR_ALIGN_FLOOR(end_elem, RTE_CACHE_LINE_SIZE);
- const size_t elem_size = (uintptr_t)end_elem - (uintptr_t)start_elem;
+ const size_t elem_size = ms->len - MALLOC_ELEM_OVERHEAD;

malloc_elem_init(start_elem, heap, ms, elem_size);
- malloc_elem_mkend(end_elem, start_elem);
+ malloc_elem_insert(start_elem);
malloc_elem_free_list_insert(start_elem);

heap->total_size += elem_size;
--
2.7.4
Stephen Hemminger
2018-04-03 23:32:40 UTC
Permalink
On Wed, 4 Apr 2018 00:21:15 +0100
Post by Anatoly Burakov
As we are preparing for dynamic memory allocation, we need to be
able to handle holes in our malloc heap, hence we're switching to
doubly linked list, and prepare infrastructure to support it.
Since our heap is now aware where are our first and last elements,
there is no longer any need to have a dummy element at the end of
each heap, so get rid of that as well. Instead, let insert/remove/
join/split operations handle end-of-list conditions automatically.
Dummy element at end of heap could be helpful for purify/valgrind style verification
that code does not exceed allocation.
Burakov, Anatoly
2018-04-04 08:05:32 UTC
Permalink
Post by Stephen Hemminger
On Wed, 4 Apr 2018 00:21:15 +0100
Post by Anatoly Burakov
As we are preparing for dynamic memory allocation, we need to be
able to handle holes in our malloc heap, hence we're switching to
doubly linked list, and prepare infrastructure to support it.
Since our heap is now aware where are our first and last elements,
there is no longer any need to have a dummy element at the end of
each heap, so get rid of that as well. Instead, let insert/remove/
join/split operations handle end-of-list conditions automatically.
Dummy element at end of heap could be helpful for purify/valgrind style verification
that code does not exceed allocation.
It would interfere with removing pages from memory :) Dummy element
needs to be stored somewhere - if it's stored in a page, that means that
page cannot be freed. Moreover, with pages being added/removed
dynamically, the dummy element will have to be moved back and forth, so
"at the end of the heap" is not a fixed location.
--
Thanks,
Anatoly
Anatoly Burakov
2018-04-03 23:21:22 UTC
Permalink
No major changes, just add some checks in a few key places, and
a new parameter to pass around.

Also, add a dummy function to check malloc element for physical
contiguousness. For now, assume hugepage memory is always
contiguous, while non-hugepage memory will be checked.

Signed-off-by: Anatoly Burakov <***@intel.com>
---

Notes:
v3:
- Moved this patch earlier
- Added physical contiguousness checking function

lib/librte_eal/common/eal_common_memzone.c | 23 +++---
lib/librte_eal/common/malloc_elem.c | 125 ++++++++++++++++++++++++-----
lib/librte_eal/common/malloc_elem.h | 6 +-
lib/librte_eal/common/malloc_heap.c | 11 +--
lib/librte_eal/common/malloc_heap.h | 4 +-
lib/librte_eal/common/rte_malloc.c | 7 +-
6 files changed, 133 insertions(+), 43 deletions(-)

diff --git a/lib/librte_eal/common/eal_common_memzone.c b/lib/librte_eal/common/eal_common_memzone.c
index 1ab3ade..16a2e7a 100644
--- a/lib/librte_eal/common/eal_common_memzone.c
+++ b/lib/librte_eal/common/eal_common_memzone.c
@@ -98,7 +98,8 @@ find_heap_max_free_elem(int *s, unsigned align)

static const struct rte_memzone *
memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
- int socket_id, unsigned flags, unsigned align, unsigned bound)
+ int socket_id, unsigned int flags, unsigned int align,
+ unsigned int bound, bool contig)
{
struct rte_memzone *mz;
struct rte_mem_config *mcfg;
@@ -188,7 +189,7 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,

/* allocate memory on heap */
void *mz_addr = malloc_heap_alloc(&mcfg->malloc_heaps[socket], NULL,
- requested_len, flags, align, bound);
+ requested_len, flags, align, bound, contig);

if ((mz_addr == NULL) && (socket_id == SOCKET_ID_ANY)) {
/* try other heaps */
@@ -197,7 +198,8 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
continue;

mz_addr = malloc_heap_alloc(&mcfg->malloc_heaps[i],
- NULL, requested_len, flags, align, bound);
+ NULL, requested_len, flags, align,
+ bound, contig);
if (mz_addr != NULL)
break;
}
@@ -235,9 +237,9 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
}

static const struct rte_memzone *
-rte_memzone_reserve_thread_safe(const char *name, size_t len,
- int socket_id, unsigned flags, unsigned align,
- unsigned bound)
+rte_memzone_reserve_thread_safe(const char *name, size_t len, int socket_id,
+ unsigned int flags, unsigned int align, unsigned int bound,
+ bool contig)
{
struct rte_mem_config *mcfg;
const struct rte_memzone *mz = NULL;
@@ -248,7 +250,7 @@ rte_memzone_reserve_thread_safe(const char *name, size_t len,
rte_rwlock_write_lock(&mcfg->mlock);

mz = memzone_reserve_aligned_thread_unsafe(
- name, len, socket_id, flags, align, bound);
+ name, len, socket_id, flags, align, bound, contig);

rte_rwlock_write_unlock(&mcfg->mlock);

@@ -265,7 +267,7 @@ rte_memzone_reserve_bounded(const char *name, size_t len, int socket_id,
unsigned flags, unsigned align, unsigned bound)
{
return rte_memzone_reserve_thread_safe(name, len, socket_id, flags,
- align, bound);
+ align, bound, false);
}

/*
@@ -277,7 +279,7 @@ rte_memzone_reserve_aligned(const char *name, size_t len, int socket_id,
unsigned flags, unsigned align)
{
return rte_memzone_reserve_thread_safe(name, len, socket_id, flags,
- align, 0);
+ align, 0, false);
}

/*
@@ -289,7 +291,8 @@ rte_memzone_reserve(const char *name, size_t len, int socket_id,
unsigned flags)
{
return rte_memzone_reserve_thread_safe(name, len, socket_id,
- flags, RTE_CACHE_LINE_SIZE, 0);
+ flags, RTE_CACHE_LINE_SIZE, 0,
+ false);
}

int
diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c
index c18f050..87695b9 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -6,6 +6,7 @@
#include <stddef.h>
#include <stdio.h>
#include <string.h>
+#include <unistd.h>
#include <sys/queue.h>

#include <rte_memory.h>
@@ -94,33 +95,112 @@ malloc_elem_insert(struct malloc_elem *elem)
}

/*
+ * Attempt to find enough physically contiguous memory in this block to store
+ * our data. Assume that element has at least enough space to fit in the data,
+ * so we just check the page addresses.
+ */
+static bool
+elem_check_phys_contig(const struct rte_memseg *ms __rte_unused,
+ void *start, size_t size)
+{
+ rte_iova_t cur, expected;
+ void *start_page, *end_page, *cur_page;
+ size_t pagesz;
+
+ /* for hugepage memory or IOVA as VA, it's always contiguous */
+ if (rte_eal_has_hugepages() || rte_eal_iova_mode() == RTE_IOVA_VA)
+ return true;
+
+ /* otherwise, check if start and end are within the same page */
+ pagesz = getpagesize();
+
+ start_page = RTE_PTR_ALIGN_FLOOR(start, pagesz);
+ end_page = RTE_PTR_ALIGN_FLOOR(RTE_PTR_ADD(start, size - 1), pagesz);
+
+ if (start_page == end_page)
+ return true;
+
+ /* if they are from different pages, check if they are contiguous */
+
+ /* if we can't access physical addresses, assume non-contiguous */
+ if (!rte_eal_using_phys_addrs())
+ return false;
+
+ /* skip first iteration */
+ cur = rte_mem_virt2iova(start_page);
+ expected = cur + pagesz;
+ cur_page = RTE_PTR_ADD(start_page, pagesz);
+
+ while (cur_page <= end_page) {
+ cur = rte_mem_virt2iova(cur_page);
+ if (cur != expected)
+ return false;
+ cur_page = RTE_PTR_ADD(cur_page, pagesz);
+ expected += pagesz;
+ }
+ return true;
+}
+
+/*
* calculate the starting point of where data of the requested size
* and alignment would fit in the current element. If the data doesn't
* fit, return NULL.
*/
static void *
elem_start_pt(struct malloc_elem *elem, size_t size, unsigned align,
- size_t bound)
+ size_t bound, bool contig)
{
- const size_t bmask = ~(bound - 1);
- uintptr_t end_pt = (uintptr_t)elem +
- elem->size - MALLOC_ELEM_TRAILER_LEN;
- uintptr_t new_data_start = RTE_ALIGN_FLOOR((end_pt - size), align);
- uintptr_t new_elem_start;
-
- /* check boundary */
- if ((new_data_start & bmask) != ((end_pt - 1) & bmask)) {
- end_pt = RTE_ALIGN_FLOOR(end_pt, bound);
- new_data_start = RTE_ALIGN_FLOOR((end_pt - size), align);
- end_pt = new_data_start + size;
- if (((end_pt - 1) & bmask) != (new_data_start & bmask))
- return NULL;
- }
+ size_t elem_size = elem->size;
+
+ /*
+ * we're allocating from the end, so adjust the size of element by
+ * alignment size.
+ */
+ while (elem_size >= size) {
+ const size_t bmask = ~(bound - 1);
+ uintptr_t end_pt = (uintptr_t)elem +
+ elem_size - MALLOC_ELEM_TRAILER_LEN;
+ uintptr_t new_data_start = RTE_ALIGN_FLOOR((end_pt - size),
+ align);
+ uintptr_t new_elem_start;
+
+ /* check boundary */
+ if ((new_data_start & bmask) != ((end_pt - 1) & bmask)) {
+ end_pt = RTE_ALIGN_FLOOR(end_pt, bound);
+ new_data_start = RTE_ALIGN_FLOOR((end_pt - size),
+ align);
+ end_pt = new_data_start + size;
+
+ if (((end_pt - 1) & bmask) != (new_data_start & bmask))
+ return NULL;
+ }
+
+ new_elem_start = new_data_start - MALLOC_ELEM_HEADER_LEN;

- new_elem_start = new_data_start - MALLOC_ELEM_HEADER_LEN;
+ /* if the new start point is before the exist start,
+ * it won't fit
+ */
+ if (new_elem_start < (uintptr_t)elem)
+ return NULL;

- /* if the new start point is before the exist start, it won't fit */
- return (new_elem_start < (uintptr_t)elem) ? NULL : (void *)new_elem_start;
+ if (contig) {
+ size_t new_data_size = end_pt - new_data_start;
+
+ /*
+ * if physical contiguousness was requested and we
+ * couldn't fit all data into one physically contiguous
+ * block, try again with lower addresses.
+ */
+ if (!elem_check_phys_contig(elem->ms,
+ (void *)new_data_start,
+ new_data_size)) {
+ elem_size -= align;
+ continue;
+ }
+ }
+ return (void *)new_elem_start;
+ }
+ return NULL;
}

/*
@@ -129,9 +209,9 @@ elem_start_pt(struct malloc_elem *elem, size_t size, unsigned align,
*/
int
malloc_elem_can_hold(struct malloc_elem *elem, size_t size, unsigned align,
- size_t bound)
+ size_t bound, bool contig)
{
- return elem_start_pt(elem, size, align, bound) != NULL;
+ return elem_start_pt(elem, size, align, bound, contig) != NULL;
}

/*
@@ -259,9 +339,10 @@ malloc_elem_free_list_remove(struct malloc_elem *elem)
*/
struct malloc_elem *
malloc_elem_alloc(struct malloc_elem *elem, size_t size, unsigned align,
- size_t bound)
+ size_t bound, bool contig)
{
- struct malloc_elem *new_elem = elem_start_pt(elem, size, align, bound);
+ struct malloc_elem *new_elem = elem_start_pt(elem, size, align, bound,
+ contig);
const size_t old_elem_size = (uintptr_t)new_elem - (uintptr_t)elem;
const size_t trailer_size = elem->size - old_elem_size - size -
MALLOC_ELEM_OVERHEAD;
diff --git a/lib/librte_eal/common/malloc_elem.h b/lib/librte_eal/common/malloc_elem.h
index 9c1614c..34bd268 100644
--- a/lib/librte_eal/common/malloc_elem.h
+++ b/lib/librte_eal/common/malloc_elem.h
@@ -5,6 +5,8 @@
#ifndef MALLOC_ELEM_H_
#define MALLOC_ELEM_H_

+#include <stdbool.h>
+
#include <rte_memory.h>

/* dummy definition of struct so we can use pointers to it in malloc_elem struct */
@@ -123,7 +125,7 @@ malloc_elem_insert(struct malloc_elem *elem);
*/
int
malloc_elem_can_hold(struct malloc_elem *elem, size_t size,
- unsigned align, size_t bound);
+ unsigned int align, size_t bound, bool contig);

/*
* reserve a block of data in an existing malloc_elem. If the malloc_elem
@@ -131,7 +133,7 @@ malloc_elem_can_hold(struct malloc_elem *elem, size_t size,
*/
struct malloc_elem *
malloc_elem_alloc(struct malloc_elem *elem, size_t size,
- unsigned align, size_t bound);
+ unsigned int align, size_t bound, bool contig);

/*
* free a malloc_elem block by adding it to the free list. If the
diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c
index a2c2e4c..564b61a 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -88,7 +88,7 @@ malloc_heap_add_memseg(struct malloc_heap *heap, struct rte_memseg *ms)
*/
static struct malloc_elem *
find_suitable_element(struct malloc_heap *heap, size_t size,
- unsigned flags, size_t align, size_t bound)
+ unsigned int flags, size_t align, size_t bound, bool contig)
{
size_t idx;
struct malloc_elem *elem, *alt_elem = NULL;
@@ -97,7 +97,8 @@ find_suitable_element(struct malloc_heap *heap, size_t size,
idx < RTE_HEAP_NUM_FREELISTS; idx++) {
for (elem = LIST_FIRST(&heap->free_head[idx]);
!!elem; elem = LIST_NEXT(elem, free_list)) {
- if (malloc_elem_can_hold(elem, size, align, bound)) {
+ if (malloc_elem_can_hold(elem, size, align, bound,
+ contig)) {
if (check_hugepage_sz(flags, elem->ms->hugepage_sz))
return elem;
if (alt_elem == NULL)
@@ -121,7 +122,7 @@ find_suitable_element(struct malloc_heap *heap, size_t size,
void *
malloc_heap_alloc(struct malloc_heap *heap,
const char *type __attribute__((unused)), size_t size, unsigned flags,
- size_t align, size_t bound)
+ size_t align, size_t bound, bool contig)
{
struct malloc_elem *elem;

@@ -130,9 +131,9 @@ malloc_heap_alloc(struct malloc_heap *heap,

rte_spinlock_lock(&heap->lock);

- elem = find_suitable_element(heap, size, flags, align, bound);
+ elem = find_suitable_element(heap, size, flags, align, bound, contig);
if (elem != NULL) {
- elem = malloc_elem_alloc(elem, size, align, bound);
+ elem = malloc_elem_alloc(elem, size, align, bound, contig);
/* increase heap's count of allocated elements */
heap->alloc_count++;
}
diff --git a/lib/librte_eal/common/malloc_heap.h b/lib/librte_eal/common/malloc_heap.h
index bb28422..c57b59a 100644
--- a/lib/librte_eal/common/malloc_heap.h
+++ b/lib/librte_eal/common/malloc_heap.h
@@ -5,6 +5,8 @@
#ifndef MALLOC_HEAP_H_
#define MALLOC_HEAP_H_

+#include <stdbool.h>
+
#include <rte_malloc.h>
#include <rte_malloc_heap.h>

@@ -25,7 +27,7 @@ malloc_get_numa_socket(void)

void *
malloc_heap_alloc(struct malloc_heap *heap, const char *type, size_t size,
- unsigned flags, size_t align, size_t bound);
+ unsigned int flags, size_t align, size_t bound, bool contig);

int
malloc_heap_free(struct malloc_elem *elem);
diff --git a/lib/librte_eal/common/rte_malloc.c b/lib/librte_eal/common/rte_malloc.c
index 2cda48e..436818a 100644
--- a/lib/librte_eal/common/rte_malloc.c
+++ b/lib/librte_eal/common/rte_malloc.c
@@ -37,7 +37,8 @@ void rte_free(void *addr)
* Allocate memory on specified heap.
*/
void *
-rte_malloc_socket(const char *type, size_t size, unsigned align, int socket_arg)
+rte_malloc_socket(const char *type, size_t size, unsigned int align,
+ int socket_arg)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
int socket, i;
@@ -60,7 +61,7 @@ rte_malloc_socket(const char *type, size_t size, unsigned align, int socket_arg)
return NULL;

ret = malloc_heap_alloc(&mcfg->malloc_heaps[socket], type,
- size, 0, align == 0 ? 1 : align, 0);
+ size, 0, align == 0 ? 1 : align, 0, false);
if (ret != NULL || socket_arg != SOCKET_ID_ANY)
return ret;

@@ -71,7 +72,7 @@ rte_malloc_socket(const char *type, size_t size, unsigned align, int socket_arg)
continue;

ret = malloc_heap_alloc(&mcfg->malloc_heaps[i], type,
- size, 0, align == 0 ? 1 : align, 0);
+ size, 0, align == 0 ? 1 : align, 0, false);
if (ret != NULL)
return ret;
}
--
2.7.4
Anatoly Burakov
2018-04-03 23:21:23 UTC
Permalink
This adds a new set of _contig API's to rte_memzone. For now,
hugepage memory is always contiguous, but we need to prepare the
drivers for the switch.

Signed-off-by: Anatoly Burakov <***@intel.com>
---

Notes:
v3:
- Moved this patch earlier

lib/librte_eal/common/eal_common_memzone.c | 44 ++++++++
lib/librte_eal/common/include/rte_memzone.h | 158 ++++++++++++++++++++++++++++
lib/librte_eal/rte_eal_version.map | 3 +
3 files changed, 205 insertions(+)

diff --git a/lib/librte_eal/common/eal_common_memzone.c b/lib/librte_eal/common/eal_common_memzone.c
index 16a2e7a..36d2553 100644
--- a/lib/librte_eal/common/eal_common_memzone.c
+++ b/lib/librte_eal/common/eal_common_memzone.c
@@ -171,6 +171,12 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
socket_id = SOCKET_ID_ANY;

if (len == 0) {
+ /* len == 0 is only allowed for non-contiguous zones */
+ if (contig) {
+ RTE_LOG(DEBUG, EAL, "Reserving zero-length contiguous memzones is not supported\n");
+ rte_errno = EINVAL;
+ return NULL;
+ }
if (bound != 0)
requested_len = bound;
else {
@@ -272,6 +278,19 @@ rte_memzone_reserve_bounded(const char *name, size_t len, int socket_id,

/*
* Return a pointer to a correctly filled memzone descriptor (with a
+ * specified alignment and boundary). If the allocation cannot be done,
+ * return NULL.
+ */
+__rte_experimental const struct rte_memzone *
+rte_memzone_reserve_bounded_contig(const char *name, size_t len, int socket_id,
+ unsigned int flags, unsigned int align, unsigned int bound)
+{
+ return rte_memzone_reserve_thread_safe(name, len, socket_id, flags,
+ align, bound, true);
+}
+
+/*
+ * Return a pointer to a correctly filled memzone descriptor (with a
* specified alignment). If the allocation cannot be done, return NULL.
*/
const struct rte_memzone *
@@ -283,6 +302,18 @@ rte_memzone_reserve_aligned(const char *name, size_t len, int socket_id,
}

/*
+ * Return a pointer to a correctly filled memzone descriptor (with a
+ * specified alignment). If the allocation cannot be done, return NULL.
+ */
+__rte_experimental const struct rte_memzone *
+rte_memzone_reserve_aligned_contig(const char *name, size_t len, int socket_id,
+ unsigned int flags, unsigned int align)
+{
+ return rte_memzone_reserve_thread_safe(name, len, socket_id, flags,
+ align, 0, true);
+}
+
+/*
* Return a pointer to a correctly filled memzone descriptor. If the
* allocation cannot be done, return NULL.
*/
@@ -295,6 +326,19 @@ rte_memzone_reserve(const char *name, size_t len, int socket_id,
false);
}

+/*
+ * Return a pointer to a correctly filled memzone descriptor. If the
+ * allocation cannot be done, return NULL.
+ */
+__rte_experimental const struct rte_memzone *
+rte_memzone_reserve_contig(const char *name, size_t len, int socket_id,
+ unsigned int flags)
+{
+ return rte_memzone_reserve_thread_safe(name, len, socket_id,
+ flags, RTE_CACHE_LINE_SIZE, 0,
+ true);
+}
+
int
rte_memzone_free(const struct rte_memzone *mz)
{
diff --git a/lib/librte_eal/common/include/rte_memzone.h b/lib/librte_eal/common/include/rte_memzone.h
index 2bfb273..ef3a4dd 100644
--- a/lib/librte_eal/common/include/rte_memzone.h
+++ b/lib/librte_eal/common/include/rte_memzone.h
@@ -23,6 +23,7 @@
*/

#include <stdio.h>
+#include <rte_compat.h>
#include <rte_memory.h>
#include <rte_common.h>

@@ -228,6 +229,163 @@ const struct rte_memzone *rte_memzone_reserve_bounded(const char *name,
unsigned flags, unsigned align, unsigned bound);

/**
+ * Reserve an IOVA-contiguous portion of physical memory.
+ *
+ * This function reserves some IOVA-contiguous memory and returns a pointer to a
+ * correctly filled memzone descriptor. If the allocation cannot be
+ * done, return NULL.
+ *
+ * @param name
+ * The name of the memzone. If it already exists, the function will
+ * fail and return NULL.
+ * @param len
+ * The size of the memory to be reserved.
+ * @param socket_id
+ * The socket identifier in the case of
+ * NUMA. The value can be SOCKET_ID_ANY if there is no NUMA
+ * constraint for the reserved zone.
+ * @param flags
+ * The flags parameter is used to request memzones to be
+ * taken from specifically sized hugepages.
+ * - RTE_MEMZONE_2MB - Reserved from 2MB pages
+ * - RTE_MEMZONE_1GB - Reserved from 1GB pages
+ * - RTE_MEMZONE_16MB - Reserved from 16MB pages
+ * - RTE_MEMZONE_16GB - Reserved from 16GB pages
+ * - RTE_MEMZONE_256KB - Reserved from 256KB pages
+ * - RTE_MEMZONE_256MB - Reserved from 256MB pages
+ * - RTE_MEMZONE_512MB - Reserved from 512MB pages
+ * - RTE_MEMZONE_4GB - Reserved from 4GB pages
+ * - RTE_MEMZONE_SIZE_HINT_ONLY - Allow alternative page size to be used if
+ * the requested page size is unavailable.
+ * If this flag is not set, the function
+ * will return error on an unavailable size
+ * request.
+ * @return
+ * A pointer to a correctly-filled read-only memzone descriptor, or NULL
+ * on error.
+ * On error case, rte_errno will be set appropriately:
+ * - E_RTE_NO_CONFIG - function could not get pointer to rte_config structure
+ * - E_RTE_SECONDARY - function was called from a secondary process instance
+ * - ENOSPC - the maximum number of memzones has already been allocated
+ * - EEXIST - a memzone with the same name already exists
+ * - ENOMEM - no appropriate memory area found in which to create memzone
+ * - EINVAL - invalid parameters
+ */
+__rte_experimental const struct rte_memzone *
+rte_memzone_reserve_contig(const char *name,
+ size_t len, int socket_id, unsigned int flags);
+
+/**
+ * Reserve an IOVA-contiguous portion of physical memory with alignment on a
+ * specified boundary.
+ *
+ * This function reserves some IOVA-contiguous memory with alignment on a
+ * specified boundary, and returns a pointer to a correctly filled memzone
+ * descriptor. If the allocation cannot be done or if the alignment
+ * is not a power of 2, returns NULL.
+ *
+ * @param name
+ * The name of the memzone. If it already exists, the function will
+ * fail and return NULL.
+ * @param len
+ * The size of the memory to be reserved.
+ * @param socket_id
+ * The socket identifier in the case of
+ * NUMA. The value can be SOCKET_ID_ANY if there is no NUMA
+ * constraint for the reserved zone.
+ * @param flags
+ * The flags parameter is used to request memzones to be
+ * taken from specifically sized hugepages.
+ * - RTE_MEMZONE_2MB - Reserved from 2MB pages
+ * - RTE_MEMZONE_1GB - Reserved from 1GB pages
+ * - RTE_MEMZONE_16MB - Reserved from 16MB pages
+ * - RTE_MEMZONE_16GB - Reserved from 16GB pages
+ * - RTE_MEMZONE_256KB - Reserved from 256KB pages
+ * - RTE_MEMZONE_256MB - Reserved from 256MB pages
+ * - RTE_MEMZONE_512MB - Reserved from 512MB pages
+ * - RTE_MEMZONE_4GB - Reserved from 4GB pages
+ * - RTE_MEMZONE_SIZE_HINT_ONLY - Allow alternative page size to be used if
+ * the requested page size is unavailable.
+ * If this flag is not set, the function
+ * will return error on an unavailable size
+ * request.
+ * @param align
+ * Alignment for resulting memzone. Must be a power of 2.
+ * @return
+ * A pointer to a correctly-filled read-only memzone descriptor, or NULL
+ * on error.
+ * On error case, rte_errno will be set appropriately:
+ * - E_RTE_NO_CONFIG - function could not get pointer to rte_config structure
+ * - E_RTE_SECONDARY - function was called from a secondary process instance
+ * - ENOSPC - the maximum number of memzones has already been allocated
+ * - EEXIST - a memzone with the same name already exists
+ * - ENOMEM - no appropriate memory area found in which to create memzone
+ * - EINVAL - invalid parameters
+ */
+__rte_experimental const struct rte_memzone *
+rte_memzone_reserve_aligned_contig(const char *name,
+ size_t len, int socket_id, unsigned int flags,
+ unsigned int align);
+
+/**
+ * Reserve an IOVA-contiguous portion of physical memory with specified
+ * alignment and boundary.
+ *
+ * This function reserves some IOVA-contiguous memory with specified alignment
+ * and boundary, and returns a pointer to a correctly filled memzone
+ * descriptor. If the allocation cannot be done or if the alignment
+ * or boundary are not a power of 2, returns NULL.
+ * Memory buffer is reserved in a way, that it wouldn't cross specified
+ * boundary. That implies that requested length should be less or equal
+ * then boundary.
+ *
+ * @param name
+ * The name of the memzone. If it already exists, the function will
+ * fail and return NULL.
+ * @param len
+ * The size of the memory to be reserved.
+ * @param socket_id
+ * The socket identifier in the case of
+ * NUMA. The value can be SOCKET_ID_ANY if there is no NUMA
+ * constraint for the reserved zone.
+ * @param flags
+ * The flags parameter is used to request memzones to be
+ * taken from specifically sized hugepages.
+ * - RTE_MEMZONE_2MB - Reserved from 2MB pages
+ * - RTE_MEMZONE_1GB - Reserved from 1GB pages
+ * - RTE_MEMZONE_16MB - Reserved from 16MB pages
+ * - RTE_MEMZONE_16GB - Reserved from 16GB pages
+ * - RTE_MEMZONE_256KB - Reserved from 256KB pages
+ * - RTE_MEMZONE_256MB - Reserved from 256MB pages
+ * - RTE_MEMZONE_512MB - Reserved from 512MB pages
+ * - RTE_MEMZONE_4GB - Reserved from 4GB pages
+ * - RTE_MEMZONE_SIZE_HINT_ONLY - Allow alternative page size to be used if
+ * the requested page size is unavailable.
+ * If this flag is not set, the function
+ * will return error on an unavailable size
+ * request.
+ * @param align
+ * Alignment for resulting memzone. Must be a power of 2.
+ * @param bound
+ * Boundary for resulting memzone. Must be a power of 2 or zero.
+ * Zero value implies no boundary condition.
+ * @return
+ * A pointer to a correctly-filled read-only memzone descriptor, or NULL
+ * on error.
+ * On error case, rte_errno will be set appropriately:
+ * - E_RTE_NO_CONFIG - function could not get pointer to rte_config structure
+ * - E_RTE_SECONDARY - function was called from a secondary process instance
+ * - ENOSPC - the maximum number of memzones has already been allocated
+ * - EEXIST - a memzone with the same name already exists
+ * - ENOMEM - no appropriate memory area found in which to create memzone
+ * - EINVAL - invalid parameters
+ */
+__rte_experimental const struct rte_memzone *
+rte_memzone_reserve_bounded_contig(const char *name,
+ size_t len, int socket_id, unsigned int flags,
+ unsigned int align, unsigned int bound);
+
+/**
* Free a memzone.
*
* @param mz
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index d9fc458..25e00de 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -223,6 +223,9 @@ EXPERIMENTAL {
rte_eal_mbuf_user_pool_ops;
rte_log_register_type_and_pick_level;
rte_malloc_dump_heaps;
+ rte_memzone_reserve_contig;
+ rte_memzone_reserve_aligned_contig;
+ rte_memzone_reserve_bounded_contig;
rte_mp_action_register;
rte_mp_action_unregister;
rte_mp_reply;
--
2.7.4
Stephen Hemminger
2018-04-03 23:41:28 UTC
Permalink
On Wed, 4 Apr 2018 00:21:23 +0100
Post by Anatoly Burakov
This adds a new set of _contig API's to rte_memzone. For now,
hugepage memory is always contiguous, but we need to prepare the
drivers for the switch.
Why not make fragmentation an optional flag to the memzone_reserved
rather than a new API. That way less drivers need to be changed.

#define RTE_MEMZONE_SPARSE 0x00100000 /* Allow zone to be non-contiguous */
Burakov, Anatoly
2018-04-04 08:01:22 UTC
Permalink
Post by Stephen Hemminger
On Wed, 4 Apr 2018 00:21:23 +0100
Post by Anatoly Burakov
This adds a new set of _contig API's to rte_memzone. For now,
hugepage memory is always contiguous, but we need to prepare the
drivers for the switch.
Why not make fragmentation an optional flag to the memzone_reserved
rather than a new API. That way less drivers need to be changed.
#define RTE_MEMZONE_SPARSE 0x00100000 /* Allow zone to be non-contiguous */
That is a good idea, however, i would perhaps do it the other way
around. Most of the time we are reserving memzones, we do not need those
to be IOVA-contiguous (i.e. creating rings, hashtables etc.). So it
would rather be a RTE_MEMZONE_CONTIG (or some such) flag.
--
Thanks,
Anatoly
Anatoly Burakov
2018-04-03 23:21:28 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---

Notes:
v3:
- Moved patch earlier in the patchset
- Allowed experimental API's in the makefile

drivers/net/cxgbe/Makefile | 3 +++
drivers/net/cxgbe/sge.c | 3 ++-
2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/net/cxgbe/Makefile b/drivers/net/cxgbe/Makefile
index 8fba1a5..0042f5e 100644
--- a/drivers/net/cxgbe/Makefile
+++ b/drivers/net/cxgbe/Makefile
@@ -18,6 +18,9 @@ EXPORT_MAP := rte_pmd_cxgbe_version.map

LIBABIVER := 1

+# contiguous memzone reserve API are not yet stable
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+
ifeq ($(CONFIG_RTE_TOOLCHAIN_ICC),y)
#
# CFLAGS for icc
diff --git a/drivers/net/cxgbe/sge.c b/drivers/net/cxgbe/sge.c
index 83e26d0..0cd3e56 100644
--- a/drivers/net/cxgbe/sge.c
+++ b/drivers/net/cxgbe/sge.c
@@ -1344,7 +1344,8 @@ static void *alloc_ring(size_t nelem, size_t elem_size,
* handle the maximum ring size is allocated in order to allow for
* resizing in later calls to the queue setup function.
*/
- tz = rte_memzone_reserve_aligned(z_name, len, socket_id, 0, 4096);
+ tz = rte_memzone_reserve_aligned_contig(z_name, len, socket_id, 0,
+ 4096);
if (!tz)
return NULL;
--
2.7.4
Anatoly Burakov
2018-04-03 23:21:20 UTC
Permalink
This will be needed because we need to know how big is the
new empty space, to check whether we can free some pages as
a result.

Signed-off-by: Anatoly Burakov <***@intel.com>
---

Notes:
v3: clarified commit message

lib/librte_eal/common/malloc_elem.c | 4 ++--
lib/librte_eal/common/malloc_elem.h | 2 +-
lib/librte_eal/common/malloc_heap.c | 4 ++--
3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c
index 008f5a3..c18f050 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -379,7 +379,7 @@ malloc_elem_join_adjacent_free(struct malloc_elem *elem)
* blocks either immediately before or immediately after newly freed block
* are also free, the blocks are merged together.
*/
-int
+struct malloc_elem *
malloc_elem_free(struct malloc_elem *elem)
{
void *ptr;
@@ -397,7 +397,7 @@ malloc_elem_free(struct malloc_elem *elem)

memset(ptr, 0, data_len);

- return 0;
+ return elem;
}

/*
diff --git a/lib/librte_eal/common/malloc_elem.h b/lib/librte_eal/common/malloc_elem.h
index 46e2383..9c1614c 100644
--- a/lib/librte_eal/common/malloc_elem.h
+++ b/lib/librte_eal/common/malloc_elem.h
@@ -138,7 +138,7 @@ malloc_elem_alloc(struct malloc_elem *elem, size_t size,
* blocks either immediately before or immediately after newly freed block
* are also free, the blocks are merged together.
*/
-int
+struct malloc_elem *
malloc_elem_free(struct malloc_elem *elem);

struct malloc_elem *
diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c
index 44538d7..a2c2e4c 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -145,7 +145,7 @@ int
malloc_heap_free(struct malloc_elem *elem)
{
struct malloc_heap *heap;
- int ret;
+ struct malloc_elem *ret;

if (!malloc_elem_cookies_ok(elem) || elem->state != ELEM_BUSY)
return -1;
@@ -159,7 +159,7 @@ malloc_heap_free(struct malloc_elem *elem)

rte_spinlock_unlock(&(heap->lock));

- return ret;
+ return ret != NULL ? 0 : -1;
}

int
--
2.7.4
Anatoly Burakov
2018-04-03 23:21:17 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---
test/test/commands.c | 3 +++
1 file changed, 3 insertions(+)

diff --git a/test/test/commands.c b/test/test/commands.c
index cf0b726..6bfdc02 100644
--- a/test/test/commands.c
+++ b/test/test/commands.c
@@ -137,6 +137,8 @@ static void cmd_dump_parsed(void *parsed_result,
rte_log_dump(stdout);
else if (!strcmp(res->dump, "dump_malloc_stats"))
rte_malloc_dump_stats(stdout, NULL);
+ else if (!strcmp(res->dump, "dump_malloc_heaps"))
+ rte_malloc_dump_heaps(stdout);
}

cmdline_parse_token_string_t cmd_dump_dump =
@@ -147,6 +149,7 @@ cmdline_parse_token_string_t cmd_dump_dump =
"dump_ring#"
"dump_mempool#"
"dump_malloc_stats#"
+ "dump_malloc_heaps#"
"dump_devargs#"
"dump_log_types");
--
2.7.4
Anatoly Burakov
2018-04-03 23:21:32 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
Acked-by: Harish Patil <***@cavium.com>
---

Notes:
v3:
- Moved the patch earlier in the patchset
- Allowed experimental API in Makefile

drivers/net/qede/Makefile | 3 +++
drivers/net/qede/base/bcm_osal.c | 5 +++--
2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/net/qede/Makefile b/drivers/net/qede/Makefile
index ccbffa4..83a4b8c 100644
--- a/drivers/net/qede/Makefile
+++ b/drivers/net/qede/Makefile
@@ -21,6 +21,9 @@ EXPORT_MAP := rte_pmd_qede_version.map

LIBABIVER := 1

+# contiguous memzone reserve API are not yet stable
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+
#
# OS
#
diff --git a/drivers/net/qede/base/bcm_osal.c b/drivers/net/qede/base/bcm_osal.c
index 91017b8..3a0a9aa 100644
--- a/drivers/net/qede/base/bcm_osal.c
+++ b/drivers/net/qede/base/bcm_osal.c
@@ -135,7 +135,7 @@ void *osal_dma_alloc_coherent(struct ecore_dev *p_dev,
if (core_id == (unsigned int)LCORE_ID_ANY)
core_id = rte_get_master_lcore();
socket_id = rte_lcore_to_socket_id(core_id);
- mz = rte_memzone_reserve_aligned(mz_name, size,
+ mz = rte_memzone_reserve_aligned_contig(mz_name, size,
socket_id, 0, RTE_CACHE_LINE_SIZE);
if (!mz) {
DP_ERR(p_dev, "Unable to allocate DMA memory "
@@ -174,7 +174,8 @@ void *osal_dma_alloc_coherent_aligned(struct ecore_dev *p_dev,
if (core_id == (unsigned int)LCORE_ID_ANY)
core_id = rte_get_master_lcore();
socket_id = rte_lcore_to_socket_id(core_id);
- mz = rte_memzone_reserve_aligned(mz_name, size, socket_id, 0, align);
+ mz = rte_memzone_reserve_aligned_contig(mz_name, size, socket_id, 0,
+ align);
if (!mz) {
DP_ERR(p_dev, "Unable to allocate DMA memory "
"of size %zu bytes - %s\n",
--
2.7.4
Anatoly Burakov
2018-04-03 23:21:26 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---

Notes:
v3:
- Moved patch earlier in the patchset
- Allowed experimental API's in the makefile

drivers/net/avf/Makefile | 3 +++
drivers/net/avf/avf_ethdev.c | 2 +-
2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/avf/Makefile b/drivers/net/avf/Makefile
index 3f815bb..678d49c 100644
--- a/drivers/net/avf/Makefile
+++ b/drivers/net/avf/Makefile
@@ -20,6 +20,9 @@ EXPORT_MAP := rte_pmd_avf_version.map

LIBABIVER := 1

+# contiguous memzone reserve API are not yet stable
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+
#
# Add extra flags for base driver files (also known as shared code)
# to disable warnings
diff --git a/drivers/net/avf/avf_ethdev.c b/drivers/net/avf/avf_ethdev.c
index 4442c3c..739ab92 100644
--- a/drivers/net/avf/avf_ethdev.c
+++ b/drivers/net/avf/avf_ethdev.c
@@ -1365,7 +1365,7 @@ avf_allocate_dma_mem_d(__rte_unused struct avf_hw *hw,
return AVF_ERR_PARAM;

snprintf(z_name, sizeof(z_name), "avf_dma_%"PRIu64, rte_rand());
- mz = rte_memzone_reserve_bounded(z_name, size, SOCKET_ID_ANY, 0,
+ mz = rte_memzone_reserve_bounded_contig(z_name, size, SOCKET_ID_ANY, 0,
alignment, RTE_PGSIZE_2M);
if (!mz)
return AVF_ERR_NO_MEMORY;
--
2.7.4
Anatoly Burakov
2018-04-03 23:21:30 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
Acked-by: John Daley <***@cisco.com>
---

Notes:
v3:
- Moved patch earlier in the patchset
- Allowed experimental API in Makefile

v3:
- Moved patch earlier in the patchset
- Allowed experimental API in the build system

drivers/net/enic/Makefile | 3 +++
drivers/net/enic/enic_main.c | 4 ++--
2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/net/enic/Makefile b/drivers/net/enic/Makefile
index 7c6c29c..f117c96 100644
--- a/drivers/net/enic/Makefile
+++ b/drivers/net/enic/Makefile
@@ -13,6 +13,9 @@ EXPORT_MAP := rte_pmd_enic_version.map

LIBABIVER := 1

+# contiguous memzone reserve API are not yet stable
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+
CFLAGS += -I$(SRCDIR)/base/
CFLAGS += -I$(SRCDIR)
CFLAGS += -O3
diff --git a/drivers/net/enic/enic_main.c b/drivers/net/enic/enic_main.c
index 69ad425..d19033e 100644
--- a/drivers/net/enic/enic_main.c
+++ b/drivers/net/enic/enic_main.c
@@ -343,7 +343,7 @@ enic_alloc_consistent(void *priv, size_t size,
struct enic *enic = (struct enic *)priv;
struct enic_memzone_entry *mze;

- rz = rte_memzone_reserve_aligned((const char *)name,
+ rz = rte_memzone_reserve_aligned_contig((const char *)name,
size, SOCKET_ID_ANY, 0, ENIC_ALIGN);
if (!rz) {
pr_err("%s : Failed to allocate memory requested for %s\n",
@@ -887,7 +887,7 @@ int enic_alloc_wq(struct enic *enic, uint16_t queue_idx,
"vnic_cqmsg-%s-%d-%d", enic->bdf_name, queue_idx,
instance++);

- wq->cqmsg_rz = rte_memzone_reserve_aligned((const char *)name,
+ wq->cqmsg_rz = rte_memzone_reserve_aligned_contig((const char *)name,
sizeof(uint32_t),
SOCKET_ID_ANY, 0,
ENIC_ALIGN);
--
2.7.4
Anatoly Burakov
2018-04-03 23:21:27 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---

Notes:
v3:
- Moved patch earlier in the patchset
- Allowed experimental API's in the makefile

drivers/net/bnx2x/Makefile | 3 +++
drivers/net/bnx2x/bnx2x.c | 2 +-
drivers/net/bnx2x/bnx2x_rxtx.c | 3 ++-
3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/net/bnx2x/Makefile b/drivers/net/bnx2x/Makefile
index 90ff8b1..d9b3ffd 100644
--- a/drivers/net/bnx2x/Makefile
+++ b/drivers/net/bnx2x/Makefile
@@ -17,6 +17,9 @@ EXPORT_MAP := rte_pmd_bnx2x_version.map

LIBABIVER := 1

+# contiguous memzone reserve API are not yet stable
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+
ifeq ($(CONFIG_RTE_TOOLCHAIN_ICC),y)
CFLAGS += -wd188 #188: enumerated type mixed with another type
endif
diff --git a/drivers/net/bnx2x/bnx2x.c b/drivers/net/bnx2x/bnx2x.c
index fb02d0f..81f5dae 100644
--- a/drivers/net/bnx2x/bnx2x.c
+++ b/drivers/net/bnx2x/bnx2x.c
@@ -177,7 +177,7 @@ bnx2x_dma_alloc(struct bnx2x_softc *sc, size_t size, struct bnx2x_dma *dma,
rte_get_timer_cycles());

/* Caller must take care that strlen(mz_name) < RTE_MEMZONE_NAMESIZE */
- z = rte_memzone_reserve_aligned(mz_name, (uint64_t) (size),
+ z = rte_memzone_reserve_aligned_contig(mz_name, (uint64_t)size,
SOCKET_ID_ANY,
0, align);
if (z == NULL) {
diff --git a/drivers/net/bnx2x/bnx2x_rxtx.c b/drivers/net/bnx2x/bnx2x_rxtx.c
index a0d4ac9..325b94d 100644
--- a/drivers/net/bnx2x/bnx2x_rxtx.c
+++ b/drivers/net/bnx2x/bnx2x_rxtx.c
@@ -26,7 +26,8 @@ ring_dma_zone_reserve(struct rte_eth_dev *dev, const char *ring_name,
if (mz)
return mz;

- return rte_memzone_reserve_aligned(z_name, ring_size, socket_id, 0, BNX2X_PAGE_SIZE);
+ return rte_memzone_reserve_aligned_contig(z_name, ring_size, socket_id,
+ 0, BNX2X_PAGE_SIZE);
}

static void
--
2.7.4
Anatoly Burakov
2018-04-03 23:21:25 UTC
Permalink
Also, remove the weird page alignment code.

Signed-off-by: Anatoly Burakov <***@intel.com>
Acked-by: Fiona Trahe <***@intel.com>
---

Notes:
v3:
- Move the patch earlier in the patchset
- Fix build system files to allow experimental API's
- Removed non-sensical memzone flags code

drivers/crypto/qat/Makefile | 3 +++
drivers/crypto/qat/meson.build | 3 +++
drivers/crypto/qat/qat_qp.c | 23 ++---------------------
3 files changed, 8 insertions(+), 21 deletions(-)

diff --git a/drivers/crypto/qat/Makefile b/drivers/crypto/qat/Makefile
index 260912d..a93fad8 100644
--- a/drivers/crypto/qat/Makefile
+++ b/drivers/crypto/qat/Makefile
@@ -13,6 +13,9 @@ LIBABIVER := 1
CFLAGS += $(WERROR_FLAGS)
CFLAGS += -O3

+# contiguous memzone reserve API are not yet stable
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+
# external library include paths
CFLAGS += -I$(SRCDIR)/qat_adf
LDLIBS += -lcrypto
diff --git a/drivers/crypto/qat/meson.build b/drivers/crypto/qat/meson.build
index 7b90463..ff0c239 100644
--- a/drivers/crypto/qat/meson.build
+++ b/drivers/crypto/qat/meson.build
@@ -12,3 +12,6 @@ includes += include_directories('qat_adf')
deps += ['bus_pci']
ext_deps += dep
pkgconfig_extra_libs += '-lcrypto'
+
+# contig memzone allocation is not yet part of stable API
+allow_experimental_apis = true
diff --git a/drivers/crypto/qat/qat_qp.c b/drivers/crypto/qat/qat_qp.c
index 87b9ce0..2ba28cf 100644
--- a/drivers/crypto/qat/qat_qp.c
+++ b/drivers/crypto/qat/qat_qp.c
@@ -54,8 +54,6 @@ queue_dma_zone_reserve(const char *queue_name, uint32_t queue_size,
int socket_id)
{
const struct rte_memzone *mz;
- unsigned memzone_flags = 0;
- const struct rte_memseg *ms;

PMD_INIT_FUNC_TRACE();
mz = rte_memzone_lookup(queue_name);
@@ -78,25 +76,8 @@ queue_dma_zone_reserve(const char *queue_name, uint32_t queue_size,

PMD_DRV_LOG(DEBUG, "Allocate memzone for %s, size %u on socket %u",
queue_name, queue_size, socket_id);
- ms = rte_eal_get_physmem_layout();
- switch (ms[0].hugepage_sz) {
- case(RTE_PGSIZE_2M):
- memzone_flags = RTE_MEMZONE_2MB;
- break;
- case(RTE_PGSIZE_1G):
- memzone_flags = RTE_MEMZONE_1GB;
- break;
- case(RTE_PGSIZE_16M):
- memzone_flags = RTE_MEMZONE_16MB;
- break;
- case(RTE_PGSIZE_16G):
- memzone_flags = RTE_MEMZONE_16GB;
- break;
- default:
- memzone_flags = RTE_MEMZONE_SIZE_HINT_ONLY;
- }
- return rte_memzone_reserve_aligned(queue_name, queue_size, socket_id,
- memzone_flags, queue_size);
+ return rte_memzone_reserve_aligned_contig(queue_name, queue_size,
+ socket_id, 0, queue_size);
}

int qat_crypto_sym_qp_setup(struct rte_cryptodev *dev, uint16_t queue_pair_id,
--
2.7.4
Anatoly Burakov
2018-04-03 23:21:33 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
Reviewed-by: Venkatesh Srinivas <***@google.com>
Reviewed-by: Maxime Coquelin <***@redhat.com>
---

Notes:
v3:
- Moved patch earlier in the patchset

drivers/net/virtio/virtio_ethdev.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/virtio/virtio_ethdev.c b/drivers/net/virtio/virtio_ethdev.c
index 2ef213d..bdd5e87 100644
--- a/drivers/net/virtio/virtio_ethdev.c
+++ b/drivers/net/virtio/virtio_ethdev.c
@@ -390,7 +390,7 @@ virtio_init_queue(struct rte_eth_dev *dev, uint16_t vtpci_queue_idx)
PMD_INIT_LOG(DEBUG, "vring_size: %d, rounded_vring_size: %d",
size, vq->vq_ring_size);

- mz = rte_memzone_reserve_aligned(vq_name, vq->vq_ring_size,
+ mz = rte_memzone_reserve_aligned_contig(vq_name, vq->vq_ring_size,
SOCKET_ID_ANY,
0, VIRTIO_PCI_VRING_ALIGN);
if (mz == NULL) {
@@ -416,9 +416,9 @@ virtio_init_queue(struct rte_eth_dev *dev, uint16_t vtpci_queue_idx)
if (sz_hdr_mz) {
snprintf(vq_hdr_name, sizeof(vq_hdr_name), "port%d_vq%d_hdr",
dev->data->port_id, vtpci_queue_idx);
- hdr_mz = rte_memzone_reserve_aligned(vq_hdr_name, sz_hdr_mz,
- SOCKET_ID_ANY, 0,
- RTE_CACHE_LINE_SIZE);
+ hdr_mz = rte_memzone_reserve_aligned_contig(vq_hdr_name,
+ sz_hdr_mz, SOCKET_ID_ANY, 0,
+ RTE_CACHE_LINE_SIZE);
if (hdr_mz == NULL) {
if (rte_errno == EEXIST)
hdr_mz = rte_memzone_lookup(vq_hdr_name);
--
2.7.4
Anatoly Burakov
2018-04-03 23:21:34 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---

Notes:
v3:
- Moved patch earlier in the patchset
- Allowed experimental API in Makefile

drivers/net/vmxnet3/Makefile | 3 +++
drivers/net/vmxnet3/vmxnet3_ethdev.c | 7 ++++---
2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/drivers/net/vmxnet3/Makefile b/drivers/net/vmxnet3/Makefile
index 6bfbf01..7f76086 100644
--- a/drivers/net/vmxnet3/Makefile
+++ b/drivers/net/vmxnet3/Makefile
@@ -45,6 +45,9 @@ EXPORT_MAP := rte_pmd_vmxnet3_version.map

LIBABIVER := 1

+# contiguous memzone reserve API are not yet stable
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+
#
# all source are stored in SRCS-y
#
diff --git a/drivers/net/vmxnet3/vmxnet3_ethdev.c b/drivers/net/vmxnet3/vmxnet3_ethdev.c
index 4260087..3f323a0 100644
--- a/drivers/net/vmxnet3/vmxnet3_ethdev.c
+++ b/drivers/net/vmxnet3/vmxnet3_ethdev.c
@@ -149,14 +149,15 @@ gpa_zone_reserve(struct rte_eth_dev *dev, uint32_t size,
if (!reuse) {
if (mz)
rte_memzone_free(mz);
- return rte_memzone_reserve_aligned(z_name, size, socket_id,
- 0, align);
+ return rte_memzone_reserve_aligned_contig(z_name, size,
+ socket_id, 0, align);
}

if (mz)
return mz;

- return rte_memzone_reserve_aligned(z_name, size, socket_id, 0, align);
+ return rte_memzone_reserve_aligned_contig(z_name, size, socket_id, 0,
+ align);
}

/*
--
2.7.4
Anatoly Burakov
2018-04-03 23:21:31 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---

Notes:
v3:
- Moved patch earlier in the patchset
- Allowed experimental API in the build system

drivers/net/i40e/Makefile | 3 +++
drivers/net/i40e/i40e_ethdev.c | 2 +-
drivers/net/i40e/i40e_rxtx.c | 2 +-
drivers/net/i40e/meson.build | 3 +++
4 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/net/i40e/Makefile b/drivers/net/i40e/Makefile
index 5663f5b..bbc33b8 100644
--- a/drivers/net/i40e/Makefile
+++ b/drivers/net/i40e/Makefile
@@ -19,6 +19,9 @@ EXPORT_MAP := rte_pmd_i40e_version.map

LIBABIVER := 2

+# contiguous memzone reserve API are not yet stable
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+
#
# Add extra flags for base driver files (also known as shared code)
# to disable warnings
diff --git a/drivers/net/i40e/i40e_ethdev.c b/drivers/net/i40e/i40e_ethdev.c
index d0bf4e3..6d72726 100644
--- a/drivers/net/i40e/i40e_ethdev.c
+++ b/drivers/net/i40e/i40e_ethdev.c
@@ -4053,7 +4053,7 @@ i40e_allocate_dma_mem_d(__attribute__((unused)) struct i40e_hw *hw,
return I40E_ERR_PARAM;

snprintf(z_name, sizeof(z_name), "i40e_dma_%"PRIu64, rte_rand());
- mz = rte_memzone_reserve_bounded(z_name, size, SOCKET_ID_ANY, 0,
+ mz = rte_memzone_reserve_bounded_contig(z_name, size, SOCKET_ID_ANY, 0,
alignment, RTE_PGSIZE_2M);
if (!mz)
return I40E_ERR_NO_MEMORY;
diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c
index 1217e5a..6b2b40e 100644
--- a/drivers/net/i40e/i40e_rxtx.c
+++ b/drivers/net/i40e/i40e_rxtx.c
@@ -2189,7 +2189,7 @@ i40e_memzone_reserve(const char *name, uint32_t len, int socket_id)
if (mz)
return mz;

- mz = rte_memzone_reserve_aligned(name, len,
+ mz = rte_memzone_reserve_aligned_contig(name, len,
socket_id, 0, I40E_RING_BASE_ALIGN);
return mz;
}
diff --git a/drivers/net/i40e/meson.build b/drivers/net/i40e/meson.build
index 197e611..e418791 100644
--- a/drivers/net/i40e/meson.build
+++ b/drivers/net/i40e/meson.build
@@ -46,3 +46,6 @@ endif
includes += include_directories('base')

install_headers('rte_pmd_i40e.h')
+
+# contig memzone allocation is not yet part of stable API
+allow_experimental_apis = true
--
2.7.4
Anatoly Burakov
2018-04-03 23:21:38 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---
drivers/bus/fslmc/fslmc_vfio.c | 78 ++++++++++++++++++++++--------------------
drivers/event/dpaa2/Makefile | 3 ++
drivers/mempool/dpaa2/Makefile | 3 ++
drivers/net/dpaa2/Makefile | 3 ++
drivers/net/dpaa2/meson.build | 3 ++
drivers/net/octeontx/Makefile | 3 ++
6 files changed, 56 insertions(+), 37 deletions(-)

diff --git a/drivers/bus/fslmc/fslmc_vfio.c b/drivers/bus/fslmc/fslmc_vfio.c
index 1310190..ccdbeff 100644
--- a/drivers/bus/fslmc/fslmc_vfio.c
+++ b/drivers/bus/fslmc/fslmc_vfio.c
@@ -193,17 +193,51 @@ static int vfio_map_irq_region(struct fslmc_vfio_group *group)
return -errno;
}

-int rte_fslmc_vfio_dmamap(void)
+static int
+fslmc_vfio_map(const struct rte_memseg *ms, void *arg)
{
- int ret;
+ int *n_segs = arg;
struct fslmc_vfio_group *group;
struct vfio_iommu_type1_dma_map dma_map = {
.argsz = sizeof(struct vfio_iommu_type1_dma_map),
.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE,
};
+ int ret;
+
+ dma_map.size = ms->len;
+ dma_map.vaddr = ms->addr_64;
+#ifdef RTE_LIBRTE_DPAA2_USE_PHYS_IOVA
+ dma_map.iova = ms->iova;
+#else
+ dma_map.iova = dma_map.vaddr;
+#endif
+
+ /* SET DMA MAP for IOMMU */
+ group = &vfio_group;
+
+ if (!group->container) {
+ FSLMC_VFIO_LOG(ERR, "Container is not connected ");
+ return -1;
+ }
+
+ FSLMC_VFIO_LOG(DEBUG, "-->Initial SHM Virtual ADDR %llX",
+ dma_map.vaddr);
+ FSLMC_VFIO_LOG(DEBUG, "-----> DMA size 0x%llX", dma_map.size);
+ ret = ioctl(group->container->fd, VFIO_IOMMU_MAP_DMA,
+ &dma_map);
+ if (ret) {
+ FSLMC_VFIO_LOG(ERR, "VFIO_IOMMU_MAP_DMA API(errno = %d)",
+ errno);
+ return -1;
+ }
+ (*n_segs)++;
+ return 0;
+}

- int i;
+int rte_fslmc_vfio_dmamap(void)
+{
const struct rte_memseg *memseg;
+ int i = 0;

if (is_dma_done)
return 0;
@@ -214,51 +248,21 @@ int rte_fslmc_vfio_dmamap(void)
return -ENODEV;
}

- for (i = 0; i < RTE_MAX_MEMSEG; i++) {
- if (memseg[i].addr == NULL && memseg[i].len == 0) {
- FSLMC_VFIO_LOG(DEBUG, "Total %d segments found.", i);
- break;
- }
-
- dma_map.size = memseg[i].len;
- dma_map.vaddr = memseg[i].addr_64;
-#ifdef RTE_LIBRTE_DPAA2_USE_PHYS_IOVA
- dma_map.iova = memseg[i].iova;
-#else
- dma_map.iova = dma_map.vaddr;
-#endif
-
- /* SET DMA MAP for IOMMU */
- group = &vfio_group;
-
- if (!group->container) {
- FSLMC_VFIO_LOG(ERR, "Container is not connected ");
- return -1;
- }
-
- FSLMC_VFIO_LOG(DEBUG, "-->Initial SHM Virtual ADDR %llX",
- dma_map.vaddr);
- FSLMC_VFIO_LOG(DEBUG, "-----> DMA size 0x%llX", dma_map.size);
- ret = ioctl(group->container->fd, VFIO_IOMMU_MAP_DMA,
- &dma_map);
- if (ret) {
- FSLMC_VFIO_LOG(ERR, "VFIO_IOMMU_MAP_DMA API(errno = %d)",
- errno);
- return ret;
- }
- }
+ if (rte_memseg_walk(fslmc_vfio_map, &i) < 0)
+ return -1;

/* Verifying that at least single segment is available */
if (i <= 0) {
FSLMC_VFIO_LOG(ERR, "No Segments found for VFIO Mapping");
return -1;
}
+ FSLMC_VFIO_LOG(DEBUG, "Total %d segments found.", i);

/* TODO - This is a W.A. as VFIO currently does not add the mapping of
* the interrupt region to SMMU. This should be removed once the
* support is added in the Kernel.
*/
- vfio_map_irq_region(group);
+ vfio_map_irq_region(&vfio_group);

is_dma_done = 1;

diff --git a/drivers/event/dpaa2/Makefile b/drivers/event/dpaa2/Makefile
index b26862c..a5b68b4 100644
--- a/drivers/event/dpaa2/Makefile
+++ b/drivers/event/dpaa2/Makefile
@@ -28,6 +28,9 @@ EXPORT_MAP := rte_pmd_dpaa2_event_version.map

LIBABIVER := 1

+# depends on fslmc bus which uses experimental API
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+
#
# all source are stored in SRCS-y
#
diff --git a/drivers/mempool/dpaa2/Makefile b/drivers/mempool/dpaa2/Makefile
index efaac96..c1cc2a3 100644
--- a/drivers/mempool/dpaa2/Makefile
+++ b/drivers/mempool/dpaa2/Makefile
@@ -27,6 +27,9 @@ EXPORT_MAP := rte_mempool_dpaa2_version.map
# Lbrary version
LIBABIVER := 1

+# depends on fslmc bus which uses experimental API
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+
# all source are stored in SRCS-y
#
SRCS-$(CONFIG_RTE_LIBRTE_DPAA2_MEMPOOL) += dpaa2_hw_mempool.c
diff --git a/drivers/net/dpaa2/Makefile b/drivers/net/dpaa2/Makefile
index 068e9d3..cc5627c 100644
--- a/drivers/net/dpaa2/Makefile
+++ b/drivers/net/dpaa2/Makefile
@@ -33,6 +33,9 @@ EXPORT_MAP := rte_pmd_dpaa2_version.map
# library version
LIBABIVER := 1

+# depends on fslmc bus which uses experimental API
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+
SRCS-$(CONFIG_RTE_LIBRTE_DPAA2_PMD) += base/dpaa2_hw_dpni.c
SRCS-$(CONFIG_RTE_LIBRTE_DPAA2_PMD) += dpaa2_rxtx.c
SRCS-$(CONFIG_RTE_LIBRTE_DPAA2_PMD) += dpaa2_ethdev.c
diff --git a/drivers/net/dpaa2/meson.build b/drivers/net/dpaa2/meson.build
index ad1724d..8e96b5a 100644
--- a/drivers/net/dpaa2/meson.build
+++ b/drivers/net/dpaa2/meson.build
@@ -13,3 +13,6 @@ sources = files('base/dpaa2_hw_dpni.c',
'mc/dpni.c')

includes += include_directories('base', 'mc')
+
+# depends on fslmc bus which uses experimental API
+allow_experimental_apis = true
diff --git a/drivers/net/octeontx/Makefile b/drivers/net/octeontx/Makefile
index 3e4a106..5f488b9 100644
--- a/drivers/net/octeontx/Makefile
+++ b/drivers/net/octeontx/Makefile
@@ -16,6 +16,9 @@ EXPORT_MAP := rte_pmd_octeontx_version.map

LIBABIVER := 1

+# depends on fslmc bus which uses experimental API
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+
OBJS_BASE_DRIVER=$(patsubst %.c,%.o,$(notdir $(wildcard $(SRCDIR)/base/*.c)))
$(foreach obj, $(OBJS_BASE_DRIVER), $(eval CFLAGS_$(obj)+=$(CFLAGS_BASE_DRIVER)))
--
2.7.4
Shreyansh Jain
2018-04-05 14:06:12 UTC
Permalink
Hello Anatoly,
Post by Anatoly Burakov
---
drivers/bus/fslmc/fslmc_vfio.c | 78 ++++++++++++++++++++++--------------------
drivers/event/dpaa2/Makefile | 3 ++
drivers/mempool/dpaa2/Makefile | 3 ++
drivers/net/dpaa2/Makefile | 3 ++
drivers/net/dpaa2/meson.build | 3 ++
drivers/net/octeontx/Makefile | 3 ++
6 files changed, 56 insertions(+), 37 deletions(-)
diff --git a/drivers/bus/fslmc/fslmc_vfio.c b/drivers/bus/fslmc/fslmc_vfio.c
index 1310190..ccdbeff 100644
--- a/drivers/bus/fslmc/fslmc_vfio.c
+++ b/drivers/bus/fslmc/fslmc_vfio.c
@@ -193,17 +193,51 @@ static int vfio_map_irq_region(struct fslmc_vfio_group *group)
return -errno;
[...]

I will send an incremental patch, in reply to this, which fixes dpaa2
for va cases.

Though, I think this patch can be completely replaced by that - if you
prefer that, let me know and I will send it non-incremental (master based).
Post by Anatoly Burakov
diff --git a/drivers/net/dpaa2/meson.build b/drivers/net/dpaa2/meson.build
index ad1724d..8e96b5a 100644
--- a/drivers/net/dpaa2/meson.build
+++ b/drivers/net/dpaa2/meson.build
@@ -13,3 +13,6 @@ sources = files('base/dpaa2_hw_dpni.c',
'mc/dpni.c')
includes += include_directories('base', 'mc')
+
+# depends on fslmc bus which uses experimental API
+allow_experimental_apis = true
diff --git a/drivers/net/octeontx/Makefile b/drivers/net/octeontx/Makefile
index 3e4a106..5f488b9 100644
--- a/drivers/net/octeontx/Makefile
+++ b/drivers/net/octeontx/Makefile
@@ -16,6 +16,9 @@ EXPORT_MAP := rte_pmd_octeontx_version.map
LIBABIVER := 1
+# depends on fslmc bus which uses experimental API
I think you wanted to say "octeontx" rather than fslmc here. Also, this
is not part of 'bus/fslmc' patch.
Post by Anatoly Burakov
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+
OBJS_BASE_DRIVER=$(patsubst %.c,%.o,$(notdir $(wildcard $(SRCDIR)/base/*.c)))
$(foreach obj, $(OBJS_BASE_DRIVER), $(eval CFLAGS_$(obj)+=$(CFLAGS_BASE_DRIVER)))
If the Octeon part is removed from above, and incremental patch is
merged here, please use my Ack:

Acked-by: Shreyansh Jain <***@nxp.com>
Shreyansh Jain
2018-04-05 14:14:14 UTC
Permalink
Restructure VFIO DMA code for handling hotplug memory events
(callbacks) and --legacy case.

Signed-off-by: Shreyansh Jain <***@nxp.com>
---

###
This is based on the 16fbfef04a3 github repository. This is assuming
that changes already exists as done in patch 26/68.
Though, this can be a standalone, replacing 26/88. Though, the Makefile
changes don't exist in this.
Also, this just a first draft. I will push any review changes after this
incrementally over v4.
###

drivers/bus/fslmc/fslmc_bus.c | 15 ++++
drivers/bus/fslmc/fslmc_vfio.c | 161 +++++++++++++++++++++++++++++++++++----
drivers/bus/fslmc/fslmc_vfio.h | 1 +
drivers/net/dpaa2/dpaa2_ethdev.c | 1 -
4 files changed, 163 insertions(+), 15 deletions(-)

diff --git a/drivers/bus/fslmc/fslmc_bus.c b/drivers/bus/fslmc/fslmc_bus.c
index 5ee0beb85..50884ff3a 100644
--- a/drivers/bus/fslmc/fslmc_bus.c
+++ b/drivers/bus/fslmc/fslmc_bus.c
@@ -266,6 +266,21 @@ rte_fslmc_probe(void)
return 0;
}

+ if (rte_log_get_global_level() >= RTE_LOG_DEBUG)
+ rte_dump_physmem_layout(stdout);
+
+ /* Map existing segments as well as, in case of hotpluggable memory,
+ * install callback handler.
+ */
+ ret = rte_fslmc_vfio_dmamap();
+ if (ret) {
+ FSLMC_BUS_LOG(ERR, "Unable to DMA map existing VAs: (%d)",
+ ret);
+ /* Not continuing ahead */
+ FSLMC_BUS_LOG(ERR, "FSLMC VFIO Mapping failed");
+ return 0;
+ }
+
ret = fslmc_vfio_process_group();
if (ret) {
FSLMC_BUS_LOG(ERR, "Unable to setup devices %d", ret);
diff --git a/drivers/bus/fslmc/fslmc_vfio.c b/drivers/bus/fslmc/fslmc_vfio.c
index 31831e3ce..60725fb70 100644
--- a/drivers/bus/fslmc/fslmc_vfio.c
+++ b/drivers/bus/fslmc/fslmc_vfio.c
@@ -30,6 +30,7 @@
#include <rte_kvargs.h>
#include <rte_dev.h>
#include <rte_bus.h>
+#include <rte_eal_memconfig.h>

#include "rte_fslmc.h"
#include "fslmc_vfio.h"
@@ -193,11 +194,68 @@ static int vfio_map_irq_region(struct fslmc_vfio_group *group)
return -errno;
}

+static int fslmc_map_dma(uint64_t vaddr, rte_iova_t iovaddr, size_t len);
+static int fslmc_unmap_dma(uint64_t vaddr, rte_iova_t iovaddr, size_t len);
+
+static void
+fslmc_memevent_cb(enum rte_mem_event type, const void *addr, size_t len)
+{
+ struct rte_memseg_list *msl;
+ struct rte_memseg *ms;
+ size_t cur_len = 0, map_len = 0;
+ uint64_t pgsz, virt_addr;
+ rte_iova_t iova_addr;
+ int ret;
+
+ msl = rte_mem_virt2memseg_list(addr);
+ pgsz = msl->page_sz;
+
+ while (cur_len < len) {
+ const void *va = RTE_PTR_ADD(addr, cur_len);
+
+ ms = rte_mem_virt2memseg(va, msl);
+ iova_addr = rte_mem_virt2iova(va);
+ virt_addr = ms->addr_64;
+
+ map_len = len - map_len;
+ /* Can only work with max pg_sz */
+ if (map_len > pgsz)
+ map_len = pgsz;
+
+ FSLMC_VFIO_LOG(DEBUG, "Calling with type=%d, va=%p,"
+ " virt_addr=%lX, iova=%lu, map_len=%lu\n",
+ type, va, virt_addr, iova_addr, map_len);
+
+ if (type == RTE_MEM_EVENT_ALLOC)
+ ret = fslmc_map_dma(virt_addr, iova_addr, map_len);
+ else
+ ret = fslmc_unmap_dma(virt_addr, iova_addr, map_len);
+
+ if (ret != 0) {
+ FSLMC_VFIO_LOG(ERR, "DMA Mapping/Unmapping failed. "
+ "Map=%d, addr=%p, len=%lu, err:(%d)",
+ type, va, map_len, ret);
+ return;
+ }
+
+ cur_len += map_len;
+ }
+
+ if (type == RTE_MEM_EVENT_ALLOC)
+ FSLMC_VFIO_LOG(DEBUG, "Total Mapped: addr=%p, len=%lu\n",
+ addr, len);
+ else
+ FSLMC_VFIO_LOG(DEBUG, "Total Unmapped: addr=%p, len=%lu\n",
+ addr, len);
+}
+
static int
-fslmc_vfio_map(const struct rte_memseg_list *msl __rte_unused,
- const struct rte_memseg *ms, void *arg)
+#ifdef RTE_LIBRTE_DPAA2_USE_PHYS_IOVA
+fslmc_map_dma(uint64_t vaddr, rte_iova_t iovaddr, size_t len)
+#else
+fslmc_map_dma(uint64_t vaddr, rte_iova_t iovaddr __rte_unused, size_t len)
+#endif
{
- int *n_segs = arg;
struct fslmc_vfio_group *group;
struct vfio_iommu_type1_dma_map dma_map = {
.argsz = sizeof(struct vfio_iommu_type1_dma_map),
@@ -205,10 +263,11 @@ fslmc_vfio_map(const struct rte_memseg_list *msl __rte_unused,
};
int ret;

- dma_map.size = ms->len;
- dma_map.vaddr = ms->addr_64;
+ dma_map.size = len;
+ dma_map.vaddr = vaddr;
+
#ifdef RTE_LIBRTE_DPAA2_USE_PHYS_IOVA
- dma_map.iova = ms->iova;
+ dma_map.iova = iovaddr;
#else
dma_map.iova = dma_map.vaddr;
#endif
@@ -221,33 +280,102 @@ fslmc_vfio_map(const struct rte_memseg_list *msl __rte_unused,
return -1;
}

- FSLMC_VFIO_LOG(DEBUG, "-->Initial SHM Virtual ADDR %llX",
- dma_map.vaddr);
- FSLMC_VFIO_LOG(DEBUG, "-----> DMA size 0x%llX", dma_map.size);
- ret = ioctl(group->container->fd, VFIO_IOMMU_MAP_DMA,
- &dma_map);
+ FSLMC_VFIO_LOG(DEBUG, "--> Map address: %llX, size: 0x%llX\n",
+ dma_map.vaddr, dma_map.size);
+ ret = ioctl(group->container->fd, VFIO_IOMMU_MAP_DMA, &dma_map);
if (ret) {
FSLMC_VFIO_LOG(ERR, "VFIO_IOMMU_MAP_DMA API(errno = %d)",
errno);
return -1;
}
- (*n_segs)++;
+
return 0;
}

+static int
+fslmc_unmap_dma(uint64_t vaddr, uint64_t iovaddr __rte_unused, size_t len)
+{
+ struct fslmc_vfio_group *group;
+ struct vfio_iommu_type1_dma_unmap dma_unmap = {
+ .argsz = sizeof(struct vfio_iommu_type1_dma_unmap),
+ .flags = 0,
+ };
+ int ret;
+
+ dma_unmap.size = len;
+ dma_unmap.iova = vaddr;
+
+ /* SET DMA MAP for IOMMU */
+ group = &vfio_group;
+
+ if (!group->container) {
+ FSLMC_VFIO_LOG(ERR, "Container is not connected ");
+ return -1;
+ }
+
+ FSLMC_VFIO_LOG(DEBUG, "--> Unmap address: %llX, size: 0x%llX\n",
+ dma_unmap.iova, dma_unmap.size);
+ ret = ioctl(group->container->fd, VFIO_IOMMU_UNMAP_DMA, &dma_unmap);
+ if (ret) {
+ FSLMC_VFIO_LOG(ERR, "VFIO_IOMMU_UNMAP_DMA API(errno = %d)",
+ errno);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int
+fslmc_dmamap_seg(const struct rte_memseg_list *msl __rte_unused,
+ const struct rte_memseg *ms, void *arg)
+{
+ int *n_segs = arg;
+ int ret;
+
+ ret = fslmc_map_dma(ms->addr_64, ms->iova, ms->len);
+ if (ret)
+ FSLMC_VFIO_LOG(ERR, "Unable to VFIO map (addr=%p, len=%lu)\n",
+ ms->addr, ms->len);
+ else
+ (*n_segs)++;
+
+ return ret;
+}
+
int rte_fslmc_vfio_dmamap(void)
{
- int i = 0;
+ int i = 0, ret;
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ rte_rwlock_t *mem_lock = &mcfg->memory_hotplug_lock;

if (is_dma_done)
return 0;

- if (rte_memseg_walk(fslmc_vfio_map, &i) < 0)
+ /* Lock before parsing and registering callback to memory subsystem */
+ rte_rwlock_read_lock(mem_lock);
+
+ if (rte_memseg_walk(fslmc_dmamap_seg, &i) < 0) {
+ rte_rwlock_read_lock(mem_lock);
return -1;
+ }
+
+ ret = rte_mem_event_callback_register("fslmc_memevent_clb",
+ fslmc_memevent_cb);
+ if (ret)
+ /* Cannot install callback handler - thus, DMA Mapping cannot
+ * be done. But, this is possible for --legacy-mem option
+ */
+ FSLMC_VFIO_LOG(DEBUG, "Unable to install memory handler");

+ FSLMC_VFIO_LOG(DEBUG, "Installed memory callback handler");
/* Verifying that at least single segment is available */
if (i <= 0) {
+ /* TODO: Is this verification required any more? What would
+ * happen to non-legacy case where nothing was preallocated
+ * thus causing i==0?
+ */
FSLMC_VFIO_LOG(ERR, "No Segments found for VFIO Mapping");
+ rte_rwlock_read_unlock(mem_lock);
return -1;
}
FSLMC_VFIO_LOG(DEBUG, "Total %d segments found.", i);
@@ -258,6 +386,11 @@ int rte_fslmc_vfio_dmamap(void)
*/
vfio_map_irq_region(&vfio_group);

+ /* Existing segments have been mapped and memory callback for hotplug
+ * has been installed.
+ */
+ rte_rwlock_read_unlock(mem_lock);
+
is_dma_done = 1;

return 0;
diff --git a/drivers/bus/fslmc/fslmc_vfio.h b/drivers/bus/fslmc/fslmc_vfio.h
index e8fb3445f..e77e4c4ac 100644
--- a/drivers/bus/fslmc/fslmc_vfio.h
+++ b/drivers/bus/fslmc/fslmc_vfio.h
@@ -9,6 +9,7 @@
#define _FSLMC_VFIO_H_

#include <rte_vfio.h>
+#include <rte_memory.h>

#include "eal_vfio.h"

diff --git a/drivers/net/dpaa2/dpaa2_ethdev.c b/drivers/net/dpaa2/dpaa2_ethdev.c
index 2fb7b2da7..962b90035 100644
--- a/drivers/net/dpaa2/dpaa2_ethdev.c
+++ b/drivers/net/dpaa2/dpaa2_ethdev.c
@@ -1850,7 +1850,6 @@ dpaa2_dev_init(struct rte_eth_dev *eth_dev)

eth_dev->rx_pkt_burst = dpaa2_dev_prefetch_rx;
eth_dev->tx_pkt_burst = dpaa2_dev_tx;
- rte_fslmc_vfio_dmamap();

RTE_LOG(INFO, PMD, "%s: netdev created\n", eth_dev->data->name);
return 0;
--
2.14.1
Burakov, Anatoly
2018-04-08 17:14:40 UTC
Permalink
Post by Shreyansh Jain
Restructure VFIO DMA code for handling hotplug memory events
(callbacks) and --legacy case.
---
###
This is based on the 16fbfef04a3 github repository. This is assuming
that changes already exists as done in patch 26/68.
Though, this can be a standalone, replacing 26/88. Though, the Makefile
changes don't exist in this.
Also, this just a first draft. I will push any review changes after this
incrementally over v4.
###
Hi Shreyansh,

I think we can keep the 26/68 as it still works within the context of
the patchset. I would like to add these changes closer to the end, where
we enable support for callbacks in VFIO (this could/should come as the
next patch).

That said, i took some liberties when integrating this patch, hopefully
for the better. I know you mentioned it's a draft, so you can post any
comments for the inevitable v4 :)
Post by Shreyansh Jain
drivers/bus/fslmc/fslmc_bus.c | 15 ++++
drivers/bus/fslmc/fslmc_vfio.c | 161 +++++++++++++++++++++++++++++++++++----
drivers/bus/fslmc/fslmc_vfio.h | 1 +
drivers/net/dpaa2/dpaa2_ethdev.c | 1 -
4 files changed, 163 insertions(+), 15 deletions(-)
diff --git a/drivers/bus/fslmc/fslmc_bus.c b/drivers/bus/fslmc/fslmc_bus.c
index 5ee0beb85..50884ff3a 100644
--- a/drivers/bus/fslmc/fslmc_bus.c
+++ b/drivers/bus/fslmc/fslmc_bus.c
@@ -266,6 +266,21 @@ rte_fslmc_probe(void)
return 0;
}
+ if (rte_log_get_global_level() >= RTE_LOG_DEBUG)
+ rte_dump_physmem_layout(stdout);
Presumably, this is not needed - just debug leftovers?
Post by Shreyansh Jain
+
+ /* Map existing segments as well as, in case of hotpluggable memory,
+ * install callback handler.
+ */
+ ret = rte_fslmc_vfio_dmamap();
+ if (ret) {
+ FSLMC_BUS_LOG(ERR, "Unable to DMA map existing VAs: (%d)",
+ ret);
+ /* Not continuing ahead */
+ FSLMC_BUS_LOG(ERR, "FSLMC VFIO Mapping failed");
+ return 0;
+ }
+
What happens if there are no devices on the bus that can be used by
DPDK? As far as i can tell, it would return error, which may or may not
be desirable (failing to map anything because there aren't any fslmc
devices is not an error?).

For "regular" VFIO, the container is an empty shell unless you add
groups into it - does fslmc VFIO support work differently, and container
is already working/initialized by the time we reach this point?

Anyway, i'll leave this as is.
Post by Shreyansh Jain
ret = fslmc_vfio_process_group();
if (ret) {
FSLMC_BUS_LOG(ERR, "Unable to setup devices %d", ret);
diff --git a/drivers/bus/fslmc/fslmc_vfio.c b/drivers/bus/fslmc/fslmc_vfio.c
index 31831e3ce..60725fb70 100644
--- a/drivers/bus/fslmc/fslmc_vfio.c
+++ b/drivers/bus/fslmc/fslmc_vfio.c
@@ -30,6 +30,7 @@
#include <rte_kvargs.h>
#include <rte_dev.h>
#include <rte_bus.h>
+#include <rte_eal_memconfig.h>
<...>
Post by Shreyansh Jain
+}
+
static int
-fslmc_vfio_map(const struct rte_memseg_list *msl __rte_unused,
- const struct rte_memseg *ms, void *arg)
+#ifdef RTE_LIBRTE_DPAA2_USE_PHYS_IOVA
+fslmc_map_dma(uint64_t vaddr, rte_iova_t iovaddr, size_t len)
+#else
+fslmc_map_dma(uint64_t vaddr, rte_iova_t iovaddr __rte_unused, size_t len)
+#endif
I think i'll leave this as just "rte_iova_t iovaaddr __rte_unused" :)
Post by Shreyansh Jain
{
- int *n_segs = arg;
struct fslmc_vfio_group *group;
struct vfio_iommu_type1_dma_map dma_map = {
.argsz = sizeof(struct vfio_iommu_type1_dma_map),
@@ -205,10 +263,11 @@ fslmc_vfio_map(const struct rte_memseg_list *msl __rte_unused,
};
int ret;
- dma_map.size = ms->len;
- dma_map.vaddr = ms->addr_64;
+ dma_map.size = len;
+ dma_map.vaddr = vaddr;
<...>
Post by Shreyansh Jain
if (is_dma_done)
return 0;
I suspect this check was needed because you've done VFIO mapping on
device probe as opposed to bus probe, so VFIO mapping function could've
been called multiple times. Is that still the case, or is this check no
longer needed? I took the liberty to remove it.
Post by Shreyansh Jain
- if (rte_memseg_walk(fslmc_vfio_map, &i) < 0)
+ /* Lock before parsing and registering callback to memory subsystem */
+ rte_rwlock_read_lock(mem_lock);
+
<...>
Post by Shreyansh Jain
return 0;
diff --git a/drivers/bus/fslmc/fslmc_vfio.h b/drivers/bus/fslmc/fslmc_vfio.h
index e8fb3445f..e77e4c4ac 100644
--- a/drivers/bus/fslmc/fslmc_vfio.h
+++ b/drivers/bus/fslmc/fslmc_vfio.h
@@ -9,6 +9,7 @@
#define _FSLMC_VFIO_H_
#include <rte_vfio.h>
+#include <rte_memory.h>
#include "eal_vfio.h"
I suspect this change is not needed? I took the liberty to remove it.
--
Thanks,
Anatoly
Shreyansh Jain
2018-04-09 07:49:18 UTC
Permalink
Hello Anatoly,
Post by Burakov, Anatoly
Post by Shreyansh Jain
Restructure VFIO DMA code for handling hotplug memory events
(callbacks) and --legacy case.
---
###
This is based on the 16fbfef04a3 github repository. This is assuming
that changes already exists as done in patch 26/68.
Though, this can be a standalone, replacing 26/88. Though, the Makefile
changes don't exist in this.
Also, this just a first draft. I will push any review changes after this
incrementally over v4.
###
Hi Shreyansh,
I think we can keep the 26/68 as it still works within the context of
the patchset. I would like to add these changes closer to the end, where
we enable support for callbacks in VFIO (this could/should come as the
next patch).
But then it would also mean that dpaa2 would be broken within the memory
hotplug patches?
I think it would be broken once the memseg ceases to be continuous
physical sets.
Post by Burakov, Anatoly
That said, i took some liberties when integrating this patch, hopefully
for the better. I know you mentioned it's a draft, so you can post any
comments for the inevitable v4 :)
Post by Shreyansh Jain
  drivers/bus/fslmc/fslmc_bus.c    |  15 ++++
  drivers/bus/fslmc/fslmc_vfio.c   | 161
+++++++++++++++++++++++++++++++++++----
  drivers/bus/fslmc/fslmc_vfio.h   |   1 +
  drivers/net/dpaa2/dpaa2_ethdev.c |   1 -
  4 files changed, 163 insertions(+), 15 deletions(-)
diff --git a/drivers/bus/fslmc/fslmc_bus.c
b/drivers/bus/fslmc/fslmc_bus.c
index 5ee0beb85..50884ff3a 100644
--- a/drivers/bus/fslmc/fslmc_bus.c
+++ b/drivers/bus/fslmc/fslmc_bus.c
@@ -266,6 +266,21 @@ rte_fslmc_probe(void)
          return 0;
      }
+    if (rte_log_get_global_level() >= RTE_LOG_DEBUG)
+        rte_dump_physmem_layout(stdout);
Presumably, this is not needed - just debug leftovers?
Yes, and can be removed.
Post by Burakov, Anatoly
Post by Shreyansh Jain
+
+    /* Map existing segments as well as, in case of hotpluggable memory,
+     * install callback handler.
+     */
+    ret = rte_fslmc_vfio_dmamap();
+    if (ret) {
+        FSLMC_BUS_LOG(ERR, "Unable to DMA map existing VAs: (%d)",
+                  ret);
+        /* Not continuing ahead */
+        FSLMC_BUS_LOG(ERR, "FSLMC VFIO Mapping failed");
+        return 0;
+    }
+
What happens if there are no devices on the bus that can be used by
DPDK? As far as i can tell, it would return error, which may or may not
be desirable (failing to map anything because there aren't any fslmc
devices is not an error?).
## If no devices found on the bus:
Call wouldn't have reached here. There is a jump out in rte_fslmc_probe
in case no devices were scanned on the bus.

--->8--- rte_fslmc_probe() ---
...
if (TAILQ_EMPTY(&rte_fslmc_bus.device_list))
return 0;
...
--->8---

## Assuming you are pointing to 'return 0':
So, the rte_eal_scan/probe doesn't expect to be interrupted just because
one of the buses had issues (whether no devices found, not a real
issue). It would continue ahead with scan/probe.

But, I think error should be returned so that rte_eal_probe can dump to
logs the error and continue ahead normally. I will fix this.
Post by Burakov, Anatoly
For "regular" VFIO, the container is an empty shell unless you add
groups into it - does fslmc VFIO support work differently, and container
is already working/initialized by the time we reach this point?
FSLMC VFIO Container is not much different from generic container. But,
at this point in code, the container is already initialized (group is
connected to it, and relevant device fds extracted).
Only thing pending beyond this point is to look into the container and
initialize various fslmc specific devices contained *within* the group.
And, dma mapping of regions which is done using the newly introduced code.
Post by Burakov, Anatoly
Anyway, i'll leave this as is.
OK.
Post by Burakov, Anatoly
Post by Shreyansh Jain
      ret = fslmc_vfio_process_group();
      if (ret) {
          FSLMC_BUS_LOG(ERR, "Unable to setup devices %d", ret);
diff --git a/drivers/bus/fslmc/fslmc_vfio.c
b/drivers/bus/fslmc/fslmc_vfio.c
index 31831e3ce..60725fb70 100644
--- a/drivers/bus/fslmc/fslmc_vfio.c
+++ b/drivers/bus/fslmc/fslmc_vfio.c
@@ -30,6 +30,7 @@
  #include <rte_kvargs.h>
  #include <rte_dev.h>
  #include <rte_bus.h>
+#include <rte_eal_memconfig.h>
<...>
Post by Shreyansh Jain
+}
+
  static int
-fslmc_vfio_map(const struct rte_memseg_list *msl __rte_unused,
-        const struct rte_memseg *ms, void *arg)
+#ifdef RTE_LIBRTE_DPAA2_USE_PHYS_IOVA
+fslmc_map_dma(uint64_t vaddr, rte_iova_t iovaddr, size_t len)
+#else
+fslmc_map_dma(uint64_t vaddr, rte_iova_t iovaddr __rte_unused, size_t len)
+#endif
I think i'll leave this as just "rte_iova_t iovaaddr __rte_unused" :)
Hmm, it is harmless if used without the enclosing #defines.
But, i don't know if any compiler has any side-effects attached to this
- for example, clang reporting this as error, etc.
Post by Burakov, Anatoly
Post by Shreyansh Jain
  {
-    int *n_segs = arg;
      struct fslmc_vfio_group *group;
      struct vfio_iommu_type1_dma_map dma_map = {
          .argsz = sizeof(struct vfio_iommu_type1_dma_map),
@@ -205,10 +263,11 @@ fslmc_vfio_map(const struct rte_memseg_list *msl __rte_unused,
      };
      int ret;
-    dma_map.size = ms->len;
-    dma_map.vaddr = ms->addr_64;
+    dma_map.size = len;
+    dma_map.vaddr = vaddr;
<...>
Post by Shreyansh Jain
      if (is_dma_done)
          return 0;
I suspect this check was needed because you've done VFIO mapping on
device probe as opposed to bus probe, so VFIO mapping function could've
been called multiple times. Is that still the case, or is this check no
longer needed? I took the liberty to remove it.
Yes, that is correct. Now that device probe vfio is disabled, it is safe
to remove this check.
Post by Burakov, Anatoly
Post by Shreyansh Jain
-    if (rte_memseg_walk(fslmc_vfio_map, &i) < 0)
+    /* Lock before parsing and registering callback to memory
subsystem */
+    rte_rwlock_read_lock(mem_lock);
+
<...>
Post by Shreyansh Jain
      return 0;
diff --git a/drivers/bus/fslmc/fslmc_vfio.h
b/drivers/bus/fslmc/fslmc_vfio.h
index e8fb3445f..e77e4c4ac 100644
--- a/drivers/bus/fslmc/fslmc_vfio.h
+++ b/drivers/bus/fslmc/fslmc_vfio.h
@@ -9,6 +9,7 @@
  #define _FSLMC_VFIO_H_
  #include <rte_vfio.h>
+#include <rte_memory.h>
  #include "eal_vfio.h"
I suspect this change is not needed? I took the liberty to remove it.
I remember when I was using your initial patch, this I added based on
structures being unresolved. It is possible that is not longer the case
(and that I too have moved a lot of code beyond the first internal
draft). If it compiles OK, no need for this.

-
Shreyansh
Burakov, Anatoly
2018-04-09 15:49:52 UTC
Permalink
Post by Shreyansh Jain
Hello Anatoly,
Post by Burakov, Anatoly
Post by Shreyansh Jain
Restructure VFIO DMA code for handling hotplug memory events
(callbacks) and --legacy case.
---
###
This is based on the 16fbfef04a3 github repository. This is assuming
that changes already exists as done in patch 26/68.
Though, this can be a standalone, replacing 26/88. Though, the Makefile
changes don't exist in this.
Also, this just a first draft. I will push any review changes after this
incrementally over v4.
###
Hi Shreyansh,
I think we can keep the 26/68 as it still works within the context of
the patchset. I would like to add these changes closer to the end,
where we enable support for callbacks in VFIO (this could/should come
as the next patch).
But then it would also mean that dpaa2 would be broken within the memory
hotplug patches?
I think it would be broken once the memseg ceases to be continuous
physical sets.
Hi Shreyansh,

Why would it be broken? Even when memseg change comes into effect,
legacy mem is not enabled until much later in the patchset, when all of
the callback/multiprocess business is in place. For all intents and
purposes, this stays valid for legacy mode, hence not broken.

We later enable callbacks etc. on VFIO, but technically they still
aren't enabled until 65 (67 in v4), when it becomes possible to actually
run DPDK in non-legacy mode.

So, for the duration of this patchset, dpaa2 is not broken, as far as i
can tell. Keeping in mind that only legacy mode will be available until
patch 65/67, what exactly is being broken here?
--
Thanks,
Anatoly
Shreyansh Jain
2018-04-09 16:06:53 UTC
Permalink
-----Original Message-----
Sent: Monday, April 9, 2018 9:20 PM
Subject: Re: [dpdk-dev] [PATCH] bus/fslmc: support for hotplugging of
memory
Post by Shreyansh Jain
Hello Anatoly,
Post by Burakov, Anatoly
Post by Shreyansh Jain
Restructure VFIO DMA code for handling hotplug memory events
(callbacks) and --legacy case.
---
###
This is based on the 16fbfef04a3 github repository. This is assuming
that changes already exists as done in patch 26/68.
Though, this can be a standalone, replacing 26/88. Though, the
Makefile
Post by Shreyansh Jain
Post by Burakov, Anatoly
Post by Shreyansh Jain
changes don't exist in this.
Also, this just a first draft. I will push any review changes after
this
Post by Shreyansh Jain
Post by Burakov, Anatoly
Post by Shreyansh Jain
incrementally over v4.
###
Hi Shreyansh,
I think we can keep the 26/68 as it still works within the context of
the patchset. I would like to add these changes closer to the end,
where we enable support for callbacks in VFIO (this could/should come
as the next patch).
But then it would also mean that dpaa2 would be broken within the
memory
Post by Shreyansh Jain
hotplug patches?
I think it would be broken once the memseg ceases to be continuous
physical sets.
Hi Shreyansh,
Why would it be broken? Even when memseg change comes into effect,
legacy mem is not enabled until much later in the patchset, when all of
the callback/multiprocess business is in place. For all intents and
purposes, this stays valid for legacy mode, hence not broken.
Ok. Then, I am mistaken. I was under the impression that as soon as memseg lists are introduced, when memseg stop being contiguous blocks, DPAA2 would have stopped working. I just didn't put an effort to check this.
We later enable callbacks etc. on VFIO, but technically they still
aren't enabled until 65 (67 in v4), when it becomes possible to actually
run DPDK in non-legacy mode.
Got it. Thanks.
So, for the duration of this patchset, dpaa2 is not broken, as far as i
can tell. Keeping in mind that only legacy mode will be available until
patch 65/67, what exactly is being broken here?
Nothing, just a misunderstand

Anatoly Burakov
2018-04-03 23:21:37 UTC
Permalink
For code that might need to iterate over list of allocated
segments, using this API will make it more resilient to
internal API changes and will prevent copying the same
iteration code over and over again.

Additionally, down the line there will be locking implemented,
so users of this API will not need to care about locking
either.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/common/eal_common_memory.c | 21 +++++++++++++++++++++
lib/librte_eal/common/include/rte_memory.h | 25 +++++++++++++++++++++++++
lib/librte_eal/rte_eal_version.map | 1 +
3 files changed, 47 insertions(+)

diff --git a/lib/librte_eal/common/eal_common_memory.c b/lib/librte_eal/common/eal_common_memory.c
index 5b8ced4..947db1f 100644
--- a/lib/librte_eal/common/eal_common_memory.c
+++ b/lib/librte_eal/common/eal_common_memory.c
@@ -218,6 +218,27 @@ rte_mem_lock_page(const void *virt)
return mlock((void *)aligned, page_size);
}

+int __rte_experimental
+rte_memseg_walk(rte_memseg_walk_t func, void *arg)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ int i, ret;
+
+ for (i = 0; i < RTE_MAX_MEMSEG; i++) {
+ const struct rte_memseg *ms = &mcfg->memseg[i];
+
+ if (ms->addr == NULL)
+ continue;
+
+ ret = func(ms, arg);
+ if (ret < 0)
+ return -1;
+ if (ret > 0)
+ return 1;
+ }
+ return 0;
+}
+
/* init memory subsystem */
int
rte_eal_memory_init(void)
diff --git a/lib/librte_eal/common/include/rte_memory.h b/lib/librte_eal/common/include/rte_memory.h
index 302f865..93eadaa 100644
--- a/lib/librte_eal/common/include/rte_memory.h
+++ b/lib/librte_eal/common/include/rte_memory.h
@@ -20,6 +20,7 @@ extern "C" {
#endif

#include <rte_common.h>
+#include <rte_compat.h>
#include <rte_config.h>

__extension__
@@ -130,6 +131,30 @@ phys_addr_t rte_mem_virt2phy(const void *virt);
rte_iova_t rte_mem_virt2iova(const void *virt);

/**
+ * Memseg walk function prototype.
+ *
+ * Returning 0 will continue walk
+ * Returning 1 will stop the walk
+ * Returning -1 will stop the walk and report error
+ */
+typedef int (*rte_memseg_walk_t)(const struct rte_memseg *ms, void *arg);
+
+/**
+ * Walk list of all memsegs.
+ *
+ * @param func
+ * Iterator function
+ * @param arg
+ * Argument passed to iterator
+ * @return
+ * 0 if walked over the entire list
+ * 1 if stopped by the user
+ * -1 if user function reported error
+ */
+int __rte_experimental
+rte_memseg_walk(rte_memseg_walk_t func, void *arg);
+
+/**
* Get the layout of the available physical memory.
*
* It can be useful for an application to have the full physical
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index 25e00de..7e9900d 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -223,6 +223,7 @@ EXPERIMENTAL {
rte_eal_mbuf_user_pool_ops;
rte_log_register_type_and_pick_level;
rte_malloc_dump_heaps;
+ rte_memseg_walk;
rte_memzone_reserve_contig;
rte_memzone_reserve_aligned_contig;
rte_memzone_reserve_bounded_contig;
--
2.7.4
Anatoly Burakov
2018-04-03 23:21:35 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---

Notes:
v3:
- Added this patch

All memzone reserve calls then check physical addresses,
so this looks like they're reserving DMA memory.
Corrections welcome.

drivers/net/bnxt/Makefile | 3 +++
drivers/net/bnxt/bnxt_ethdev.c | 6 ++++--
drivers/net/bnxt/bnxt_ring.c | 3 ++-
drivers/net/bnxt/bnxt_vnic.c | 2 +-
4 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/net/bnxt/Makefile b/drivers/net/bnxt/Makefile
index 2aa0441..b443d29 100644
--- a/drivers/net/bnxt/Makefile
+++ b/drivers/net/bnxt/Makefile
@@ -42,6 +42,9 @@ EXPORT_MAP := rte_pmd_bnxt_version.map

LIBABIVER := 2

+# contiguous memzone reserve API are not yet stable
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+
CFLAGS += -O3
CFLAGS += $(WERROR_FLAGS)
LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring
diff --git a/drivers/net/bnxt/bnxt_ethdev.c b/drivers/net/bnxt/bnxt_ethdev.c
index 0b21653..5a7143c 100644
--- a/drivers/net/bnxt/bnxt_ethdev.c
+++ b/drivers/net/bnxt/bnxt_ethdev.c
@@ -3146,7 +3146,8 @@ bnxt_dev_init(struct rte_eth_dev *eth_dev)
total_alloc_len = RTE_CACHE_LINE_ROUNDUP(
sizeof(struct rx_port_stats) + 512);
if (!mz) {
- mz = rte_memzone_reserve(mz_name, total_alloc_len,
+ mz = rte_memzone_reserve_contig(mz_name,
+ total_alloc_len,
SOCKET_ID_ANY,
RTE_MEMZONE_2MB |
RTE_MEMZONE_SIZE_HINT_ONLY);
@@ -3181,7 +3182,8 @@ bnxt_dev_init(struct rte_eth_dev *eth_dev)
total_alloc_len = RTE_CACHE_LINE_ROUNDUP(
sizeof(struct tx_port_stats) + 512);
if (!mz) {
- mz = rte_memzone_reserve(mz_name, total_alloc_len,
+ mz = rte_memzone_reserve_contig(mz_name,
+ total_alloc_len,
SOCKET_ID_ANY,
RTE_MEMZONE_2MB |
RTE_MEMZONE_SIZE_HINT_ONLY);
diff --git a/drivers/net/bnxt/bnxt_ring.c b/drivers/net/bnxt/bnxt_ring.c
index 8fb8972..e8127de 100644
--- a/drivers/net/bnxt/bnxt_ring.c
+++ b/drivers/net/bnxt/bnxt_ring.c
@@ -165,7 +165,8 @@ int bnxt_alloc_rings(struct bnxt *bp, uint16_t qidx,
mz_name[RTE_MEMZONE_NAMESIZE - 1] = 0;
mz = rte_memzone_lookup(mz_name);
if (!mz) {
- mz = rte_memzone_reserve_aligned(mz_name, total_alloc_len,
+ mz = rte_memzone_reserve_aligned_contig(mz_name,
+ total_alloc_len,
SOCKET_ID_ANY,
RTE_MEMZONE_2MB |
RTE_MEMZONE_SIZE_HINT_ONLY,
diff --git a/drivers/net/bnxt/bnxt_vnic.c b/drivers/net/bnxt/bnxt_vnic.c
index d4aeb4c..611ce66 100644
--- a/drivers/net/bnxt/bnxt_vnic.c
+++ b/drivers/net/bnxt/bnxt_vnic.c
@@ -184,7 +184,7 @@ int bnxt_alloc_vnic_attributes(struct bnxt *bp)
mz_name[RTE_MEMZONE_NAMESIZE - 1] = 0;
mz = rte_memzone_lookup(mz_name);
if (!mz) {
- mz = rte_memzone_reserve(mz_name,
+ mz = rte_memzone_reserve_contig(mz_name,
entry_length * max_vnics,
SOCKET_ID_ANY,
RTE_MEMZONE_2MB |
--
2.7.4
Anatoly Burakov
2018-04-03 23:21:29 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
Acked-by: Michal Krawczyk <***@semihalf.com>
---

Notes:
v3:
- Moved patch earlier in the patchset
- Allowed experimental API's in the Makefile

drivers/net/ena/Makefile | 3 +++
drivers/net/ena/base/ena_plat_dpdk.h | 7 ++++---
2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ena/Makefile b/drivers/net/ena/Makefile
index f9bfe05..01c3823 100644
--- a/drivers/net/ena/Makefile
+++ b/drivers/net/ena/Makefile
@@ -43,6 +43,9 @@ INCLUDES :=-I$(SRCDIR) -I$(SRCDIR)/base/ena_defs -I$(SRCDIR)/base
EXPORT_MAP := rte_pmd_ena_version.map
LIBABIVER := 1

+# contiguous memzone reserve API are not yet stable
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+
VPATH += $(SRCDIR)/base
#
# all source are stored in SRCS-y
diff --git a/drivers/net/ena/base/ena_plat_dpdk.h b/drivers/net/ena/base/ena_plat_dpdk.h
index 8cba319..c1ebf00 100644
--- a/drivers/net/ena/base/ena_plat_dpdk.h
+++ b/drivers/net/ena/base/ena_plat_dpdk.h
@@ -188,7 +188,8 @@ typedef uint64_t dma_addr_t;
ENA_TOUCH(dmadev); ENA_TOUCH(handle); \
snprintf(z_name, sizeof(z_name), \
"ena_alloc_%d", ena_alloc_cnt++); \
- mz = rte_memzone_reserve(z_name, size, SOCKET_ID_ANY, 0); \
+ mz = rte_memzone_reserve_contig(z_name, \
+ size, SOCKET_ID_ANY, 0); \
memset(mz->addr, 0, size); \
virt = mz->addr; \
phys = mz->iova; \
@@ -206,7 +207,7 @@ typedef uint64_t dma_addr_t;
ENA_TOUCH(dmadev); ENA_TOUCH(dev_node); \
snprintf(z_name, sizeof(z_name), \
"ena_alloc_%d", ena_alloc_cnt++); \
- mz = rte_memzone_reserve(z_name, size, node, 0); \
+ mz = rte_memzone_reserve_contig(z_name, size, node, 0); \
memset(mz->addr, 0, size); \
virt = mz->addr; \
phys = mz->iova; \
@@ -219,7 +220,7 @@ typedef uint64_t dma_addr_t;
ENA_TOUCH(dmadev); ENA_TOUCH(dev_node); \
snprintf(z_name, sizeof(z_name), \
"ena_alloc_%d", ena_alloc_cnt++); \
- mz = rte_memzone_reserve(z_name, size, node, 0); \
+ mz = rte_memzone_reserve_contig(z_name, size, node, 0); \
memset(mz->addr, 0, size); \
virt = mz->addr; \
} while (0)
--
2.7.4
Anatoly Burakov
2018-04-03 23:21:50 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---
drivers/bus/fslmc/portal/dpaa2_hw_pvt.h | 13 +------------
1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/drivers/bus/fslmc/portal/dpaa2_hw_pvt.h b/drivers/bus/fslmc/portal/dpaa2_hw_pvt.h
index 4a19d42..d38fc49 100644
--- a/drivers/bus/fslmc/portal/dpaa2_hw_pvt.h
+++ b/drivers/bus/fslmc/portal/dpaa2_hw_pvt.h
@@ -260,21 +260,10 @@ static void *dpaa2_mem_ptov(phys_addr_t paddr) __attribute__((unused));
/* todo - this is costly, need to write a fast coversion routine */
static void *dpaa2_mem_ptov(phys_addr_t paddr)
{
- const struct rte_memseg *memseg;
- int i;
-
if (dpaa2_virt_mode)
return (void *)(size_t)paddr;

- memseg = rte_eal_get_physmem_layout();
-
- for (i = 0; i < RTE_MAX_MEMSEG && memseg[i].addr_64 != 0; i++) {
- if (paddr >= memseg[i].iova &&
- paddr < memseg[i].iova + memseg[i].len)
- return (void *)(size_t)(memseg[i].addr_64
- + (paddr - memseg[i].iova));
- }
- return NULL;
+ return rte_mem_iova2virt(paddr);
}

static phys_addr_t dpaa2_mem_vtop(uint64_t vaddr) __attribute__((unused));
--
2.7.4
Anatoly Burakov
2018-04-03 23:21:46 UTC
Permalink
For now, this function is identical to memseg walk function,
but it is meant to walk over first segment of each VA
contiguous group of memsegs. The difference will become
apparent when memseg lists will come into play.

Again, this is done so that there is less dependency on
internals of mem API and less noise later change sets.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/common/eal_common_memory.c | 37 ++++++++++++++++++++++++++++++
lib/librte_eal/common/include/rte_memory.h | 27 ++++++++++++++++++++++
lib/librte_eal/rte_eal_version.map | 1 +
3 files changed, 65 insertions(+)

diff --git a/lib/librte_eal/common/eal_common_memory.c b/lib/librte_eal/common/eal_common_memory.c
index 4f588c7..4b528b0 100644
--- a/lib/librte_eal/common/eal_common_memory.c
+++ b/lib/librte_eal/common/eal_common_memory.c
@@ -242,6 +242,43 @@ rte_memseg_walk(rte_memseg_walk_t func, void *arg)
return 0;
}

+int __rte_experimental
+rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ int i, j, ret;
+
+ for (i = 0; i < RTE_MAX_MEMSEG; i++) {
+ const struct rte_memseg *ms = &mcfg->memseg[i];
+ size_t total_len;
+ void *end_addr;
+
+ if (ms->addr == NULL)
+ continue;
+
+ end_addr = RTE_PTR_ADD(ms->addr, ms->len);
+
+ /* check how many more segments are contiguous to this one */
+ for (j = i + 1; j < RTE_MAX_MEMSEG; j++) {
+ const struct rte_memseg *next = &mcfg->memseg[j];
+
+ if (next->addr != end_addr)
+ break;
+
+ end_addr = RTE_PTR_ADD(next->addr, next->len);
+ i++;
+ }
+ total_len = RTE_PTR_DIFF(end_addr, ms->addr);
+
+ ret = func(ms, total_len, arg);
+ if (ret < 0)
+ return -1;
+ if (ret > 0)
+ return 1;
+ }
+ return 0;
+}
+
/* init memory subsystem */
int
rte_eal_memory_init(void)
diff --git a/lib/librte_eal/common/include/rte_memory.h b/lib/librte_eal/common/include/rte_memory.h
index 93eadaa..45d067f 100644
--- a/lib/librte_eal/common/include/rte_memory.h
+++ b/lib/librte_eal/common/include/rte_memory.h
@@ -140,6 +140,18 @@ rte_iova_t rte_mem_virt2iova(const void *virt);
typedef int (*rte_memseg_walk_t)(const struct rte_memseg *ms, void *arg);

/**
+ * Memseg contig walk function prototype. This will trigger a callback on every
+ * VA-contiguous are starting at memseg ``ms``, so total valid VA space at each
+ * callback call will be [``ms->addr``, ``ms->addr + len``).
+ *
+ * Returning 0 will continue walk
+ * Returning 1 will stop the walk
+ * Returning -1 will stop the walk and report error
+ */
+typedef int (*rte_memseg_contig_walk_t)(const struct rte_memseg *ms,
+ size_t len, void *arg);
+
+/**
* Walk list of all memsegs.
*
* @param func
@@ -155,6 +167,21 @@ int __rte_experimental
rte_memseg_walk(rte_memseg_walk_t func, void *arg);

/**
+ * Walk each VA-contiguous area.
+ *
+ * @param func
+ * Iterator function
+ * @param arg
+ * Argument passed to iterator
+ * @return
+ * 0 if walked over the entire list
+ * 1 if stopped by the user
+ * -1 if user function reported error
+ */
+int __rte_experimental
+rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg);
+
+/**
* Get the layout of the available physical memory.
*
* It can be useful for an application to have the full physical
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index 7e9900d..8409a3a 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -223,6 +223,7 @@ EXPERIMENTAL {
rte_eal_mbuf_user_pool_ops;
rte_log_register_type_and_pick_level;
rte_malloc_dump_heaps;
+ rte_memseg_contig_walk;
rte_memseg_walk;
rte_memzone_reserve_contig;
rte_memzone_reserve_aligned_contig;
--
2.7.4
Anatoly Burakov
2018-04-03 23:21:36 UTC
Permalink
If a user has specified that the zone should have contiguous memory,
use the new _contig allocation API's instead of normal ones.
Otherwise, account for the fact that unless we're in IOVA_AS_VA
mode, we cannot guarantee that the pages would be physically
contiguous, so we calculate the memzone size and alignments as if
we were getting the smallest page size available.

Existing mempool size calculation function also doesn't give us
expected results, because it will return memzone sizes aligned to
page size (e.g. a 1MB mempool will reserve the entire 1GB page if
all user has are 1GB pages), so add a new one that will give us
results more in line with what we would expect.

Signed-off-by: Anatoly Burakov <***@intel.com>
---

Notes:
v3:
- Fixed mempool size calculation
- Fixed handling of contiguous memzones
- Moved earlier in the patchset

lib/librte_mempool/Makefile | 3 +
lib/librte_mempool/meson.build | 3 +
lib/librte_mempool/rte_mempool.c | 137 ++++++++++++++++++++++++++++++++-------
3 files changed, 121 insertions(+), 22 deletions(-)

diff --git a/lib/librte_mempool/Makefile b/lib/librte_mempool/Makefile
index 24e735a..cfc69b4 100644
--- a/lib/librte_mempool/Makefile
+++ b/lib/librte_mempool/Makefile
@@ -13,6 +13,9 @@ EXPORT_MAP := rte_mempool_version.map

LIBABIVER := 3

+# uses new contiguous memzone allocation that isn't yet in stable ABI
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+
# all source are stored in SRCS-y
SRCS-$(CONFIG_RTE_LIBRTE_MEMPOOL) += rte_mempool.c
SRCS-$(CONFIG_RTE_LIBRTE_MEMPOOL) += rte_mempool_ops.c
diff --git a/lib/librte_mempool/meson.build b/lib/librte_mempool/meson.build
index 712720f..5916a0f 100644
--- a/lib/librte_mempool/meson.build
+++ b/lib/librte_mempool/meson.build
@@ -5,3 +5,6 @@ version = 3
sources = files('rte_mempool.c', 'rte_mempool_ops.c')
headers = files('rte_mempool.h')
deps += ['ring']
+
+# contig memzone allocation is not yet part of stable API
+allow_experimental_apis = true
diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index 54f7f4b..e147180 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -3,6 +3,7 @@
* Copyright(c) 2016 6WIND S.A.
*/

+#include <stdbool.h>
#include <stdio.h>
#include <string.h>
#include <stdint.h>
@@ -98,6 +99,27 @@ static unsigned optimize_object_size(unsigned obj_size)
return new_obj_size * RTE_MEMPOOL_ALIGN;
}

+static size_t
+get_min_page_size(void)
+{
+ const struct rte_mem_config *mcfg =
+ rte_eal_get_configuration()->mem_config;
+ int i;
+ size_t min_pagesz = SIZE_MAX;
+
+ for (i = 0; i < RTE_MAX_MEMSEG; i++) {
+ const struct rte_memseg *ms = &mcfg->memseg[i];
+
+ if (ms->addr == NULL)
+ continue;
+
+ if (ms->hugepage_sz < min_pagesz)
+ min_pagesz = ms->hugepage_sz;
+ }
+
+ return min_pagesz == SIZE_MAX ? (size_t) getpagesize() : min_pagesz;
+}
+
static void
mempool_add_elem(struct rte_mempool *mp, void *obj, rte_iova_t iova)
{
@@ -204,7 +226,6 @@ rte_mempool_calc_obj_size(uint32_t elt_size, uint32_t flags,
return sz->total_size;
}

-
/*
* Calculate maximum amount of memory required to store given number of objects.
*/
@@ -367,16 +388,6 @@ rte_mempool_populate_iova(struct rte_mempool *mp, char *vaddr,
/* update mempool capabilities */
mp->flags |= mp_capa_flags;

- /* Detect pool area has sufficient space for elements */
- if (mp_capa_flags & MEMPOOL_F_CAPA_PHYS_CONTIG) {
- if (len < total_elt_sz * mp->size) {
- RTE_LOG(ERR, MEMPOOL,
- "pool area %" PRIx64 " not enough\n",
- (uint64_t)len);
- return -ENOSPC;
- }
- }
-
memhdr = rte_zmalloc("MEMPOOL_MEMHDR", sizeof(*memhdr), 0);
if (memhdr == NULL)
return -ENOMEM;
@@ -549,6 +560,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
unsigned mz_id, n;
unsigned int mp_flags;
int ret;
+ bool force_contig, no_contig, try_contig, no_pageshift;

/* mempool must not be populated */
if (mp->nb_mem_chunks != 0)
@@ -563,9 +575,62 @@ rte_mempool_populate_default(struct rte_mempool *mp)
/* update mempool capabilities */
mp->flags |= mp_flags;

- if (rte_eal_has_hugepages()) {
- pg_shift = 0; /* not needed, zone is physically contiguous */
+ no_contig = mp->flags & MEMPOOL_F_NO_PHYS_CONTIG;
+ force_contig = mp->flags & MEMPOOL_F_CAPA_PHYS_CONTIG;
+
+ /*
+ * the following section calculates page shift and page size values.
+ *
+ * these values impact the result of rte_mempool_xmem_size(), which
+ * returns the amount of memory that should be allocated to store the
+ * desired number of objects. when not zero, it allocates more memory
+ * for the padding between objects, to ensure that an object does not
+ * cross a page boundary. in other words, page size/shift are to be set
+ * to zero if mempool elements won't care about page boundaries.
+ * there are several considerations for page size and page shift here.
+ *
+ * if we don't need our mempools to have physically contiguous objects,
+ * then just set page shift and page size to 0, because the user has
+ * indicated that there's no need to care about anything.
+ *
+ * if we do need contiguous objects, there is also an option to reserve
+ * the entire mempool memory as one contiguous block of memory, in
+ * which case the page shift and alignment wouldn't matter as well.
+ *
+ * if we require contiguous objects, but not necessarily the entire
+ * mempool reserved space to be contiguous, then there are two options.
+ *
+ * if our IO addresses are virtual, not actual physical (IOVA as VA
+ * case), then no page shift needed - our memory allocation will give us
+ * contiguous physical memory as far as the hardware is concerned, so
+ * act as if we're getting contiguous memory.
+ *
+ * if our IO addresses are physical, we may get memory from bigger
+ * pages, or we might get memory from smaller pages, and how much of it
+ * we require depends on whether we want bigger or smaller pages.
+ * However, requesting each and every memory size is too much work, so
+ * what we'll do instead is walk through the page sizes available, pick
+ * the smallest one and set up page shift to match that one. We will be
+ * wasting some space this way, but it's much nicer than looping around
+ * trying to reserve each and every page size.
+ *
+ * However, since size calculation will produce page-aligned sizes, it
+ * makes sense to first try and see if we can reserve the entire memzone
+ * in one contiguous chunk as well (otherwise we might end up wasting a
+ * 1G page on a 10MB memzone). If we fail to get enough contiguous
+ * memory, then we'll go and reserve space page-by-page.
+ */
+ no_pageshift = no_contig || force_contig ||
+ rte_eal_iova_mode() == RTE_IOVA_VA;
+ try_contig = !no_contig && !no_pageshift && rte_eal_has_hugepages();
+
+ if (no_pageshift) {
pg_sz = 0;
+ pg_shift = 0;
+ align = RTE_CACHE_LINE_SIZE;
+ } else if (try_contig) {
+ pg_sz = get_min_page_size();
+ pg_shift = rte_bsf32(pg_sz);
align = RTE_CACHE_LINE_SIZE;
} else {
pg_sz = getpagesize();
@@ -575,8 +640,12 @@ rte_mempool_populate_default(struct rte_mempool *mp)

total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
for (mz_id = 0, n = mp->size; n > 0; mz_id++, n -= ret) {
- size = rte_mempool_xmem_size(n, total_elt_sz, pg_shift,
- mp->flags);
+ if (try_contig || no_pageshift)
+ size = rte_mempool_xmem_size(n, total_elt_sz, 0,
+ mp->flags);
+ else
+ size = rte_mempool_xmem_size(n, total_elt_sz, pg_shift,
+ mp->flags);

ret = snprintf(mz_name, sizeof(mz_name),
RTE_MEMPOOL_MZ_FORMAT "_%d", mp->name, mz_id);
@@ -585,23 +654,47 @@ rte_mempool_populate_default(struct rte_mempool *mp)
goto fail;
}

- mz = rte_memzone_reserve_aligned(mz_name, size,
- mp->socket_id, mz_flags, align);
- /* not enough memory, retry with the biggest zone we have */
- if (mz == NULL)
- mz = rte_memzone_reserve_aligned(mz_name, 0,
+ mz = NULL;
+ if (force_contig || try_contig) {
+ /* if contiguous memory for entire mempool memory was
+ * requested, don't try reserving again if we fail...
+ */
+ mz = rte_memzone_reserve_aligned_contig(mz_name, size,
+ mp->socket_id, mz_flags, align);
+
+ /* ...unless we are doing best effort allocation, in
+ * which case recalculate size and try again */
+ if (try_contig && mz == NULL) {
+ try_contig = false;
+ align = pg_sz;
+ size = rte_mempool_xmem_size(n, total_elt_sz,
+ pg_shift, mp->flags);
+ }
+ }
+ /* only try this if we're not trying to reserve contiguous
+ * memory.
+ */
+ if (!force_contig && mz == NULL) {
+ mz = rte_memzone_reserve_aligned(mz_name, size,
mp->socket_id, mz_flags, align);
+ /* not enough memory, retry with the biggest zone we
+ * have
+ */
+ if (mz == NULL)
+ mz = rte_memzone_reserve_aligned(mz_name, 0,
+ mp->socket_id, mz_flags, align);
+ }
if (mz == NULL) {
ret = -rte_errno;
goto fail;
}

- if (mp->flags & MEMPOOL_F_NO_PHYS_CONTIG)
+ if (no_contig)
iova = RTE_BAD_IOVA;
else
iova = mz->iova;

- if (rte_eal_has_hugepages())
+ if (no_pageshift || try_contig)
ret = rte_mempool_populate_iova(mp, mz->addr,
iova, mz->len,
rte_mempool_memchunk_mz_free,
--
2.7.4
Anatoly Burakov
2018-04-03 23:21:39 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---
drivers/bus/pci/Makefile | 3 +++
drivers/bus/pci/linux/pci.c | 26 ++++++++++++++------------
drivers/bus/pci/meson.build | 3 +++
3 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/drivers/bus/pci/Makefile b/drivers/bus/pci/Makefile
index f3df1c4..804a198 100644
--- a/drivers/bus/pci/Makefile
+++ b/drivers/bus/pci/Makefile
@@ -49,6 +49,9 @@ CFLAGS += -I$(RTE_SDK)/drivers/bus/pci/$(SYSTEM)
CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common
CFLAGS += -I$(RTE_SDK)/lib/librte_eal/$(SYSTEM)app/eal

+# memseg walk is not part of stable API yet
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+
LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring
LDLIBS += -lrte_ethdev -lrte_pci

diff --git a/drivers/bus/pci/linux/pci.c b/drivers/bus/pci/linux/pci.c
index abde641..6dda054 100644
--- a/drivers/bus/pci/linux/pci.c
+++ b/drivers/bus/pci/linux/pci.c
@@ -116,22 +116,24 @@ rte_pci_unmap_device(struct rte_pci_device *dev)
}
}

-void *
-pci_find_max_end_va(void)
+static int
+find_max_end_va(const struct rte_memseg *ms, void *arg)
{
- const struct rte_memseg *seg = rte_eal_get_physmem_layout();
- const struct rte_memseg *last = seg;
- unsigned i = 0;
+ void *end_va = RTE_PTR_ADD(ms->addr, ms->len);
+ void **max_va = arg;

- for (i = 0; i < RTE_MAX_MEMSEG; i++, seg++) {
- if (seg->addr == NULL)
- break;
+ if (*max_va < end_va)
+ *max_va = end_va;
+ return 0;
+}

- if (seg->addr > last->addr)
- last = seg;
+void *
+pci_find_max_end_va(void)
+{
+ void *va = NULL;

- }
- return RTE_PTR_ADD(last->addr, last->len);
+ rte_memseg_walk(find_max_end_va, &va);
+ return va;
}

/* parse one line of the "resource" sysfs file (note that the 'line'
diff --git a/drivers/bus/pci/meson.build b/drivers/bus/pci/meson.build
index 12756a4..72939e5 100644
--- a/drivers/bus/pci/meson.build
+++ b/drivers/bus/pci/meson.build
@@ -14,3 +14,6 @@ else
sources += files('bsd/pci.c')
includes += include_directories('bsd')
endif
+
+# memseg walk is not part of stable API yet
+allow_experimental_apis = true
--
2.7.4
Anatoly Burakov
2018-04-03 23:21:45 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/linuxapp/eal/eal_vfio.c | 108 +++++++++++++++++++--------------
1 file changed, 63 insertions(+), 45 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index 2a34ae9..fb41e82 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -694,16 +694,69 @@ vfio_type1_dma_map(int vfio_container_fd)
return rte_memseg_walk(type1_map, &vfio_container_fd);
}

+struct spapr_walk_param {
+ uint64_t window_size;
+ uint64_t hugepage_sz;
+};
static int
-vfio_spapr_dma_map(int vfio_container_fd)
+spapr_window_size(const struct rte_memseg *ms, void *arg)
{
- const struct rte_memseg *ms = rte_eal_get_physmem_layout();
- int i, ret;
+ struct spapr_walk_param *param = arg;
+ uint64_t max = ms->iova + ms->len;
+
+ if (max > param->window_size) {
+ param->hugepage_sz = ms->hugepage_sz;
+ param->window_size = max;
+ }

+ return 0;
+}
+
+static int
+spapr_map(const struct rte_memseg *ms, void *arg)
+{
+ struct vfio_iommu_type1_dma_map dma_map;
struct vfio_iommu_spapr_register_memory reg = {
.argsz = sizeof(reg),
.flags = 0
};
+ int *vfio_container_fd = arg;
+ int ret;
+
+ reg.vaddr = (uintptr_t) ms->addr;
+ reg.size = ms->len;
+ ret = ioctl(*vfio_container_fd,
+ VFIO_IOMMU_SPAPR_REGISTER_MEMORY, &reg);
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot register vaddr for IOMMU, error %i (%s)\n",
+ errno, strerror(errno));
+ return -1;
+ }
+
+ memset(&dma_map, 0, sizeof(dma_map));
+ dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
+ dma_map.vaddr = ms->addr_64;
+ dma_map.size = ms->len;
+ dma_map.iova = ms->iova;
+ dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
+ VFIO_DMA_MAP_FLAG_WRITE;
+
+ ret = ioctl(*vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot set up DMA remapping, error %i (%s)\n",
+ errno, strerror(errno));
+ return -1;
+ }
+
+ return 0;
+}
+
+static int
+vfio_spapr_dma_map(int vfio_container_fd)
+{
+ struct spapr_walk_param param;
+ int ret;
struct vfio_iommu_spapr_tce_info info = {
.argsz = sizeof(info),
};
@@ -714,6 +767,8 @@ vfio_spapr_dma_map(int vfio_container_fd)
.argsz = sizeof(remove),
};

+ memset(&param, 0, sizeof(param));
+
/* query spapr iommu info */
ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
if (ret) {
@@ -732,17 +787,11 @@ vfio_spapr_dma_map(int vfio_container_fd)
}

/* create DMA window from 0 to max(phys_addr + len) */
- for (i = 0; i < RTE_MAX_MEMSEG; i++) {
- if (ms[i].addr == NULL)
- break;
-
- create.window_size = RTE_MAX(create.window_size,
- ms[i].iova + ms[i].len);
- }
+ rte_memseg_walk(spapr_window_size, &param);

/* sPAPR requires window size to be a power of 2 */
- create.window_size = rte_align64pow2(create.window_size);
- create.page_shift = __builtin_ctzll(ms->hugepage_sz);
+ create.window_size = rte_align64pow2(param.window_size);
+ create.page_shift = __builtin_ctzll(param.hugepage_sz);
create.levels = 1;

ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
@@ -758,39 +807,8 @@ vfio_spapr_dma_map(int vfio_container_fd)
}

/* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
- for (i = 0; i < RTE_MAX_MEMSEG; i++) {
- struct vfio_iommu_type1_dma_map dma_map;
-
- if (ms[i].addr == NULL)
- break;
-
- reg.vaddr = (uintptr_t) ms[i].addr;
- reg.size = ms[i].len;
- ret = ioctl(vfio_container_fd,
- VFIO_IOMMU_SPAPR_REGISTER_MEMORY, &reg);
- if (ret) {
- RTE_LOG(ERR, EAL, " cannot register vaddr for IOMMU, "
- "error %i (%s)\n", errno, strerror(errno));
- return -1;
- }
-
- memset(&dma_map, 0, sizeof(dma_map));
- dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
- dma_map.vaddr = ms[i].addr_64;
- dma_map.size = ms[i].len;
- dma_map.iova = ms[i].iova;
- dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
- VFIO_DMA_MAP_FLAG_WRITE;
-
- ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
-
- if (ret) {
- RTE_LOG(ERR, EAL, " cannot set up DMA remapping, "
- "error %i (%s)\n", errno, strerror(errno));
- return -1;
- }
-
- }
+ if (rte_memseg_walk(spapr_map, &vfio_container_fd) < 0)
+ return -1;

return 0;
}
--
2.7.4
Anatoly Burakov
2018-04-03 23:21:40 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---
drivers/net/mlx5/Makefile | 3 +++
drivers/net/mlx5/mlx5.c | 24 +++++++++++++++---------
2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile
index afda411..25c8e10 100644
--- a/drivers/net/mlx5/Makefile
+++ b/drivers/net/mlx5/Makefile
@@ -92,6 +92,9 @@ CFLAGS += -Wno-error=cast-qual
EXPORT_MAP := rte_pmd_mlx5_version.map
LIBABIVER := 1

+# memseg walk is not part of stable API
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+
# DEBUG which is usually provided on the command-line may enable
# CONFIG_RTE_LIBRTE_MLX5_DEBUG.
ifeq ($(DEBUG),1)
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 7d58d66..1724b65 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -477,6 +477,19 @@ static struct rte_pci_driver mlx5_driver;
*/
static void *uar_base;

+static int
+find_lower_va_bound(const struct rte_memseg *ms, void *arg)
+{
+ void **addr = arg;
+
+ if (*addr == NULL)
+ *addr = ms->addr;
+ else
+ *addr = RTE_MIN(*addr, ms->addr);
+
+ return 0;
+}
+
/**
* Reserve UAR address space for primary process.
*
@@ -491,21 +504,14 @@ mlx5_uar_init_primary(struct rte_eth_dev *dev)
{
struct priv *priv = dev->data->dev_private;
void *addr = (void *)0;
- int i;
- const struct rte_mem_config *mcfg;

if (uar_base) { /* UAR address space mapped. */
priv->uar_base = uar_base;
return 0;
}
/* find out lower bound of hugepage segments */
- mcfg = rte_eal_get_configuration()->mem_config;
- for (i = 0; i < RTE_MAX_MEMSEG && mcfg->memseg[i].addr; i++) {
- if (addr)
- addr = RTE_MIN(addr, mcfg->memseg[i].addr);
- else
- addr = mcfg->memseg[i].addr;
- }
+ rte_memseg_walk(find_lower_va_bound, &addr);
+
/* keep distance to hugepages to minimize potential conflicts. */
addr = RTE_PTR_SUB(addr, MLX5_UAR_OFFSET + MLX5_UAR_SIZE);
/* anonymous mmap, no real memory consumption. */
--
2.7.4
Anatoly Burakov
2018-04-03 23:21:41 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/bsdapp/eal/eal.c | 25 +++++++-----
lib/librte_eal/common/eal_common_memory.c | 67 ++++++++++++++++---------------
lib/librte_eal/common/malloc_heap.c | 33 +++++++++------
lib/librte_eal/linuxapp/eal/eal.c | 22 +++++-----
4 files changed, 81 insertions(+), 66 deletions(-)

diff --git a/lib/librte_eal/bsdapp/eal/eal.c b/lib/librte_eal/bsdapp/eal/eal.c
index 4eafcb5..8e25d78 100644
--- a/lib/librte_eal/bsdapp/eal/eal.c
+++ b/lib/librte_eal/bsdapp/eal/eal.c
@@ -429,23 +429,26 @@ eal_parse_args(int argc, char **argv)
return ret;
}

+static int
+check_socket(const struct rte_memseg *ms, void *arg)
+{
+ int *socket_id = arg;
+
+ if (ms->socket_id == *socket_id)
+ return 1;
+
+ return 0;
+}
+
static void
eal_check_mem_on_local_socket(void)
{
- const struct rte_memseg *ms;
- int i, socket_id;
+ int socket_id;

socket_id = rte_lcore_to_socket_id(rte_config.master_lcore);

- ms = rte_eal_get_physmem_layout();
-
- for (i = 0; i < RTE_MAX_MEMSEG; i++)
- if (ms[i].socket_id == socket_id &&
- ms[i].len > 0)
- return;
-
- RTE_LOG(WARNING, EAL, "WARNING: Master core has no "
- "memory on local socket!\n");
+ if (rte_memseg_walk(check_socket, &socket_id) == 0)
+ RTE_LOG(WARNING, EAL, "WARNING: Master core has no memory on local socket!\n");
}

static int
diff --git a/lib/librte_eal/common/eal_common_memory.c b/lib/librte_eal/common/eal_common_memory.c
index 947db1f..4f588c7 100644
--- a/lib/librte_eal/common/eal_common_memory.c
+++ b/lib/librte_eal/common/eal_common_memory.c
@@ -131,54 +131,57 @@ rte_eal_get_physmem_layout(void)
return rte_eal_get_configuration()->mem_config->memseg;
}

+static int
+physmem_size(const struct rte_memseg *ms, void *arg)
+{
+ uint64_t *total_len = arg;
+
+ *total_len += ms->len;
+
+ return 0;
+}

/* get the total size of memory */
uint64_t
rte_eal_get_physmem_size(void)
{
- const struct rte_mem_config *mcfg;
- unsigned i = 0;
uint64_t total_len = 0;

- /* get pointer to global configuration */
- mcfg = rte_eal_get_configuration()->mem_config;
+ rte_memseg_walk(physmem_size, &total_len);

- for (i = 0; i < RTE_MAX_MEMSEG; i++) {
- if (mcfg->memseg[i].addr == NULL)
- break;
+ return total_len;
+}

- total_len += mcfg->memseg[i].len;
- }
+static int
+dump_memseg(const struct rte_memseg *ms, void *arg)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ int i = ms - mcfg->memseg;
+ FILE *f = arg;

- return total_len;
+ if (i < 0 || i >= RTE_MAX_MEMSEG)
+ return -1;
+
+ fprintf(f, "Segment %u: IOVA:0x%"PRIx64", len:%zu, "
+ "virt:%p, socket_id:%"PRId32", "
+ "hugepage_sz:%"PRIu64", nchannel:%"PRIx32", "
+ "nrank:%"PRIx32"\n", i,
+ mcfg->memseg[i].iova,
+ mcfg->memseg[i].len,
+ mcfg->memseg[i].addr,
+ mcfg->memseg[i].socket_id,
+ mcfg->memseg[i].hugepage_sz,
+ mcfg->memseg[i].nchannel,
+ mcfg->memseg[i].nrank);
+
+ return 0;
}

/* Dump the physical memory layout on console */
void
rte_dump_physmem_layout(FILE *f)
{
- const struct rte_mem_config *mcfg;
- unsigned i = 0;
-
- /* get pointer to global configuration */
- mcfg = rte_eal_get_configuration()->mem_config;
-
- for (i = 0; i < RTE_MAX_MEMSEG; i++) {
- if (mcfg->memseg[i].addr == NULL)
- break;
-
- fprintf(f, "Segment %u: IOVA:0x%"PRIx64", len:%zu, "
- "virt:%p, socket_id:%"PRId32", "
- "hugepage_sz:%"PRIu64", nchannel:%"PRIx32", "
- "nrank:%"PRIx32"\n", i,
- mcfg->memseg[i].iova,
- mcfg->memseg[i].len,
- mcfg->memseg[i].addr,
- mcfg->memseg[i].socket_id,
- mcfg->memseg[i].hugepage_sz,
- mcfg->memseg[i].nchannel,
- mcfg->memseg[i].nrank);
- }
+ rte_memseg_walk(dump_memseg, f);
}

/* return the number of memory channels */
diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c
index 564b61a..79914fc 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -67,17 +67,32 @@ check_hugepage_sz(unsigned flags, uint64_t hugepage_sz)
* to prevent overflow. The rest of the zone is added to free list as a single
* large free block
*/
-static void
-malloc_heap_add_memseg(struct malloc_heap *heap, struct rte_memseg *ms)
+static int
+malloc_heap_add_memseg(const struct rte_memseg *ms, void *arg __rte_unused)
{
- struct malloc_elem *start_elem = (struct malloc_elem *)ms->addr;
- const size_t elem_size = ms->len - MALLOC_ELEM_OVERHEAD;
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct malloc_elem *start_elem;
+ struct rte_memseg *found_ms;
+ struct malloc_heap *heap;
+ size_t elem_size;
+ int ms_idx;
+
+ heap = &mcfg->malloc_heaps[ms->socket_id];
+
+ /* ms is const, so find it */
+ ms_idx = ms - mcfg->memseg;
+ found_ms = &mcfg->memseg[ms_idx];

- malloc_elem_init(start_elem, heap, ms, elem_size);
+ start_elem = (struct malloc_elem *)found_ms->addr;
+ elem_size = ms->len - MALLOC_ELEM_OVERHEAD;
+
+ malloc_elem_init(start_elem, heap, found_ms, elem_size);
malloc_elem_insert(start_elem);
malloc_elem_free_list_insert(start_elem);

heap->total_size += elem_size;
+
+ return 0;
}

/*
@@ -244,17 +259,11 @@ int
rte_eal_malloc_heap_init(void)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
- unsigned ms_cnt;
- struct rte_memseg *ms;

if (mcfg == NULL)
return -1;

- for (ms = &mcfg->memseg[0], ms_cnt = 0;
- (ms_cnt < RTE_MAX_MEMSEG) && (ms->len > 0);
- ms_cnt++, ms++) {
- malloc_heap_add_memseg(&mcfg->malloc_heaps[ms->socket_id], ms);
- }
+ rte_memseg_walk(malloc_heap_add_memseg, NULL);

return 0;
}
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 2ecd07b..77f6cb7 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -638,23 +638,23 @@ eal_parse_args(int argc, char **argv)
return ret;
}

+static int
+check_mem(const struct rte_memseg *ms, void *arg)
+{
+ int *socket = arg;
+
+ return ms->socket_id == *socket;
+}
+
static void
eal_check_mem_on_local_socket(void)
{
- const struct rte_memseg *ms;
- int i, socket_id;
+ int socket_id;

socket_id = rte_lcore_to_socket_id(rte_config.master_lcore);

- ms = rte_eal_get_physmem_layout();
-
- for (i = 0; i < RTE_MAX_MEMSEG; i++)
- if (ms[i].socket_id == socket_id &&
- ms[i].len > 0)
- return;
-
- RTE_LOG(WARNING, EAL, "WARNING: Master core has no "
- "memory on local socket!\n");
+ if (rte_memseg_walk(check_mem, &socket_id) == 0)
+ RTE_LOG(WARNING, EAL, "WARNING: Master core has no memory on local socket!\n");
}

static int
--
2.7.4
Anatoly Burakov
2018-04-03 23:21:42 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_mempool/rte_mempool.c | 24 ++++++++++++------------
1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index e147180..bb33c3a 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -99,23 +99,23 @@ static unsigned optimize_object_size(unsigned obj_size)
return new_obj_size * RTE_MEMPOOL_ALIGN;
}

+static int
+find_min_pagesz(const struct rte_memseg *ms, void *arg)
+{
+ size_t *min = arg;
+
+ if (ms->hugepage_sz < *min)
+ *min = ms->hugepage_sz;
+
+ return 0;
+}
+
static size_t
get_min_page_size(void)
{
- const struct rte_mem_config *mcfg =
- rte_eal_get_configuration()->mem_config;
- int i;
size_t min_pagesz = SIZE_MAX;

- for (i = 0; i < RTE_MAX_MEMSEG; i++) {
- const struct rte_memseg *ms = &mcfg->memseg[i];
-
- if (ms->addr == NULL)
- continue;
-
- if (ms->hugepage_sz < min_pagesz)
- min_pagesz = ms->hugepage_sz;
- }
+ rte_memseg_walk(find_min_pagesz, &min_pagesz);

return min_pagesz == SIZE_MAX ? (size_t) getpagesize() : min_pagesz;
}
--
2.7.4
Anatoly Burakov
2018-04-03 23:21:48 UTC
Permalink
This is reverse lookup of PA to VA. Using this will make
other code less dependent on internals of mem API.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/common/eal_common_memory.c | 30 ++++++++++++++++++++++++++++++
lib/librte_eal/common/include/rte_memory.h | 12 ++++++++++++
lib/librte_eal/rte_eal_version.map | 1 +
3 files changed, 43 insertions(+)

diff --git a/lib/librte_eal/common/eal_common_memory.c b/lib/librte_eal/common/eal_common_memory.c
index 4b528b0..ea3c5a7 100644
--- a/lib/librte_eal/common/eal_common_memory.c
+++ b/lib/librte_eal/common/eal_common_memory.c
@@ -131,6 +131,36 @@ rte_eal_get_physmem_layout(void)
return rte_eal_get_configuration()->mem_config->memseg;
}

+struct virtiova {
+ rte_iova_t iova;
+ void *virt;
+};
+static int
+find_virt(const struct rte_memseg *ms, void *arg)
+{
+ struct virtiova *vi = arg;
+ if (vi->iova >= ms->iova && vi->iova < (ms->iova + ms->len)) {
+ size_t offset = vi->iova - ms->iova;
+ vi->virt = RTE_PTR_ADD(ms->addr, offset);
+ /* stop the walk */
+ return 1;
+ }
+ return 0;
+}
+
+__rte_experimental void *
+rte_mem_iova2virt(rte_iova_t iova)
+{
+ struct virtiova vi;
+
+ memset(&vi, 0, sizeof(vi));
+
+ vi.iova = iova;
+ rte_memseg_walk(find_virt, &vi);
+
+ return vi.virt;
+}
+
static int
physmem_size(const struct rte_memseg *ms, void *arg)
{
diff --git a/lib/librte_eal/common/include/rte_memory.h b/lib/librte_eal/common/include/rte_memory.h
index 45d067f..5c60b91 100644
--- a/lib/librte_eal/common/include/rte_memory.h
+++ b/lib/librte_eal/common/include/rte_memory.h
@@ -131,6 +131,18 @@ phys_addr_t rte_mem_virt2phy(const void *virt);
rte_iova_t rte_mem_virt2iova(const void *virt);

/**
+ * Get virtual memory address corresponding to iova address.
+ *
+ * @param iova
+ * The iova address.
+ * @return
+ * Virtual address corresponding to iova address (or NULL if address does not
+ * exist within DPDK memory map).
+ */
+__rte_experimental void *
+rte_mem_iova2virt(rte_iova_t iova);
+
+/**
* Memseg walk function prototype.
*
* Returning 0 will continue walk
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index 8409a3a..83b1635 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -223,6 +223,7 @@ EXPERIMENTAL {
rte_eal_mbuf_user_pool_ops;
rte_log_register_type_and_pick_level;
rte_malloc_dump_heaps;
+ rte_mem_iova2virt;
rte_memseg_contig_walk;
rte_memseg_walk;
rte_memzone_reserve_contig;
--
2.7.4
Anatoly Burakov
2018-04-03 23:21:47 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---
drivers/net/virtio/virtio_user/vhost_kernel.c | 83 +++++++++++----------------
1 file changed, 35 insertions(+), 48 deletions(-)

diff --git a/drivers/net/virtio/virtio_user/vhost_kernel.c b/drivers/net/virtio/virtio_user/vhost_kernel.c
index 1711ead..93d7efe 100644
--- a/drivers/net/virtio/virtio_user/vhost_kernel.c
+++ b/drivers/net/virtio/virtio_user/vhost_kernel.c
@@ -70,6 +70,32 @@ static uint64_t vhost_req_user_to_kernel[] = {
[VHOST_USER_SET_MEM_TABLE] = VHOST_SET_MEM_TABLE,
};

+struct walk_arg {
+ struct vhost_memory_kernel *vm;
+ uint32_t region_nr;
+};
+static int
+add_memory_region(const struct rte_memseg *ms, size_t len, void *arg)
+{
+ struct walk_arg *wa = arg;
+ struct vhost_memory_region *mr;
+ void *start_addr;
+
+ if (wa->region_nr >= max_regions)
+ return -1;
+
+ mr = &wa->vm->regions[wa->region_nr++];
+ start_addr = ms->addr;
+
+ mr->guest_phys_addr = (uint64_t)(uintptr_t)start_addr;
+ mr->userspace_addr = (uint64_t)(uintptr_t)start_addr;
+ mr->memory_size = len;
+ mr->mmap_offset = 0;
+
+ return 0;
+}
+
+
/* By default, vhost kernel module allows 64 regions, but DPDK allows
* 256 segments. As a relief, below function merges those virtually
* adjacent memsegs into one region.
@@ -77,63 +103,24 @@ static uint64_t vhost_req_user_to_kernel[] = {
static struct vhost_memory_kernel *
prepare_vhost_memory_kernel(void)
{
- uint32_t i, j, k = 0;
- struct rte_memseg *seg;
- struct vhost_memory_region *mr;
struct vhost_memory_kernel *vm;
+ struct walk_arg wa;

vm = malloc(sizeof(struct vhost_memory_kernel) +
- max_regions *
- sizeof(struct vhost_memory_region));
+ max_regions *
+ sizeof(struct vhost_memory_region));
if (!vm)
return NULL;

- for (i = 0; i < RTE_MAX_MEMSEG; ++i) {
- seg = &rte_eal_get_configuration()->mem_config->memseg[i];
- if (!seg->addr)
- break;
-
- int new_region = 1;
-
- for (j = 0; j < k; ++j) {
- mr = &vm->regions[j];
+ wa.region_nr = 0;
+ wa.vm = vm;

- if (mr->userspace_addr + mr->memory_size ==
- (uint64_t)(uintptr_t)seg->addr) {
- mr->memory_size += seg->len;
- new_region = 0;
- break;
- }
-
- if ((uint64_t)(uintptr_t)seg->addr + seg->len ==
- mr->userspace_addr) {
- mr->guest_phys_addr =
- (uint64_t)(uintptr_t)seg->addr;
- mr->userspace_addr =
- (uint64_t)(uintptr_t)seg->addr;
- mr->memory_size += seg->len;
- new_region = 0;
- break;
- }
- }
-
- if (new_region == 0)
- continue;
-
- mr = &vm->regions[k++];
- /* use vaddr here! */
- mr->guest_phys_addr = (uint64_t)(uintptr_t)seg->addr;
- mr->userspace_addr = (uint64_t)(uintptr_t)seg->addr;
- mr->memory_size = seg->len;
- mr->mmap_offset = 0;
-
- if (k >= max_regions) {
- free(vm);
- return NULL;
- }
+ if (rte_memseg_contig_walk(add_memory_region, &wa) < 0) {
+ free(vm);
+ return NULL;
}

- vm->nregions = k;
+ vm->nregions = wa.region_nr;
vm->padding = 0;
return vm;
}
--
2.7.4
Anatoly Burakov
2018-04-03 23:21:52 UTC
Permalink
This can be used as a virt2iova function that only looks up
memory that is owned by DPDK (as opposed to doing pagemap walks).
Using this will result in less dependency on internals of mem API.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/common/eal_common_memory.c | 37 ++++++++++++++++++++++++++++++
lib/librte_eal/common/include/rte_memory.h | 11 +++++++++
lib/librte_eal/rte_eal_version.map | 1 +
3 files changed, 49 insertions(+)

diff --git a/lib/librte_eal/common/eal_common_memory.c b/lib/librte_eal/common/eal_common_memory.c
index ea3c5a7..fd78d2f 100644
--- a/lib/librte_eal/common/eal_common_memory.c
+++ b/lib/librte_eal/common/eal_common_memory.c
@@ -161,6 +161,43 @@ rte_mem_iova2virt(rte_iova_t iova)
return vi.virt;
}

+struct virtms {
+ const void *virt;
+ struct rte_memseg *ms;
+};
+static int
+find_memseg(const struct rte_memseg *ms, void *arg)
+{
+ struct virtms *vm = arg;
+
+ if (arg >= ms->addr && arg < RTE_PTR_ADD(ms->addr, ms->len)) {
+ struct rte_memseg *memseg, *found_ms;
+ int idx;
+
+ memseg = rte_eal_get_configuration()->mem_config->memseg;
+ idx = ms - memseg;
+ found_ms = &memseg[idx];
+
+ vm->ms = found_ms;
+ return 1;
+ }
+ return 0;
+}
+
+__rte_experimental struct rte_memseg *
+rte_mem_virt2memseg(const void *addr)
+{
+ struct virtms vm;
+
+ memset(&vm, 0, sizeof(vm));
+
+ vm.virt = addr;
+
+ rte_memseg_walk(find_memseg, &vm);
+
+ return vm.ms;
+}
+
static int
physmem_size(const struct rte_memseg *ms, void *arg)
{
diff --git a/lib/librte_eal/common/include/rte_memory.h b/lib/librte_eal/common/include/rte_memory.h
index 5c60b91..b3d7e61 100644
--- a/lib/librte_eal/common/include/rte_memory.h
+++ b/lib/librte_eal/common/include/rte_memory.h
@@ -143,6 +143,17 @@ __rte_experimental void *
rte_mem_iova2virt(rte_iova_t iova);

/**
+ * Get memseg to which a particular virtual address belongs.
+ *
+ * @param virt
+ * The virtual address.
+ * @return
+ * Memseg pointer on success, or NULL on error.
+ */
+__rte_experimental struct rte_memseg *
+rte_mem_virt2memseg(const void *virt);
+
+/**
* Memseg walk function prototype.
*
* Returning 0 will continue walk
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index 83b1635..70ec778 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -224,6 +224,7 @@ EXPERIMENTAL {
rte_log_register_type_and_pick_level;
rte_malloc_dump_heaps;
rte_mem_iova2virt;
+ rte_mem_virt2memseg;
rte_memseg_contig_walk;
rte_memseg_walk;
rte_memzone_reserve_contig;
--
2.7.4
Anatoly Burakov
2018-04-03 23:21:43 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---
test/test/test_malloc.c | 40 +++++++++++++++++++++++-------------
test/test/test_memory.c | 23 +++++++++++----------
test/test/test_memzone.c | 53 ++++++++++++++++++++++++++++++++----------------
3 files changed, 74 insertions(+), 42 deletions(-)

diff --git a/test/test/test_malloc.c b/test/test/test_malloc.c
index d23192c..578ad04 100644
--- a/test/test/test_malloc.c
+++ b/test/test/test_malloc.c
@@ -705,16 +705,34 @@ test_malloc_bad_params(void)
return -1;
}

+static int
+check_socket_mem(const struct rte_memseg *ms, void *arg)
+{
+ int32_t *socket = arg;
+
+ return *socket == ms->socket_id;
+}
+
/* Check if memory is available on a specific socket */
static int
is_mem_on_socket(int32_t socket)
{
- const struct rte_memseg *ms = rte_eal_get_physmem_layout();
- unsigned i;
+ return rte_memseg_walk(check_socket_mem, &socket);
+}

- for (i = 0; i < RTE_MAX_MEMSEG; i++) {
- if (socket == ms[i].socket_id)
- return 1;
+struct walk_param {
+ void *addr;
+ int32_t socket;
+};
+static int
+find_socket(const struct rte_memseg *ms, void *arg)
+{
+ struct walk_param *param = arg;
+
+ if (param->addr >= ms->addr &&
+ param->addr < RTE_PTR_ADD(ms->addr, ms->len)) {
+ param->socket = ms->socket_id;
+ return 1;
}
return 0;
}
@@ -726,15 +744,9 @@ is_mem_on_socket(int32_t socket)
static int32_t
addr_to_socket(void * addr)
{
- const struct rte_memseg *ms = rte_eal_get_physmem_layout();
- unsigned i;
-
- for (i = 0; i < RTE_MAX_MEMSEG; i++) {
- if ((ms[i].addr <= addr) &&
- ((uintptr_t)addr <
- ((uintptr_t)ms[i].addr + (uintptr_t)ms[i].len)))
- return ms[i].socket_id;
- }
+ struct walk_param param = {.addr = addr, .socket = 0};
+ if (rte_memseg_walk(find_socket, &param) > 0)
+ return param.socket;
return -1;
}

diff --git a/test/test/test_memory.c b/test/test/test_memory.c
index 972321f..c9b287c 100644
--- a/test/test/test_memory.c
+++ b/test/test/test_memory.c
@@ -23,12 +23,20 @@
*/

static int
+check_mem(const struct rte_memseg *ms, void *arg __rte_unused)
+{
+ volatile uint8_t *mem = (volatile uint8_t *) ms->addr;
+ size_t i;
+
+ for (i = 0; i < ms->len; i++, mem++)
+ *mem;
+ return 0;
+}
+
+static int
test_memory(void)
{
uint64_t s;
- unsigned i;
- size_t j;
- const struct rte_memseg *mem;

/*
* dump the mapped memory: the python-expect script checks
@@ -45,14 +53,7 @@ test_memory(void)
}

/* try to read memory (should not segfault) */
- mem = rte_eal_get_physmem_layout();
- for (i = 0; i < RTE_MAX_MEMSEG && mem[i].addr != NULL ; i++) {
-
- /* check memory */
- for (j = 0; j<mem[i].len; j++) {
- *((volatile uint8_t *) mem[i].addr + j);
- }
- }
+ rte_memseg_walk(check_mem, NULL);

return 0;
}
diff --git a/test/test/test_memzone.c b/test/test/test_memzone.c
index 8ece1ac..cbf0cfa 100644
--- a/test/test/test_memzone.c
+++ b/test/test/test_memzone.c
@@ -104,28 +104,47 @@ test_memzone_reserving_zone_size_bigger_than_the_maximum(void)
return 0;
}

+struct walk_arg {
+ int hugepage_2MB_avail;
+ int hugepage_1GB_avail;
+ int hugepage_16MB_avail;
+ int hugepage_16GB_avail;
+};
+static int
+find_available_pagesz(const struct rte_memseg *ms, void *arg)
+{
+ struct walk_arg *wa = arg;
+
+ if (ms->hugepage_sz == RTE_PGSIZE_2M)
+ wa->hugepage_2MB_avail = 1;
+ if (ms->hugepage_sz == RTE_PGSIZE_1G)
+ wa->hugepage_1GB_avail = 1;
+ if (ms->hugepage_sz == RTE_PGSIZE_16M)
+ wa->hugepage_16MB_avail = 1;
+ if (ms->hugepage_sz == RTE_PGSIZE_16G)
+ wa->hugepage_16GB_avail = 1;
+
+ return 0;
+}
+
static int
test_memzone_reserve_flags(void)
{
const struct rte_memzone *mz;
- const struct rte_memseg *ms;
- int hugepage_2MB_avail = 0;
- int hugepage_1GB_avail = 0;
- int hugepage_16MB_avail = 0;
- int hugepage_16GB_avail = 0;
+ struct walk_arg wa;
+ int hugepage_2MB_avail, hugepage_1GB_avail;
+ int hugepage_16MB_avail, hugepage_16GB_avail;
const size_t size = 100;
- int i = 0;
- ms = rte_eal_get_physmem_layout();
- for (i = 0; i < RTE_MAX_MEMSEG; i++) {
- if (ms[i].hugepage_sz == RTE_PGSIZE_2M)
- hugepage_2MB_avail = 1;
- if (ms[i].hugepage_sz == RTE_PGSIZE_1G)
- hugepage_1GB_avail = 1;
- if (ms[i].hugepage_sz == RTE_PGSIZE_16M)
- hugepage_16MB_avail = 1;
- if (ms[i].hugepage_sz == RTE_PGSIZE_16G)
- hugepage_16GB_avail = 1;
- }
+
+ memset(&wa, 0, sizeof(wa));
+
+ rte_memseg_walk(find_available_pagesz, &wa);
+
+ hugepage_2MB_avail = wa.hugepage_2MB_avail;
+ hugepage_1GB_avail = wa.hugepage_1GB_avail;
+ hugepage_16MB_avail = wa.hugepage_16MB_avail;
+ hugepage_16GB_avail = wa.hugepage_16GB_avail;
+
/* Display the availability of 2MB ,1GB, 16MB, 16GB pages */
if (hugepage_2MB_avail)
printf("2MB Huge pages available\n");
--
2.7.4
Anatoly Burakov
2018-04-03 23:21:51 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---
drivers/crypto/dpaa_sec/dpaa_sec.c | 11 +----------
1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/drivers/crypto/dpaa_sec/dpaa_sec.c b/drivers/crypto/dpaa_sec/dpaa_sec.c
index c5191ce..b04510f 100644
--- a/drivers/crypto/dpaa_sec/dpaa_sec.c
+++ b/drivers/crypto/dpaa_sec/dpaa_sec.c
@@ -120,16 +120,7 @@ dpaa_mem_vtop_ctx(struct dpaa_sec_op_ctx *ctx, void *vaddr)
static inline void *
dpaa_mem_ptov(rte_iova_t paddr)
{
- const struct rte_memseg *memseg = rte_eal_get_physmem_layout();
- int i;
-
- for (i = 0; i < RTE_MAX_MEMSEG && memseg[i].addr_64 != 0; i++) {
- if (paddr >= memseg[i].iova &&
- paddr < memseg[i].iova + memseg[i].len)
- return (void *)(size_t)(memseg[i].addr_64 +
- (paddr - memseg[i].iova));
- }
- return NULL;
+ return rte_mem_iova2virt(paddr);
}

static void
--
2.7.4
Anatoly Burakov
2018-04-03 23:21:54 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---
drivers/net/mlx4/mlx4_mr.c | 17 +++++++----------
1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/drivers/net/mlx4/mlx4_mr.c b/drivers/net/mlx4/mlx4_mr.c
index 9a1e4de..47dd542 100644
--- a/drivers/net/mlx4/mlx4_mr.c
+++ b/drivers/net/mlx4/mlx4_mr.c
@@ -126,7 +126,7 @@ mlx4_check_mempool(struct rte_mempool *mp, uintptr_t *start, uintptr_t *end)
struct mlx4_mr *
mlx4_mr_get(struct priv *priv, struct rte_mempool *mp)
{
- const struct rte_memseg *ms = rte_eal_get_physmem_layout();
+ const struct rte_memseg *ms;
uintptr_t start;
uintptr_t end;
unsigned int i;
@@ -142,16 +142,13 @@ mlx4_mr_get(struct priv *priv, struct rte_mempool *mp)
(void *)mp, (void *)start, (void *)end,
(size_t)(end - start));
/* Round start and end to page boundary if found in memory segments. */
- for (i = 0; (i < RTE_MAX_MEMSEG) && (ms[i].addr != NULL); ++i) {
- uintptr_t addr = (uintptr_t)ms[i].addr;
- size_t len = ms[i].len;
- unsigned int align = ms[i].hugepage_sz;
+ ms = rte_mem_virt2memseg((void *)start);
+ if (ms != NULL)
+ start = RTE_ALIGN_FLOOR(start, ms->hugepage_sz);
+ ms = rte_mem_virt2memseg((void *)end);
+ if (ms != NULL)
+ end = RTE_ALIGN_CEIL(end, ms->hugepage_sz);

- if ((start > addr) && (start < addr + len))
- start = RTE_ALIGN_FLOOR(start, align);
- if ((end > addr) && (end < addr + len))
- end = RTE_ALIGN_CEIL(end, align);
- }
DEBUG("mempool %p using start=%p end=%p size=%zu for MR",
(void *)mp, (void *)start, (void *)end,
(size_t)(end - start));
--
2.7.4
Anatoly Burakov
2018-04-03 23:21:49 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---
drivers/bus/dpaa/rte_dpaa_bus.h | 12 +-----------
drivers/event/dpaa/Makefile | 3 +++
drivers/mempool/dpaa/Makefile | 3 +++
drivers/net/dpaa/Makefile | 3 +++
4 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/drivers/bus/dpaa/rte_dpaa_bus.h b/drivers/bus/dpaa/rte_dpaa_bus.h
index 718701b..89aeac2 100644
--- a/drivers/bus/dpaa/rte_dpaa_bus.h
+++ b/drivers/bus/dpaa/rte_dpaa_bus.h
@@ -98,17 +98,7 @@ struct dpaa_portal {
/* TODO - this is costly, need to write a fast coversion routine */
static inline void *rte_dpaa_mem_ptov(phys_addr_t paddr)
{
- const struct rte_memseg *memseg = rte_eal_get_physmem_layout();
- int i;
-
- for (i = 0; i < RTE_MAX_MEMSEG && memseg[i].addr != NULL; i++) {
- if (paddr >= memseg[i].iova && paddr <
- memseg[i].iova + memseg[i].len)
- return (uint8_t *)(memseg[i].addr) +
- (paddr - memseg[i].iova);
- }
-
- return NULL;
+ return rte_mem_iova2virt(paddr);
}

/**
diff --git a/drivers/event/dpaa/Makefile b/drivers/event/dpaa/Makefile
index ddd8552..df3cae8 100644
--- a/drivers/event/dpaa/Makefile
+++ b/drivers/event/dpaa/Makefile
@@ -26,6 +26,9 @@ EXPORT_MAP := rte_pmd_dpaa_event_version.map

LIBABIVER := 1

+# depends on dpaa bus which uses experimental API
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+
# Interfaces with DPDK
SRCS-$(CONFIG_RTE_LIBRTE_PMD_DPAA_EVENTDEV) += dpaa_eventdev.c

diff --git a/drivers/mempool/dpaa/Makefile b/drivers/mempool/dpaa/Makefile
index 4c0d7aa..da8da1e 100644
--- a/drivers/mempool/dpaa/Makefile
+++ b/drivers/mempool/dpaa/Makefile
@@ -22,6 +22,9 @@ EXPORT_MAP := rte_mempool_dpaa_version.map
# Lbrary version
LIBABIVER := 1

+# depends on dpaa bus which uses experimental API
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+
# all source are stored in SRCS-y
#
SRCS-$(CONFIG_RTE_LIBRTE_DPAA_MEMPOOL) += dpaa_mempool.c
diff --git a/drivers/net/dpaa/Makefile b/drivers/net/dpaa/Makefile
index 9c2a5ea..d7a0a50 100644
--- a/drivers/net/dpaa/Makefile
+++ b/drivers/net/dpaa/Makefile
@@ -27,6 +27,9 @@ EXPORT_MAP := rte_pmd_dpaa_version.map

LIBABIVER := 1

+# depends on dpaa bus which uses experimental API
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+
# Interfaces with DPDK
SRCS-$(CONFIG_RTE_LIBRTE_DPAA_PMD) += dpaa_ethdev.c
SRCS-$(CONFIG_RTE_LIBRTE_DPAA_PMD) += dpaa_rxtx.c
--
2.7.4
Anatoly Burakov
2018-04-03 23:21:53 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---
drivers/bus/fslmc/portal/dpaa2_hw_pvt.h | 14 ++++----------
1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/drivers/bus/fslmc/portal/dpaa2_hw_pvt.h b/drivers/bus/fslmc/portal/dpaa2_hw_pvt.h
index d38fc49..45fd41e 100644
--- a/drivers/bus/fslmc/portal/dpaa2_hw_pvt.h
+++ b/drivers/bus/fslmc/portal/dpaa2_hw_pvt.h
@@ -270,20 +270,14 @@ static phys_addr_t dpaa2_mem_vtop(uint64_t vaddr) __attribute__((unused));
static phys_addr_t dpaa2_mem_vtop(uint64_t vaddr)
{
const struct rte_memseg *memseg;
- int i;

if (dpaa2_virt_mode)
return vaddr;

- memseg = rte_eal_get_physmem_layout();
-
- for (i = 0; i < RTE_MAX_MEMSEG && memseg[i].addr_64 != 0; i++) {
- if (vaddr >= memseg[i].addr_64 &&
- vaddr < memseg[i].addr_64 + memseg[i].len)
- return memseg[i].iova
- + (vaddr - memseg[i].addr_64);
- }
- return (size_t)(NULL);
+ memseg = rte_mem_virt2memseg((void *)(uintptr_t)vaddr);
+ if (memseg)
+ return memseg->phys_addr + RTE_PTR_DIFF(vaddr, memseg->addr);
+ return (size_t)NULL;
}

/**
--
2.7.4
Anatoly Burakov
2018-04-03 23:21:55 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---
drivers/net/mlx5/mlx5_mr.c | 18 ++++++++----------
1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c
index 2bf1f9c..d8c04dc 100644
--- a/drivers/net/mlx5/mlx5_mr.c
+++ b/drivers/net/mlx5/mlx5_mr.c
@@ -234,7 +234,7 @@ struct mlx5_mr *
mlx5_mr_new(struct rte_eth_dev *dev, struct rte_mempool *mp)
{
struct priv *priv = dev->data->dev_private;
- const struct rte_memseg *ms = rte_eal_get_physmem_layout();
+ const struct rte_memseg *ms;
uintptr_t start;
uintptr_t end;
unsigned int i;
@@ -261,17 +261,15 @@ mlx5_mr_new(struct rte_eth_dev *dev, struct rte_mempool *mp)
/* Save original addresses for exact MR lookup. */
mr->start = start;
mr->end = end;
+
/* Round start and end to page boundary if found in memory segments. */
- for (i = 0; (i < RTE_MAX_MEMSEG) && (ms[i].addr != NULL); ++i) {
- uintptr_t addr = (uintptr_t)ms[i].addr;
- size_t len = ms[i].len;
- unsigned int align = ms[i].hugepage_sz;
+ ms = rte_mem_virt2memseg((void *)start);
+ if (ms != NULL)
+ start = RTE_ALIGN_FLOOR(start, ms->hugepage_sz);
+ ms = rte_mem_virt2memseg((void *)end);
+ if (ms != NULL)
+ end = RTE_ALIGN_CEIL(end, ms->hugepage_sz);

- if ((start > addr) && (start < addr + len))
- start = RTE_ALIGN_FLOOR(start, align);
- if ((end > addr) && (end < addr + len))
- end = RTE_ALIGN_CEIL(end, align);
- }
DRV_LOG(DEBUG,
"port %u mempool %p using start=%p end=%p size=%zu for memory"
" region",
--
2.7.4
Anatoly Burakov
2018-04-03 23:21:56 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---
drivers/crypto/dpaa_sec/dpaa_sec.c | 19 +++++--------------
1 file changed, 5 insertions(+), 14 deletions(-)

diff --git a/drivers/crypto/dpaa_sec/dpaa_sec.c b/drivers/crypto/dpaa_sec/dpaa_sec.c
index b04510f..a14e669 100644
--- a/drivers/crypto/dpaa_sec/dpaa_sec.c
+++ b/drivers/crypto/dpaa_sec/dpaa_sec.c
@@ -93,20 +93,11 @@ dpaa_sec_alloc_ctx(dpaa_sec_session *ses)
static inline rte_iova_t
dpaa_mem_vtop(void *vaddr)
{
- const struct rte_memseg *memseg = rte_eal_get_physmem_layout();
- uint64_t vaddr_64, paddr;
- int i;
-
- vaddr_64 = (size_t)vaddr;
- for (i = 0; i < RTE_MAX_MEMSEG && memseg[i].addr_64 != 0; i++) {
- if (vaddr_64 >= memseg[i].addr_64 &&
- vaddr_64 < memseg[i].addr_64 + memseg[i].len) {
- paddr = memseg[i].iova +
- (vaddr_64 - memseg[i].addr_64);
-
- return (rte_iova_t)paddr;
- }
- }
+ const struct rte_memseg *ms;
+
+ ms = rte_mem_virt2memseg(vaddr);
+ if (ms)
+ return ms->iova + RTE_PTR_DIFF(vaddr, ms->addr);
return (size_t)NULL;
}
--
2.7.4
Anatoly Burakov
2018-04-03 23:21:59 UTC
Permalink
This adds a "--legacy-mem" command-line switch. It will be used to
go back to the old memory behavior, one where we can't dynamically
allocate/free memory (the downside), but one where the user can
get physically contiguous memory, like before (the upside).

For now, nothing but the legacy behavior exists, non-legacy
memory init sequence will be added later. For FreeBSD, non-legacy
memory init will never be enabled, while for Linux, it is
disabled in this patch to avoid breaking bisect, but will be
enabled once non-legacy mode will be fully operational.

Signed-off-by: Anatoly Burakov <***@intel.com>
---

Notes:
v3:
- Move to earlier in the patchset
- Make Linuxapp always load in legacy mode

lib/librte_eal/bsdapp/eal/eal.c | 3 +++
lib/librte_eal/common/eal_common_options.c | 4 ++++
lib/librte_eal/common/eal_internal_cfg.h | 4 ++++
lib/librte_eal/common/eal_options.h | 2 ++
lib/librte_eal/linuxapp/eal/eal.c | 3 +++
lib/librte_eal/linuxapp/eal/eal_memory.c | 24 ++++++++++++++++++++----
6 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/lib/librte_eal/bsdapp/eal/eal.c b/lib/librte_eal/bsdapp/eal/eal.c
index 032a5ea..f44b904 100644
--- a/lib/librte_eal/bsdapp/eal/eal.c
+++ b/lib/librte_eal/bsdapp/eal/eal.c
@@ -534,6 +534,9 @@ rte_eal_init(int argc, char **argv)
return -1;
}

+ /* FreeBSD always uses legacy memory model */
+ internal_config.legacy_mem = true;
+
if (eal_plugins_init() < 0) {
rte_eal_init_alert("Cannot init plugins\n");
rte_errno = EINVAL;
diff --git a/lib/librte_eal/common/eal_common_options.c b/lib/librte_eal/common/eal_common_options.c
index 8a51ade..fb5ea03 100644
--- a/lib/librte_eal/common/eal_common_options.c
+++ b/lib/librte_eal/common/eal_common_options.c
@@ -73,6 +73,7 @@ eal_long_options[] = {
{OPT_VDEV, 1, NULL, OPT_VDEV_NUM },
{OPT_VFIO_INTR, 1, NULL, OPT_VFIO_INTR_NUM },
{OPT_VMWARE_TSC_MAP, 0, NULL, OPT_VMWARE_TSC_MAP_NUM },
+ {OPT_LEGACY_MEM, 0, NULL, OPT_LEGACY_MEM_NUM },
{0, 0, NULL, 0 }
};

@@ -1184,6 +1185,9 @@ eal_parse_common_option(int opt, const char *optarg,

core_parsed = LCORE_OPT_MAP;
break;
+ case OPT_LEGACY_MEM_NUM:
+ conf->legacy_mem = 1;
+ break;

/* don't know what to do, leave this to caller */
default:
diff --git a/lib/librte_eal/common/eal_internal_cfg.h b/lib/librte_eal/common/eal_internal_cfg.h
index a0082d1..fda087b 100644
--- a/lib/librte_eal/common/eal_internal_cfg.h
+++ b/lib/librte_eal/common/eal_internal_cfg.h
@@ -47,6 +47,10 @@ struct internal_config {
volatile unsigned force_sockets;
volatile uint64_t socket_mem[RTE_MAX_NUMA_NODES]; /**< amount of memory per socket */
uintptr_t base_virtaddr; /**< base address to try and reserve memory from */
+ volatile unsigned legacy_mem;
+ /**< true to enable legacy memory behavior (no dynamic allocation,
+ * IOVA-contiguous segments).
+ */
volatile int syslog_facility; /**< facility passed to openlog() */
/** default interrupt mode for VFIO */
volatile enum rte_intr_mode vfio_intr_mode;
diff --git a/lib/librte_eal/common/eal_options.h b/lib/librte_eal/common/eal_options.h
index e86c711..d301d0b 100644
--- a/lib/librte_eal/common/eal_options.h
+++ b/lib/librte_eal/common/eal_options.h
@@ -55,6 +55,8 @@ enum {
OPT_VFIO_INTR_NUM,
#define OPT_VMWARE_TSC_MAP "vmware-tsc-map"
OPT_VMWARE_TSC_MAP_NUM,
+#define OPT_LEGACY_MEM "legacy-mem"
+ OPT_LEGACY_MEM_NUM,
OPT_LONG_MAX_NUM
};

diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 77f6cb7..b34e57a 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -348,6 +348,7 @@ eal_usage(const char *prgname)
" --"OPT_BASE_VIRTADDR" Base virtual address\n"
" --"OPT_CREATE_UIO_DEV" Create /dev/uioX (usually done by hotplug)\n"
" --"OPT_VFIO_INTR" Interrupt mode for VFIO (legacy|msi|msix)\n"
+ " --"OPT_LEGACY_MEM" Legacy memory mode (no dynamic allocation, contiguous segments)\n"
"\n");
/* Allow the application to print its usage message too if hook is set */
if ( rte_application_usage_hook ) {
@@ -767,6 +768,8 @@ rte_eal_init(int argc, char **argv)
rte_atomic32_clear(&run_once);
return -1;
}
+ /* for now, always set legacy mem */
+ internal_config.legacy_mem = 1;

if (eal_plugins_init() < 0) {
rte_eal_init_alert("Cannot init plugins\n");
diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
index 5642cc8..1d3defe 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -922,8 +922,8 @@ huge_recover_sigbus(void)
* 6. unmap the first mapping
* 7. fill memsegs in configuration with contiguous zones
*/
-int
-rte_eal_hugepage_init(void)
+static int
+eal_legacy_hugepage_init(void)
{
struct rte_mem_config *mcfg;
struct hugepage_file *hugepage = NULL, *tmp_hp = NULL;
@@ -1266,8 +1266,8 @@ getFileSize(int fd)
* configuration and finds the hugepages which form that segment, mapping them
* in order to form a contiguous block in the virtual memory space
*/
-int
-rte_eal_hugepage_attach(void)
+static int
+eal_legacy_hugepage_attach(void)
{
const struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
struct hugepage_file *hp = NULL;
@@ -1403,6 +1403,22 @@ rte_eal_hugepage_attach(void)
}

int
+rte_eal_hugepage_init(void)
+{
+ if (internal_config.legacy_mem)
+ return eal_legacy_hugepage_init();
+ return -1;
+}
+
+int
+rte_eal_hugepage_attach(void)
+{
+ if (internal_config.legacy_mem)
+ return eal_legacy_hugepage_attach();
+ return -1;
+}
+
+int
rte_eal_using_phys_addrs(void)
{
return phys_addrs_available;
--
2.7.4
Anatoly Burakov
2018-04-03 23:21:58 UTC
Permalink
Currently it is not possible to use memory that is not owned by DPDK to
perform DMA. This scenarion might be used in vhost applications (like
SPDK) where guest send its own memory table. To fill this gap provide
API to allow registering arbitrary address in VFIO container.

Signed-off-by: Pawel Wodkowski <***@intel.com>
Signed-off-by: Anatoly Burakov <***@intel.com>
Signed-off-by: Gowrishankar Muthukrishnan <***@linux.vnet.ibm.com>
---

Notes:
v3:
- Added PPC64, courtesy of Gowrishankar

v3:
- Moved to earlier in the patchset
- Made API experimental
- Do not print out error message if init isn't finished
- SPAPR code provided by Gowrishankar

lib/librte_eal/bsdapp/eal/eal.c | 16 ++
lib/librte_eal/common/include/rte_vfio.h | 39 ++++
lib/librte_eal/linuxapp/eal/eal_vfio.c | 347 ++++++++++++++++++++++++-------
lib/librte_eal/linuxapp/eal/eal_vfio.h | 12 ++
lib/librte_eal/rte_eal_version.map | 2 +
5 files changed, 341 insertions(+), 75 deletions(-)

diff --git a/lib/librte_eal/bsdapp/eal/eal.c b/lib/librte_eal/bsdapp/eal/eal.c
index 8e25d78..032a5ea 100644
--- a/lib/librte_eal/bsdapp/eal/eal.c
+++ b/lib/librte_eal/bsdapp/eal/eal.c
@@ -749,6 +749,8 @@ int rte_vfio_enable(const char *modname);
int rte_vfio_is_enabled(const char *modname);
int rte_vfio_noiommu_is_enabled(void);
int rte_vfio_clear_group(int vfio_group_fd);
+int rte_vfio_dma_map(uint64_t vaddr, uint64_t iova, uint64_t len);
+int rte_vfio_dma_unmap(uint64_t vaddr, uint64_t iova, uint64_t len);

int rte_vfio_setup_device(__rte_unused const char *sysfs_base,
__rte_unused const char *dev_addr,
@@ -784,3 +786,17 @@ int rte_vfio_clear_group(__rte_unused int vfio_group_fd)
{
return 0;
}
+
+int __rte_experimental
+rte_vfio_dma_map(uint64_t __rte_unused vaddr, __rte_unused uint64_t iova,
+ __rte_unused uint64_t len)
+{
+ return -1;
+}
+
+int __rte_experimental
+rte_vfio_dma_unmap(uint64_t __rte_unused vaddr, uint64_t __rte_unused iova,
+ __rte_unused uint64_t len)
+{
+ return -1;
+}
diff --git a/lib/librte_eal/common/include/rte_vfio.h b/lib/librte_eal/common/include/rte_vfio.h
index 249095e..bd4663c 100644
--- a/lib/librte_eal/common/include/rte_vfio.h
+++ b/lib/librte_eal/common/include/rte_vfio.h
@@ -127,6 +127,45 @@ int rte_vfio_noiommu_is_enabled(void);
int
rte_vfio_clear_group(int vfio_group_fd);

+/**
+ * Map memory region for use with VFIO.
+ *
+ * @param vaddr
+ * Starting virtual address of memory to be mapped.
+ *
+ * @param iova
+ * Starting IOVA address of memory to be mapped.
+ *
+ * @param len
+ * Length of memory segment being mapped.
+ *
+ * @return
+ * 0 if success.
+ * -1 on error.
+ */
+int __rte_experimental
+rte_vfio_dma_map(uint64_t vaddr, uint64_t iova, uint64_t len);
+
+
+/**
+ * Unmap memory region from VFIO.
+ *
+ * @param vaddr
+ * Starting virtual address of memory to be unmapped.
+ *
+ * @param iova
+ * Starting IOVA address of memory to be unmapped.
+ *
+ * @param len
+ * Length of memory segment being unmapped.
+ *
+ * @return
+ * 0 if success.
+ * -1 on error.
+ */
+int __rte_experimental
+rte_vfio_dma_unmap(uint64_t vaddr, uint64_t iova, uint64_t len);
+
#ifdef __cplusplus
}
#endif
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index fb41e82..f6fe93e 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -22,17 +22,35 @@
static struct vfio_config vfio_cfg;

static int vfio_type1_dma_map(int);
+static int vfio_type1_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);
static int vfio_spapr_dma_map(int);
+static int vfio_spapr_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);
static int vfio_noiommu_dma_map(int);
+static int vfio_noiommu_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);

/* IOMMU types we support */
static const struct vfio_iommu_type iommu_types[] = {
/* x86 IOMMU, otherwise known as type 1 */
- { RTE_VFIO_TYPE1, "Type 1", &vfio_type1_dma_map},
+ {
+ .type_id = RTE_VFIO_TYPE1,
+ .name = "Type 1",
+ .dma_map_func = &vfio_type1_dma_map,
+ .dma_user_map_func = &vfio_type1_dma_mem_map
+ },
/* ppc64 IOMMU, otherwise known as spapr */
- { RTE_VFIO_SPAPR, "sPAPR", &vfio_spapr_dma_map},
+ {
+ .type_id = RTE_VFIO_SPAPR,
+ .name = "sPAPR",
+ .dma_map_func = &vfio_spapr_dma_map,
+ .dma_user_map_func = &vfio_spapr_dma_mem_map
+ },
/* IOMMU-less mode */
- { RTE_VFIO_NOIOMMU, "No-IOMMU", &vfio_noiommu_dma_map},
+ {
+ .type_id = RTE_VFIO_NOIOMMU,
+ .name = "No-IOMMU",
+ .dma_map_func = &vfio_noiommu_dma_map,
+ .dma_user_map_func = &vfio_noiommu_dma_mem_map
+ },
};

int
@@ -333,9 +351,10 @@ rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
*/
if (internal_config.process_type == RTE_PROC_PRIMARY &&
vfio_cfg.vfio_active_groups == 1) {
+ const struct vfio_iommu_type *t;
+
/* select an IOMMU type which we will be using */
- const struct vfio_iommu_type *t =
- vfio_set_iommu_type(vfio_cfg.vfio_container_fd);
+ t = vfio_set_iommu_type(vfio_cfg.vfio_container_fd);
if (!t) {
RTE_LOG(ERR, EAL,
" %s failed to select IOMMU type\n",
@@ -353,6 +372,8 @@ rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
rte_vfio_clear_group(vfio_group_fd);
return -1;
}
+
+ vfio_cfg.vfio_iommu_type = t;
}
}

@@ -668,23 +689,49 @@ static int
type1_map(const struct rte_memseg *ms, void *arg)
{
int *vfio_container_fd = arg;
+
+ return vfio_type1_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova,
+ ms->len, 1);
+}
+
+static int
+vfio_type1_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
+ uint64_t len, int do_map)
+{
struct vfio_iommu_type1_dma_map dma_map;
+ struct vfio_iommu_type1_dma_unmap dma_unmap;
int ret;

- memset(&dma_map, 0, sizeof(dma_map));
- dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
- dma_map.vaddr = ms->addr_64;
- dma_map.size = ms->len;
- dma_map.iova = ms->iova;
- dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
-
- ret = ioctl(*vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+ if (do_map != 0) {
+ memset(&dma_map, 0, sizeof(dma_map));
+ dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
+ dma_map.vaddr = vaddr;
+ dma_map.size = len;
+ dma_map.iova = iova;
+ dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
+ VFIO_DMA_MAP_FLAG_WRITE;

- if (ret) {
- RTE_LOG(ERR, EAL, " cannot set up DMA remapping, error %i (%s)\n",
+ ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot set up DMA remapping, error %i (%s)\n",
errno, strerror(errno));
- return -1;
+ return -1;
+ }
+ } else {
+ memset(&dma_unmap, 0, sizeof(dma_unmap));
+ dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap);
+ dma_unmap.size = len;
+ dma_unmap.iova = iova;
+
+ ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA,
+ &dma_unmap);
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot clear DMA remapping, error %i (%s)\n",
+ errno, strerror(errno));
+ return -1;
+ }
}
+
return 0;
}

@@ -694,12 +741,78 @@ vfio_type1_dma_map(int vfio_container_fd)
return rte_memseg_walk(type1_map, &vfio_container_fd);
}

+static int
+vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
+ uint64_t len, int do_map)
+{
+ struct vfio_iommu_type1_dma_map dma_map;
+ struct vfio_iommu_type1_dma_unmap dma_unmap;
+ int ret;
+
+ if (do_map != 0) {
+ memset(&dma_map, 0, sizeof(dma_map));
+ dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
+ dma_map.vaddr = vaddr;
+ dma_map.size = len;
+ dma_map.iova = iova;
+ dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
+ VFIO_DMA_MAP_FLAG_WRITE;
+
+ ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot set up DMA remapping, error %i (%s)\n",
+ errno, strerror(errno));
+ return -1;
+ }
+
+ } else {
+ struct vfio_iommu_spapr_register_memory reg = {
+ .argsz = sizeof(reg),
+ .flags = 0
+ };
+ reg.vaddr = (uintptr_t) vaddr;
+ reg.size = len;
+
+ ret = ioctl(vfio_container_fd,
+ VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, &reg);
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot unregister vaddr for IOMMU, error %i (%s)\n",
+ errno, strerror(errno));
+ return -1;
+ }
+
+ memset(&dma_unmap, 0, sizeof(dma_unmap));
+ dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap);
+ dma_unmap.size = len;
+ dma_unmap.iova = iova;
+
+ ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA,
+ &dma_unmap);
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot clear DMA remapping, error %i (%s)\n",
+ errno, strerror(errno));
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+static int
+vfio_spapr_map_walk(const struct rte_memseg *ms, void *arg)
+{
+ int *vfio_container_fd = arg;
+
+ return vfio_spapr_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova,
+ ms->len, 1);
+}
+
struct spapr_walk_param {
uint64_t window_size;
uint64_t hugepage_sz;
};
static int
-spapr_window_size(const struct rte_memseg *ms, void *arg)
+vfio_spapr_window_size_walk(const struct rte_memseg *ms, void *arg)
{
struct spapr_walk_param *param = arg;
uint64_t max = ms->iova + ms->len;
@@ -713,39 +826,43 @@ spapr_window_size(const struct rte_memseg *ms, void *arg)
}

static int
-spapr_map(const struct rte_memseg *ms, void *arg)
-{
- struct vfio_iommu_type1_dma_map dma_map;
- struct vfio_iommu_spapr_register_memory reg = {
- .argsz = sizeof(reg),
- .flags = 0
+vfio_spapr_create_new_dma_window(int vfio_container_fd,
+ struct vfio_iommu_spapr_tce_create *create) {
+ struct vfio_iommu_spapr_tce_remove remove = {
+ .argsz = sizeof(remove),
+ };
+ struct vfio_iommu_spapr_tce_info info = {
+ .argsz = sizeof(info),
};
- int *vfio_container_fd = arg;
int ret;

- reg.vaddr = (uintptr_t) ms->addr;
- reg.size = ms->len;
- ret = ioctl(*vfio_container_fd,
- VFIO_IOMMU_SPAPR_REGISTER_MEMORY, &reg);
+ /* query spapr iommu info */
+ ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
if (ret) {
- RTE_LOG(ERR, EAL, " cannot register vaddr for IOMMU, error %i (%s)\n",
- errno, strerror(errno));
+ RTE_LOG(ERR, EAL, " cannot get iommu info, "
+ "error %i (%s)\n", errno, strerror(errno));
return -1;
}

- memset(&dma_map, 0, sizeof(dma_map));
- dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
- dma_map.vaddr = ms->addr_64;
- dma_map.size = ms->len;
- dma_map.iova = ms->iova;
- dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
- VFIO_DMA_MAP_FLAG_WRITE;
-
- ret = ioctl(*vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+ /* remove default DMA of 32 bit window */
+ remove.start_addr = info.dma32_window_start;
+ ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot remove default DMA window, "
+ "error %i (%s)\n", errno, strerror(errno));
+ return -1;
+ }

+ /* create new DMA window */
+ ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, create);
if (ret) {
- RTE_LOG(ERR, EAL, " cannot set up DMA remapping, error %i (%s)\n",
- errno, strerror(errno));
+ RTE_LOG(ERR, EAL, " cannot create new DMA window, "
+ "error %i (%s)\n", errno, strerror(errno));
+ return -1;
+ }
+
+ if (create->start_addr != 0) {
+ RTE_LOG(ERR, EAL, " DMA window start address != 0\n");
return -1;
}

@@ -753,61 +870,82 @@ spapr_map(const struct rte_memseg *ms, void *arg)
}

static int
-vfio_spapr_dma_map(int vfio_container_fd)
+vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
+ uint64_t len, int do_map)
{
struct spapr_walk_param param;
- int ret;
- struct vfio_iommu_spapr_tce_info info = {
- .argsz = sizeof(info),
- };
struct vfio_iommu_spapr_tce_create create = {
.argsz = sizeof(create),
};
- struct vfio_iommu_spapr_tce_remove remove = {
- .argsz = sizeof(remove),
- };

+ /* check if window size needs to be adjusted */
memset(&param, 0, sizeof(param));

- /* query spapr iommu info */
- ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
- if (ret) {
- RTE_LOG(ERR, EAL, " cannot get iommu info, "
- "error %i (%s)\n", errno, strerror(errno));
+ if (rte_memseg_walk(vfio_spapr_window_size_walk, &param) < 0) {
+ RTE_LOG(ERR, EAL, "Could not get window size\n");
return -1;
}

- /* remove default DMA of 32 bit window */
- remove.start_addr = info.dma32_window_start;
- ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
- if (ret) {
- RTE_LOG(ERR, EAL, " cannot remove default DMA window, "
- "error %i (%s)\n", errno, strerror(errno));
- return -1;
+ /* sPAPR requires window size to be a power of 2 */
+ create.window_size = rte_align64pow2(param.window_size);
+ create.page_shift = __builtin_ctzll(param.hugepage_sz);
+ create.levels = 1;
+
+ if (do_map) {
+ /* re-create window and remap the entire memory */
+ if (iova > create.window_size) {
+ if (vfio_spapr_create_new_dma_window(vfio_container_fd,
+ &create) < 0) {
+ RTE_LOG(ERR, EAL, "Could not create new DMA window\n");
+ return -1;
+ }
+ if (rte_memseg_walk(vfio_spapr_map_walk,
+ &vfio_container_fd) < 0) {
+ RTE_LOG(ERR, EAL, "Could not recreate DMA maps\n");
+ return -1;
+ }
+ }
+ /* now that we've remapped all of the memory that was present
+ * before, map the segment that we were requested to map.
+ */
+ if (vfio_spapr_dma_do_map(vfio_container_fd,
+ vaddr, iova, len, 1) < 0) {
+ RTE_LOG(ERR, EAL, "Could not map segment\n");
+ return -1;
+ }
+ } else {
+
+ /* for unmap, check if iova within DMA window */
+ if (iova > create.window_size) {
+ RTE_LOG(ERR, EAL, "iova beyond DMA window for unmap");
+ return -1;
+ }
+
+ vfio_spapr_dma_do_map(vfio_container_fd, vaddr, iova, len, 0);
}
+ return 0;
+}
+
+static int
+vfio_spapr_dma_map(int vfio_container_fd)
+{
+ struct vfio_iommu_spapr_tce_create create = {
+ .argsz = sizeof(create),
+ };
+ struct spapr_walk_param param;
+
+ memset(&param, 0, sizeof(param));

/* create DMA window from 0 to max(phys_addr + len) */
- rte_memseg_walk(spapr_window_size, &param);
+ rte_memseg_walk(vfio_spapr_window_size_walk, &param);

/* sPAPR requires window size to be a power of 2 */
create.window_size = rte_align64pow2(param.window_size);
create.page_shift = __builtin_ctzll(param.hugepage_sz);
create.levels = 1;

- ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
- if (ret) {
- RTE_LOG(ERR, EAL, " cannot create new DMA window, "
- "error %i (%s)\n", errno, strerror(errno));
- return -1;
- }
-
- if (create.start_addr != 0) {
- RTE_LOG(ERR, EAL, " DMA window start address != 0\n");
- return -1;
- }
-
/* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
- if (rte_memseg_walk(spapr_map, &vfio_container_fd) < 0)
+ if (rte_memseg_walk(vfio_spapr_map_walk, &vfio_container_fd) < 0)
return -1;

return 0;
@@ -820,6 +958,49 @@ vfio_noiommu_dma_map(int __rte_unused vfio_container_fd)
return 0;
}

+static int
+vfio_noiommu_dma_mem_map(int __rte_unused vfio_container_fd,
+ uint64_t __rte_unused vaddr,
+ uint64_t __rte_unused iova, uint64_t __rte_unused len,
+ int __rte_unused do_map)
+{
+ /* No-IOMMU mode does not need DMA mapping */
+ return 0;
+}
+
+static int
+vfio_dma_mem_map(uint64_t vaddr, uint64_t iova, uint64_t len, int do_map)
+{
+ const struct vfio_iommu_type *t = vfio_cfg.vfio_iommu_type;
+
+ if (!t) {
+ RTE_LOG(ERR, EAL, " VFIO support not initialized\n");
+ return -1;
+ }
+
+ if (!t->dma_user_map_func) {
+ RTE_LOG(ERR, EAL,
+ " VFIO custom DMA region maping not supported by IOMMU %s\n",
+ t->name);
+ return -1;
+ }
+
+ return t->dma_user_map_func(vfio_cfg.vfio_container_fd, vaddr, iova,
+ len, do_map);
+}
+
+int __rte_experimental
+rte_vfio_dma_map(uint64_t vaddr, uint64_t iova, uint64_t len)
+{
+ return vfio_dma_mem_map(vaddr, iova, len, 1);
+}
+
+int __rte_experimental
+rte_vfio_dma_unmap(uint64_t vaddr, uint64_t iova, uint64_t len)
+{
+ return vfio_dma_mem_map(vaddr, iova, len, 0);
+}
+
int
rte_vfio_noiommu_is_enabled(void)
{
@@ -852,4 +1033,20 @@ rte_vfio_noiommu_is_enabled(void)
return c == 'Y';
}

+#else
+
+int __rte_experimental
+rte_vfio_dma_map(uint64_t __rte_unused vaddr, __rte_unused uint64_t iova,
+ __rte_unused uint64_t len)
+{
+ return -1;
+}
+
+int __rte_experimental
+rte_vfio_dma_unmap(uint64_t __rte_unused vaddr, uint64_t __rte_unused iova,
+ __rte_unused uint64_t len)
+{
+ return -1;
+}
+
#endif
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h
index 8059577..549f442 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.h
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
@@ -19,6 +19,7 @@

#ifdef VFIO_PRESENT

+#include <stdint.h>
#include <linux/vfio.h>

#define RTE_VFIO_TYPE1 VFIO_TYPE1_IOMMU
@@ -26,6 +27,7 @@
#ifndef VFIO_SPAPR_TCE_v2_IOMMU
#define RTE_VFIO_SPAPR 7
#define VFIO_IOMMU_SPAPR_REGISTER_MEMORY _IO(VFIO_TYPE, VFIO_BASE + 17)
+#define VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY _IO(VFIO_TYPE, VFIO_BASE + 18)
#define VFIO_IOMMU_SPAPR_TCE_CREATE _IO(VFIO_TYPE, VFIO_BASE + 19)
#define VFIO_IOMMU_SPAPR_TCE_REMOVE _IO(VFIO_TYPE, VFIO_BASE + 20)

@@ -110,6 +112,7 @@ struct vfio_config {
int vfio_enabled;
int vfio_container_fd;
int vfio_active_groups;
+ const struct vfio_iommu_type *vfio_iommu_type;
struct vfio_group vfio_groups[VFIO_MAX_GROUPS];
};

@@ -119,9 +122,18 @@ struct vfio_config {
* */
typedef int (*vfio_dma_func_t)(int);

+/* Custom memory region DMA mapping function prototype.
+ * Takes VFIO container fd, virtual address, phisical address, length and
+ * operation type (0 to unmap 1 for map) as a parameters.
+ * Returns 0 on success, -1 on error.
+ **/
+typedef int (*vfio_dma_user_func_t)(int fd, uint64_t vaddr, uint64_t iova,
+ uint64_t len, int do_map);
+
struct vfio_iommu_type {
int type_id;
const char *name;
+ vfio_dma_user_func_t dma_user_map_func;
vfio_dma_func_t dma_map_func;
};

diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index 70ec778..fe4a9c9 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -266,5 +266,7 @@ EXPERIMENTAL {
rte_service_start_with_defaults;
rte_socket_count;
rte_socket_id_by_idx;
+ rte_vfio_dma_map;
+ rte_vfio_dma_unmap;

} DPDK_18.02;
--
2.7.4
Anatoly Burakov
2018-04-03 23:21:44 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/linuxapp/eal/eal_vfio.c | 45 ++++++++++++++++------------------
1 file changed, 21 insertions(+), 24 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index 2421d51..2a34ae9 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -665,39 +665,36 @@ vfio_get_group_no(const char *sysfs_base,
}

static int
-vfio_type1_dma_map(int vfio_container_fd)
+type1_map(const struct rte_memseg *ms, void *arg)
{
- const struct rte_memseg *ms = rte_eal_get_physmem_layout();
- int i, ret;
+ int *vfio_container_fd = arg;
+ struct vfio_iommu_type1_dma_map dma_map;
+ int ret;

- /* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
- for (i = 0; i < RTE_MAX_MEMSEG; i++) {
- struct vfio_iommu_type1_dma_map dma_map;
+ memset(&dma_map, 0, sizeof(dma_map));
+ dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
+ dma_map.vaddr = ms->addr_64;
+ dma_map.size = ms->len;
+ dma_map.iova = ms->iova;
+ dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;

- if (ms[i].addr == NULL)
- break;
-
- memset(&dma_map, 0, sizeof(dma_map));
- dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
- dma_map.vaddr = ms[i].addr_64;
- dma_map.size = ms[i].len;
- dma_map.iova = ms[i].iova;
- dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
+ ret = ioctl(*vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);

- ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
-
- if (ret) {
- RTE_LOG(ERR, EAL, " cannot set up DMA remapping, "
- "error %i (%s)\n", errno,
- strerror(errno));
- return -1;
- }
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot set up DMA remapping, error %i (%s)\n",
+ errno, strerror(errno));
+ return -1;
}
-
return 0;
}

static int
+vfio_type1_dma_map(int vfio_container_fd)
+{
+ return rte_memseg_walk(type1_map, &vfio_container_fd);
+}
+
+static int
vfio_spapr_dma_map(int vfio_container_fd)
{
const struct rte_memseg *ms = rte_eal_get_physmem_layout();
--
2.7.4
Anatoly Burakov
2018-04-03 23:22:04 UTC
Permalink
This isn't used anywhere yet, but the support is now there. Also,
adding cleanup to allocation procedures, so that if we fail to
allocate everything we asked for, we can free all of it back.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/bsdapp/eal/eal_memalloc.c | 15 +++
lib/librte_eal/common/eal_memalloc.h | 14 +++
lib/librte_eal/linuxapp/eal/eal_memalloc.c | 149 ++++++++++++++++++++++++++++-
3 files changed, 177 insertions(+), 1 deletion(-)

diff --git a/lib/librte_eal/bsdapp/eal/eal_memalloc.c b/lib/librte_eal/bsdapp/eal/eal_memalloc.c
index 8c30670..e7bcd2b 100644
--- a/lib/librte_eal/bsdapp/eal/eal_memalloc.c
+++ b/lib/librte_eal/bsdapp/eal/eal_memalloc.c
@@ -24,3 +24,18 @@ eal_memalloc_alloc_seg(size_t __rte_unused page_sz, int __rte_unused socket)
RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n");
return NULL;
}
+
+int
+eal_memalloc_free_seg(struct rte_memseg *ms __rte_unused)
+{
+ RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n");
+ return -1;
+}
+
+int
+eal_memalloc_free_seg_bulk(struct rte_memseg **ms __rte_unused,
+ int n_segs __rte_unused)
+{
+ RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n");
+ return -1;
+}
diff --git a/lib/librte_eal/common/eal_memalloc.h b/lib/librte_eal/common/eal_memalloc.h
index f628514..6017345 100644
--- a/lib/librte_eal/common/eal_memalloc.h
+++ b/lib/librte_eal/common/eal_memalloc.h
@@ -28,4 +28,18 @@ int
eal_memalloc_alloc_seg_bulk(struct rte_memseg **ms, int n_segs, size_t page_sz,
int socket, bool exact);

+/*
+ * Deallocate segment
+ */
+int
+eal_memalloc_free_seg(struct rte_memseg *ms);
+
+/*
+ * Deallocate `n_segs` segments. Returns 0 on successful deallocation of all
+ * segments, returns -1 on error. Any segments that could have been deallocated,
+ * will be deallocated even in case of error.
+ */
+int
+eal_memalloc_free_seg_bulk(struct rte_memseg **ms, int n_segs);
+
#endif // EAL_MEMALLOC_H
diff --git a/lib/librte_eal/linuxapp/eal/eal_memalloc.c b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
index 45ea0ad..118b12d 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memalloc.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
@@ -289,6 +289,48 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
return -1;
}

+static int
+free_seg(struct rte_memseg *ms, struct hugepage_info *hi,
+ unsigned int list_idx, unsigned int seg_idx)
+{
+ char path[PATH_MAX];
+ int fd, ret;
+
+ /* erase page data */
+ memset(ms->addr, 0, ms->len);
+
+ if (mmap(ms->addr, ms->len, PROT_READ,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0) ==
+ MAP_FAILED) {
+ RTE_LOG(DEBUG, EAL, "couldn't unmap page\n");
+ return -1;
+ }
+
+ fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx);
+ if (fd < 0)
+ return -1;
+
+ /* if we're able to take out a write lock, we're the last one
+ * holding onto this page.
+ */
+
+ ret = lock(fd, 0, ms->len, F_WRLCK);
+ if (ret >= 0) {
+ /* no one else is using this page */
+ if (ret == 1)
+ unlink(path);
+ ret = lock(fd, 0, ms->len, F_UNLCK);
+ if (ret != 1)
+ RTE_LOG(ERR, EAL, "%s(): unable to unlock file %s\n",
+ __func__, path);
+ }
+ close(fd);
+
+ memset(ms, 0, sizeof(*ms));
+
+ return ret;
+}
+
struct alloc_walk_param {
struct hugepage_info *hi;
struct rte_memseg **ms;
@@ -305,7 +347,7 @@ alloc_seg_walk(const struct rte_memseg_list *msl, void *arg)
struct alloc_walk_param *wa = arg;
struct rte_memseg_list *cur_msl;
size_t page_sz;
- int cur_idx;
+ int cur_idx, start_idx, j;
unsigned int msl_idx, need, i;

if (msl->page_sz != wa->page_sz)
@@ -324,6 +366,7 @@ alloc_seg_walk(const struct rte_memseg_list *msl, void *arg)
cur_idx = rte_fbarray_find_next_n_free(&cur_msl->memseg_arr, 0, need);
if (cur_idx < 0)
return 0;
+ start_idx = cur_idx;

for (i = 0; i < need; i++, cur_idx++) {
struct rte_memseg *cur;
@@ -341,6 +384,25 @@ alloc_seg_walk(const struct rte_memseg_list *msl, void *arg)
/* if exact number wasn't requested, stop */
if (!wa->exact)
goto out;
+
+ /* clean up */
+ for (j = start_idx; j < cur_idx; j++) {
+ struct rte_memseg *tmp;
+ struct rte_fbarray *arr =
+ &cur_msl->memseg_arr;
+
+ tmp = rte_fbarray_get(arr, j);
+ if (free_seg(tmp, wa->hi, msl_idx,
+ start_idx + j)) {
+ RTE_LOG(ERR, EAL, "Cannot free page\n");
+ continue;
+ }
+
+ rte_fbarray_set_free(arr, j);
+ }
+ /* clear the list */
+ if (wa->ms)
+ memset(wa->ms, 0, sizeof(*wa->ms) * wa->n_segs);
return -1;
}
if (wa->ms)
@@ -351,7 +413,39 @@ alloc_seg_walk(const struct rte_memseg_list *msl, void *arg)
out:
wa->segs_allocated = i;
return 1;
+}
+
+struct free_walk_param {
+ struct hugepage_info *hi;
+ struct rte_memseg *ms;
+};
+static int
+free_seg_walk(const struct rte_memseg_list *msl, void *arg)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct rte_memseg_list *found_msl;
+ struct free_walk_param *wa = arg;
+ uintptr_t start_addr, end_addr;
+ int msl_idx, seg_idx;

+ start_addr = (uintptr_t) msl->base_va;
+ end_addr = start_addr + msl->memseg_arr.len * (size_t)msl->page_sz;
+
+ if ((uintptr_t)wa->ms->addr < start_addr ||
+ (uintptr_t)wa->ms->addr >= end_addr)
+ return 0;
+
+ msl_idx = msl - mcfg->memsegs;
+ seg_idx = RTE_PTR_DIFF(wa->ms->addr, start_addr) / msl->page_sz;
+
+ /* msl is const */
+ found_msl = &mcfg->memsegs[msl_idx];
+
+ rte_fbarray_set_free(&found_msl->memseg_arr, seg_idx);
+ if (free_seg(wa->ms, wa->hi, msl_idx, seg_idx))
+ return -1;
+
+ return 1;
}

int
@@ -427,3 +521,56 @@ eal_memalloc_alloc_seg(size_t page_sz, int socket)
/* return pointer to newly allocated memseg */
return ms;
}
+
+int
+eal_memalloc_free_seg_bulk(struct rte_memseg **ms, int n_segs)
+{
+ int seg, ret = 0;
+
+ /* dynamic free not supported in legacy mode */
+ if (internal_config.legacy_mem)
+ return -1;
+
+ for (seg = 0; seg < n_segs; seg++) {
+ struct rte_memseg *cur = ms[seg];
+ struct hugepage_info *hi = NULL;
+ struct free_walk_param wa;
+ int i, walk_res;
+
+ memset(&wa, 0, sizeof(wa));
+
+ for (i = 0; i < (int)RTE_DIM(internal_config.hugepage_info);
+ i++) {
+ hi = &internal_config.hugepage_info[i];
+ if (cur->hugepage_sz == hi->hugepage_sz) {
+ break;
+ }
+ }
+ if (i == (int)RTE_DIM(internal_config.hugepage_info)) {
+ RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n");
+ ret = -1;
+ continue;
+ }
+
+ wa.ms = cur;
+ wa.hi = hi;
+
+ walk_res = rte_memseg_list_walk(free_seg_walk, &wa);
+ if (walk_res == 1)
+ continue;
+ if (walk_res == 0)
+ RTE_LOG(ERR, EAL, "Couldn't find memseg list\n");
+ ret = -1;
+ }
+ return ret;
+}
+
+int
+eal_memalloc_free_seg(struct rte_memseg *ms)
+{
+ /* dynamic free not supported in legacy mode */
+ if (internal_config.legacy_mem)
+ return -1;
+
+ return eal_memalloc_free_seg_bulk(&ms, 1);
+}
--
2.7.4
Anatoly Burakov
2018-04-03 23:22:08 UTC
Permalink
For non-legacy memory init mode, instead of looking at generic
sysfs path, look at sysfs paths pertaining to each NUMA node
for hugepage counts. Note that per-NUMA node path does not
provide information regarding reserved pages, so we might not
get the best info from these paths, but this saves us from the
whole mapping/remapping business before we're actually able to
tell which page is on which socket, because we no longer require
our memory to be physically contiguous.

Legacy memory init will not use this.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/linuxapp/eal/eal_hugepage_info.c | 80 +++++++++++++++++++++++--
1 file changed, 74 insertions(+), 6 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
index afebd42..2e0819f 100644
--- a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
+++ b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
@@ -31,6 +31,7 @@
#include "eal_filesystem.h"

static const char sys_dir_path[] = "/sys/kernel/mm/hugepages";
+static const char sys_pages_numa_dir_path[] = "/sys/devices/system/node";

/* this function is only called from eal_hugepage_info_init which itself
* is only called from a primary process */
@@ -71,6 +72,45 @@ get_num_hugepages(const char *subdir)
return num_pages;
}

+static uint32_t
+get_num_hugepages_on_node(const char *subdir, unsigned int socket)
+{
+ char path[PATH_MAX], socketpath[PATH_MAX];
+ DIR *socketdir;
+ unsigned long num_pages = 0;
+ const char *nr_hp_file = "free_hugepages";
+
+ snprintf(socketpath, sizeof(socketpath), "%s/node%u/hugepages",
+ sys_pages_numa_dir_path, socket);
+
+ socketdir = opendir(socketpath);
+ if (socketdir) {
+ /* Keep calm and carry on */
+ closedir(socketdir);
+ } else {
+ /* Can't find socket dir, so ignore it */
+ return 0;
+ }
+
+ snprintf(path, sizeof(path), "%s/%s/%s",
+ socketpath, subdir, nr_hp_file);
+ if (eal_parse_sysfs_value(path, &num_pages) < 0)
+ return 0;
+
+ if (num_pages == 0)
+ RTE_LOG(WARNING, EAL, "No free hugepages reported in %s\n",
+ subdir);
+
+ /*
+ * we want to return a uint32_t and more than this looks suspicious
+ * anyway ...
+ */
+ if (num_pages > UINT32_MAX)
+ num_pages = UINT32_MAX;
+
+ return num_pages;
+}
+
static uint64_t
get_default_hp_size(void)
{
@@ -269,7 +309,7 @@ eal_hugepage_info_init(void)
{
const char dirent_start_text[] = "hugepages-";
const size_t dirent_start_len = sizeof(dirent_start_text) - 1;
- unsigned i, num_sizes = 0;
+ unsigned int i, total_pages, num_sizes = 0;
DIR *dir;
struct dirent *dirent;

@@ -323,9 +363,28 @@ eal_hugepage_info_init(void)
if (clear_hugedir(hpi->hugedir) == -1)
break;

- /* for now, put all pages into socket 0,
- * later they will be sorted */
- hpi->num_pages[0] = get_num_hugepages(dirent->d_name);
+ /*
+ * first, try to put all hugepages into relevant sockets, but
+ * if first attempts fails, fall back to collecting all pages
+ * in one socket and sorting them later
+ */
+ total_pages = 0;
+ /* we also don't want to do this for legacy init */
+ if (!internal_config.legacy_mem)
+ for (i = 0; i < rte_socket_count(); i++) {
+ int socket = rte_socket_id_by_idx(i);
+ unsigned int num_pages =
+ get_num_hugepages_on_node(
+ dirent->d_name, socket);
+ hpi->num_pages[socket] = num_pages;
+ total_pages += num_pages;
+ }
+ /*
+ * we failed to sort memory from the get go, so fall
+ * back to old way
+ */
+ if (total_pages == 0)
+ hpi->num_pages[0] = get_num_hugepages(dirent->d_name);

#ifndef RTE_ARCH_64
/* for 32-bit systems, limit number of hugepages to
@@ -349,10 +408,19 @@ eal_hugepage_info_init(void)
sizeof(internal_config.hugepage_info[0]), compare_hpi);

/* now we have all info, check we have at least one valid size */
- for (i = 0; i < num_sizes; i++)
+ for (i = 0; i < num_sizes; i++) {
+ /* pages may no longer all be on socket 0, so check all */
+ unsigned int j, num_pages = 0;
+
+ for (j = 0; j < RTE_MAX_NUMA_NODES; j++) {
+ struct hugepage_info *hpi =
+ &internal_config.hugepage_info[i];
+ num_pages += hpi->num_pages[j];
+ }
if (internal_config.hugepage_info[i].hugedir != NULL &&
- internal_config.hugepage_info[i].num_pages[0] > 0)
+ num_pages > 0)
return 0;
+ }

/* no valid hugepage mounts available, return error */
return -1;
--
2.7.4
Anatoly Burakov
2018-04-03 23:21:57 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/common/eal_common_memzone.c | 42 +++++++++++++++---------------
1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/lib/librte_eal/common/eal_common_memzone.c b/lib/librte_eal/common/eal_common_memzone.c
index 36d2553..88f401f 100644
--- a/lib/librte_eal/common/eal_common_memzone.c
+++ b/lib/librte_eal/common/eal_common_memzone.c
@@ -395,31 +395,31 @@ rte_memzone_lookup(const char *name)
return memzone;
}

+static void
+dump_memzone(const struct rte_memzone *mz, void *arg)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ FILE *f = arg;
+ int mz_idx;
+
+ mz_idx = mz - mcfg->memzone;
+
+ fprintf(f, "Zone %u: name:<%s>, IO:0x%"PRIx64", len:0x%zx, virt:%p, "
+ "socket_id:%"PRId32", flags:%"PRIx32"\n",
+ mz_idx,
+ mz->name,
+ mz->iova,
+ mz->len,
+ mz->addr,
+ mz->socket_id,
+ mz->flags);
+}
+
/* Dump all reserved memory zones on console */
void
rte_memzone_dump(FILE *f)
{
- struct rte_mem_config *mcfg;
- unsigned i = 0;
-
- /* get pointer to global configuration */
- mcfg = rte_eal_get_configuration()->mem_config;
-
- rte_rwlock_read_lock(&mcfg->mlock);
- /* dump all zones */
- for (i=0; i<RTE_MAX_MEMZONE; i++) {
- if (mcfg->memzone[i].addr == NULL)
- break;
- fprintf(f, "Zone %u: name:<%s>, IO:0x%"PRIx64", len:0x%zx"
- ", virt:%p, socket_id:%"PRId32", flags:%"PRIx32"\n", i,
- mcfg->memzone[i].name,
- mcfg->memzone[i].iova,
- mcfg->memzone[i].len,
- mcfg->memzone[i].addr,
- mcfg->memzone[i].socket_id,
- mcfg->memzone[i].flags);
- }
- rte_rwlock_read_unlock(&mcfg->mlock);
+ rte_memzone_walk(dump_memzone, f);
}

/*
--
2.7.4
Anatoly Burakov
2018-04-03 23:22:00 UTC
Permalink
rte_fbarray is a simple indexed array stored in shared memory
via mapping files into memory. Rationale for its existence is the
following: since we are going to map memory page-by-page, there
could be quite a lot of memory segments to keep track of (for
smaller page sizes, page count can easily reach thousands). We
can't really make page lists truly dynamic and infinitely expandable,
because that involves reallocating memory (which is a big no-no in
multiprocess). What we can do instead is have a maximum capacity as
something really, really large, and decide at allocation time how
big the array is going to be. We map the entire file into memory,
which makes it possible to use fbarray as shared memory, provided
the structure itself is allocated in shared memory. Per-fbarray
locking is also used to avoid index data races (but not contents
data races - that is up to user application to synchronize).

In addition, in understanding that we will frequently need to scan
this array for free space and iterating over array linearly can
become slow, rte_fbarray provides facilities to index array's
usage. The following use cases are covered:
- find next free/used slot (useful either for adding new elements
to fbarray, or walking the list)
- find starting index for next N free/used slots (useful for when
we want to allocate chunk of VA-contiguous memory composed of
several pages)
- find how many contiguous free/used slots there are, starting
from specified index (useful for when we want to figure out
how many pages we have until next hole in allocated memory, to
speed up some bulk operations where we would otherwise have to
walk the array and add pages one by one)

This is accomplished by storing a usage mask in-memory, right
after the data section of the array, and using some bit-level
magic to figure out the info we need.

Signed-off-by: Anatoly Burakov <***@intel.com>
---

Notes:
v3:
- Fixed index alignment bug
- Fixed compile issues

v3:
- MAP_POPULATE not supported on FreeBSD, removed it
- Bugfix for index size when it is unaligned
- Replace uint64_t with size_t for mapping sizes
- Make API experimental

Initial version of this had resizing capability, however it was
removed due to the fact that in multiprocess scenario, each
fbarray would have its own view of mapped memory, which might not
correspond with others due to some other process performing a
resize that current process didn't know about.

It was therefore decided that to avoid cost of synchronization on
each and every operation (to make sure the array wasn't resized),
resizing feature should be dropped.

lib/librte_eal/bsdapp/eal/Makefile | 1 +
lib/librte_eal/common/Makefile | 2 +-
lib/librte_eal/common/eal_common_fbarray.c | 859 ++++++++++++++++++++++++++++
lib/librte_eal/common/eal_filesystem.h | 13 +
lib/librte_eal/common/include/rte_fbarray.h | 353 ++++++++++++
lib/librte_eal/common/meson.build | 2 +
lib/librte_eal/linuxapp/eal/Makefile | 1 +
lib/librte_eal/rte_eal_version.map | 16 +
8 files changed, 1246 insertions(+), 1 deletion(-)
create mode 100644 lib/librte_eal/common/eal_common_fbarray.c
create mode 100644 lib/librte_eal/common/include/rte_fbarray.h

diff --git a/lib/librte_eal/bsdapp/eal/Makefile b/lib/librte_eal/bsdapp/eal/Makefile
index ed1d17b..1b43d77 100644
--- a/lib/librte_eal/bsdapp/eal/Makefile
+++ b/lib/librte_eal/bsdapp/eal/Makefile
@@ -53,6 +53,7 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_dev.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_options.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_thread.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_proc.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_fbarray.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += rte_malloc.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += malloc_elem.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += malloc_heap.c
diff --git a/lib/librte_eal/common/Makefile b/lib/librte_eal/common/Makefile
index ea824a3..48f870f 100644
--- a/lib/librte_eal/common/Makefile
+++ b/lib/librte_eal/common/Makefile
@@ -16,7 +16,7 @@ INC += rte_pci_dev_feature_defs.h rte_pci_dev_features.h
INC += rte_malloc.h rte_keepalive.h rte_time.h
INC += rte_service.h rte_service_component.h
INC += rte_bitmap.h rte_vfio.h rte_hypervisor.h rte_test.h
-INC += rte_reciprocal.h
+INC += rte_reciprocal.h rte_fbarray.h

GENERIC_INC := rte_atomic.h rte_byteorder.h rte_cycles.h rte_prefetch.h
GENERIC_INC += rte_spinlock.h rte_memcpy.h rte_cpuflags.h rte_rwlock.h
diff --git a/lib/librte_eal/common/eal_common_fbarray.c b/lib/librte_eal/common/eal_common_fbarray.c
new file mode 100644
index 0000000..f65875d
--- /dev/null
+++ b/lib/librte_eal/common/eal_common_fbarray.c
@@ -0,0 +1,859 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2017-2018 Intel Corporation
+ */
+
+#include <inttypes.h>
+#include <sys/mman.h>
+#include <stdint.h>
+#include <errno.h>
+#include <sys/file.h>
+#include <string.h>
+
+#include <rte_common.h>
+#include <rte_log.h>
+#include <rte_errno.h>
+#include <rte_spinlock.h>
+#include <rte_tailq.h>
+
+#include "eal_filesystem.h"
+#include "eal_private.h"
+
+#include "rte_fbarray.h"
+
+#define MASK_SHIFT 6ULL
+#define MASK_ALIGN (1 << MASK_SHIFT)
+#define MASK_LEN_TO_IDX(x) ((x) >> MASK_SHIFT)
+#define MASK_LEN_TO_MOD(x) ((x) - RTE_ALIGN_FLOOR(x, MASK_ALIGN))
+#define MASK_GET_IDX(idx, mod) ((idx << MASK_SHIFT) + mod)
+
+/*
+ * This is a mask that is always stored at the end of array, to provide fast
+ * way of finding free/used spots without looping through each element.
+ */
+
+struct used_mask {
+ int n_masks;
+ uint64_t data[];
+};
+
+static size_t
+calc_mask_size(int len)
+{
+ /* mask must be multiple of MASK_ALIGN, even though length of array
+ * itself may not be aligned on that boundary.
+ */
+ len = RTE_ALIGN_CEIL(len, MASK_ALIGN);
+ return sizeof(struct used_mask) +
+ sizeof(uint64_t) * MASK_LEN_TO_IDX(len);
+}
+
+static size_t
+calc_data_size(size_t page_sz, int elt_sz, int len)
+{
+ size_t data_sz = elt_sz * len;
+ size_t msk_sz = calc_mask_size(len);
+ return RTE_ALIGN_CEIL(data_sz + msk_sz, page_sz);
+}
+
+static struct used_mask *
+get_used_mask(void *data, int elt_sz, int len)
+{
+ return (struct used_mask *) RTE_PTR_ADD(data, elt_sz * len);
+}
+
+static int
+resize_and_map(int fd, void *addr, size_t len)
+{
+ char path[PATH_MAX];
+ void *map_addr;
+
+ if (ftruncate(fd, len)) {
+ RTE_LOG(ERR, EAL, "Cannot truncate %s\n", path);
+ /* pass errno up the chain */
+ rte_errno = errno;
+ return -1;
+ }
+
+ map_addr = mmap(addr, len, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_FIXED, fd, 0);
+ if (map_addr != addr) {
+ RTE_LOG(ERR, EAL, "mmap() failed: %s\n", strerror(errno));
+ /* pass errno up the chain */
+ rte_errno = errno;
+ return -1;
+ }
+ return 0;
+}
+
+static int
+find_next_n(const struct rte_fbarray *arr, int start, int n, bool used)
+{
+ const struct used_mask *msk = get_used_mask(arr->data, arr->elt_sz,
+ arr->len);
+ int msk_idx, lookahead_idx, first, first_mod;
+ int last, last_mod, last_msk;
+ uint64_t ignore_msk;
+
+ /*
+ * mask only has granularity of MASK_ALIGN, but start may not be aligned
+ * on that boundary, so construct a special mask to exclude anything we
+ * don't want to see to avoid confusing ctz.
+ */
+ first = MASK_LEN_TO_IDX(start);
+ first_mod = MASK_LEN_TO_MOD(start);
+ ignore_msk = ~((1ULL << first_mod) - 1);
+
+ /* array length may not be aligned, so calculate ignore mask for last
+ * mask index.
+ */
+ last = MASK_LEN_TO_IDX(arr->len);
+ last_mod = MASK_LEN_TO_MOD(arr->len);
+ last_msk = ~(-(1ULL) << last_mod);
+
+ for (msk_idx = first; msk_idx < msk->n_masks; msk_idx++) {
+ uint64_t cur_msk, lookahead_msk;
+ int run_start, clz, left;
+ bool found = false;
+ /*
+ * The process of getting n consecutive bits for arbitrary n is
+ * a bit involved, but here it is in a nutshell:
+ *
+ * 1. let n be the number of consecutive bits we're looking for
+ * 2. check if n can fit in one mask, and if so, do n-1
+ * rshift-ands to see if there is an appropriate run inside
+ * our current mask
+ * 2a. if we found a run, bail out early
+ * 2b. if we didn't find a run, proceed
+ * 3. invert the mask and count leading zeroes (that is, count
+ * how many consecutive set bits we had starting from the
+ * end of current mask) as k
+ * 3a. if k is 0, continue to next mask
+ * 3b. if k is not 0, we have a potential run
+ * 4. to satisfy our requirements, next mask must have n-k
+ * consecutive set bits right at the start, so we will do
+ * (n-k-1) rshift-ands and check if first bit is set.
+ *
+ * Step 4 will need to be repeated if (n-k) > MASK_ALIGN until
+ * we either run out of masks, lose the run, or find what we
+ * were looking for.
+ */
+ cur_msk = msk->data[msk_idx];
+ left = n;
+
+ /* if we're looking for free spaces, invert the mask */
+ if (!used)
+ cur_msk = ~cur_msk;
+
+ /* combine current ignore mask with last index ignore mask */
+ if (msk_idx == last)
+ ignore_msk |= last_msk;
+
+ /* if we have an ignore mask, ignore once */
+ if (ignore_msk) {
+ cur_msk &= ignore_msk;
+ ignore_msk = 0;
+ }
+
+ /* if n can fit in within a single mask, do a search */
+ if (n <= MASK_ALIGN) {
+ uint64_t tmp_msk = cur_msk;
+ int s_idx;
+ for (s_idx = 0; s_idx < n - 1; s_idx++)
+ tmp_msk &= tmp_msk >> 1ULL;
+ /* we found what we were looking for */
+ if (tmp_msk != 0) {
+ run_start = __builtin_ctzll(tmp_msk);
+ return MASK_GET_IDX(msk_idx, run_start);
+ }
+ }
+
+ /*
+ * we didn't find our run within the mask, or n > MASK_ALIGN,
+ * so we're going for plan B.
+ */
+
+ /* count leading zeroes on inverted mask */
+ clz = __builtin_clzll(~cur_msk);
+
+ /* if there aren't any runs at the end either, just continue */
+ if (clz == 0)
+ continue;
+
+ /* we have a partial run at the end, so try looking ahead */
+ run_start = MASK_ALIGN - clz;
+ left -= clz;
+
+ for (lookahead_idx = msk_idx + 1; lookahead_idx < msk->n_masks;
+ lookahead_idx++) {
+ int s_idx, need;
+ lookahead_msk = msk->data[lookahead_idx];
+
+ /* if we're looking for free space, invert the mask */
+ if (!used)
+ lookahead_msk = ~lookahead_msk;
+
+ /* figure out how many consecutive bits we need here */
+ need = RTE_MIN(left, MASK_ALIGN);
+
+ for (s_idx = 0; s_idx < need - 1; s_idx++)
+ lookahead_msk &= lookahead_msk >> 1ULL;
+
+ /* if first bit is not set, we've lost the run */
+ if ((lookahead_msk & 1) == 0) {
+ /*
+ * we've scanned this far, so we know there are
+ * no runs in the space we've lookahead-scanned
+ * as well, so skip that on next iteration.
+ */
+ ignore_msk = ~((1ULL << need) - 1);
+ msk_idx = lookahead_idx;
+ break;
+ }
+
+ left -= need;
+
+ /* check if we've found what we were looking for */
+ if (left == 0) {
+ found = true;
+ break;
+ }
+ }
+
+ /* we didn't find anything, so continue */
+ if (!found)
+ continue;
+
+ return MASK_GET_IDX(msk_idx, run_start);
+ }
+ /* we didn't find anything */
+ rte_errno = used ? -ENOENT : -ENOSPC;
+ return -1;
+}
+
+static int
+find_next(const struct rte_fbarray *arr, int start, bool used)
+{
+ const struct used_mask *msk = get_used_mask(arr->data, arr->elt_sz,
+ arr->len);
+ int idx, first, first_mod;
+ int last, last_mod, last_msk;
+ uint64_t ignore_msk;
+
+ /*
+ * mask only has granularity of MASK_ALIGN, but start may not be aligned
+ * on that boundary, so construct a special mask to exclude anything we
+ * don't want to see to avoid confusing ctz.
+ */
+ first = MASK_LEN_TO_IDX(start);
+ first_mod = MASK_LEN_TO_MOD(start);
+ ignore_msk = ~((1ULL << first_mod) - 1ULL);
+
+ /* array length may not be aligned, so calculate ignore mask for last
+ * mask index.
+ */
+ last = MASK_LEN_TO_IDX(arr->len);
+ last_mod = MASK_LEN_TO_MOD(arr->len);
+ last_msk = ~(-(1ULL) << last_mod);
+
+ for (idx = first; idx < msk->n_masks; idx++) {
+ uint64_t cur = msk->data[idx];
+ int found;
+
+ /* if we're looking for free entries, invert mask */
+ if (!used)
+ cur = ~cur;
+
+ if (idx == last)
+ cur &= last_msk;
+
+ /* ignore everything before start on first iteration */
+ if (idx == first)
+ cur &= ignore_msk;
+
+ /* check if we have any entries */
+ if (cur == 0)
+ continue;
+
+ /*
+ * find first set bit - that will correspond to whatever it is
+ * that we're looking for.
+ */
+ found = __builtin_ctzll(cur);
+ return MASK_GET_IDX(idx, found);
+ }
+ /* we didn't find anything */
+ rte_errno = used ? -ENOENT : -ENOSPC;
+ return -1;
+}
+
+static int
+find_contig(const struct rte_fbarray *arr, int start, bool used)
+{
+ const struct used_mask *msk = get_used_mask(arr->data, arr->elt_sz,
+ arr->len);
+ int idx, first, first_mod;
+ int last, last_mod, last_msk;
+ int need_len, result = 0;
+
+ /* array length may not be aligned, so calculate ignore mask for last
+ * mask index.
+ */
+ last = MASK_LEN_TO_IDX(arr->len);
+ last_mod = MASK_LEN_TO_MOD(arr->len);
+ last_msk = ~(-(1ULL) << last_mod);
+
+ first = MASK_LEN_TO_IDX(start);
+ first_mod = MASK_LEN_TO_MOD(start);
+ for (idx = first; idx < msk->n_masks; idx++, result += need_len) {
+ uint64_t cur = msk->data[idx];
+ int run_len;
+
+ need_len = MASK_ALIGN;
+
+ /* if we're looking for free entries, invert mask */
+ if (!used)
+ cur = ~cur;
+
+ /* if this is last mask, ignore everything after last bit */
+ if (idx == last)
+ cur &= last_msk;
+
+ /* ignore everything before start on first iteration */
+ if (idx == first) {
+ cur >>= first_mod;
+ /* at the start, we don't need the full mask len */
+ need_len -= first_mod;
+ }
+
+ /* we will be looking for zeroes, so invert the mask */
+ cur = ~cur;
+
+ /* if mask is zero, we have a complete run */
+ if (cur == 0)
+ continue;
+
+ /*
+ * see if current run ends before mask end.
+ */
+ run_len = __builtin_ctzll(cur);
+
+ /* add however many zeroes we've had in the last run and quit */
+ if (run_len < need_len) {
+ result += run_len;
+ break;
+ }
+ }
+ return result;
+}
+
+static int
+set_used(struct rte_fbarray *arr, int idx, bool used)
+{
+ struct used_mask *msk = get_used_mask(arr->data, arr->elt_sz, arr->len);
+ uint64_t msk_bit = 1ULL << MASK_LEN_TO_MOD(idx);
+ int msk_idx = MASK_LEN_TO_IDX(idx);
+ bool already_used;
+ int ret = -1;
+
+ if (arr == NULL || idx < 0 || idx >= arr->len) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+ ret = 0;
+
+ /* prevent array from changing under us */
+ rte_rwlock_write_lock(&arr->rwlock);
+
+ already_used = (msk->data[msk_idx] & msk_bit) != 0;
+
+ /* nothing to be done */
+ if (used == already_used)
+ goto out;
+
+ if (used) {
+ msk->data[msk_idx] |= msk_bit;
+ arr->count++;
+ } else {
+ msk->data[msk_idx] &= ~msk_bit;
+ arr->count--;
+ }
+out:
+ rte_rwlock_write_unlock(&arr->rwlock);
+
+ return ret;
+}
+
+static int
+fully_validate(const char *name, unsigned int elt_sz, unsigned int len)
+{
+ if (name == NULL || elt_sz == 0 || len == 0) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ if (strnlen(name, RTE_FBARRAY_NAME_LEN) == RTE_FBARRAY_NAME_LEN) {
+ rte_errno = ENAMETOOLONG;
+ return -1;
+ }
+ return 0;
+}
+
+int __rte_experimental
+rte_fbarray_init(struct rte_fbarray *arr, const char *name, int len, int elt_sz)
+{
+ size_t page_sz, mmap_len;
+ char path[PATH_MAX];
+ struct used_mask *msk;
+ void *data = NULL;
+ int fd = -1;
+
+ if (arr == NULL) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ if (fully_validate(name, elt_sz, len))
+ return -1;
+
+ page_sz = sysconf(_SC_PAGESIZE);
+
+ /* calculate our memory limits */
+ mmap_len = calc_data_size(page_sz, elt_sz, len);
+
+ data = eal_get_virtual_area(NULL, &mmap_len, page_sz, 0, 0);
+ if (data == NULL)
+ goto fail;
+
+ eal_get_fbarray_path(path, sizeof(path), name);
+
+ /*
+ * Each fbarray is unique to process namespace, i.e. the filename
+ * depends on process prefix. Try to take out a lock and see if we
+ * succeed. If we don't, someone else is using it already.
+ */
+ fd = open(path, O_CREAT | O_RDWR, 0600);
+ if (fd < 0) {
+ RTE_LOG(DEBUG, EAL, "%s(): couldn't open %s: %s\n", __func__,
+ path, strerror(errno));
+ rte_errno = errno;
+ goto fail;
+ } else if (flock(fd, LOCK_EX | LOCK_NB)) {
+ RTE_LOG(DEBUG, EAL, "%s(): couldn't lock %s: %s\n", __func__,
+ path, strerror(errno));
+ rte_errno = EBUSY;
+ goto fail;
+ }
+
+ /* take out a non-exclusive lock, so that other processes could still
+ * attach to it, but no other process could reinitialize it.
+ */
+ if (flock(fd, LOCK_SH | LOCK_NB)) {
+ rte_errno = errno;
+ goto fail;
+ }
+
+ if (resize_and_map(fd, data, mmap_len))
+ goto fail;
+
+ /* we've mmap'ed the file, we can now close the fd */
+ close(fd);
+
+ /* initialize the data */
+ memset(data, 0, mmap_len);
+
+ /* populate data structure */
+ snprintf(arr->name, sizeof(arr->name), "%s", name);
+ arr->data = data;
+ arr->len = len;
+ arr->elt_sz = elt_sz;
+ arr->count = 0;
+
+ msk = get_used_mask(data, elt_sz, len);
+ msk->n_masks = MASK_LEN_TO_IDX(RTE_ALIGN_CEIL(len, MASK_ALIGN));
+
+ rte_rwlock_init(&arr->rwlock);
+
+ return 0;
+fail:
+ if (data)
+ munmap(data, mmap_len);
+ if (fd >= 0)
+ close(fd);
+ return -1;
+}
+
+int __rte_experimental
+rte_fbarray_attach(struct rte_fbarray *arr)
+{
+ size_t page_sz, mmap_len;
+ char path[PATH_MAX];
+ void *data = NULL;
+ int fd = -1;
+
+ if (arr == NULL) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ /*
+ * we don't need to synchronize attach as two values we need (element
+ * size and array length) are constant for the duration of life of
+ * the array, so the parts we care about will not race.
+ */
+
+ if (fully_validate(arr->name, arr->elt_sz, arr->len))
+ return -1;
+
+ page_sz = sysconf(_SC_PAGESIZE);
+
+ mmap_len = calc_data_size(page_sz, arr->elt_sz, arr->len);
+
+ data = eal_get_virtual_area(arr->data, &mmap_len, page_sz, 0, 0);
+ if (data == NULL)
+ goto fail;
+
+ eal_get_fbarray_path(path, sizeof(path), arr->name);
+
+ fd = open(path, O_RDWR);
+ if (fd < 0) {
+ rte_errno = errno;
+ goto fail;
+ }
+
+ /* lock the file, to let others know we're using it */
+ if (flock(fd, LOCK_SH | LOCK_NB)) {
+ rte_errno = errno;
+ goto fail;
+ }
+
+ if (resize_and_map(fd, data, mmap_len))
+ goto fail;
+
+ close(fd);
+
+ /* we're done */
+
+ return 0;
+fail:
+ if (data)
+ munmap(data, mmap_len);
+ if (fd >= 0)
+ close(fd);
+ return -1;
+}
+
+int __rte_experimental
+rte_fbarray_detach(struct rte_fbarray *arr)
+{
+ if (arr == NULL) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ /*
+ * we don't need to synchronize detach as two values we need (element
+ * size and total capacity) are constant for the duration of life of
+ * the array, so the parts we care about will not race. if the user is
+ * detaching while doing something else in the same process, we can't
+ * really do anything about it, things will blow up either way.
+ */
+
+ size_t page_sz = sysconf(_SC_PAGESIZE);
+
+ /* this may already be unmapped (e.g. repeated call from previously
+ * failed destroy(), but this is on user, we can't (easily) know if this
+ * is still mapped.
+ */
+ munmap(arr->data, calc_data_size(page_sz, arr->elt_sz, arr->len));
+
+ return 0;
+}
+
+int __rte_experimental
+rte_fbarray_destroy(struct rte_fbarray *arr)
+{
+ int fd, ret;
+ char path[PATH_MAX];
+
+ ret = rte_fbarray_detach(arr);
+ if (ret)
+ return ret;
+
+ /* try deleting the file */
+ eal_get_fbarray_path(path, sizeof(path), arr->name);
+
+ fd = open(path, O_RDONLY);
+ if (flock(fd, LOCK_EX | LOCK_NB)) {
+ RTE_LOG(DEBUG, EAL, "Cannot destroy fbarray - another process is using it\n");
+ rte_errno = EBUSY;
+ ret = -1;
+ } else {
+ ret = 0;
+ unlink(path);
+ memset(arr, 0, sizeof(*arr));
+ }
+ close(fd);
+
+ return ret;
+}
+
+void * __rte_experimental
+rte_fbarray_get(const struct rte_fbarray *arr, int idx)
+{
+ void *ret = NULL;
+ if (arr == NULL || idx < 0) {
+ rte_errno = EINVAL;
+ return NULL;
+ }
+
+ if (idx >= arr->len) {
+ rte_errno = EINVAL;
+ return NULL;
+ }
+
+ ret = RTE_PTR_ADD(arr->data, idx * arr->elt_sz);
+
+ return ret;
+}
+
+int __rte_experimental
+rte_fbarray_set_used(struct rte_fbarray *arr, int idx)
+{
+ return set_used(arr, idx, true);
+}
+
+int __rte_experimental
+rte_fbarray_set_free(struct rte_fbarray *arr, int idx)
+{
+ return set_used(arr, idx, false);
+}
+
+int __rte_experimental
+rte_fbarray_is_used(struct rte_fbarray *arr, int idx)
+{
+ struct used_mask *msk;
+ int msk_idx;
+ uint64_t msk_bit;
+ int ret = -1;
+
+ if (arr == NULL || idx < 0 || idx >= arr->len) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ /* prevent array from changing under us */
+ rte_rwlock_read_lock(&arr->rwlock);
+
+ msk = get_used_mask(arr->data, arr->elt_sz, arr->len);
+ msk_idx = MASK_LEN_TO_IDX(idx);
+ msk_bit = 1ULL << MASK_LEN_TO_MOD(idx);
+
+ ret = (msk->data[msk_idx] & msk_bit) != 0;
+
+ rte_rwlock_read_unlock(&arr->rwlock);
+
+ return ret;
+}
+
+int __rte_experimental
+rte_fbarray_find_next_free(struct rte_fbarray *arr, int start)
+{
+ int ret = -1;
+
+ if (arr == NULL || start < 0 || start >= arr->len) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ /* prevent array from changing under us */
+ rte_rwlock_read_lock(&arr->rwlock);
+
+ if (arr->len == arr->count) {
+ rte_errno = ENOSPC;
+ goto out;
+ }
+
+ ret = find_next(arr, start, false);
+out:
+ rte_rwlock_read_unlock(&arr->rwlock);
+ return ret;
+}
+
+int __rte_experimental
+rte_fbarray_find_next_used(struct rte_fbarray *arr, int start)
+{
+ int ret = -1;
+
+ if (arr == NULL || start < 0 || start >= arr->len) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ /* prevent array from changing under us */
+ rte_rwlock_read_lock(&arr->rwlock);
+
+ if (arr->count == 0) {
+ rte_errno = ENOENT;
+ goto out;
+ }
+
+ ret = find_next(arr, start, true);
+out:
+ rte_rwlock_read_unlock(&arr->rwlock);
+ return ret;
+}
+
+int __rte_experimental
+rte_fbarray_find_next_n_free(struct rte_fbarray *arr, int start, int n)
+{
+ int ret = -1;
+
+ if (arr == NULL || start < 0 || start >= arr->len ||
+ n < 0 || n > arr->len) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ /* prevent array from changing under us */
+ rte_rwlock_read_lock(&arr->rwlock);
+
+ if (arr->len == arr->count || arr->len - arr->count < n) {
+ rte_errno = ENOSPC;
+ goto out;
+ }
+
+ ret = find_next_n(arr, start, n, false);
+out:
+ rte_rwlock_read_unlock(&arr->rwlock);
+ return ret;
+}
+
+int __rte_experimental
+rte_fbarray_find_next_n_used(struct rte_fbarray *arr, int start, int n)
+{
+ int ret = -1;
+
+ if (arr == NULL || start < 0 || start >= arr->len ||
+ n < 0 || n > arr->len) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ /* prevent array from changing under us */
+ rte_rwlock_read_lock(&arr->rwlock);
+
+ if (arr->count < n) {
+ rte_errno = ENOENT;
+ goto out;
+ }
+
+ ret = find_next_n(arr, start, n, true);
+out:
+ rte_rwlock_read_unlock(&arr->rwlock);
+ return ret;
+}
+
+int __rte_experimental
+rte_fbarray_find_contig_free(struct rte_fbarray *arr, int start)
+{
+ int ret = -1;
+
+ if (arr == NULL || start < 0 || start >= arr->len) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ /* prevent array from changing under us */
+ rte_rwlock_read_lock(&arr->rwlock);
+
+ if (arr->len == arr->count) {
+ rte_errno = ENOSPC;
+ goto out;
+ }
+
+ if (arr->count == 0) {
+ ret = arr->len - start;
+ goto out;
+ }
+
+ ret = find_contig(arr, start, false);
+out:
+ rte_rwlock_read_unlock(&arr->rwlock);
+ return ret;
+}
+
+int __rte_experimental
+rte_fbarray_find_contig_used(struct rte_fbarray *arr, int start)
+{
+ int ret = -1;
+
+ if (arr == NULL || start < 0 || start >= arr->len) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ /* prevent array from changing under us */
+ rte_rwlock_read_lock(&arr->rwlock);
+
+ ret = find_contig(arr, start, true);
+
+ rte_rwlock_read_unlock(&arr->rwlock);
+ return ret;
+}
+
+int __rte_experimental
+rte_fbarray_find_idx(const struct rte_fbarray *arr, const void *elt)
+{
+ void *end;
+ int ret = -1;
+
+ /*
+ * no need to synchronize as it doesn't matter if underlying data
+ * changes - we're doing pointer arithmetic here.
+ */
+
+ if (arr == NULL || elt == NULL) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+ end = RTE_PTR_ADD(arr->data, arr->elt_sz * arr->len);
+ if (elt < arr->data || elt >= end) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ ret = RTE_PTR_DIFF(elt, arr->data) / arr->elt_sz;
+
+ return ret;
+}
+
+void __rte_experimental
+rte_fbarray_dump_metadata(struct rte_fbarray *arr, FILE *f)
+{
+ struct used_mask *msk;
+ int i;
+
+ if (arr == NULL || f == NULL) {
+ rte_errno = EINVAL;
+ return;
+ }
+
+ if (fully_validate(arr->name, arr->elt_sz, arr->len)) {
+ fprintf(f, "Invalid file-backed array\n");
+ goto out;
+ }
+
+ /* prevent array from changing under us */
+ rte_rwlock_read_lock(&arr->rwlock);
+
+ fprintf(f, "File-backed array: %s\n", arr->name);
+ fprintf(f, "size: %i occupied: %i elt_sz: %i\n",
+ arr->len, arr->count, arr->elt_sz);
+
+ msk = get_used_mask(arr->data, arr->elt_sz, arr->len);
+
+ for (i = 0; i < msk->n_masks; i++)
+ fprintf(f, "msk idx %i: 0x%016" PRIx64 "\n", i, msk->data[i]);
+out:
+ rte_rwlock_read_unlock(&arr->rwlock);
+}
diff --git a/lib/librte_eal/common/eal_filesystem.h b/lib/librte_eal/common/eal_filesystem.h
index 4708dd5..1c6048b 100644
--- a/lib/librte_eal/common/eal_filesystem.h
+++ b/lib/librte_eal/common/eal_filesystem.h
@@ -13,6 +13,7 @@

/** Path of rte config file. */
#define RUNTIME_CONFIG_FMT "%s/.%s_config"
+#define FBARRAY_FMT "%s/%s_%s"

#include <stdint.h>
#include <limits.h>
@@ -55,6 +56,18 @@ eal_mp_socket_path(void)
return buffer;
}

+static inline const char *
+eal_get_fbarray_path(char *buffer, size_t buflen, const char *name) {
+ const char *directory = "/tmp";
+ const char *home_dir = getenv("HOME");
+
+ if (getuid() != 0 && home_dir != NULL)
+ directory = home_dir;
+ snprintf(buffer, buflen - 1, FBARRAY_FMT, directory,
+ internal_config.hugefile_prefix, name);
+ return buffer;
+}
+
/** Path of hugepage info file. */
#define HUGEPAGE_INFO_FMT "%s/.%s_hugepage_info"

diff --git a/lib/librte_eal/common/include/rte_fbarray.h b/lib/librte_eal/common/include/rte_fbarray.h
new file mode 100644
index 0000000..c45ac0b
--- /dev/null
+++ b/lib/librte_eal/common/include/rte_fbarray.h
@@ -0,0 +1,353 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2017-2018 Intel Corporation
+ */
+
+#ifndef RTE_FBARRAY_H
+#define RTE_FBARRAY_H
+
+/**
+ * @file
+ *
+ * File-backed shared indexed array for DPDK.
+ *
+ * Basic workflow is expected to be the following:
+ * 1) Allocate array either using ``rte_fbarray_init()`` or
+ * ``rte_fbarray_attach()`` (depending on whether it's shared between
+ * multiple DPDK processes)
+ * 2) find free spots using ``rte_fbarray_find_next_free()``
+ * 3) get pointer to data in the free spot using ``rte_fbarray_get()``, and
+ * copy data into the pointer (element size is fixed)
+ * 4) mark entry as used using ``rte_fbarray_set_used()``
+ *
+ * Calls to ``rte_fbarray_init()`` and ``rte_fbarray_destroy()`` will have
+ * consequences for all processes, while calls to ``rte_fbarray_attach()`` and
+ * ``rte_fbarray_detach()`` will only have consequences within a single process.
+ * Therefore, it is safe to call ``rte_fbarray_attach()`` or
+ * ``rte_fbarray_detach()`` while another process is using ``rte_fbarray``,
+ * provided no other thread within the same process will try to use
+ * ``rte_fbarray`` before attaching or after detaching. It is not safe to call
+ * ``rte_fbarray_init()`` or ``rte_fbarray_destroy()`` while another thread or
+ * another process is using ``rte_fbarray``.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdbool.h>
+#include <stdio.h>
+
+#include <rte_compat.h>
+#include <rte_rwlock.h>
+
+#define RTE_FBARRAY_NAME_LEN 64
+
+struct rte_fbarray {
+ char name[RTE_FBARRAY_NAME_LEN]; /**< name associated with an array */
+ int count; /**< number of entries stored */
+ int len; /**< current length of the array */
+ int elt_sz; /**< size of each element */
+ void *data; /**< data pointer */
+ rte_rwlock_t rwlock; /**< multiprocess lock */
+};
+
+/**
+ * Set up ``rte_fbarray`` structure and allocate underlying resources.
+ *
+ * Call this function to correctly set up ``rte_fbarray`` and allocate
+ * underlying files that will be backing the data in the current process. Note
+ * that in order to use and share ``rte_fbarray`` between multiple processes,
+ * data pointed to by ``arr`` pointer must itself be allocated in shared memory.
+ *
+ * @param arr
+ * Valid pointer to allocated ``rte_fbarray`` structure.
+ *
+ * @param name
+ * Unique name to be assigned to this array.
+ *
+ * @param len
+ * Number of elements initially available in the array.
+ *
+ * @param elt_sz
+ * Size of each element.
+ *
+ * @return
+ * - 0 on success.
+ * - -1 on failure, with ``rte_errno`` indicating reason for failure.
+ */
+int __rte_experimental
+rte_fbarray_init(struct rte_fbarray *arr, const char *name, int len,
+ int elt_sz);
+
+
+/**
+ * Attach to a file backing an already allocated and correctly set up
+ * ``rte_fbarray`` structure.
+ *
+ * Call this function to attach to file that will be backing the data in the
+ * current process. The structure must have been previously correctly set up
+ * with a call to ``rte_fbarray_init()``. Calls to ``rte_fbarray_attach()`` are
+ * usually meant to be performed in a multiprocessing scenario, with data
+ * pointed to by ``arr`` pointer allocated in shared memory.
+ *
+ * @param arr
+ * Valid pointer to allocated and correctly set up rte_fbarray structure.
+ *
+ * @return
+ * - 0 on success.
+ * - -1 on failure, with ``rte_errno`` indicating reason for failure.
+ */
+int __rte_experimental
+rte_fbarray_attach(struct rte_fbarray *arr);
+
+
+/**
+ * Deallocate resources for an already allocated and correctly set up
+ * ``rte_fbarray`` structure, and remove the underlying file.
+ *
+ * Call this function to deallocate all resources associated with an
+ * ``rte_fbarray`` structure within the current process. This will also
+ * zero-fill data pointed to by ``arr`` pointer and remove the underlying file
+ * backing the data, so it is expected that by the time this function is called,
+ * all other processes have detached from this ``rte_fbarray``.
+ *
+ * @param arr
+ * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure.
+ *
+ * @return
+ * - 0 on success.
+ * - -1 on failure, with ``rte_errno`` indicating reason for failure.
+ */
+int __rte_experimental
+rte_fbarray_destroy(struct rte_fbarray *arr);
+
+
+/**
+ * Deallocate resources for an already allocated and correctly set up
+ * ``rte_fbarray`` structure.
+ *
+ * Call this function to deallocate all resources associated with an
+ * ``rte_fbarray`` structure within current process.
+ *
+ * @param arr
+ * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure.
+ *
+ * @return
+ * - 0 on success.
+ * - -1 on failure, with ``rte_errno`` indicating reason for failure.
+ */
+int __rte_experimental
+rte_fbarray_detach(struct rte_fbarray *arr);
+
+
+/**
+ * Get pointer to element residing at specified index.
+ *
+ * @param arr
+ * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure.
+ *
+ * @param idx
+ * Index of an element to get a pointer to.
+ *
+ * @return
+ * - non-NULL pointer on success.
+ * - NULL on failure, with ``rte_errno`` indicating reason for failure.
+ */
+void * __rte_experimental
+rte_fbarray_get(const struct rte_fbarray *arr, int idx);
+
+
+/**
+ * Find index of a specified element within the array.
+ *
+ * @param arr
+ * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure.
+ *
+ * @param elt
+ * Pointer to element to find index to.
+ *
+ * @return
+ * - non-negative integer on success.
+ * - -1 on failure, with ``rte_errno`` indicating reason for failure.
+ */
+int __rte_experimental
+rte_fbarray_find_idx(const struct rte_fbarray *arr, const void *elt);
+
+
+/**
+ * Mark specified element as used.
+ *
+ * @param arr
+ * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure.
+ *
+ * @param idx
+ * Element index to mark as used.
+ *
+ * @return
+ * - 0 on success.
+ * - -1 on failure, with ``rte_errno`` indicating reason for failure.
+ */
+int __rte_experimental
+rte_fbarray_set_used(struct rte_fbarray *arr, int idx);
+
+
+/**
+ * Mark specified element as free.
+ *
+ * @param arr
+ * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure.
+ *
+ * @param idx
+ * Element index to mark as free.
+ *
+ * @return
+ * - 0 on success.
+ * - -1 on failure, with ``rte_errno`` indicating reason for failure.
+ */
+int __rte_experimental
+rte_fbarray_set_free(struct rte_fbarray *arr, int idx);
+
+
+/**
+ * Check whether element at specified index is marked as used.
+ *
+ * @param arr
+ * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure.
+ *
+ * @param idx
+ * Element index to check as used.
+ *
+ * @return
+ * - 1 if element is used.
+ * - 0 if element is unused.
+ * - -1 on failure, with ``rte_errno`` indicating reason for failure.
+ */
+int __rte_experimental
+rte_fbarray_is_used(struct rte_fbarray *arr, int idx);
+
+
+/**
+ * Find index of next free element, starting at specified index.
+ *
+ * @param arr
+ * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure.
+ *
+ * @param start
+ * Element index to start search from.
+ *
+ * @return
+ * - non-negative integer on success.
+ * - -1 on failure, with ``rte_errno`` indicating reason for failure.
+ */
+int __rte_experimental
+rte_fbarray_find_next_free(struct rte_fbarray *arr, int start);
+
+
+/**
+ * Find index of next used element, starting at specified index.
+ *
+ * @param arr
+ * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure.
+ *
+ * @param start
+ * Element index to start search from.
+ *
+ * @return
+ * - non-negative integer on success.
+ * - -1 on failure, with ``rte_errno`` indicating reason for failure.
+ */
+int __rte_experimental
+rte_fbarray_find_next_used(struct rte_fbarray *arr, int start);
+
+
+/**
+ * Find index of next chunk of ``n`` free elements, starting at specified index.
+ *
+ * @param arr
+ * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure.
+ *
+ * @param start
+ * Element index to start search from.
+ *
+ * @param n
+ * Number of free elements to look for.
+ *
+ * @return
+ * - non-negative integer on success.
+ * - -1 on failure, with ``rte_errno`` indicating reason for failure.
+ */
+int __rte_experimental
+rte_fbarray_find_next_n_free(struct rte_fbarray *arr, int start, int n);
+
+
+/**
+ * Find index of next chunk of ``n`` used elements, starting at specified index.
+ *
+ * @param arr
+ * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure.
+ *
+ * @param start
+ * Element index to start search from.
+ *
+ * @param n
+ * Number of used elements to look for.
+ *
+ * @return
+ * - non-negative integer on success.
+ * - -1 on failure, with ``rte_errno`` indicating reason for failure.
+ */
+int __rte_experimental
+rte_fbarray_find_next_n_used(struct rte_fbarray *arr, int start, int n);
+
+
+/**
+ * Find how many more free entries there are, starting at specified index.
+ *
+ * @param arr
+ * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure.
+ *
+ * @param start
+ * Element index to start search from.
+ *
+ * @return
+ * - non-negative integer on success.
+ * - -1 on failure, with ``rte_errno`` indicating reason for failure.
+ */
+int __rte_experimental
+rte_fbarray_find_contig_free(struct rte_fbarray *arr, int start);
+
+
+/**
+ * Find how many more used entries there are, starting at specified index.
+ *
+ * @param arr
+ * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure.
+ *
+ * @param start
+ * Element index to start search from.
+ *
+ * @return
+ * - non-negative integer on success.
+ * - -1 on failure, with ``rte_errno`` indicating reason for failure.
+ */
+int __rte_experimental
+rte_fbarray_find_contig_used(struct rte_fbarray *arr, int start);
+
+
+/**
+ * Dump ``rte_fbarray`` metadata.
+ *
+ * @param arr
+ * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure.
+ *
+ * @param f
+ * File object to dump information into.
+ */
+void __rte_experimental
+rte_fbarray_dump_metadata(struct rte_fbarray *arr, FILE *f);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // RTE_FBARRAY_H
diff --git a/lib/librte_eal/common/meson.build b/lib/librte_eal/common/meson.build
index 82b8910..7d02191 100644
--- a/lib/librte_eal/common/meson.build
+++ b/lib/librte_eal/common/meson.build
@@ -11,6 +11,7 @@ common_sources = files(
'eal_common_devargs.c',
'eal_common_dev.c',
'eal_common_errno.c',
+ 'eal_common_fbarray.c',
'eal_common_hexdump.c',
'eal_common_launch.c',
'eal_common_lcore.c',
@@ -51,6 +52,7 @@ common_headers = files(
'include/rte_eal_memconfig.h',
'include/rte_eal_interrupts.h',
'include/rte_errno.h',
+ 'include/rte_fbarray.h',
'include/rte_hexdump.h',
'include/rte_interrupts.h',
'include/rte_keepalive.h',
diff --git a/lib/librte_eal/linuxapp/eal/Makefile b/lib/librte_eal/linuxapp/eal/Makefile
index b9c7727..c407a43 100644
--- a/lib/librte_eal/linuxapp/eal/Makefile
+++ b/lib/librte_eal/linuxapp/eal/Makefile
@@ -61,6 +61,7 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_dev.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_options.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_thread.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_proc.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_fbarray.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_malloc.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_elem.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_heap.c
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index fe4a9c9..3a12112 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -221,6 +221,22 @@ EXPERIMENTAL {
rte_eal_hotplug_add;
rte_eal_hotplug_remove;
rte_eal_mbuf_user_pool_ops;
+ rte_fbarray_attach;
+ rte_fbarray_destroy;
+ rte_fbarray_detach;
+ rte_fbarray_dump_metadata;
+ rte_fbarray_find_idx;
+ rte_fbarray_find_next_free;
+ rte_fbarray_find_next_used;
+ rte_fbarray_find_next_n_free;
+ rte_fbarray_find_next_n_used;
+ rte_fbarray_find_contig_free;
+ rte_fbarray_find_contig_used;
+ rte_fbarray_get;
+ rte_fbarray_init;
+ rte_fbarray_is_used;
+ rte_fbarray_set_free;
+ rte_fbarray_set_used;
rte_log_register_type_and_pick_level;
rte_malloc_dump_heaps;
rte_mem_iova2virt;
--
2.7.4
Anatoly Burakov
2018-04-03 23:22:07 UTC
Permalink
In preparation for implementing multiprocess support, we are adding
a version number to memseg lists. We will not need any locks, because
memory hotplug will have a global lock (so any time memory map and
thus version number might change, we will already be holding a lock).

There are two ways of implementing multiprocess support for memory
hotplug: either all information about mapped memory is shared
between processes, and secondary processes simply attempt to
map/unmap memory based on requests from the primary, or secondary
processes store their own maps and only check if they are in sync
with the primary process' maps.

This implementation will opt for the latter option: primary process
shared mappings will be authoritative, and each secondary process
will use its own interal view of mapped memory, and will attempt
to synchronize on these mappings using versioning.

Under this model, only primary process will decide which pages get
mapped, and secondary processes will only copy primary's page
maps and get notified of the changes via IPC mechanism (coming
in later commits).

Signed-off-by: Anatoly Burakov <***@intel.com>
---

Notes:
v3:
- Removed per-list locks as we're using global hotplug lock

lib/librte_eal/bsdapp/eal/eal_memalloc.c | 7 +
lib/librte_eal/common/eal_memalloc.h | 4 +
lib/librte_eal/common/include/rte_eal_memconfig.h | 1 +
lib/librte_eal/linuxapp/eal/eal_memalloc.c | 250 ++++++++++++++++++++++
4 files changed, 262 insertions(+)

diff --git a/lib/librte_eal/bsdapp/eal/eal_memalloc.c b/lib/librte_eal/bsdapp/eal/eal_memalloc.c
index e7bcd2b..461732f 100644
--- a/lib/librte_eal/bsdapp/eal/eal_memalloc.c
+++ b/lib/librte_eal/bsdapp/eal/eal_memalloc.c
@@ -39,3 +39,10 @@ eal_memalloc_free_seg_bulk(struct rte_memseg **ms __rte_unused,
RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n");
return -1;
}
+
+int
+eal_memalloc_sync_with_primary(void)
+{
+ RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n");
+ return -1;
+}
diff --git a/lib/librte_eal/common/eal_memalloc.h b/lib/librte_eal/common/eal_memalloc.h
index 2413c6c..4a7b45c 100644
--- a/lib/librte_eal/common/eal_memalloc.h
+++ b/lib/librte_eal/common/eal_memalloc.h
@@ -52,4 +52,8 @@ bool
eal_memalloc_is_contig(const struct rte_memseg_list *msl, void *start,
size_t len);

+/* synchronize local memory map to primary process */
+int
+eal_memalloc_sync_with_primary(void);
+
#endif // EAL_MEMALLOC_H
diff --git a/lib/librte_eal/common/include/rte_eal_memconfig.h b/lib/librte_eal/common/include/rte_eal_memconfig.h
index 88cde8c..a781793 100644
--- a/lib/librte_eal/common/include/rte_eal_memconfig.h
+++ b/lib/librte_eal/common/include/rte_eal_memconfig.h
@@ -32,6 +32,7 @@ struct rte_memseg_list {
};
int socket_id; /**< Socket ID for all memsegs in this list. */
uint64_t page_sz; /**< Page size for all memsegs in this list. */
+ volatile uint32_t version; /**< version number for multiprocess sync. */
struct rte_fbarray memseg_arr;
};

diff --git a/lib/librte_eal/linuxapp/eal/eal_memalloc.c b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
index 545ac49..ce242b1 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memalloc.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
@@ -64,6 +64,9 @@ static struct msl_entry_list msl_entry_list =
TAILQ_HEAD_INITIALIZER(msl_entry_list);
static rte_spinlock_t tailq_lock = RTE_SPINLOCK_INITIALIZER;

+/** local copy of a memory map, used to synchronize memory hotplug in MP */
+static struct rte_memseg_list local_memsegs[RTE_MAX_MEMSEG_LISTS];
+
static sigjmp_buf huge_jmpenv;

static void __rte_unused huge_sigbus_handler(int signo __rte_unused)
@@ -647,6 +650,8 @@ alloc_seg_walk(const struct rte_memseg_list *msl, void *arg)
}
out:
wa->segs_allocated = i;
+ if (i > 0)
+ cur_msl->version++;
return 1;
}

@@ -676,7 +681,10 @@ free_seg_walk(const struct rte_memseg_list *msl, void *arg)
/* msl is const */
found_msl = &mcfg->memsegs[msl_idx];

+ found_msl->version++;
+
rte_fbarray_set_free(&found_msl->memseg_arr, seg_idx);
+
if (free_seg(wa->ms, wa->hi, msl_idx, seg_idx))
return -1;

@@ -809,3 +817,245 @@ eal_memalloc_free_seg(struct rte_memseg *ms)

return eal_memalloc_free_seg_bulk(&ms, 1);
}
+
+static int
+sync_chunk(struct rte_memseg_list *primary_msl,
+ struct rte_memseg_list *local_msl, struct hugepage_info *hi,
+ unsigned int msl_idx, bool used, int start, int end)
+{
+ struct rte_fbarray *l_arr, *p_arr;
+ int i, ret, chunk_len, diff_len;
+
+ l_arr = &local_msl->memseg_arr;
+ p_arr = &primary_msl->memseg_arr;
+
+ /* we need to aggregate allocations/deallocations into bigger chunks,
+ * as we don't want to spam the user with per-page callbacks.
+ *
+ * to avoid any potential issues, we also want to trigger
+ * deallocation callbacks *before* we actually deallocate
+ * memory, so that the user application could wrap up its use
+ * before it goes away.
+ */
+
+ chunk_len = end - start;
+
+ /* find how many contiguous pages we can map/unmap for this chunk */
+ diff_len = used ?
+ rte_fbarray_find_contig_free(l_arr, start) :
+ rte_fbarray_find_contig_used(l_arr, start);
+
+ /* has to be at least one page */
+ if (diff_len < 1)
+ return -1;
+
+ diff_len = RTE_MIN(chunk_len, diff_len);
+
+ for (i = 0; i < diff_len; i++) {
+ struct rte_memseg *p_ms, *l_ms;
+ int seg_idx = start + i;
+
+ l_ms = rte_fbarray_get(l_arr, seg_idx);
+ p_ms = rte_fbarray_get(p_arr, seg_idx);
+
+ if (l_ms == NULL || p_ms == NULL)
+ return -1;
+
+ if (used) {
+ ret = alloc_seg(l_ms, p_ms->addr,
+ p_ms->socket_id, hi,
+ msl_idx, seg_idx);
+ if (ret < 0)
+ return -1;
+ rte_fbarray_set_used(l_arr, seg_idx);
+ } else {
+ ret = free_seg(l_ms, hi, msl_idx, seg_idx);
+ rte_fbarray_set_free(l_arr, seg_idx);
+ if (ret < 0)
+ return -1;
+ }
+ }
+
+ /* calculate how much we can advance until next chunk */
+ diff_len = used ?
+ rte_fbarray_find_contig_used(l_arr, start) :
+ rte_fbarray_find_contig_free(l_arr, start);
+ ret = RTE_MIN(chunk_len, diff_len);
+
+ return ret;
+}
+
+static int
+sync_status(struct rte_memseg_list *primary_msl,
+ struct rte_memseg_list *local_msl, struct hugepage_info *hi,
+ unsigned int msl_idx, bool used)
+{
+ struct rte_fbarray *l_arr, *p_arr;
+ int p_idx, l_chunk_len, p_chunk_len, ret;
+ int start, end;
+
+ /* this is a little bit tricky, but the basic idea is - walk both lists
+ * and spot any places where there are discrepancies. walking both lists
+ * and noting discrepancies in a single go is a hard problem, so we do
+ * it in two passes - first we spot any places where allocated segments
+ * mismatch (i.e. ensure that everything that's allocated in the primary
+ * is also allocated in the secondary), and then we do it by looking at
+ * free segments instead.
+ *
+ * we also need to aggregate changes into chunks, as we have to call
+ * callbacks per allocation, not per page.
+ */
+ l_arr = &local_msl->memseg_arr;
+ p_arr = &primary_msl->memseg_arr;
+
+ if (used)
+ p_idx = rte_fbarray_find_next_used(p_arr, 0);
+ else
+ p_idx = rte_fbarray_find_next_free(p_arr, 0);
+
+ while (p_idx >= 0) {
+ int next_chunk_search_idx;
+
+ if (used) {
+ p_chunk_len = rte_fbarray_find_contig_used(p_arr,
+ p_idx);
+ l_chunk_len = rte_fbarray_find_contig_used(l_arr,
+ p_idx);
+ } else {
+ p_chunk_len = rte_fbarray_find_contig_free(p_arr,
+ p_idx);
+ l_chunk_len = rte_fbarray_find_contig_free(l_arr,
+ p_idx);
+ }
+ /* best case scenario - no differences (or bigger, which will be
+ * fixed during next iteration), look for next chunk
+ */
+ if (l_chunk_len >= p_chunk_len) {
+ next_chunk_search_idx = p_idx + p_chunk_len;
+ goto next_chunk;
+ }
+
+ /* if both chunks start at the same point, skip parts we know
+ * are identical, and sync the rest. each call to sync_chunk
+ * will only sync contiguous segments, so we need to call this
+ * until we are sure there are no more differences in this
+ * chunk.
+ */
+ start = p_idx + l_chunk_len;
+ end = p_idx + p_chunk_len;
+ do {
+ ret = sync_chunk(primary_msl, local_msl, hi, msl_idx,
+ used, start, end);
+ start += ret;
+ } while (start < end && ret >= 0);
+ /* if ret is negative, something went wrong */
+ if (ret < 0)
+ return -1;
+
+ next_chunk_search_idx = p_idx + p_chunk_len;
+next_chunk:
+ /* skip to end of this chunk */
+ if (used) {
+ p_idx = rte_fbarray_find_next_used(p_arr,
+ next_chunk_search_idx);
+ } else {
+ p_idx = rte_fbarray_find_next_free(p_arr,
+ next_chunk_search_idx);
+ }
+ }
+ return 0;
+}
+
+static int
+sync_existing(struct rte_memseg_list *primary_msl,
+ struct rte_memseg_list *local_msl, struct hugepage_info *hi,
+ unsigned int msl_idx)
+{
+ int ret;
+
+ /* ensure all allocated space is the same in both lists */
+ ret = sync_status(primary_msl, local_msl, hi, msl_idx, true);
+ if (ret < 0)
+ return -1;
+
+ /* ensure all unallocated space is the same in both lists */
+ ret = sync_status(primary_msl, local_msl, hi, msl_idx, false);
+ if (ret < 0)
+ return -1;
+
+ /* update version number */
+ local_msl->version = primary_msl->version;
+
+ return 0;
+}
+
+static int
+sync_walk(const struct rte_memseg_list *msl, void *arg __rte_unused)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct rte_memseg_list *primary_msl, *local_msl;
+ struct hugepage_info *hi = NULL;
+ unsigned int i;
+ int msl_idx;
+ bool new_msl = false;
+
+ msl_idx = msl - mcfg->memsegs;
+ primary_msl = &mcfg->memsegs[msl_idx];
+ local_msl = &local_memsegs[msl_idx];
+
+ /* check if secondary has this memseg list set up */
+ if (local_msl->base_va == NULL) {
+ char name[PATH_MAX];
+ int ret;
+ new_msl = true;
+
+ /* create distinct fbarrays for each secondary */
+ snprintf(name, RTE_FBARRAY_NAME_LEN, "%s_%i",
+ primary_msl->memseg_arr.name, getpid());
+
+ ret = rte_fbarray_init(&local_msl->memseg_arr, name,
+ primary_msl->memseg_arr.len,
+ primary_msl->memseg_arr.elt_sz);
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL, "Cannot initialize local memory map\n");
+ return -1;
+ }
+
+ local_msl->base_va = primary_msl->base_va;
+ }
+
+ for (i = 0; i < RTE_DIM(internal_config.hugepage_info); i++) {
+ uint64_t cur_sz =
+ internal_config.hugepage_info[i].hugepage_sz;
+ uint64_t msl_sz = primary_msl->page_sz;
+ if (msl_sz == cur_sz) {
+ hi = &internal_config.hugepage_info[i];
+ break;
+ }
+ }
+ if (!hi) {
+ RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n");
+ return -1;
+ }
+
+ /* if versions don't match or if we have just allocated a new
+ * memseg list, synchronize everything
+ */
+ if ((new_msl || local_msl->version != primary_msl->version) &&
+ sync_existing(primary_msl, local_msl, hi, msl_idx))
+ return -1;
+ return 0;
+}
+
+
+int
+eal_memalloc_sync_with_primary(void)
+{
+ /* nothing to be done in primary */
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+ return 0;
+
+ if (rte_memseg_list_walk(sync_walk, NULL))
+ return -1;
+ return 0;
+}
--
2.7.4
Anatoly Burakov
2018-04-03 23:22:11 UTC
Permalink
Secondary initialization will just sync memory map with
primary process.

Signed-off-by: Anatoly Burakov <***@intel.com>
---

Notes:
v3:
- Improved handling of EAL hugepage info

lib/librte_eal/common/eal_common_memory.c | 1 +
lib/librte_eal/linuxapp/eal/eal_memory.c | 18 +++++++++++++++---
2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/lib/librte_eal/common/eal_common_memory.c b/lib/librte_eal/common/eal_common_memory.c
index 0a6d678..8db085e 100644
--- a/lib/librte_eal/common/eal_common_memory.c
+++ b/lib/librte_eal/common/eal_common_memory.c
@@ -20,6 +20,7 @@
#include <rte_errno.h>
#include <rte_log.h>

+#include "eal_memalloc.h"
#include "eal_private.h"
#include "eal_internal_cfg.h"

diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
index efa1202..7ec7129 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -1776,6 +1776,18 @@ eal_legacy_hugepage_attach(void)
return -1;
}

+static int
+eal_hugepage_attach(void)
+{
+ if (eal_memalloc_sync_with_primary()) {
+ RTE_LOG(ERR, EAL, "Could not map memory from primary process\n");
+ if (aslr_enabled() > 0)
+ RTE_LOG(ERR, EAL, "It is recommended to disable ASLR in the kernel and retry running both primary and secondary processes\n");
+ return -1;
+ }
+ return 0;
+}
+
int
rte_eal_hugepage_init(void)
{
@@ -1787,9 +1799,9 @@ rte_eal_hugepage_init(void)
int
rte_eal_hugepage_attach(void)
{
- if (internal_config.legacy_mem)
- return eal_legacy_hugepage_attach();
- return -1;
+ return internal_config.legacy_mem ?
+ eal_legacy_hugepage_attach() :
+ eal_hugepage_attach();
}

int
--
2.7.4
Anatoly Burakov
2018-04-03 23:22:10 UTC
Permalink
Since we are going to need to map hugepages in both primary and
secondary processes, we need to know where we should look for
hugetlbfs mountpoints. So, share those with secondary processes,
and map them on init.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/bsdapp/eal/eal.c | 19 ++--
lib/librte_eal/bsdapp/eal/eal_hugepage_info.c | 56 +++++++++--
lib/librte_eal/bsdapp/eal/eal_memory.c | 21 +---
lib/librte_eal/common/eal_common_options.c | 5 +-
lib/librte_eal/common/eal_filesystem.h | 17 ++++
lib/librte_eal/common/eal_hugepages.h | 10 +-
lib/librte_eal/common/eal_internal_cfg.h | 2 +-
lib/librte_eal/linuxapp/eal/eal.c | 18 ++--
lib/librte_eal/linuxapp/eal/eal_hugepage_info.c | 121 ++++++++++++++++++++----
lib/librte_eal/linuxapp/eal/eal_memory.c | 15 +--
10 files changed, 217 insertions(+), 67 deletions(-)

diff --git a/lib/librte_eal/bsdapp/eal/eal.c b/lib/librte_eal/bsdapp/eal/eal.c
index 54330e1..727adc5 100644
--- a/lib/librte_eal/bsdapp/eal/eal.c
+++ b/lib/librte_eal/bsdapp/eal/eal.c
@@ -289,7 +289,7 @@ eal_get_hugepage_mem_size(void)

for (i = 0; i < internal_config.num_hugepage_sizes; i++) {
struct hugepage_info *hpi = &internal_config.hugepage_info[i];
- if (hpi->hugedir != NULL) {
+ if (strnlen(hpi->hugedir, sizeof(hpi->hugedir)) != 0) {
for (j = 0; j < RTE_MAX_NUMA_NODES; j++) {
size += hpi->hugepage_sz * hpi->num_pages[j];
}
@@ -561,12 +561,17 @@ rte_eal_init(int argc, char **argv)
/* autodetect the iova mapping mode (default is iova_pa) */
rte_eal_get_configuration()->iova_mode = rte_bus_get_iommu_class();

- if (internal_config.no_hugetlbfs == 0 &&
- eal_hugepage_info_init() < 0) {
- rte_eal_init_alert("Cannot get hugepage information.");
- rte_errno = EACCES;
- rte_atomic32_clear(&run_once);
- return -1;
+ if (internal_config.no_hugetlbfs == 0) {
+ /* rte_config isn't initialized yet */
+ ret = internal_config.process_type == RTE_PROC_PRIMARY ?
+ eal_hugepage_info_init() :
+ eal_hugepage_info_read();
+ if (ret < 0) {
+ rte_eal_init_alert("Cannot get hugepage information.");
+ rte_errno = EACCES;
+ rte_atomic32_clear(&run_once);
+ return -1;
+ }
}

if (internal_config.memory == 0 && internal_config.force_sockets == 0) {
diff --git a/lib/librte_eal/bsdapp/eal/eal_hugepage_info.c b/lib/librte_eal/bsdapp/eal/eal_hugepage_info.c
index ba44da0..38d143c 100644
--- a/lib/librte_eal/bsdapp/eal/eal_hugepage_info.c
+++ b/lib/librte_eal/bsdapp/eal/eal_hugepage_info.c
@@ -19,10 +19,10 @@
* Used in this file to store the hugepage file map on disk
*/
static void *
-create_shared_memory(const char *filename, const size_t mem_size)
+map_shared_memory(const char *filename, const size_t mem_size, int flags)
{
void *retval;
- int fd = open(filename, O_CREAT | O_RDWR, 0666);
+ int fd = open(filename, flags, 0666);
if (fd < 0)
return NULL;
if (ftruncate(fd, mem_size) < 0) {
@@ -34,6 +34,18 @@ create_shared_memory(const char *filename, const size_t mem_size)
return retval;
}

+static void *
+open_shared_memory(const char *filename, const size_t mem_size)
+{
+ return map_shared_memory(filename, mem_size, O_RDWR);
+}
+
+static void *
+create_shared_memory(const char *filename, const size_t mem_size)
+{
+ return map_shared_memory(filename, mem_size, O_RDWR | O_CREAT);
+}
+
/*
* No hugepage support on freebsd, but we dummy it, using contigmem driver
*/
@@ -46,13 +58,10 @@ eal_hugepage_info_init(void)
/* re-use the linux "internal config" structure for our memory data */
struct hugepage_info *hpi = &internal_config.hugepage_info[0];
struct hugepage_info *tmp_hpi;
+ unsigned int i;

internal_config.num_hugepage_sizes = 1;

- /* nothing more to be done for secondary */
- if (rte_eal_process_type() == RTE_PROC_SECONDARY)
- return 0;
-
sysctl_size = sizeof(num_buffers);
error = sysctlbyname("hw.contigmem.num_buffers", &num_buffers,
&sysctl_size, NULL, 0);
@@ -87,7 +96,7 @@ eal_hugepage_info_init(void)
RTE_LOG(INFO, EAL, "Contigmem driver has %d buffers, each of size %dKB\n",
num_buffers, (int)(buffer_size>>10));

- hpi->hugedir = CONTIGMEM_DEV;
+ snprintf(hpi->hugedir, sizeof(hpi->hugedir), "%s", CONTIGMEM_DEV);
hpi->hugepage_sz = buffer_size;
hpi->num_pages[0] = num_buffers;
hpi->lock_descriptor = fd;
@@ -101,6 +110,14 @@ eal_hugepage_info_init(void)

memcpy(tmp_hpi, hpi, sizeof(internal_config.hugepage_info));

+ /* we've copied file descriptors along with everything else, but they
+ * will be invalid in secondary process, so overwrite them
+ */
+ for (i = 0; i < RTE_DIM(internal_config.hugepage_info); i++) {
+ struct hugepage_info *tmp = &tmp_hpi[i];
+ tmp->lock_descriptor = -1;
+ }
+
if (munmap(tmp_hpi, sizeof(internal_config.hugepage_info)) < 0) {
RTE_LOG(ERR, EAL, "Failed to unmap shared memory!\n");
return -1;
@@ -108,3 +125,28 @@ eal_hugepage_info_init(void)

return 0;
}
+
+/* copy stuff from shared info into internal config */
+int
+eal_hugepage_info_read(void)
+{
+ struct hugepage_info *hpi = &internal_config.hugepage_info[0];
+ struct hugepage_info *tmp_hpi;
+
+ internal_config.num_hugepage_sizes = 1;
+
+ tmp_hpi = open_shared_memory(eal_hugepage_info_path(),
+ sizeof(internal_config.hugepage_info));
+ if (tmp_hpi == NULL) {
+ RTE_LOG(ERR, EAL, "Failed to open shared memory!\n");
+ return -1;
+ }
+
+ memcpy(hpi, tmp_hpi, sizeof(internal_config.hugepage_info));
+
+ if (munmap(tmp_hpi, sizeof(internal_config.hugepage_info)) < 0) {
+ RTE_LOG(ERR, EAL, "Failed to unmap shared memory!\n");
+ return -1;
+ }
+ return 0;
+}
diff --git a/lib/librte_eal/bsdapp/eal/eal_memory.c b/lib/librte_eal/bsdapp/eal/eal_memory.c
index 6692b3d..3064605 100644
--- a/lib/librte_eal/bsdapp/eal/eal_memory.c
+++ b/lib/librte_eal/bsdapp/eal/eal_memory.c
@@ -240,23 +240,10 @@ int
rte_eal_hugepage_attach(void)
{
const struct hugepage_info *hpi;
- int fd_hugepage_info, fd_hugepage = -1;
+ int fd_hugepage = -1;
unsigned int i;

- /* Obtain a file descriptor for hugepage_info */
- fd_hugepage_info = open(eal_hugepage_info_path(), O_RDONLY);
- if (fd_hugepage_info < 0) {
- RTE_LOG(ERR, EAL, "Could not open %s\n", eal_hugepage_info_path());
- return -1;
- }
-
- /* Map the shared hugepage_info into the process address spaces */
- hpi = mmap(NULL, sizeof(internal_config.hugepage_info),
- PROT_READ, MAP_PRIVATE, fd_hugepage_info, 0);
- if (hpi == MAP_FAILED) {
- RTE_LOG(ERR, EAL, "Could not mmap %s\n", eal_hugepage_info_path());
- goto error;
- }
+ hpi = &internal_config.hugepage_info[0];

for (i = 0; i < internal_config.num_hugepage_sizes; i++) {
const struct hugepage_info *cur_hpi = &hpi[i];
@@ -286,13 +273,9 @@ rte_eal_hugepage_attach(void)
}

/* hugepage_info is no longer required */
- munmap((void *)(uintptr_t)hpi, sizeof(internal_config.hugepage_info));
- close(fd_hugepage_info);
return 0;

error:
- if (fd_hugepage_info >= 0)
- close(fd_hugepage_info);
if (fd_hugepage >= 0)
close(fd_hugepage);
return -1;
diff --git a/lib/librte_eal/common/eal_common_options.c b/lib/librte_eal/common/eal_common_options.c
index 5b5da5f..04a4476 100644
--- a/lib/librte_eal/common/eal_common_options.c
+++ b/lib/librte_eal/common/eal_common_options.c
@@ -179,8 +179,11 @@ eal_reset_internal_config(struct internal_config *internal_cfg)
for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
internal_cfg->socket_mem[i] = 0;
/* zero out hugedir descriptors */
- for (i = 0; i < MAX_HUGEPAGE_SIZES; i++)
+ for (i = 0; i < MAX_HUGEPAGE_SIZES; i++) {
+ memset(&internal_cfg->hugepage_info[i], 0,
+ sizeof(internal_cfg->hugepage_info[0]));
internal_cfg->hugepage_info[i].lock_descriptor = -1;
+ }
internal_cfg->base_virtaddr = 0;

internal_cfg->syslog_facility = LOG_DAEMON;
diff --git a/lib/librte_eal/common/eal_filesystem.h b/lib/librte_eal/common/eal_filesystem.h
index 1c6048b..ad059ef 100644
--- a/lib/librte_eal/common/eal_filesystem.h
+++ b/lib/librte_eal/common/eal_filesystem.h
@@ -85,6 +85,23 @@ eal_hugepage_info_path(void)
return buffer;
}

+/** Path of hugepage info file. */
+#define HUGEPAGE_FILE_FMT "%s/.%s_hugepage_file"
+
+static inline const char *
+eal_hugepage_file_path(void)
+{
+ static char buffer[PATH_MAX]; /* static so auto-zeroed */
+ const char *directory = default_config_dir;
+ const char *home_dir = getenv("HOME");
+
+ if (getuid() != 0 && home_dir != NULL)
+ directory = home_dir;
+ snprintf(buffer, sizeof(buffer) - 1, HUGEPAGE_FILE_FMT, directory,
+ internal_config.hugefile_prefix);
+ return buffer;
+}
+
/** String format for hugepage map files. */
#define HUGEFILE_FMT "%s/%smap_%d"
#define TEMP_HUGEFILE_FMT "%s/%smap_temp_%d"
diff --git a/lib/librte_eal/common/eal_hugepages.h b/lib/librte_eal/common/eal_hugepages.h
index ad1b0b6..4582f19 100644
--- a/lib/librte_eal/common/eal_hugepages.h
+++ b/lib/librte_eal/common/eal_hugepages.h
@@ -26,9 +26,15 @@ struct hugepage_file {
};

/**
- * Read the information from linux on what hugepages are available
- * for the EAL to use
+ * Read the information on what hugepages are available for the EAL to use,
+ * clearing out any unused ones.
*/
int eal_hugepage_info_init(void);

+/**
+ * Read whatever information primary process has shared about hugepages into
+ * secondary process.
+ */
+int eal_hugepage_info_read(void);
+
#endif /* EAL_HUGEPAGES_H */
diff --git a/lib/librte_eal/common/eal_internal_cfg.h b/lib/librte_eal/common/eal_internal_cfg.h
index 9d33cf4..c4cbf3a 100644
--- a/lib/librte_eal/common/eal_internal_cfg.h
+++ b/lib/librte_eal/common/eal_internal_cfg.h
@@ -21,7 +21,7 @@
*/
struct hugepage_info {
uint64_t hugepage_sz; /**< size of a huge page */
- const char *hugedir; /**< dir where hugetlbfs is mounted */
+ char hugedir[PATH_MAX]; /**< dir where hugetlbfs is mounted */
uint32_t num_pages[RTE_MAX_NUMA_NODES];
/**< number of hugepages of that size on each socket */
int lock_descriptor; /**< file descriptor for hugepage dir */
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 2c12811..e7c6dcf 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -807,13 +807,17 @@ rte_eal_init(int argc, char **argv)
"KNI module inserted\n");
}

- if (internal_config.no_hugetlbfs == 0 &&
- internal_config.process_type != RTE_PROC_SECONDARY &&
- eal_hugepage_info_init() < 0) {
- rte_eal_init_alert("Cannot get hugepage information.");
- rte_errno = EACCES;
- rte_atomic32_clear(&run_once);
- return -1;
+ if (internal_config.no_hugetlbfs == 0) {
+ /* rte_config isn't initialized yet */
+ ret = internal_config.process_type == RTE_PROC_PRIMARY ?
+ eal_hugepage_info_init() :
+ eal_hugepage_info_read();
+ if (ret < 0) {
+ rte_eal_init_alert("Cannot get hugepage information.");
+ rte_errno = EACCES;
+ rte_atomic32_clear(&run_once);
+ return -1;
+ }
}

if (internal_config.memory == 0 && internal_config.force_sockets == 0) {
diff --git a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
index 2e0819f..fb4b667 100644
--- a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
+++ b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
@@ -14,6 +14,7 @@
#include <stdarg.h>
#include <unistd.h>
#include <errno.h>
+#include <sys/mman.h>
#include <sys/queue.h>
#include <sys/stat.h>

@@ -33,6 +34,39 @@
static const char sys_dir_path[] = "/sys/kernel/mm/hugepages";
static const char sys_pages_numa_dir_path[] = "/sys/devices/system/node";

+/*
+ * Uses mmap to create a shared memory area for storage of data
+ * Used in this file to store the hugepage file map on disk
+ */
+static void *
+map_shared_memory(const char *filename, const size_t mem_size, int flags)
+{
+ void *retval;
+ int fd = open(filename, flags, 0666);
+ if (fd < 0)
+ return NULL;
+ if (ftruncate(fd, mem_size) < 0) {
+ close(fd);
+ return NULL;
+ }
+ retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE,
+ MAP_SHARED, fd, 0);
+ close(fd);
+ return retval;
+}
+
+static void *
+open_shared_memory(const char *filename, const size_t mem_size)
+{
+ return map_shared_memory(filename, mem_size, O_RDWR);
+}
+
+static void *
+create_shared_memory(const char *filename, const size_t mem_size)
+{
+ return map_shared_memory(filename, mem_size, O_RDWR | O_CREAT);
+}
+
/* this function is only called from eal_hugepage_info_init which itself
* is only called from a primary process */
static uint32_t
@@ -299,15 +333,9 @@ compare_hpi(const void *a, const void *b)
return hpi_b->hugepage_sz - hpi_a->hugepage_sz;
}

-/*
- * when we initialize the hugepage info, everything goes
- * to socket 0 by default. it will later get sorted by memory
- * initialization procedure.
- */
-int
-eal_hugepage_info_init(void)
-{
- const char dirent_start_text[] = "hugepages-";
+static int
+hugepage_info_init(void)
+{ const char dirent_start_text[] = "hugepages-";
const size_t dirent_start_len = sizeof(dirent_start_text) - 1;
unsigned int i, total_pages, num_sizes = 0;
DIR *dir;
@@ -323,6 +351,7 @@ eal_hugepage_info_init(void)

for (dirent = readdir(dir); dirent != NULL; dirent = readdir(dir)) {
struct hugepage_info *hpi;
+ const char *hugedir;

if (strncmp(dirent->d_name, dirent_start_text,
dirent_start_len) != 0)
@@ -334,10 +363,10 @@ eal_hugepage_info_init(void)
hpi = &internal_config.hugepage_info[num_sizes];
hpi->hugepage_sz =
rte_str_to_size(&dirent->d_name[dirent_start_len]);
- hpi->hugedir = get_hugepage_dir(hpi->hugepage_sz);
+ hugedir = get_hugepage_dir(hpi->hugepage_sz);

/* first, check if we have a mountpoint */
- if (hpi->hugedir == NULL) {
+ if (hugedir == NULL) {
uint32_t num_pages;

num_pages = get_num_hugepages(dirent->d_name);
@@ -349,6 +378,7 @@ eal_hugepage_info_init(void)
num_pages, hpi->hugepage_sz);
continue;
}
+ snprintf(hpi->hugedir, sizeof(hpi->hugedir), "%s", hugedir);

/* try to obtain a writelock */
hpi->lock_descriptor = open(hpi->hugedir, O_RDONLY);
@@ -411,13 +441,11 @@ eal_hugepage_info_init(void)
for (i = 0; i < num_sizes; i++) {
/* pages may no longer all be on socket 0, so check all */
unsigned int j, num_pages = 0;
+ struct hugepage_info *hpi = &internal_config.hugepage_info[i];

- for (j = 0; j < RTE_MAX_NUMA_NODES; j++) {
- struct hugepage_info *hpi =
- &internal_config.hugepage_info[i];
+ for (j = 0; j < RTE_MAX_NUMA_NODES; j++)
num_pages += hpi->num_pages[j];
- }
- if (internal_config.hugepage_info[i].hugedir != NULL &&
+ if (strnlen(hpi->hugedir, sizeof(hpi->hugedir)) != 0 &&
num_pages > 0)
return 0;
}
@@ -425,3 +453,64 @@ eal_hugepage_info_init(void)
/* no valid hugepage mounts available, return error */
return -1;
}
+
+/*
+ * when we initialize the hugepage info, everything goes
+ * to socket 0 by default. it will later get sorted by memory
+ * initialization procedure.
+ */
+int
+eal_hugepage_info_init(void)
+{
+ struct hugepage_info *hpi, *tmp_hpi;
+ unsigned int i;
+
+ if (hugepage_info_init() < 0)
+ return -1;
+
+ hpi = &internal_config.hugepage_info[0];
+
+ tmp_hpi = create_shared_memory(eal_hugepage_info_path(),
+ sizeof(internal_config.hugepage_info));
+ if (tmp_hpi == NULL) {
+ RTE_LOG(ERR, EAL, "Failed to create shared memory!\n");
+ return -1;
+ }
+
+ memcpy(tmp_hpi, hpi, sizeof(internal_config.hugepage_info));
+
+ /* we've copied file descriptors along with everything else, but they
+ * will be invalid in secondary process, so overwrite them
+ */
+ for (i = 0; i < RTE_DIM(internal_config.hugepage_info); i++) {
+ struct hugepage_info *tmp = &tmp_hpi[i];
+ tmp->lock_descriptor = -1;
+ }
+
+ if (munmap(tmp_hpi, sizeof(internal_config.hugepage_info)) < 0) {
+ RTE_LOG(ERR, EAL, "Failed to unmap shared memory!\n");
+ return -1;
+ }
+ return 0;
+}
+
+int eal_hugepage_info_read(void)
+{
+ struct hugepage_info *hpi = &internal_config.hugepage_info[0];
+ struct hugepage_info *tmp_hpi;
+
+ tmp_hpi = open_shared_memory(eal_hugepage_info_path(),
+ sizeof(internal_config.hugepage_info));
+ if (tmp_hpi == NULL) {
+ RTE_LOG(ERR, EAL, "Failed to open shared memory!\n");
+ return -1;
+ }
+
+ memcpy(hpi, tmp_hpi, sizeof(internal_config.hugepage_info));
+
+ if (munmap(tmp_hpi, sizeof(internal_config.hugepage_info)) < 0) {
+ RTE_LOG(ERR, EAL, "Failed to unmap shared memory!\n");
+ return -1;
+ }
+ return 0;
+}
diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
index c51d598..efa1202 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -1060,7 +1060,7 @@ get_socket_mem_size(int socket)

for (i = 0; i < internal_config.num_hugepage_sizes; i++){
struct hugepage_info *hpi = &internal_config.hugepage_info[i];
- if (hpi->hugedir != NULL)
+ if (strnlen(hpi->hugedir, sizeof(hpi->hugedir)) != 0)
size += hpi->hugepage_sz * hpi->num_pages[socket];
}

@@ -1160,7 +1160,8 @@ calc_num_pages_per_socket(uint64_t * memory,
for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_mem != 0; socket++) {
/* skips if the memory on specific socket wasn't requested */
for (i = 0; i < num_hp_info && memory[socket] != 0; i++){
- hp_used[i].hugedir = hp_info[i].hugedir;
+ snprintf(hp_used[i].hugedir, sizeof(hp_used[i].hugedir),
+ "%s", hp_info[i].hugedir);
hp_used[i].num_pages[socket] = RTE_MIN(
memory[socket] / hp_info[i].hugepage_sz,
hp_info[i].num_pages[socket]);
@@ -1235,7 +1236,7 @@ eal_get_hugepage_mem_size(void)

for (i = 0; i < internal_config.num_hugepage_sizes; i++) {
struct hugepage_info *hpi = &internal_config.hugepage_info[i];
- if (hpi->hugedir != NULL) {
+ if (strnlen(hpi->hugedir, sizeof(hpi->hugedir)) != 0) {
for (j = 0; j < RTE_MAX_NUMA_NODES; j++) {
size += hpi->hugepage_sz * hpi->num_pages[j];
}
@@ -1508,7 +1509,7 @@ eal_legacy_hugepage_init(void)
}

/* create shared memory */
- hugepage = create_shared_memory(eal_hugepage_info_path(),
+ hugepage = create_shared_memory(eal_hugepage_file_path(),
nr_hugefiles * sizeof(struct hugepage_file));

if (hugepage == NULL) {
@@ -1693,16 +1694,16 @@ eal_legacy_hugepage_attach(void)

test_phys_addrs_available();

- fd_hugepage = open(eal_hugepage_info_path(), O_RDONLY);
+ fd_hugepage = open(eal_hugepage_file_path(), O_RDONLY);
if (fd_hugepage < 0) {
- RTE_LOG(ERR, EAL, "Could not open %s\n", eal_hugepage_info_path());
+ RTE_LOG(ERR, EAL, "Could not open %s\n", eal_hugepage_file_path());
goto error;
}

size = getFileSize(fd_hugepage);
hp = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd_hugepage, 0);
if (hp == MAP_FAILED) {
- RTE_LOG(ERR, EAL, "Could not mmap %s\n", eal_hugepage_info_path());
+ RTE_LOG(ERR, EAL, "Could not mmap %s\n", eal_hugepage_file_path());
goto error;
}
--
2.7.4
Anatoly Burakov
2018-04-03 23:22:03 UTC
Permalink
Nothing uses this code yet. The bulk of it is copied from old
memory allocation code (linuxapp eal_memory.c). We provide an
EAL-internal API to allocate either one page or multiple pages,
guaranteeing that we'll get contiguous VA for all of the pages
that we requested.

Not supported on FreeBSD.

Locking is done via fcntl() because that way, when it comes to
taking out write locks or unlocking on deallocation, we don't
have to keep original fd's around. Plus, using fcntl() gives us
ability to lock parts of a file, which is useful for single-file
segments, which are coming down the line.

Signed-off-by: Anatoly Burakov <***@intel.com>
---

Notes:
v3:
- Compile fixes for various platforms
- Split single file segments stuff into separate commit

v3:
- Split single file segments into separate patch
- Added missing FreeBSD implementation
- Removed rte_panic when unable to free page

v3:
- Added single file segments functionality in this
commit, instead of later commits

lib/librte_eal/bsdapp/eal/Makefile | 1 +
lib/librte_eal/bsdapp/eal/eal_memalloc.c | 26 ++
lib/librte_eal/bsdapp/eal/meson.build | 1 +
lib/librte_eal/common/eal_memalloc.h | 31 +++
lib/librte_eal/linuxapp/eal/Makefile | 2 +
lib/librte_eal/linuxapp/eal/eal_memalloc.c | 429 +++++++++++++++++++++++++++++
lib/librte_eal/linuxapp/eal/meson.build | 1 +
7 files changed, 491 insertions(+)
create mode 100644 lib/librte_eal/bsdapp/eal/eal_memalloc.c
create mode 100644 lib/librte_eal/common/eal_memalloc.h
create mode 100644 lib/librte_eal/linuxapp/eal/eal_memalloc.c

diff --git a/lib/librte_eal/bsdapp/eal/Makefile b/lib/librte_eal/bsdapp/eal/Makefile
index 1b43d77..19f9322 100644
--- a/lib/librte_eal/bsdapp/eal/Makefile
+++ b/lib/librte_eal/bsdapp/eal/Makefile
@@ -29,6 +29,7 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_memory.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_hugepage_info.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_thread.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_debug.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_memalloc.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_lcore.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_timer.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_interrupts.c
diff --git a/lib/librte_eal/bsdapp/eal/eal_memalloc.c b/lib/librte_eal/bsdapp/eal/eal_memalloc.c
new file mode 100644
index 0000000..8c30670
--- /dev/null
+++ b/lib/librte_eal/bsdapp/eal/eal_memalloc.c
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2017-2018 Intel Corporation
+ */
+
+#include <inttypes.h>
+
+#include <rte_log.h>
+#include <rte_memory.h>
+
+#include "eal_memalloc.h"
+
+int
+eal_memalloc_alloc_seg_bulk(struct rte_memseg **ms __rte_unused,
+ int __rte_unused n_segs, size_t __rte_unused page_sz,
+ int __rte_unused socket, bool __rte_unused exact)
+{
+ RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n");
+ return -1;
+}
+
+struct rte_memseg *
+eal_memalloc_alloc_seg(size_t __rte_unused page_sz, int __rte_unused socket)
+{
+ RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n");
+ return NULL;
+}
diff --git a/lib/librte_eal/bsdapp/eal/meson.build b/lib/librte_eal/bsdapp/eal/meson.build
index e83fc91..4b40223 100644
--- a/lib/librte_eal/bsdapp/eal/meson.build
+++ b/lib/librte_eal/bsdapp/eal/meson.build
@@ -8,6 +8,7 @@ env_sources = files('eal_alarm.c',
'eal_hugepage_info.c',
'eal_interrupts.c',
'eal_lcore.c',
+ 'eal_memalloc.c',
'eal_thread.c',
'eal_timer.c',
'eal.c',
diff --git a/lib/librte_eal/common/eal_memalloc.h b/lib/librte_eal/common/eal_memalloc.h
new file mode 100644
index 0000000..f628514
--- /dev/null
+++ b/lib/librte_eal/common/eal_memalloc.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2017-2018 Intel Corporation
+ */
+
+#ifndef EAL_MEMALLOC_H
+#define EAL_MEMALLOC_H
+
+#include <stdbool.h>
+
+#include <rte_memory.h>
+
+/*
+ * Allocate segment of specified page size.
+ */
+struct rte_memseg *
+eal_memalloc_alloc_seg(size_t page_sz, int socket);
+
+/*
+ * Allocate `n_segs` segments.
+ *
+ * Note: `ms` can be NULL.
+ *
+ * Note: it is possible to request best-effort allocation by setting `exact` to
+ * `false`, in which case allocator will return however many pages it managed to
+ * allocate successfully.
+ */
+int
+eal_memalloc_alloc_seg_bulk(struct rte_memseg **ms, int n_segs, size_t page_sz,
+ int socket, bool exact);
+
+#endif // EAL_MEMALLOC_H
diff --git a/lib/librte_eal/linuxapp/eal/Makefile b/lib/librte_eal/linuxapp/eal/Makefile
index c407a43..af6b9be 100644
--- a/lib/librte_eal/linuxapp/eal/Makefile
+++ b/lib/librte_eal/linuxapp/eal/Makefile
@@ -36,6 +36,7 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_thread.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_log.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_vfio.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_vfio_mp_sync.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_memalloc.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_debug.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_lcore.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_timer.c
@@ -82,6 +83,7 @@ CFLAGS_eal_interrupts.o := -D_GNU_SOURCE
CFLAGS_eal_vfio_mp_sync.o := -D_GNU_SOURCE
CFLAGS_eal_timer.o := -D_GNU_SOURCE
CFLAGS_eal_lcore.o := -D_GNU_SOURCE
+CFLAGS_eal_memalloc.o := -D_GNU_SOURCE
CFLAGS_eal_thread.o := -D_GNU_SOURCE
CFLAGS_eal_log.o := -D_GNU_SOURCE
CFLAGS_eal_common_log.o := -D_GNU_SOURCE
diff --git a/lib/librte_eal/linuxapp/eal/eal_memalloc.c b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
new file mode 100644
index 0000000..45ea0ad
--- /dev/null
+++ b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
@@ -0,0 +1,429 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2017-2018 Intel Corporation
+ */
+
+#define _FILE_OFFSET_BITS 64
+#include <errno.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/queue.h>
+#include <sys/file.h>
+#include <unistd.h>
+#include <limits.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/time.h>
+#include <signal.h>
+#include <setjmp.h>
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+#include <numa.h>
+#include <numaif.h>
+#endif
+
+#include <rte_common.h>
+#include <rte_log.h>
+#include <rte_eal_memconfig.h>
+#include <rte_eal.h>
+#include <rte_memory.h>
+#include <rte_spinlock.h>
+
+#include "eal_filesystem.h"
+#include "eal_internal_cfg.h"
+#include "eal_memalloc.h"
+
+static sigjmp_buf huge_jmpenv;
+
+static void __rte_unused huge_sigbus_handler(int signo __rte_unused)
+{
+ siglongjmp(huge_jmpenv, 1);
+}
+
+/* Put setjmp into a wrap method to avoid compiling error. Any non-volatile,
+ * non-static local variable in the stack frame calling sigsetjmp might be
+ * clobbered by a call to longjmp.
+ */
+static int __rte_unused huge_wrap_sigsetjmp(void)
+{
+ return sigsetjmp(huge_jmpenv, 1);
+}
+
+static struct sigaction huge_action_old;
+static int huge_need_recover;
+
+static void __rte_unused
+huge_register_sigbus(void)
+{
+ sigset_t mask;
+ struct sigaction action;
+
+ sigemptyset(&mask);
+ sigaddset(&mask, SIGBUS);
+ action.sa_flags = 0;
+ action.sa_mask = mask;
+ action.sa_handler = huge_sigbus_handler;
+
+ huge_need_recover = !sigaction(SIGBUS, &action, &huge_action_old);
+}
+
+static void __rte_unused
+huge_recover_sigbus(void)
+{
+ if (huge_need_recover) {
+ sigaction(SIGBUS, &huge_action_old, NULL);
+ huge_need_recover = 0;
+ }
+}
+
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+static bool
+check_numa(void)
+{
+ bool ret = true;
+ /* Check if kernel supports NUMA. */
+ if (numa_available() != 0) {
+ RTE_LOG(DEBUG, EAL, "NUMA is not supported.\n");
+ ret = false;
+ }
+ return ret;
+}
+
+static void
+prepare_numa(int *oldpolicy, struct bitmask *oldmask, int socket_id)
+{
+ RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n");
+ if (get_mempolicy(oldpolicy, oldmask->maskp,
+ oldmask->size + 1, 0, 0) < 0) {
+ RTE_LOG(ERR, EAL,
+ "Failed to get current mempolicy: %s. "
+ "Assuming MPOL_DEFAULT.\n", strerror(errno));
+ oldpolicy = MPOL_DEFAULT;
+ }
+ RTE_LOG(DEBUG, EAL,
+ "Setting policy MPOL_PREFERRED for socket %d\n",
+ socket_id);
+ numa_set_preferred(socket_id);
+}
+
+static void
+resotre_numa(int *oldpolicy, struct bitmask *oldmask)
+{
+ RTE_LOG(DEBUG, EAL,
+ "Restoring previous memory policy: %d\n", *oldpolicy);
+ if (oldpolicy == MPOL_DEFAULT) {
+ numa_set_localalloc();
+ } else if (set_mempolicy(*oldpolicy, oldmask->maskp,
+ oldmask->size + 1) < 0) {
+ RTE_LOG(ERR, EAL, "Failed to restore mempolicy: %s\n",
+ strerror(errno));
+ numa_set_localalloc();
+ }
+ numa_free_cpumask(oldmask);
+}
+#endif
+
+static int
+get_seg_fd(char *path, int buflen, struct hugepage_info *hi,
+ unsigned int list_idx, unsigned int seg_idx)
+{
+ int fd;
+ eal_get_hugefile_path(path, buflen, hi->hugedir,
+ list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx);
+ fd = open(path, O_CREAT | O_RDWR, 0600);
+ if (fd < 0) {
+ RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__,
+ strerror(errno));
+ return -1;
+ }
+ return fd;
+}
+
+/* returns 1 on successful lock, 0 on unsuccessful lock, -1 on error */
+static int lock(int fd, uint64_t offset, uint64_t len, int type)
+{
+ struct flock lck;
+ int ret;
+
+ memset(&lck, 0, sizeof(lck));
+
+ lck.l_type = type;
+ lck.l_whence = SEEK_SET;
+ lck.l_start = offset;
+ lck.l_len = len;
+
+ ret = fcntl(fd, F_SETLK, &lck);
+
+ if (ret && (errno == EAGAIN || errno == EACCES)) {
+ /* locked by another process, not an error */
+ return 0;
+ } else if (ret) {
+ RTE_LOG(ERR, EAL, "%s(): error calling fcntl(): %s\n",
+ __func__, strerror(errno));
+ /* we've encountered an unexpected error */
+ return -1;
+ }
+ return 1;
+}
+
+static int
+alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
+ struct hugepage_info *hi, unsigned int list_idx,
+ unsigned int seg_idx)
+{
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+ int cur_socket_id = 0;
+#endif
+ uint64_t map_offset;
+ char path[PATH_MAX];
+ int ret = 0;
+ int fd;
+ size_t alloc_sz;
+
+ fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx);
+ if (fd < 0)
+ return -1;
+
+ alloc_sz = hi->hugepage_sz;
+
+ map_offset = 0;
+ if (ftruncate(fd, alloc_sz) < 0) {
+ RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n",
+ __func__, strerror(errno));
+ goto resized;
+ }
+ /* we've allocated a page - take out a read lock. we're using fcntl()
+ * locks rather than flock() here because doing that gives us one huge
+ * advantage - fcntl() locks are per-process, not per-file descriptor,
+ * which means that we don't have to keep the original fd's around to
+ * keep a lock on the file.
+ *
+ * this is useful, because when it comes to unmapping pages, we will
+ * have to take out a write lock (to figure out if another process still
+ * has this page mapped), and to do itwith flock() we'll have to use
+ * original fd, as lock is associated with that particular fd. with
+ * fcntl(), this is not necessary - we can open a new fd and use fcntl()
+ * on that.
+ */
+ ret = lock(fd, map_offset, alloc_sz, F_RDLCK);
+
+ /* this should not fail */
+ if (ret != 1) {
+ RTE_LOG(ERR, EAL, "%s(): error locking file: %s\n",
+ __func__,
+ strerror(errno));
+ goto resized;
+ }
+
+ /*
+ * map the segment, and populate page tables, the kernel fills this
+ * segment with zeros if it's a new page.
+ */
+ void *va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd, map_offset);
+ close(fd);
+
+ if (va == MAP_FAILED) {
+ RTE_LOG(DEBUG, EAL, "%s(): mmap() failed: %s\n", __func__,
+ strerror(errno));
+ goto resized;
+ }
+ if (va != addr) {
+ RTE_LOG(DEBUG, EAL, "%s(): wrong mmap() address\n", __func__);
+ goto mapped;
+ }
+
+ rte_iova_t iova = rte_mem_virt2iova(addr);
+ if (iova == RTE_BAD_PHYS_ADDR) {
+ RTE_LOG(DEBUG, EAL, "%s(): can't get IOVA addr\n",
+ __func__);
+ goto mapped;
+ }
+
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+ move_pages(getpid(), 1, &addr, NULL, &cur_socket_id, 0);
+
+ if (cur_socket_id != socket_id) {
+ RTE_LOG(DEBUG, EAL,
+ "%s(): allocation happened on wrong socket (wanted %d, got %d)\n",
+ __func__, socket_id, cur_socket_id);
+ goto mapped;
+ }
+#endif
+
+ /* In linux, hugetlb limitations, like cgroup, are
+ * enforced at fault time instead of mmap(), even
+ * with the option of MAP_POPULATE. Kernel will send
+ * a SIGBUS signal. To avoid to be killed, save stack
+ * environment here, if SIGBUS happens, we can jump
+ * back here.
+ */
+ if (huge_wrap_sigsetjmp()) {
+ RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more hugepages of size %uMB\n",
+ (unsigned int)(alloc_sz >> 20));
+ goto mapped;
+ }
+ *(int *)addr = *(int *)addr;
+
+ ms->addr = addr;
+ ms->hugepage_sz = alloc_sz;
+ ms->len = alloc_sz;
+ ms->nchannel = rte_memory_get_nchannel();
+ ms->nrank = rte_memory_get_nrank();
+ ms->iova = iova;
+ ms->socket_id = socket_id;
+
+ return 0;
+
+mapped:
+ munmap(addr, alloc_sz);
+resized:
+ close(fd);
+ unlink(path);
+ return -1;
+}
+
+struct alloc_walk_param {
+ struct hugepage_info *hi;
+ struct rte_memseg **ms;
+ size_t page_sz;
+ unsigned int segs_allocated;
+ unsigned int n_segs;
+ int socket;
+ bool exact;
+};
+static int
+alloc_seg_walk(const struct rte_memseg_list *msl, void *arg)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct alloc_walk_param *wa = arg;
+ struct rte_memseg_list *cur_msl;
+ size_t page_sz;
+ int cur_idx;
+ unsigned int msl_idx, need, i;
+
+ if (msl->page_sz != wa->page_sz)
+ return 0;
+ if (msl->socket_id != wa->socket)
+ return 0;
+
+ page_sz = (size_t)msl->page_sz;
+
+ msl_idx = msl - mcfg->memsegs;
+ cur_msl = &mcfg->memsegs[msl_idx];
+
+ need = wa->n_segs;
+
+ /* try finding space in memseg list */
+ cur_idx = rte_fbarray_find_next_n_free(&cur_msl->memseg_arr, 0, need);
+ if (cur_idx < 0)
+ return 0;
+
+ for (i = 0; i < need; i++, cur_idx++) {
+ struct rte_memseg *cur;
+ void *map_addr;
+
+ cur = rte_fbarray_get(&cur_msl->memseg_arr, cur_idx);
+ map_addr = RTE_PTR_ADD(cur_msl->base_va,
+ cur_idx * page_sz);
+
+ if (alloc_seg(cur, map_addr, wa->socket, wa->hi,
+ msl_idx, cur_idx)) {
+ RTE_LOG(DEBUG, EAL, "attempted to allocate %i segments, but only %i were allocated\n",
+ need, i);
+
+ /* if exact number wasn't requested, stop */
+ if (!wa->exact)
+ goto out;
+ return -1;
+ }
+ if (wa->ms)
+ wa->ms[i] = cur;
+
+ rte_fbarray_set_used(&cur_msl->memseg_arr, cur_idx);
+ }
+out:
+ wa->segs_allocated = i;
+ return 1;
+
+}
+
+int
+eal_memalloc_alloc_seg_bulk(struct rte_memseg **ms, int n_segs, size_t page_sz,
+ int socket, bool exact)
+{
+ int i, ret = -1;
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+ bool have_numa = false;
+ int oldpolicy;
+ struct bitmask *oldmask;
+#endif
+ struct alloc_walk_param wa;
+ struct hugepage_info *hi = NULL;
+
+ memset(&wa, 0, sizeof(wa));
+
+ /* dynamic allocation not supported in legacy mode */
+ if (internal_config.legacy_mem)
+ return -1;
+
+ for (i = 0; i < (int) RTE_DIM(internal_config.hugepage_info); i++) {
+ if (page_sz ==
+ internal_config.hugepage_info[i].hugepage_sz) {
+ hi = &internal_config.hugepage_info[i];
+ break;
+ }
+ }
+ if (!hi) {
+ RTE_LOG(ERR, EAL, "%s(): can't find relevant hugepage_info entry\n",
+ __func__);
+ return -1;
+ }
+
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+ if (check_numa()) {
+ oldmask = numa_allocate_nodemask();
+ prepare_numa(&oldpolicy, oldmask, socket);
+ have_numa = true;
+ }
+#endif
+
+ wa.exact = exact;
+ wa.hi = hi;
+ wa.ms = ms;
+ wa.n_segs = n_segs;
+ wa.page_sz = page_sz;
+ wa.socket = socket;
+ wa.segs_allocated = 0;
+
+ ret = rte_memseg_list_walk(alloc_seg_walk, &wa);
+ if (ret == 0) {
+ RTE_LOG(ERR, EAL, "%s(): couldn't find suitable memseg_list\n",
+ __func__);
+ ret = -1;
+ } else if (ret > 0) {
+ ret = (int)wa.segs_allocated;
+ }
+
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+ if (have_numa)
+ resotre_numa(&oldpolicy, oldmask);
+#endif
+ return ret;
+}
+
+struct rte_memseg *
+eal_memalloc_alloc_seg(size_t page_sz, int socket)
+{
+ struct rte_memseg *ms;
+ if (eal_memalloc_alloc_seg_bulk(&ms, 1, page_sz, socket, true) < 0)
+ return NULL;
+ /* return pointer to newly allocated memseg */
+ return ms;
+}
diff --git a/lib/librte_eal/linuxapp/eal/meson.build b/lib/librte_eal/linuxapp/eal/meson.build
index 03974ff..5254c6c 100644
--- a/lib/librte_eal/linuxapp/eal/meson.build
+++ b/lib/librte_eal/linuxapp/eal/meson.build
@@ -10,6 +10,7 @@ env_sources = files('eal_alarm.c',
'eal_debug.c',
'eal_hugepage_info.c',
'eal_interrupts.c',
+ 'eal_memalloc.c',
'eal_lcore.c',
'eal_log.c',
'eal_thread.c',
--
2.7.4
Anatoly Burakov
2018-04-03 23:22:15 UTC
Permalink
Callbacks will be triggered just after allocation and just
before deallocation, to ensure that memory address space
referenced in the callback is always valid by the time
callback is called.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/common/malloc_heap.c | 21 +++++++++++++++++++++
lib/librte_eal/linuxapp/eal/eal_memalloc.c | 30 ++++++++++++++++++++++++++++++
lib/librte_eal/linuxapp/eal/eal_vfio.c | 15 +++++++++++++--
3 files changed, 64 insertions(+), 2 deletions(-)

diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c
index be39250..18c7b69 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -241,6 +241,7 @@ try_expand_heap_primary(struct malloc_heap *heap, uint64_t pg_sz,
void *map_addr;
size_t alloc_sz;
int n_segs;
+ bool callback_triggered = false;

alloc_sz = RTE_ALIGN_CEIL(align + elt_size +
MALLOC_ELEM_TRAILER_LEN, pg_sz);
@@ -262,12 +263,22 @@ try_expand_heap_primary(struct malloc_heap *heap, uint64_t pg_sz,

map_addr = ms[0]->addr;

+ /* notify user about changes in memory map */
+ eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC, map_addr, alloc_sz);
+
/* notify other processes that this has happened */
if (request_sync()) {
/* we couldn't ensure all processes have mapped memory,
* so free it back and notify everyone that it's been
* freed back.
+ *
+ * technically, we could've avoided adding memory addresses to
+ * the map, but that would've led to inconsistent behavior
+ * between primary and secondary processes, as those get
+ * callbacks during sync. therefore, force primary process to
+ * do alloc-and-rollback syncs as well.
*/
+ callback_triggered = true;
goto free_elem;
}
heap->total_size += alloc_sz;
@@ -280,6 +291,10 @@ try_expand_heap_primary(struct malloc_heap *heap, uint64_t pg_sz,
return 0;

free_elem:
+ if (callback_triggered)
+ eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE,
+ map_addr, alloc_sz);
+
rollback_expand_heap(ms, n_segs, elem, map_addr, alloc_sz);

request_sync();
@@ -642,6 +657,10 @@ malloc_heap_free(struct malloc_elem *elem)
heap->total_size -= aligned_len;

if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ /* notify user about changes in memory map */
+ eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE,
+ aligned_start, aligned_len);
+
/* don't care if any of this fails */
malloc_heap_free_pages(aligned_start, aligned_len);

@@ -666,6 +685,8 @@ malloc_heap_free(struct malloc_elem *elem)
* already removed from the heap, so it is, for all intents and
* purposes, hidden from the rest of DPDK even if some other
* process (including this one) may have these pages mapped.
+ *
+ * notifications about deallocated memory happen during sync.
*/
request_to_primary(&req);
}
diff --git a/lib/librte_eal/linuxapp/eal/eal_memalloc.c b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
index 8202a1b..87c1bdb 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memalloc.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
@@ -877,6 +877,21 @@ sync_chunk(struct rte_memseg_list *primary_msl,

diff_len = RTE_MIN(chunk_len, diff_len);

+ /* if we are freeing memory, notify the application */
+ if (!used) {
+ struct rte_memseg *ms;
+ void *start_va;
+ size_t len, page_sz;
+
+ ms = rte_fbarray_get(l_arr, start);
+ start_va = ms->addr;
+ page_sz = (size_t)primary_msl->page_sz;
+ len = page_sz * diff_len;
+
+ eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE,
+ start_va, len);
+ }
+
for (i = 0; i < diff_len; i++) {
struct rte_memseg *p_ms, *l_ms;
int seg_idx = start + i;
@@ -902,6 +917,21 @@ sync_chunk(struct rte_memseg_list *primary_msl,
}
}

+ /* if we just allocated memory, notify the application */
+ if (used) {
+ struct rte_memseg *ms;
+ void *start_va;
+ size_t len, page_sz;
+
+ ms = rte_fbarray_get(l_arr, start);
+ start_va = ms->addr;
+ page_sz = (size_t)primary_msl->page_sz;
+ len = page_sz * diff_len;
+
+ eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC,
+ start_va, len);
+ }
+
/* calculate how much we can advance until next chunk */
diff_len = used ?
rte_fbarray_find_contig_used(l_arr, start) :
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index cbfe3c9..8b5f8fd 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -931,6 +931,7 @@ vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
create.levels = 1;

if (do_map) {
+ void *addr;
/* re-create window and remap the entire memory */
if (iova > create.window_size) {
if (vfio_spapr_create_new_dma_window(vfio_container_fd,
@@ -946,9 +947,19 @@ vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
}
/* now that we've remapped all of the memory that was present
* before, map the segment that we were requested to map.
+ *
+ * however, if we were called by the callback, the memory we
+ * were called with was already in the memseg list, so previous
+ * mapping should've mapped that segment already.
+ *
+ * virt2memseg_list is a relatively cheap check, so use that. if
+ * memory is within any memseg list, it's a memseg, so it's
+ * already mapped.
*/
- if (vfio_spapr_dma_do_map(vfio_container_fd,
- vaddr, iova, len, 1) < 0) {
+ addr = (void *)(uintptr_t)vaddr;
+ if (rte_mem_virt2memseg_list(addr) == NULL &&
+ vfio_spapr_dma_do_map(vfio_container_fd,
+ vaddr, iova, len, 1) < 0) {
RTE_LOG(ERR, EAL, "Could not map segment\n");
return -1;
}
--
2.7.4
Anatoly Burakov
2018-04-03 23:22:14 UTC
Permalink
Each process will have its own callbacks. Callbacks will indicate
whether it's allocation and deallocation that's happened, and will
also provide start VA address and length of allocated block.

Since memory hotplug isn't supported on FreeBSD and in legacy mem
mode, it will not be possible to register them in either.

Signed-off-by: Anatoly Burakov <***@intel.com>
---

Notes:
v3:
- Made API experimental
- Compile fixes

lib/librte_eal/common/eal_common_memalloc.c | 133 ++++++++++++++++++++++++++++
lib/librte_eal/common/eal_common_memory.c | 28 ++++++
lib/librte_eal/common/eal_memalloc.h | 11 +++
lib/librte_eal/common/include/rte_memory.h | 49 ++++++++++
lib/librte_eal/rte_eal_version.map | 2 +
5 files changed, 223 insertions(+)

diff --git a/lib/librte_eal/common/eal_common_memalloc.c b/lib/librte_eal/common/eal_common_memalloc.c
index 607ec3f..2d2d46f 100644
--- a/lib/librte_eal/common/eal_common_memalloc.c
+++ b/lib/librte_eal/common/eal_common_memalloc.c
@@ -2,16 +2,46 @@
* Copyright(c) 2017-2018 Intel Corporation
*/

+#include <string.h>
+
+#include <rte_errno.h>
#include <rte_lcore.h>
#include <rte_fbarray.h>
#include <rte_memzone.h>
#include <rte_memory.h>
#include <rte_eal_memconfig.h>
+#include <rte_rwlock.h>

#include "eal_private.h"
#include "eal_internal_cfg.h"
#include "eal_memalloc.h"

+struct mem_event_callback_entry {
+ TAILQ_ENTRY(mem_event_callback_entry) next;
+ char name[RTE_MEM_EVENT_CALLBACK_NAME_LEN];
+ rte_mem_event_callback_t clb;
+};
+
+/** Double linked list of actions. */
+TAILQ_HEAD(mem_event_callback_entry_list, mem_event_callback_entry);
+
+static struct mem_event_callback_entry_list mem_event_callback_list =
+ TAILQ_HEAD_INITIALIZER(mem_event_callback_list);
+
+static rte_rwlock_t mem_event_rwlock = RTE_RWLOCK_INITIALIZER;
+
+static struct mem_event_callback_entry *
+find_mem_event_callback(const char *name)
+{
+ struct mem_event_callback_entry *r;
+
+ TAILQ_FOREACH(r, &mem_event_callback_list, next) {
+ if (!strcmp(r->name, name))
+ break;
+ }
+ return r;
+}
+
bool
eal_memalloc_is_contig(const struct rte_memseg_list *msl, void *start,
size_t len)
@@ -88,3 +118,106 @@ eal_memalloc_is_contig(const struct rte_memseg_list *msl, void *start,
}
return true;
}
+
+int
+eal_memalloc_mem_event_callback_register(const char *name,
+ rte_mem_event_callback_t clb)
+{
+ struct mem_event_callback_entry *entry;
+ int ret, len;
+ if (name == NULL || clb == NULL) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+ len = strnlen(name, RTE_MEM_EVENT_CALLBACK_NAME_LEN);
+ if (len == 0) {
+ rte_errno = EINVAL;
+ return -1;
+ } else if (len == RTE_MEM_EVENT_CALLBACK_NAME_LEN) {
+ rte_errno = ENAMETOOLONG;
+ return -1;
+ }
+ rte_rwlock_write_lock(&mem_event_rwlock);
+
+ entry = find_mem_event_callback(name);
+ if (entry != NULL) {
+ rte_errno = EEXIST;
+ ret = -1;
+ goto unlock;
+ }
+
+ entry = malloc(sizeof(*entry));
+ if (entry == NULL) {
+ rte_errno = ENOMEM;
+ ret = -1;
+ goto unlock;
+ }
+
+ /* callback successfully created and is valid, add it to the list */
+ entry->clb = clb;
+ snprintf(entry->name, RTE_MEM_EVENT_CALLBACK_NAME_LEN, "%s", name);
+ TAILQ_INSERT_TAIL(&mem_event_callback_list, entry, next);
+
+ ret = 0;
+
+ RTE_LOG(DEBUG, EAL, "Mem event callback '%s' registered\n", name);
+
+unlock:
+ rte_rwlock_write_unlock(&mem_event_rwlock);
+ return ret;
+}
+
+int
+eal_memalloc_mem_event_callback_unregister(const char *name)
+{
+ struct mem_event_callback_entry *entry;
+ int ret, len;
+
+ if (name == NULL) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+ len = strnlen(name, RTE_MEM_EVENT_CALLBACK_NAME_LEN);
+ if (len == 0) {
+ rte_errno = EINVAL;
+ return -1;
+ } else if (len == RTE_MEM_EVENT_CALLBACK_NAME_LEN) {
+ rte_errno = ENAMETOOLONG;
+ return -1;
+ }
+ rte_rwlock_write_lock(&mem_event_rwlock);
+
+ entry = find_mem_event_callback(name);
+ if (entry == NULL) {
+ rte_errno = ENOENT;
+ ret = -1;
+ goto unlock;
+ }
+ TAILQ_REMOVE(&mem_event_callback_list, entry, next);
+ free(entry);
+
+ ret = 0;
+
+ RTE_LOG(DEBUG, EAL, "Mem event callback '%s' unregistered\n", name);
+
+unlock:
+ rte_rwlock_write_unlock(&mem_event_rwlock);
+ return ret;
+}
+
+void
+eal_memalloc_mem_event_notify(enum rte_mem_event event, const void *start,
+ size_t len)
+{
+ struct mem_event_callback_entry *entry;
+
+ rte_rwlock_read_lock(&mem_event_rwlock);
+
+ TAILQ_FOREACH(entry, &mem_event_callback_list, next) {
+ RTE_LOG(DEBUG, EAL, "Calling mem event callback %s",
+ entry->name);
+ entry->clb(event, start, len);
+ }
+
+ rte_rwlock_read_unlock(&mem_event_rwlock);
+}
diff --git a/lib/librte_eal/common/eal_common_memory.c b/lib/librte_eal/common/eal_common_memory.c
index 0c4c1f5..e3ce69c 100644
--- a/lib/librte_eal/common/eal_common_memory.c
+++ b/lib/librte_eal/common/eal_common_memory.c
@@ -604,6 +604,34 @@ dump_memseg(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
return 0;
}

+/*
+ * Defining here because declared in rte_memory.h, but the actual implementation
+ * is in eal_common_memalloc.c, like all other memalloc internals.
+ */
+int __rte_experimental
+rte_mem_event_callback_register(const char *name, rte_mem_event_callback_t clb)
+{
+ /* FreeBSD boots with legacy mem enabled by default */
+ if (internal_config.legacy_mem) {
+ RTE_LOG(DEBUG, EAL, "Registering mem event callbacks not supported\n");
+ rte_errno = ENOTSUP;
+ return -1;
+ }
+ return eal_memalloc_mem_event_callback_register(name, clb);
+}
+
+int __rte_experimental
+rte_mem_event_callback_unregister(const char *name)
+{
+ /* FreeBSD boots with legacy mem enabled by default */
+ if (internal_config.legacy_mem) {
+ RTE_LOG(DEBUG, EAL, "Registering mem event callbacks not supported\n");
+ rte_errno = ENOTSUP;
+ return -1;
+ }
+ return eal_memalloc_mem_event_callback_unregister(name);
+}
+
/* Dump the physical memory layout on console */
void
rte_dump_physmem_layout(FILE *f)
diff --git a/lib/librte_eal/common/eal_memalloc.h b/lib/librte_eal/common/eal_memalloc.h
index 4a7b45c..4d27403 100644
--- a/lib/librte_eal/common/eal_memalloc.h
+++ b/lib/librte_eal/common/eal_memalloc.h
@@ -56,4 +56,15 @@ eal_memalloc_is_contig(const struct rte_memseg_list *msl, void *start,
int
eal_memalloc_sync_with_primary(void);

+int
+eal_memalloc_mem_event_callback_register(const char *name,
+ rte_mem_event_callback_t clb);
+
+int
+eal_memalloc_mem_event_callback_unregister(const char *name);
+
+void
+eal_memalloc_mem_event_notify(enum rte_mem_event event, const void *start,
+ size_t len);
+
#endif // EAL_MEMALLOC_H
diff --git a/lib/librte_eal/common/include/rte_memory.h b/lib/librte_eal/common/include/rte_memory.h
index 55383c4..0de1198 100644
--- a/lib/librte_eal/common/include/rte_memory.h
+++ b/lib/librte_eal/common/include/rte_memory.h
@@ -290,6 +290,55 @@ unsigned rte_memory_get_nrank(void);
*/
int rte_eal_using_phys_addrs(void);

+
+/**
+ * Enum indicating which kind of memory event has happened. Used by callbacks to
+ * distinguish between memory allocations and deallocations.
+ */
+enum rte_mem_event {
+ RTE_MEM_EVENT_ALLOC = 0, /**< Allocation event. */
+ RTE_MEM_EVENT_FREE, /**< Deallocation event. */
+};
+#define RTE_MEM_EVENT_CALLBACK_NAME_LEN 64
+/**< maximum length of callback name */
+
+/**
+ * Function typedef used to register callbacks for memory events.
+ */
+typedef void (*rte_mem_event_callback_t)(enum rte_mem_event event_type,
+ const void *addr, size_t len);
+
+/**
+ * Function used to register callbacks for memory events.
+ *
+ * @param name
+ * Name associated with specified callback to be added to the list.
+ *
+ * @param clb
+ * Callback function pointer.
+ *
+ * @return
+ * 0 on successful callback register
+ * -1 on unsuccessful callback register, with rte_errno value indicating
+ * reason for failure.
+ */
+int __rte_experimental
+rte_mem_event_callback_register(const char *name, rte_mem_event_callback_t clb);
+
+/**
+ * Function used to unregister callbacks for memory events.
+ *
+ * @param name
+ * Name associated with specified callback to be removed from the list.
+ *
+ * @return
+ * 0 on successful callback unregister
+ * -1 on unsuccessful callback unregister, with rte_errno value indicating
+ * reason for failure.
+ */
+int __rte_experimental
+rte_mem_event_callback_unregister(const char *name);
+
#ifdef __cplusplus
}
#endif
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index df5802d..0460edb 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -238,6 +238,8 @@ EXPERIMENTAL {
rte_fbarray_set_used;
rte_log_register_type_and_pick_level;
rte_malloc_dump_heaps;
+ rte_mem_event_callback_register;
+ rte_mem_event_callback_unregister;
rte_mem_iova2virt;
rte_mem_virt2memseg;
rte_mem_virt2memseg_list;
--
2.7.4
Anatoly Burakov
2018-04-03 23:22:12 UTC
Permalink
This set of changes enables rte_malloc to allocate and free memory
as needed. Currently, it is disabled because legacy mem mode is
enabled unconditionally.

The way it works is, first malloc checks if there is enough memory
already allocated to satisfy user's request. If there isn't, we try
and allocate more memory. The reverse happens with free - we free
an element, check its size (including free element merging due to
adjacency) and see if it's bigger than hugepage size and that its
start and end span a hugepage or more. Then we remove the area from
malloc heap (adjusting element lengths where appropriate), and
deallocate the page.

For legacy mode, runtime alloc/free of pages is disabled.

It is worth noting that memseg lists are being sorted by page size,
and that we try our best to satisfy user's request. That is, if
the user requests an element from a 2MB page memory, we will check
if we can satisfy that request from existing memory, if not we try
and allocate more 2MB pages. If that fails and user also specified
a "size is hint" flag, we then check other page sizes and try to
allocate from there. If that fails too, then, depending on flags,
we may try allocating from other sockets. In other words, we try
our best to give the user what they asked for, but going to other
sockets is last resort - first we try to allocate more memory on
the same socket.

Signed-off-by: Anatoly Burakov <***@intel.com>
---

Notes:
v3:
- Compile fixes

lib/librte_eal/common/eal_common_memzone.c | 26 +--
lib/librte_eal/common/malloc_elem.c | 86 +++++++
lib/librte_eal/common/malloc_elem.h | 3 +
lib/librte_eal/common/malloc_heap.c | 347 ++++++++++++++++++++++++++++-
lib/librte_eal/common/malloc_heap.h | 4 +-
lib/librte_eal/common/rte_malloc.c | 31 +--
6 files changed, 433 insertions(+), 64 deletions(-)

diff --git a/lib/librte_eal/common/eal_common_memzone.c b/lib/librte_eal/common/eal_common_memzone.c
index aed9331..d522883 100644
--- a/lib/librte_eal/common/eal_common_memzone.c
+++ b/lib/librte_eal/common/eal_common_memzone.c
@@ -94,7 +94,7 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
struct rte_mem_config *mcfg;
struct rte_fbarray *arr;
size_t requested_len;
- int socket, i, mz_idx;
+ int mz_idx;

/* get pointer to global configuration */
mcfg = rte_eal_get_configuration()->mem_config;
@@ -179,29 +179,9 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
}
}

- if (socket_id == SOCKET_ID_ANY)
- socket = malloc_get_numa_socket();
- else
- socket = socket_id;
-
/* allocate memory on heap */
- void *mz_addr = malloc_heap_alloc(&mcfg->malloc_heaps[socket], NULL,
- requested_len, flags, align, bound, contig);
-
- if ((mz_addr == NULL) && (socket_id == SOCKET_ID_ANY)) {
- /* try other heaps */
- for (i = 0; i < RTE_MAX_NUMA_NODES; i++) {
- if (socket == i)
- continue;
-
- mz_addr = malloc_heap_alloc(&mcfg->malloc_heaps[i],
- NULL, requested_len, flags, align,
- bound, contig);
- if (mz_addr != NULL)
- break;
- }
- }
-
+ void *mz_addr = malloc_heap_alloc(NULL, requested_len, socket_id, flags,
+ align, bound, contig);
if (mz_addr == NULL) {
rte_errno = ENOMEM;
return NULL;
diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c
index 9db416f..4346532 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -447,6 +447,92 @@ malloc_elem_free(struct malloc_elem *elem)
return elem;
}

+/* assume all checks were already done */
+void
+malloc_elem_hide_region(struct malloc_elem *elem, void *start, size_t len)
+{
+ struct malloc_elem *hide_start, *hide_end, *prev, *next;
+ size_t len_before, len_after;
+
+ hide_start = start;
+ hide_end = RTE_PTR_ADD(start, len);
+
+ prev = elem->prev;
+ next = elem->next;
+
+ len_before = RTE_PTR_DIFF(hide_start, elem);
+ len_after = RTE_PTR_DIFF(next, hide_end);
+
+ if (len_after >= MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) {
+ /* split after */
+ split_elem(elem, hide_end);
+
+ malloc_elem_free_list_insert(hide_end);
+ } else if (len_after >= MALLOC_ELEM_HEADER_LEN) {
+ /* shrink current element */
+ elem->size -= len_after;
+ memset(hide_end, 0, sizeof(*hide_end));
+
+ /* copy next element's data to our pad */
+ memcpy(hide_end, next, sizeof(*hide_end));
+
+ /* pad next element */
+ next->state = ELEM_PAD;
+ next->pad = len_after;
+
+ /* next element is busy, would've been merged otherwise */
+ hide_end->pad = len_after;
+ hide_end->size += len_after;
+
+ /* adjust pointers to point to our new pad */
+ if (next->next)
+ next->next->prev = hide_end;
+ elem->next = hide_end;
+ } else if (len_after > 0) {
+ RTE_LOG(ERR, EAL, "Unaligned element, heap is probably corrupt\n");
+ rte_panic("blow up\n");
+ return;
+ }
+
+ if (len_before >= MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) {
+ /* split before */
+ split_elem(elem, hide_start);
+
+ prev = elem;
+ elem = hide_start;
+
+ malloc_elem_free_list_insert(prev);
+ } else if (len_before > 0) {
+ /*
+ * unlike with elements after current, here we don't need to
+ * pad elements, but rather just increase the size of previous
+ * element, copy the old header and and set up trailer.
+ */
+ void *trailer = RTE_PTR_ADD(prev,
+ prev->size - MALLOC_ELEM_TRAILER_LEN);
+
+ memcpy(hide_start, elem, sizeof(*elem));
+ hide_start->size = len;
+
+ prev->size += len_before;
+ set_trailer(prev);
+
+ /* update pointers */
+ prev->next = hide_start;
+ if (next)
+ next->prev = hide_start;
+
+ elem = hide_start;
+
+ /* erase old trailer */
+ memset(trailer, 0, MALLOC_ELEM_TRAILER_LEN);
+ /* erase old header */
+ memset(elem, 0, sizeof(*elem));
+ }
+
+ remove_elem(elem);
+}
+
/*
* attempt to resize a malloc_elem by expanding into any free space
* immediately after it in memory.
diff --git a/lib/librte_eal/common/malloc_elem.h b/lib/librte_eal/common/malloc_elem.h
index 620dd44..8f4aef8 100644
--- a/lib/librte_eal/common/malloc_elem.h
+++ b/lib/librte_eal/common/malloc_elem.h
@@ -154,6 +154,9 @@ int
malloc_elem_resize(struct malloc_elem *elem, size_t size);

void
+malloc_elem_hide_region(struct malloc_elem *elem, void *start, size_t len);
+
+void
malloc_elem_free_list_remove(struct malloc_elem *elem);

/*
diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c
index d798675..5f8c643 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -20,8 +20,10 @@
#include <rte_spinlock.h>
#include <rte_memcpy.h>
#include <rte_atomic.h>
+#include <rte_fbarray.h>

#include "eal_internal_cfg.h"
+#include "eal_memalloc.h"
#include "malloc_elem.h"
#include "malloc_heap.h"

@@ -149,48 +151,371 @@ find_suitable_element(struct malloc_heap *heap, size_t size,
* scan fails. Once the new memseg is added, it re-scans and should return
* the new element after releasing the lock.
*/
-void *
-malloc_heap_alloc(struct malloc_heap *heap,
- const char *type __attribute__((unused)), size_t size, unsigned flags,
- size_t align, size_t bound, bool contig)
+static void *
+heap_alloc(struct malloc_heap *heap, const char *type __rte_unused, size_t size,
+ unsigned int flags, size_t align, size_t bound, bool contig)
{
struct malloc_elem *elem;

size = RTE_CACHE_LINE_ROUNDUP(size);
align = RTE_CACHE_LINE_ROUNDUP(align);

- rte_spinlock_lock(&heap->lock);
-
elem = find_suitable_element(heap, size, flags, align, bound, contig);
if (elem != NULL) {
elem = malloc_elem_alloc(elem, size, align, bound, contig);
+
/* increase heap's count of allocated elements */
heap->alloc_count++;
}
- rte_spinlock_unlock(&heap->lock);

return elem == NULL ? NULL : (void *)(&elem[1]);
}

+static int
+try_expand_heap(struct malloc_heap *heap, size_t pg_sz, size_t elt_size,
+ int socket, unsigned int flags, size_t align, size_t bound,
+ bool contig)
+{
+ size_t map_len;
+ struct rte_memseg_list *msl;
+ struct rte_memseg **ms;
+ struct malloc_elem *elem;
+ int n_segs, allocd_pages;
+ void *ret, *map_addr;
+
+ align = RTE_MAX(align, MALLOC_ELEM_HEADER_LEN);
+ map_len = RTE_ALIGN_CEIL(align + elt_size + MALLOC_ELEM_TRAILER_LEN,
+ pg_sz);
+
+ n_segs = map_len / pg_sz;
+
+ /* we can't know in advance how many pages we'll need, so malloc */
+ ms = malloc(sizeof(*ms) * n_segs);
+
+ allocd_pages = eal_memalloc_alloc_seg_bulk(ms, n_segs, pg_sz,
+ socket, true);
+
+ /* make sure we've allocated our pages... */
+ if (allocd_pages < 0)
+ goto free_ms;
+
+ map_addr = ms[0]->addr;
+ msl = rte_mem_virt2memseg_list(map_addr);
+
+ /* check if we wanted contiguous memory but didn't get it */
+ if (contig && !eal_memalloc_is_contig(msl, map_addr, map_len)) {
+ RTE_LOG(DEBUG, EAL, "%s(): couldn't allocate physically contiguous space\n",
+ __func__);
+ goto free_pages;
+ }
+
+ /* add newly minted memsegs to malloc heap */
+ elem = malloc_heap_add_memory(heap, msl, map_addr, map_len);
+
+ /* try once more, as now we have allocated new memory */
+ ret = find_suitable_element(heap, elt_size, flags, align, bound,
+ contig);
+
+ if (ret == NULL)
+ goto free_elem;
+
+ RTE_LOG(DEBUG, EAL, "Heap on socket %d was expanded by %zdMB\n",
+ socket, map_len >> 20ULL);
+
+ free(ms);
+
+ return 0;
+
+free_elem:
+ malloc_elem_free_list_remove(elem);
+ malloc_elem_hide_region(elem, map_addr, map_len);
+ heap->total_size -= map_len;
+
+free_pages:
+ eal_memalloc_free_seg_bulk(ms, n_segs);
+free_ms:
+ free(ms);
+
+ return -1;
+}
+
+static int
+compare_pagesz(const void *a, const void *b)
+{
+ const struct rte_memseg_list * const*mpa = a;
+ const struct rte_memseg_list * const*mpb = b;
+ const struct rte_memseg_list *msla = *mpa;
+ const struct rte_memseg_list *mslb = *mpb;
+ uint64_t pg_sz_a = msla->page_sz;
+ uint64_t pg_sz_b = mslb->page_sz;
+
+ if (pg_sz_a < pg_sz_b)
+ return -1;
+ if (pg_sz_a > pg_sz_b)
+ return 1;
+ return 0;
+}
+
+static int
+alloc_mem_on_socket(size_t size, int socket, unsigned int flags, size_t align,
+ size_t bound, bool contig)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct malloc_heap *heap = &mcfg->malloc_heaps[socket];
+ struct rte_memseg_list *requested_msls[RTE_MAX_MEMSEG_LISTS];
+ struct rte_memseg_list *other_msls[RTE_MAX_MEMSEG_LISTS];
+ uint64_t requested_pg_sz[RTE_MAX_MEMSEG_LISTS];
+ uint64_t other_pg_sz[RTE_MAX_MEMSEG_LISTS];
+ uint64_t prev_pg_sz;
+ int i, n_other_msls, n_other_pg_sz, n_requested_msls, n_requested_pg_sz;
+ bool size_hint = (flags & RTE_MEMZONE_SIZE_HINT_ONLY) > 0;
+ unsigned int size_flags = flags & ~RTE_MEMZONE_SIZE_HINT_ONLY;
+ void *ret;
+
+ memset(requested_msls, 0, sizeof(requested_msls));
+ memset(other_msls, 0, sizeof(other_msls));
+ memset(requested_pg_sz, 0, sizeof(requested_pg_sz));
+ memset(other_pg_sz, 0, sizeof(other_pg_sz));
+
+ /*
+ * go through memseg list and take note of all the page sizes available,
+ * and if any of them were specifically requested by the user.
+ */
+ n_requested_msls = 0;
+ n_other_msls = 0;
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+ struct rte_memseg_list *msl = &mcfg->memsegs[i];
+
+ if (msl->socket_id != socket)
+ continue;
+
+ if (msl->base_va == NULL)
+ continue;
+
+ /* if pages of specific size were requested */
+ if (size_flags != 0 && check_hugepage_sz(size_flags,
+ msl->page_sz))
+ requested_msls[n_requested_msls++] = msl;
+ else if (size_flags == 0 || size_hint)
+ other_msls[n_other_msls++] = msl;
+ }
+
+ /* sort the lists, smallest first */
+ qsort(requested_msls, n_requested_msls, sizeof(requested_msls[0]),
+ compare_pagesz);
+ qsort(other_msls, n_other_msls, sizeof(other_msls[0]),
+ compare_pagesz);
+
+ /* now, extract page sizes we are supposed to try */
+ prev_pg_sz = 0;
+ n_requested_pg_sz = 0;
+ for (i = 0; i < n_requested_msls; i++) {
+ uint64_t pg_sz = requested_msls[i]->page_sz;
+
+ if (prev_pg_sz != pg_sz) {
+ requested_pg_sz[n_requested_pg_sz++] = pg_sz;
+ prev_pg_sz = pg_sz;
+ }
+ }
+ prev_pg_sz = 0;
+ n_other_pg_sz = 0;
+ for (i = 0; i < n_other_msls; i++) {
+ uint64_t pg_sz = other_msls[i]->page_sz;
+
+ if (prev_pg_sz != pg_sz) {
+ other_pg_sz[n_other_pg_sz++] = pg_sz;
+ prev_pg_sz = pg_sz;
+ }
+ }
+
+ /* finally, try allocating memory of specified page sizes, starting from
+ * the smallest sizes
+ */
+ for (i = 0; i < n_requested_pg_sz; i++) {
+ uint64_t pg_sz = requested_pg_sz[i];
+
+ /*
+ * do not pass the size hint here, as user expects other page
+ * sizes first, before resorting to best effort allocation.
+ */
+ if (!try_expand_heap(heap, pg_sz, size, socket, size_flags,
+ align, bound, contig))
+ return 0;
+ }
+ if (n_other_pg_sz == 0)
+ return -1;
+
+ /* now, check if we can reserve anything with size hint */
+ ret = find_suitable_element(heap, size, flags, align, bound, contig);
+ if (ret != NULL)
+ return 0;
+
+ /*
+ * we still couldn't reserve memory, so try expanding heap with other
+ * page sizes, if there are any
+ */
+ for (i = 0; i < n_other_pg_sz; i++) {
+ uint64_t pg_sz = other_pg_sz[i];
+
+ if (!try_expand_heap(heap, pg_sz, size, socket, flags,
+ align, bound, contig))
+ return 0;
+ }
+ return -1;
+}
+
+/* this will try lower page sizes first */
+static void *
+heap_alloc_on_socket(const char *type, size_t size, int socket,
+ unsigned int flags, size_t align, size_t bound, bool contig)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct malloc_heap *heap = &mcfg->malloc_heaps[socket];
+ unsigned int size_flags = flags & ~RTE_MEMZONE_SIZE_HINT_ONLY;
+ void *ret;
+
+ rte_spinlock_lock(&(heap->lock));
+
+ align = align == 0 ? 1 : align;
+
+ /* for legacy mode, try once and with all flags */
+ if (internal_config.legacy_mem) {
+ ret = heap_alloc(heap, type, size, flags, align, bound, contig);
+ goto alloc_unlock;
+ }
+
+ /*
+ * we do not pass the size hint here, because even if allocation fails,
+ * we may still be able to allocate memory from appropriate page sizes,
+ * we just need to request more memory first.
+ */
+ ret = heap_alloc(heap, type, size, size_flags, align, bound, contig);
+ if (ret != NULL)
+ goto alloc_unlock;
+
+ if (!alloc_mem_on_socket(size, socket, flags, align, bound, contig)) {
+ ret = heap_alloc(heap, type, size, flags, align, bound, contig);
+
+ /* this should have succeeded */
+ if (ret == NULL)
+ RTE_LOG(ERR, EAL, "Error allocating from heap\n");
+ }
+alloc_unlock:
+ rte_spinlock_unlock(&(heap->lock));
+ return ret;
+}
+
+void *
+malloc_heap_alloc(const char *type, size_t size, int socket_arg,
+ unsigned int flags, size_t align, size_t bound, bool contig)
+{
+ int socket, i, cur_socket;
+ void *ret;
+
+ /* return NULL if size is 0 or alignment is not power-of-2 */
+ if (size == 0 || (align && !rte_is_power_of_2(align)))
+ return NULL;
+
+ if (!rte_eal_has_hugepages())
+ socket_arg = SOCKET_ID_ANY;
+
+ if (socket_arg == SOCKET_ID_ANY)
+ socket = malloc_get_numa_socket();
+ else
+ socket = socket_arg;
+
+ /* Check socket parameter */
+ if (socket >= RTE_MAX_NUMA_NODES)
+ return NULL;
+
+ ret = heap_alloc_on_socket(type, size, socket, flags, align, bound,
+ contig);
+ if (ret != NULL || socket_arg != SOCKET_ID_ANY)
+ return ret;
+
+ /* try other heaps */
+ for (i = 0; i < (int) rte_socket_count(); i++) {
+ cur_socket = rte_socket_id_by_idx(i);
+ if (cur_socket == socket)
+ continue;
+ ret = heap_alloc_on_socket(type, size, cur_socket, flags,
+ align, bound, contig);
+ if (ret != NULL)
+ return ret;
+ }
+ return NULL;
+}
+
int
malloc_heap_free(struct malloc_elem *elem)
{
struct malloc_heap *heap;
- struct malloc_elem *ret;
+ void *start, *aligned_start, *end, *aligned_end;
+ size_t len, aligned_len, page_sz;
+ struct rte_memseg_list *msl;
+ int n_segs, seg_idx, max_seg_idx, ret;

if (!malloc_elem_cookies_ok(elem) || elem->state != ELEM_BUSY)
return -1;

/* elem may be merged with previous element, so keep heap address */
heap = elem->heap;
+ msl = elem->msl;
+ page_sz = (size_t)msl->page_sz;

rte_spinlock_lock(&(heap->lock));

- ret = malloc_elem_free(elem);
+ /* mark element as free */
+ elem->state = ELEM_FREE;

- rte_spinlock_unlock(&(heap->lock));
+ elem = malloc_elem_free(elem);
+
+ /* anything after this is a bonus */
+ ret = 0;
+
+ /* ...of which we can't avail if we are in legacy mode */
+ if (internal_config.legacy_mem)
+ goto free_unlock;
+
+ /* check if we can free any memory back to the system */
+ if (elem->size < page_sz)
+ goto free_unlock;

- return ret != NULL ? 0 : -1;
+ /* probably, but let's make sure, as we may not be using up full page */
+ start = elem;
+ len = elem->size;
+ aligned_start = RTE_PTR_ALIGN_CEIL(start, page_sz);
+ end = RTE_PTR_ADD(elem, len);
+ aligned_end = RTE_PTR_ALIGN_FLOOR(end, page_sz);
+
+ aligned_len = RTE_PTR_DIFF(aligned_end, aligned_start);
+
+ /* can't free anything */
+ if (aligned_len < page_sz)
+ goto free_unlock;
+
+ malloc_elem_free_list_remove(elem);
+
+ malloc_elem_hide_region(elem, (void *) aligned_start, aligned_len);
+
+ /* we don't really care if we fail to deallocate memory */
+ n_segs = aligned_len / page_sz;
+ seg_idx = RTE_PTR_DIFF(aligned_start, msl->base_va) / page_sz;
+ max_seg_idx = seg_idx + n_segs;
+
+ for (; seg_idx < max_seg_idx; seg_idx++) {
+ struct rte_memseg *ms;
+
+ ms = rte_fbarray_get(&msl->memseg_arr, seg_idx);
+ eal_memalloc_free_seg(ms);
+ }
+ heap->total_size -= aligned_len;
+
+ RTE_LOG(DEBUG, EAL, "Heap on socket %d was shrunk by %zdMB\n",
+ msl->socket_id, aligned_len >> 20ULL);
+free_unlock:
+ rte_spinlock_unlock(&(heap->lock));
+ return ret;
}

int
diff --git a/lib/librte_eal/common/malloc_heap.h b/lib/librte_eal/common/malloc_heap.h
index c57b59a..03b8014 100644
--- a/lib/librte_eal/common/malloc_heap.h
+++ b/lib/librte_eal/common/malloc_heap.h
@@ -26,8 +26,8 @@ malloc_get_numa_socket(void)
}

void *
-malloc_heap_alloc(struct malloc_heap *heap, const char *type, size_t size,
- unsigned int flags, size_t align, size_t bound, bool contig);
+malloc_heap_alloc(const char *type, size_t size, int socket, unsigned int flags,
+ size_t align, size_t bound, bool contig);

int
malloc_heap_free(struct malloc_elem *elem);
diff --git a/lib/librte_eal/common/rte_malloc.c b/lib/librte_eal/common/rte_malloc.c
index c6d3e57..b51a6d1 100644
--- a/lib/librte_eal/common/rte_malloc.c
+++ b/lib/librte_eal/common/rte_malloc.c
@@ -40,10 +40,6 @@ void *
rte_malloc_socket(const char *type, size_t size, unsigned int align,
int socket_arg)
{
- struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
- int socket, i;
- void *ret;
-
/* return NULL if size is 0 or alignment is not power-of-2 */
if (size == 0 || (align && !rte_is_power_of_2(align)))
return NULL;
@@ -51,33 +47,12 @@ rte_malloc_socket(const char *type, size_t size, unsigned int align,
if (!rte_eal_has_hugepages())
socket_arg = SOCKET_ID_ANY;

- if (socket_arg == SOCKET_ID_ANY)
- socket = malloc_get_numa_socket();
- else
- socket = socket_arg;
-
/* Check socket parameter */
- if (socket >= RTE_MAX_NUMA_NODES)
+ if (socket_arg >= RTE_MAX_NUMA_NODES)
return NULL;

- ret = malloc_heap_alloc(&mcfg->malloc_heaps[socket], type,
- size, 0, align == 0 ? 1 : align, 0, false);
- if (ret != NULL || socket_arg != SOCKET_ID_ANY)
- return ret;
-
- /* try other heaps */
- for (i = 0; i < RTE_MAX_NUMA_NODES; i++) {
- /* we already tried this one */
- if (i == socket)
- continue;
-
- ret = malloc_heap_alloc(&mcfg->malloc_heaps[i], type,
- size, 0, align == 0 ? 1 : align, 0, false);
- if (ret != NULL)
- return ret;
- }
-
- return NULL;
+ return malloc_heap_alloc(type, size, socket_arg, 0,
+ align == 0 ? 1 : align, 0, false);
}

/*
--
2.7.4
Anatoly Burakov
2018-04-03 23:22:16 UTC
Permalink
Enable callbacks on first device attach, disable callbacks
on last device attach.

Signed-off-by: Anatoly Burakov <***@intel.com>
---

Notes:
v3:
- Moved callbacks to attach/detach as opposed to init

lib/librte_eal/linuxapp/eal/eal_vfio.c | 83 +++++++++++++++++++++++++++++++---
1 file changed, 77 insertions(+), 6 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index 8b5f8fd..eb1a024 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -18,6 +18,8 @@

#ifdef VFIO_PRESENT

+#define VFIO_MEM_EVENT_CLB_NAME "vfio_mem_event_clb"
+
/* per-process VFIO config */
static struct vfio_config vfio_cfg;

@@ -250,6 +252,38 @@ vfio_group_device_count(int vfio_group_fd)
return vfio_cfg.vfio_groups[i].devices;
}

+static void
+vfio_mem_event_callback(enum rte_mem_event type, const void *addr, size_t len)
+{
+ struct rte_memseg_list *msl;
+ struct rte_memseg *ms;
+ size_t cur_len = 0;
+
+ msl = rte_mem_virt2memseg_list(addr);
+
+ /* for IOVA as VA mode, no need to care for IOVA addresses */
+ if (rte_eal_iova_mode() == RTE_IOVA_VA) {
+ uint64_t vfio_va = (uint64_t)(uintptr_t)addr;
+ if (type == RTE_MEM_EVENT_ALLOC)
+ rte_vfio_dma_map(vfio_va, vfio_va, len);
+ else
+ rte_vfio_dma_unmap(vfio_va, vfio_va, len);
+ return;
+ }
+
+ /* memsegs are contiguous in memory */
+ ms = rte_mem_virt2memseg(addr, msl);
+ while (cur_len < len) {
+ if (type == RTE_MEM_EVENT_ALLOC)
+ rte_vfio_dma_map(ms->addr_64, ms->iova, ms->len);
+ else
+ rte_vfio_dma_unmap(ms->addr_64, ms->iova, ms->len);
+
+ cur_len += ms->len;
+ ++ms;
+ }
+}
+
int
rte_vfio_clear_group(int vfio_group_fd)
{
@@ -312,6 +346,8 @@ int
rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
int *vfio_dev_fd, struct vfio_device_info *device_info)
{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ rte_rwlock_t *mem_lock = &mcfg->memory_hotplug_lock;
struct vfio_group_status group_status = {
.argsz = sizeof(group_status)
};
@@ -399,6 +435,10 @@ rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
rte_vfio_clear_group(vfio_group_fd);
return -1;
}
+ /* lock memory hotplug before mapping and release it
+ * after registering callback, to prevent races
+ */
+ rte_rwlock_read_lock(mem_lock);
ret = t->dma_map_func(vfio_cfg.vfio_container_fd);
if (ret) {
RTE_LOG(ERR, EAL,
@@ -406,10 +446,17 @@ rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
dev_addr, errno, strerror(errno));
close(vfio_group_fd);
rte_vfio_clear_group(vfio_group_fd);
+ rte_rwlock_read_unlock(mem_lock);
return -1;
}

vfio_cfg.vfio_iommu_type = t;
+
+ /* register callback for mem events */
+ rte_mem_event_callback_register(VFIO_MEM_EVENT_CLB_NAME,
+ vfio_mem_event_callback);
+ /* unlock memory hotplug */
+ rte_rwlock_read_unlock(mem_lock);
}
}

@@ -447,6 +494,8 @@ int
rte_vfio_release_device(const char *sysfs_base, const char *dev_addr,
int vfio_dev_fd)
{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ rte_rwlock_t *mem_lock = &mcfg->memory_hotplug_lock;
struct vfio_group_status group_status = {
.argsz = sizeof(group_status)
};
@@ -454,13 +503,20 @@ rte_vfio_release_device(const char *sysfs_base, const char *dev_addr,
int iommu_group_no;
int ret;

+ /* we don't want any DMA mapping messages to come while we're detaching
+ * VFIO device, because this might be the last device and we might need
+ * to unregister the callback.
+ */
+ rte_rwlock_read_lock(mem_lock);
+
/* get group number */
ret = vfio_get_group_no(sysfs_base, dev_addr, &iommu_group_no);
if (ret <= 0) {
RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver\n",
dev_addr);
/* This is an error at this point. */
- return -1;
+ ret = -1;
+ goto out;
}

/* get the actual group fd */
@@ -468,7 +524,8 @@ rte_vfio_release_device(const char *sysfs_base, const char *dev_addr,
if (vfio_group_fd <= 0) {
RTE_LOG(INFO, EAL, "vfio_get_group_fd failed for %s\n",
dev_addr);
- return -1;
+ ret = -1;
+ goto out;
}

/* At this point we got an active group. Closing it will make the
@@ -480,7 +537,8 @@ rte_vfio_release_device(const char *sysfs_base, const char *dev_addr,
if (close(vfio_dev_fd) < 0) {
RTE_LOG(INFO, EAL, "Error when closing vfio_dev_fd for %s\n",
dev_addr);
- return -1;
+ ret = -1;
+ goto out;
}

/* An VFIO group can have several devices attached. Just when there is
@@ -492,17 +550,30 @@ rte_vfio_release_device(const char *sysfs_base, const char *dev_addr,
if (close(vfio_group_fd) < 0) {
RTE_LOG(INFO, EAL, "Error when closing vfio_group_fd for %s\n",
dev_addr);
- return -1;
+ ret = -1;
+ goto out;
}

if (rte_vfio_clear_group(vfio_group_fd) < 0) {
RTE_LOG(INFO, EAL, "Error when clearing group for %s\n",
dev_addr);
- return -1;
+ ret = -1;
+ goto out;
}
}

- return 0;
+ /* if there are no active device groups, unregister the callback to
+ * avoid spurious attempts to map/unmap memory from VFIO.
+ */
+ if (vfio_cfg.vfio_active_groups == 0)
+ rte_mem_event_callback_unregister(VFIO_MEM_EVENT_CLB_NAME);
+
+ /* success */
+ ret = 0;
+
+out:
+ rte_rwlock_read_unlock(mem_lock);
+ return ret;
}

int
--
2.7.4
Anatoly Burakov
2018-04-03 23:22:02 UTC
Permalink
It's there, so we might as well use it. Some operations will be
sped up by that.

Since we have to allocate an fbarray for memzones, we have to do
it before we initialize memory subsystem, because that, in
secondary processes, will (later) allocate more fbarrays than the
primary process, which will result in inability to attach to
memzone fbarray if we do it after the fact.

Signed-off-by: Anatoly Burakov <***@intel.com>
---

Notes:
v3:
- Moved earlier in patchset
- Fixed compiled issues
- Removed rte_panic() calls

drivers/net/ena/ena_ethdev.c | 10 +-
lib/librte_eal/bsdapp/eal/eal.c | 14 ++-
lib/librte_eal/common/eal_common_memzone.c | 109 ++++++++++++----------
lib/librte_eal/common/include/rte_eal_memconfig.h | 4 +-
lib/librte_eal/common/malloc_heap.c | 4 +
lib/librte_eal/linuxapp/eal/eal.c | 13 ++-
test/test/test_memzone.c | 9 +-
7 files changed, 99 insertions(+), 64 deletions(-)

diff --git a/drivers/net/ena/ena_ethdev.c b/drivers/net/ena/ena_ethdev.c
index 34b2a8d..f7bfc7a 100644
--- a/drivers/net/ena/ena_ethdev.c
+++ b/drivers/net/ena/ena_ethdev.c
@@ -264,11 +264,15 @@ static const struct eth_dev_ops ena_dev_ops = {
static inline int ena_cpu_to_node(int cpu)
{
struct rte_config *config = rte_eal_get_configuration();
+ struct rte_fbarray *arr = &config->mem_config->memzones;
+ const struct rte_memzone *mz;

- if (likely(cpu < RTE_MAX_MEMZONE))
- return config->mem_config->memzone[cpu].socket_id;
+ if (unlikely(cpu >= RTE_MAX_MEMZONE))
+ return NUMA_NO_NODE;

- return NUMA_NO_NODE;
+ mz = rte_fbarray_get(arr, cpu);
+
+ return mz->socket_id;
}

static inline void ena_rx_mbuf_prepare(struct rte_mbuf *mbuf,
diff --git a/lib/librte_eal/bsdapp/eal/eal.c b/lib/librte_eal/bsdapp/eal/eal.c
index d009cf0..54330e1 100644
--- a/lib/librte_eal/bsdapp/eal/eal.c
+++ b/lib/librte_eal/bsdapp/eal/eal.c
@@ -599,14 +599,24 @@ rte_eal_init(int argc, char **argv)
}
}

+ /* in secondary processes, memory init may allocate additional fbarrays
+ * not present in primary processes, so to avoid any potential issues,
+ * initialize memzones first.
+ */
+ if (rte_eal_memzone_init() < 0) {
+ rte_eal_init_alert("Cannot init memzone\n");
+ rte_errno = ENODEV;
+ return -1;
+ }
+
if (rte_eal_memory_init() < 0) {
rte_eal_init_alert("Cannot init memory\n");
rte_errno = ENOMEM;
return -1;
}

- if (rte_eal_memzone_init() < 0) {
- rte_eal_init_alert("Cannot init memzone\n");
+ if (rte_eal_malloc_heap_init() < 0) {
+ rte_eal_init_alert("Cannot init malloc heap\n");
rte_errno = ENODEV;
return -1;
}
diff --git a/lib/librte_eal/common/eal_common_memzone.c b/lib/librte_eal/common/eal_common_memzone.c
index 529b36f..aed9331 100644
--- a/lib/librte_eal/common/eal_common_memzone.c
+++ b/lib/librte_eal/common/eal_common_memzone.c
@@ -28,42 +28,31 @@
static inline const struct rte_memzone *
memzone_lookup_thread_unsafe(const char *name)
{
- const struct rte_mem_config *mcfg;
+ struct rte_mem_config *mcfg;
+ struct rte_fbarray *arr;
const struct rte_memzone *mz;
- unsigned i = 0;
+ int i = 0;

/* get pointer to global configuration */
mcfg = rte_eal_get_configuration()->mem_config;
+ arr = &mcfg->memzones;

/*
* the algorithm is not optimal (linear), but there are few
* zones and this function should be called at init only
*/
- for (i = 0; i < RTE_MAX_MEMZONE; i++) {
- mz = &mcfg->memzone[i];
- if (mz->addr != NULL && !strncmp(name, mz->name, RTE_MEMZONE_NAMESIZE))
- return &mcfg->memzone[i];
+ i = rte_fbarray_find_next_used(arr, 0);
+ while (i >= 0) {
+ mz = rte_fbarray_get(arr, i++);
+ if (mz->addr != NULL &&
+ !strncmp(name, mz->name, RTE_MEMZONE_NAMESIZE))
+ return mz;
+ i = rte_fbarray_find_next_used(arr, i + 1);
}

return NULL;
}

-static inline struct rte_memzone *
-get_next_free_memzone(void)
-{
- struct rte_mem_config *mcfg;
- unsigned i = 0;
-
- /* get pointer to global configuration */
- mcfg = rte_eal_get_configuration()->mem_config;
-
- for (i = 0; i < RTE_MAX_MEMZONE; i++) {
- if (mcfg->memzone[i].addr == NULL)
- return &mcfg->memzone[i];
- }
-
- return NULL;
-}

/* This function will return the greatest free block if a heap has been
* specified. If no heap has been specified, it will return the heap and
@@ -103,14 +92,16 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
{
struct rte_memzone *mz;
struct rte_mem_config *mcfg;
+ struct rte_fbarray *arr;
size_t requested_len;
- int socket, i;
+ int socket, i, mz_idx;

/* get pointer to global configuration */
mcfg = rte_eal_get_configuration()->mem_config;
+ arr = &mcfg->memzones;

/* no more room in config */
- if (mcfg->memzone_cnt >= RTE_MAX_MEMZONE) {
+ if (arr->count >= arr->len) {
RTE_LOG(ERR, EAL, "%s(): No more room in config\n", __func__);
rte_errno = ENOSPC;
return NULL;
@@ -219,7 +210,14 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
struct malloc_elem *elem = malloc_elem_from_data(mz_addr);

/* fill the zone in config */
- mz = get_next_free_memzone();
+ mz_idx = rte_fbarray_find_next_free(arr, 0);
+
+ if (mz_idx < 0) {
+ mz = NULL;
+ } else {
+ rte_fbarray_set_used(arr, mz_idx);
+ mz = rte_fbarray_get(arr, mz_idx);
+ }

if (mz == NULL) {
RTE_LOG(ERR, EAL, "%s(): Cannot find free memzone but there is room "
@@ -229,7 +227,6 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
return NULL;
}

- mcfg->memzone_cnt++;
snprintf(mz->name, sizeof(mz->name), "%s", name);
mz->iova = rte_malloc_virt2iova(mz_addr);
mz->addr = mz_addr;
@@ -342,34 +339,38 @@ int
rte_memzone_free(const struct rte_memzone *mz)
{
struct rte_mem_config *mcfg;
+ struct rte_fbarray *arr;
+ struct rte_memzone *found_mz;
int ret = 0;
- void *addr;
+ void *addr = NULL;
unsigned idx;

if (mz == NULL)
return -EINVAL;

mcfg = rte_eal_get_configuration()->mem_config;
+ arr = &mcfg->memzones;

rte_rwlock_write_lock(&mcfg->mlock);

- idx = ((uintptr_t)mz - (uintptr_t)mcfg->memzone);
- idx = idx / sizeof(struct rte_memzone);
+ idx = rte_fbarray_find_idx(arr, mz);
+ found_mz = rte_fbarray_get(arr, idx);

- addr = mcfg->memzone[idx].addr;
- if (addr == NULL)
+ if (found_mz == NULL) {
+ ret = -EINVAL;
+ } else if (found_mz->addr == NULL) {
+ RTE_LOG(ERR, EAL, "Memzone is not allocated\n");
ret = -EINVAL;
- else if (mcfg->memzone_cnt == 0) {
- rte_panic("%s(): memzone address not NULL but memzone_cnt is 0!\n",
- __func__);
} else {
- memset(&mcfg->memzone[idx], 0, sizeof(mcfg->memzone[idx]));
- mcfg->memzone_cnt--;
+ addr = found_mz->addr;
+ memset(found_mz, 0, sizeof(*found_mz));
+ rte_fbarray_set_free(arr, idx);
}

rte_rwlock_write_unlock(&mcfg->mlock);

- rte_free(addr);
+ if (addr != NULL)
+ rte_free(addr);

return ret;
}
@@ -405,7 +406,7 @@ dump_memzone(const struct rte_memzone *mz, void *arg)
size_t page_sz;
FILE *f = arg;

- mz_idx = mz - mcfg->memzone;
+ mz_idx = rte_fbarray_find_idx(&mcfg->memzones, mz);

fprintf(f, "Zone %u: name:<%s>, len:0x%zx, virt:%p, "
"socket_id:%"PRId32", flags:%"PRIx32"\n",
@@ -462,19 +463,23 @@ rte_eal_memzone_init(void)
/* get pointer to global configuration */
mcfg = rte_eal_get_configuration()->mem_config;

- /* secondary processes don't need to initialise anything */
- if (rte_eal_process_type() == RTE_PROC_SECONDARY)
- return 0;
-
rte_rwlock_write_lock(&mcfg->mlock);

- /* delete all zones */
- mcfg->memzone_cnt = 0;
- memset(mcfg->memzone, 0, sizeof(mcfg->memzone));
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY &&
+ rte_fbarray_init(&mcfg->memzones, "memzone",
+ RTE_MAX_MEMZONE, sizeof(struct rte_memzone))) {
+ RTE_LOG(ERR, EAL, "Cannot allocate memzone list\n");
+ return -1;
+ } else if (rte_eal_process_type() == RTE_PROC_SECONDARY &&
+ rte_fbarray_attach(&mcfg->memzones)) {
+ RTE_LOG(ERR, EAL, "Cannot attach to memzone list\n");
+ rte_rwlock_write_unlock(&mcfg->mlock);
+ return -1;
+ }

rte_rwlock_write_unlock(&mcfg->mlock);

- return rte_eal_malloc_heap_init();
+ return 0;
}

/* Walk all reserved memory zones */
@@ -482,14 +487,18 @@ void rte_memzone_walk(void (*func)(const struct rte_memzone *, void *),
void *arg)
{
struct rte_mem_config *mcfg;
- unsigned i;
+ struct rte_fbarray *arr;
+ int i;

mcfg = rte_eal_get_configuration()->mem_config;
+ arr = &mcfg->memzones;

rte_rwlock_read_lock(&mcfg->mlock);
- for (i=0; i<RTE_MAX_MEMZONE; i++) {
- if (mcfg->memzone[i].addr != NULL)
- (*func)(&mcfg->memzone[i], arg);
+ i = rte_fbarray_find_next_used(arr, 0);
+ while (i >= 0) {
+ struct rte_memzone *mz = rte_fbarray_get(arr, i);
+ (*func)(mz, arg);
+ i = rte_fbarray_find_next_used(arr, i + 1);
}
rte_rwlock_read_unlock(&mcfg->mlock);
}
diff --git a/lib/librte_eal/common/include/rte_eal_memconfig.h b/lib/librte_eal/common/include/rte_eal_memconfig.h
index b745e18..88cde8c 100644
--- a/lib/librte_eal/common/include/rte_eal_memconfig.h
+++ b/lib/librte_eal/common/include/rte_eal_memconfig.h
@@ -58,10 +58,8 @@ struct rte_mem_config {
rte_rwlock_t qlock; /**< used for tailq operation for thread safe. */
rte_rwlock_t mplock; /**< only used by mempool LIB for thread-safe. */

- uint32_t memzone_cnt; /**< Number of allocated memzones */
-
/* memory segments and zones */
- struct rte_memzone memzone[RTE_MAX_MEMZONE]; /**< Memzone descriptors. */
+ struct rte_fbarray memzones; /**< Memzone descriptors. */

struct rte_memseg_list memsegs[RTE_MAX_MEMSEG_LISTS];
/**< list of dynamic arrays holding memsegs */
diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c
index 0ef2c45..d798675 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -278,6 +278,10 @@ rte_eal_malloc_heap_init(void)
if (mcfg == NULL)
return -1;

+ /* secondary process does not need to initialize anything */
+ if (rte_eal_process_type() != RTE_PROC_PRIMARY)
+ return 0;
+
/* add all IOVA-contiguous areas to the heap */
return rte_memseg_contig_walk(malloc_add_seg, NULL);
}
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index ffcbd71..9832551 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -858,6 +858,15 @@ rte_eal_init(int argc, char **argv)
return -1;
}
#endif
+ /* in secondary processes, memory init may allocate additional fbarrays
+ * not present in primary processes, so to avoid any potential issues,
+ * initialize memzones first.
+ */
+ if (rte_eal_memzone_init() < 0) {
+ rte_eal_init_alert("Cannot init memzone\n");
+ rte_errno = ENODEV;
+ return -1;
+ }

if (rte_eal_memory_init() < 0) {
rte_eal_init_alert("Cannot init memory\n");
@@ -868,8 +877,8 @@ rte_eal_init(int argc, char **argv)
/* the directories are locked during eal_hugepage_info_init */
eal_hugedirs_unlock();

- if (rte_eal_memzone_init() < 0) {
- rte_eal_init_alert("Cannot init memzone\n");
+ if (rte_eal_malloc_heap_init() < 0) {
+ rte_eal_init_alert("Cannot init malloc heap\n");
rte_errno = ENODEV;
return -1;
}
diff --git a/test/test/test_memzone.c b/test/test/test_memzone.c
index 0046f04..efcf732 100644
--- a/test/test/test_memzone.c
+++ b/test/test/test_memzone.c
@@ -909,7 +909,7 @@ test_memzone_basic(void)
const struct rte_memzone *mz;
int memzone_cnt_after, memzone_cnt_expected;
int memzone_cnt_before =
- rte_eal_get_configuration()->mem_config->memzone_cnt;
+ rte_eal_get_configuration()->mem_config->memzones.count;

memzone1 = rte_memzone_reserve(TEST_MEMZONE_NAME("testzone1"), 100,
SOCKET_ID_ANY, 0);
@@ -933,7 +933,7 @@ test_memzone_basic(void)
(memzone3 != NULL) + (memzone4 != NULL);

memzone_cnt_after =
- rte_eal_get_configuration()->mem_config->memzone_cnt;
+ rte_eal_get_configuration()->mem_config->memzones.count;

if (memzone_cnt_after != memzone_cnt_expected)
return -1;
@@ -1012,7 +1012,7 @@ test_memzone_basic(void)
}

memzone_cnt_after =
- rte_eal_get_configuration()->mem_config->memzone_cnt;
+ rte_eal_get_configuration()->mem_config->memzones.count;
if (memzone_cnt_after != memzone_cnt_before)
return -1;

@@ -1033,7 +1033,8 @@ static int
test_memzone(void)
{
/* take note of how many memzones were allocated before running */
- int memzone_cnt = rte_eal_get_configuration()->mem_config->memzone_cnt;
+ int memzone_cnt =
+ rte_eal_get_configuration()->mem_config->memzones.count;

printf("test basic memzone API\n");
if (test_memzone_basic() < 0)
--
2.7.4
Shreyansh Jain
2018-04-05 14:23:08 UTC
Permalink
Post by Anatoly Burakov
It's there, so we might as well use it. Some operations will be
sped up by that.
Since we have to allocate an fbarray for memzones, we have to do
it before we initialize memory subsystem, because that, in
secondary processes, will (later) allocate more fbarrays than the
primary process, which will result in inability to attach to
memzone fbarray if we do it after the fact.
---
- Moved earlier in patchset
- Fixed compiled issues
- Removed rte_panic() calls
[...]
Post by Anatoly Burakov
diff --git a/lib/librte_eal/common/eal_common_memzone.c b/lib/librte_eal/common/eal_common_memzone.c
index 529b36f..aed9331 100644
--- a/lib/librte_eal/common/eal_common_memzone.c
+++ b/lib/librte_eal/common/eal_common_memzone.c
@@ -28,42 +28,31 @@
static inline const struct rte_memzone *
memzone_lookup_thread_unsafe(const char *name)
{
- const struct rte_mem_config *mcfg;
+ struct rte_mem_config *mcfg;
+ struct rte_fbarray *arr;
const struct rte_memzone *mz;
- unsigned i = 0;
+ int i = 0;
/* get pointer to global configuration */
mcfg = rte_eal_get_configuration()->mem_config;
+ arr = &mcfg->memzones;
/*
* the algorithm is not optimal (linear), but there are few
* zones and this function should be called at init only
*/
- for (i = 0; i < RTE_MAX_MEMZONE; i++) {
- mz = &mcfg->memzone[i];
- if (mz->addr != NULL && !strncmp(name, mz->name, RTE_MEMZONE_NAMESIZE))
- return &mcfg->memzone[i];
+ i = rte_fbarray_find_next_used(arr, 0);
+ while (i >= 0) {
+ mz = rte_fbarray_get(arr, i++);
^^^^^^^^
As discussed offline, this needs to be changed.
Double increment of 'i' leading to skips over lookup.
Post by Anatoly Burakov
+ if (mz->addr != NULL &&
+ !strncmp(name, mz->name, RTE_MEMZONE_NAMESIZE))
+ return mz;
+ i = rte_fbarray_find_next_used(arr, i + 1);
}
return NULL;
}
[..]

-
Shreyansh
Anatoly Burakov
2018-04-03 23:22:06 UTC
Permalink
For now, memory is always contiguous because legacy mem mode is
enabled unconditionally, but this function will be helpful down
the line when we implement support for allocating physically
non-contiguous memory. We can no longer guarantee physically
contiguous memory unless we're in legacy or IOVA_AS_VA mode, but
we can certainly try and see if we succeed.

In addition, this would be useful for e.g. PMD's who may allocate
chunks that are smaller than the pagesize, but they must not cross
the page boundary, in which case we will be able to accommodate
that request. This function will also support non-hugepage memory.

Signed-off-by: Anatoly Burakov <***@intel.com>
---

Notes:
v3:
- Moved this earlier in the patchset
- Add support for non-hugepage memory
- Fix handling of IOVA as VA mode

v3:
- Add support for non-hugepage memory
- Support non-page-sized segments

lib/librte_eal/bsdapp/eal/Makefile | 1 +
lib/librte_eal/common/eal_common_memalloc.c | 90 +++++++++++++++++++++++++++++
lib/librte_eal/common/eal_memalloc.h | 10 ++++
lib/librte_eal/common/malloc_elem.c | 40 +------------
lib/librte_eal/common/meson.build | 1 +
lib/librte_eal/linuxapp/eal/Makefile | 1 +
6 files changed, 106 insertions(+), 37 deletions(-)
create mode 100644 lib/librte_eal/common/eal_common_memalloc.c

diff --git a/lib/librte_eal/bsdapp/eal/Makefile b/lib/librte_eal/bsdapp/eal/Makefile
index 19f9322..907e30d 100644
--- a/lib/librte_eal/bsdapp/eal/Makefile
+++ b/lib/librte_eal/bsdapp/eal/Makefile
@@ -41,6 +41,7 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_timer.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_memzone.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_log.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_launch.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_memalloc.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_memory.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_tailqs.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_errno.c
diff --git a/lib/librte_eal/common/eal_common_memalloc.c b/lib/librte_eal/common/eal_common_memalloc.c
new file mode 100644
index 0000000..607ec3f
--- /dev/null
+++ b/lib/librte_eal/common/eal_common_memalloc.c
@@ -0,0 +1,90 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2017-2018 Intel Corporation
+ */
+
+#include <rte_lcore.h>
+#include <rte_fbarray.h>
+#include <rte_memzone.h>
+#include <rte_memory.h>
+#include <rte_eal_memconfig.h>
+
+#include "eal_private.h"
+#include "eal_internal_cfg.h"
+#include "eal_memalloc.h"
+
+bool
+eal_memalloc_is_contig(const struct rte_memseg_list *msl, void *start,
+ size_t len)
+{
+ void *end, *aligned_start, *aligned_end;
+ size_t pgsz = (size_t)msl->page_sz;
+ const struct rte_memseg *ms;
+
+ /* for IOVA_VA, it's always contiguous */
+ if (rte_eal_iova_mode() == RTE_IOVA_VA)
+ return true;
+
+ /* for legacy memory, it's always contiguous */
+ if (internal_config.legacy_mem)
+ return true;
+
+ end = RTE_PTR_ADD(start, len);
+
+ /* for nohuge, we check pagemap, otherwise check memseg */
+ if (!rte_eal_has_hugepages()) {
+ rte_iova_t cur, expected;
+
+ aligned_start = RTE_PTR_ALIGN_FLOOR(start, pgsz);
+ aligned_end = RTE_PTR_ALIGN_CEIL(end, pgsz);
+
+ /* if start and end are on the same page, bail out early */
+ if (RTE_PTR_DIFF(aligned_end, aligned_start) == pgsz)
+ return true;
+
+ /* skip first iteration */
+ cur = rte_mem_virt2iova(aligned_start);
+ expected = cur + pgsz;
+ aligned_start = RTE_PTR_ADD(aligned_start, pgsz);
+
+ while (aligned_start < aligned_end) {
+ cur = rte_mem_virt2iova(aligned_start);
+ if (cur != expected)
+ return false;
+ aligned_start = RTE_PTR_ADD(aligned_start, pgsz);
+ expected += pgsz;
+ }
+ } else {
+ int start_seg, end_seg, cur_seg;
+ rte_iova_t cur, expected;
+
+ aligned_start = RTE_PTR_ALIGN_FLOOR(start, pgsz);
+ aligned_end = RTE_PTR_ALIGN_CEIL(end, pgsz);
+
+ start_seg = RTE_PTR_DIFF(aligned_start, msl->base_va) /
+ pgsz;
+ end_seg = RTE_PTR_DIFF(aligned_end, msl->base_va) /
+ pgsz;
+
+ /* if start and end are on the same page, bail out early */
+ if (RTE_PTR_DIFF(aligned_end, aligned_start) == pgsz)
+ return true;
+
+ /* skip first iteration */
+ ms = rte_fbarray_get(&msl->memseg_arr, start_seg);
+ cur = ms->iova;
+ expected = cur + pgsz;
+
+ /* if we can't access IOVA addresses, assume non-contiguous */
+ if (cur == RTE_BAD_IOVA)
+ return false;
+
+ for (cur_seg = start_seg + 1; cur_seg < end_seg;
+ cur_seg++, expected += pgsz) {
+ ms = rte_fbarray_get(&msl->memseg_arr, cur_seg);
+
+ if (ms->iova != expected)
+ return false;
+ }
+ }
+ return true;
+}
diff --git a/lib/librte_eal/common/eal_memalloc.h b/lib/librte_eal/common/eal_memalloc.h
index 6017345..2413c6c 100644
--- a/lib/librte_eal/common/eal_memalloc.h
+++ b/lib/librte_eal/common/eal_memalloc.h
@@ -8,6 +8,7 @@
#include <stdbool.h>

#include <rte_memory.h>
+#include <rte_eal_memconfig.h>

/*
* Allocate segment of specified page size.
@@ -42,4 +43,13 @@ eal_memalloc_free_seg(struct rte_memseg *ms);
int
eal_memalloc_free_seg_bulk(struct rte_memseg **ms, int n_segs);

+
+/*
+ * Check if memory pointed to by `start` and of `length` that resides in
+ * memseg list `msl` is IOVA-contiguous.
+ */
+bool
+eal_memalloc_is_contig(const struct rte_memseg_list *msl, void *start,
+ size_t len);
+
#endif // EAL_MEMALLOC_H
diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c
index 685aac4..9db416f 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -18,6 +18,7 @@
#include <rte_common.h>
#include <rte_spinlock.h>

+#include "eal_memalloc.h"
#include "malloc_elem.h"
#include "malloc_heap.h"

@@ -100,45 +101,10 @@ malloc_elem_insert(struct malloc_elem *elem)
* so we just check the page addresses.
*/
static bool
-elem_check_phys_contig(const struct rte_memseg_list *msl __rte_unused,
+elem_check_phys_contig(const struct rte_memseg_list *msl,
void *start, size_t size)
{
- rte_iova_t cur, expected;
- void *start_page, *end_page, *cur_page;
- size_t pagesz;
-
- /* for hugepage memory or IOVA as VA, it's always contiguous */
- if (rte_eal_has_hugepages() || rte_eal_iova_mode() == RTE_IOVA_VA)
- return true;
-
- /* otherwise, check if start and end are within the same page */
- pagesz = getpagesize();
-
- start_page = RTE_PTR_ALIGN_FLOOR(start, pagesz);
- end_page = RTE_PTR_ALIGN_FLOOR(RTE_PTR_ADD(start, size - 1), pagesz);
-
- if (start_page == end_page)
- return true;
-
- /* if they are from different pages, check if they are contiguous */
-
- /* if we can't access physical addresses, assume non-contiguous */
- if (!rte_eal_using_phys_addrs())
- return false;
-
- /* skip first iteration */
- cur = rte_mem_virt2iova(start_page);
- expected = cur + pagesz;
- cur_page = RTE_PTR_ADD(start_page, pagesz);
-
- while (cur_page <= end_page) {
- cur = rte_mem_virt2iova(cur_page);
- if (cur != expected)
- return false;
- cur_page = RTE_PTR_ADD(cur_page, pagesz);
- expected += pagesz;
- }
- return true;
+ return eal_memalloc_is_contig(msl, start, size);
}

/*
diff --git a/lib/librte_eal/common/meson.build b/lib/librte_eal/common/meson.build
index 7d02191..a1ada24 100644
--- a/lib/librte_eal/common/meson.build
+++ b/lib/librte_eal/common/meson.build
@@ -16,6 +16,7 @@ common_sources = files(
'eal_common_launch.c',
'eal_common_lcore.c',
'eal_common_log.c',
+ 'eal_common_memalloc.c',
'eal_common_memory.c',
'eal_common_memzone.c',
'eal_common_options.c',
diff --git a/lib/librte_eal/linuxapp/eal/Makefile b/lib/librte_eal/linuxapp/eal/Makefile
index af6b9be..5380ba8 100644
--- a/lib/librte_eal/linuxapp/eal/Makefile
+++ b/lib/librte_eal/linuxapp/eal/Makefile
@@ -49,6 +49,7 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_timer.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_memzone.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_log.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_launch.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_memalloc.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_memory.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_tailqs.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_errno.c
--
2.7.4
Anatoly Burakov
2018-04-03 23:22:09 UTC
Permalink
Add a new (non-legacy) memory init path for EAL. It uses the
new memory hotplug facilities.

If no -m or --socket-mem switches were specified, the new init
will not allocate anything, whereas if those switches were passed,
appropriate amounts of pages would be requested, just like for
legacy init.

Allocated pages will be physically discontiguous (or rather, they're
not guaranteed to be physically contiguous - they may still be so by
accident) unless RTE_IOVA_VA mode is used.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/linuxapp/eal/eal_memory.c | 62 ++++++++++++++++++++++++++++++--
1 file changed, 59 insertions(+), 3 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
index d38fb68..c51d598 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -40,6 +40,7 @@
#include <rte_string_fns.h>

#include "eal_private.h"
+#include "eal_memalloc.h"
#include "eal_internal_cfg.h"
#include "eal_filesystem.h"
#include "eal_hugepages.h"
@@ -1600,6 +1601,61 @@ eal_legacy_hugepage_init(void)
return -1;
}

+static int
+eal_hugepage_init(void)
+{
+ struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES];
+ uint64_t memory[RTE_MAX_NUMA_NODES];
+ int hp_sz_idx, socket_id;
+
+ test_phys_addrs_available();
+
+ memset(used_hp, 0, sizeof(used_hp));
+
+ for (hp_sz_idx = 0;
+ hp_sz_idx < (int) internal_config.num_hugepage_sizes;
+ hp_sz_idx++) {
+ /* also initialize used_hp hugepage sizes in used_hp */
+ struct hugepage_info *hpi;
+ hpi = &internal_config.hugepage_info[hp_sz_idx];
+ used_hp[hp_sz_idx].hugepage_sz = hpi->hugepage_sz;
+ }
+
+ /* make a copy of socket_mem, needed for balanced allocation. */
+ for (hp_sz_idx = 0; hp_sz_idx < RTE_MAX_NUMA_NODES; hp_sz_idx++)
+ memory[hp_sz_idx] = internal_config.socket_mem[hp_sz_idx];
+
+ /* calculate final number of pages */
+ if (calc_num_pages_per_socket(memory,
+ internal_config.hugepage_info, used_hp,
+ internal_config.num_hugepage_sizes) < 0)
+ return -1;
+
+ for (hp_sz_idx = 0;
+ hp_sz_idx < (int)internal_config.num_hugepage_sizes;
+ hp_sz_idx++) {
+ for (socket_id = 0; socket_id < RTE_MAX_NUMA_NODES;
+ socket_id++) {
+ struct hugepage_info *hpi = &used_hp[hp_sz_idx];
+ unsigned int num_pages = hpi->num_pages[socket_id];
+ int num_pages_alloc;
+
+ if (num_pages == 0)
+ continue;
+
+ RTE_LOG(DEBUG, EAL, "Allocating %u pages of size %" PRIu64 "M on socket %i\n",
+ num_pages, hpi->hugepage_sz >> 20, socket_id);
+
+ num_pages_alloc = eal_memalloc_alloc_seg_bulk(NULL,
+ num_pages, hpi->hugepage_sz,
+ socket_id, true);
+ if (num_pages_alloc < 0)
+ return -1;
+ }
+ }
+ return 0;
+}
+
/*
* uses fstat to report the size of a file on disk
*/
@@ -1722,9 +1778,9 @@ eal_legacy_hugepage_attach(void)
int
rte_eal_hugepage_init(void)
{
- if (internal_config.legacy_mem)
- return eal_legacy_hugepage_init();
- return -1;
+ return internal_config.legacy_mem ?
+ eal_legacy_hugepage_init() :
+ eal_hugepage_init();
}

int
--
2.7.4
Anatoly Burakov
2018-04-03 23:22:19 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/common/malloc_heap.c | 10 +++++++++-
1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c
index 18c7b69..f8daf84 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -196,6 +196,15 @@ alloc_pages_on_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size,
int allocd_pages;
void *ret, *map_addr;

+ alloc_sz = (size_t)pg_sz * n_segs;
+
+ /* first, check if we're allowed to allocate this memory */
+ if (eal_memalloc_mem_alloc_validate(socket,
+ heap->total_size + alloc_sz) < 0) {
+ RTE_LOG(DEBUG, EAL, "User has disallowed allocation\n");
+ return NULL;
+ }
+
allocd_pages = eal_memalloc_alloc_seg_bulk(ms, n_segs, pg_sz,
socket, true);

@@ -205,7 +214,6 @@ alloc_pages_on_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size,

map_addr = ms[0]->addr;
msl = rte_mem_virt2memseg_list(map_addr);
- alloc_sz = (size_t)msl->page_sz * allocd_pages;

/* check if we wanted contiguous memory but didn't get it */
if (contig && !eal_memalloc_is_contig(msl, map_addr, alloc_sz)) {
--
2.7.4
Anatoly Burakov
2018-04-03 23:22:01 UTC
Permalink
Before, we were aggregating multiple pages into one memseg, so the
number of memsegs was small. Now, each page gets its own memseg,
so the list of memsegs is huge. To accommodate the new memseg list
size and to keep the under-the-hood workings sane, the memseg list
is now not just a single list, but multiple lists. To be precise,
each hugepage size available on the system gets one or more memseg
lists, per socket.

In order to support dynamic memory allocation, we reserve all
memory in advance (unless we're in 32-bit legacy mode, in which
case we do not preallocate memory). As in, we do an anonymous
mmap() of the entire maximum size of memory per hugepage size, per
socket (which is limited to either RTE_MAX_MEMSEG_PER_TYPE pages or
RTE_MAX_MEM_MB_PER_TYPE megabytes worth of memory, whichever is the
smaller one), split over multiple lists (which are limited to
either RTE_MAX_MEMSEG_PER_LIST memsegs or RTE_MAX_MEM_MB_PER_LIST
megabytes per list, whichever is the smaller one). There is also
a global limit of CONFIG_RTE_MAX_MEM_MB megabytes, which is mainly
used for 32-bit targets to limit amounts of preallocated memory,
but can be used to place an upper limit on total amount of VA
memory that can be allocated by DPDK application.

So, for each hugepage size, we get (by default) up to 128G worth
of memory, per socket, split into chunks of up to 32G in size.
The address space is claimed at the start, in eal_common_memory.c.
The actual page allocation code is in eal_memalloc.c (Linux-only),
and largely consists of copied EAL memory init code.

Pages in the list are also indexed by address. That is, in order
to figure out where the page belongs, one can simply look at base
address for a memseg list. Similarly, figuring out IOVA address
of a memzone is a matter of finding the right memseg list, getting
offset and dividing by page size to get the appropriate memseg.

This commit also removes rte_eal_dump_physmem_layout() call,
according to deprecation notice [1], and removes that deprecation
notice as well.

On 32-bit targets due to limited VA space, DPDK will no longer
spread memory to different sockets like before. Instead, it will
(by default) allocate all of the memory on socket where master
lcore is. To override this behavior, --socket-mem must be used.

The rest of the changes are really ripple effects from the memseg
change - heap changes, compile fixes, and rewrites to support
fbarray-backed memseg lists. Due to earlier switch to _walk()
functions, most of the changes are simple fixes, however some
of the _walk() calls were switched to memseg list walk, where
it made sense to do so.

Additionally, we are also switching locks from flock() to fcntl().
Down the line, we will be introducing single-file segments option,
and we cannot use flock() locks to lock parts of the file. Therefore,
we will use fcntl() locks for legacy mem as well, in case someone is
unfortunate enough to accidentally start legacy mem primary process
alongside an already working non-legacy mem-based primary process.

[1] http://dpdk.org/dev/patchwork/patch/34002/

Signed-off-by: Anatoly Burakov <***@intel.com>
---

Notes:
v3:
- New and improved legacy mode, without (too much) crazy hacks
- 32-bit support
- FreeBSD support
- Compile fixes for all platforms

config/common_base | 15 +-
config/defconfig_i686-native-linuxapp-gcc | 3 +
config/defconfig_i686-native-linuxapp-icc | 3 +
config/defconfig_x86_x32-native-linuxapp-gcc | 3 +
config/rte_config.h | 7 +-
doc/guides/rel_notes/deprecation.rst | 9 -
drivers/bus/fslmc/fslmc_vfio.c | 10 +-
drivers/bus/fslmc/portal/dpaa2_hw_pvt.h | 2 +-
drivers/bus/pci/linux/pci.c | 8 +-
drivers/crypto/dpaa_sec/dpaa_sec.c | 2 +-
drivers/net/mlx4/mlx4_mr.c | 4 +-
drivers/net/mlx5/mlx5.c | 3 +-
drivers/net/mlx5/mlx5_mr.c | 4 +-
drivers/net/virtio/virtio_user/vhost_kernel.c | 4 +-
lib/librte_eal/bsdapp/eal/eal.c | 12 +-
lib/librte_eal/bsdapp/eal/eal_hugepage_info.c | 17 +-
lib/librte_eal/bsdapp/eal/eal_memory.c | 207 ++++-
lib/librte_eal/common/eal_common_memory.c | 581 ++++++++++++--
lib/librte_eal/common/eal_common_memzone.c | 48 +-
lib/librte_eal/common/eal_hugepages.h | 1 -
lib/librte_eal/common/eal_internal_cfg.h | 2 +-
lib/librte_eal/common/include/rte_eal_memconfig.h | 22 +-
lib/librte_eal/common/include/rte_memory.h | 56 +-
lib/librte_eal/common/include/rte_memzone.h | 1 -
lib/librte_eal/common/malloc_elem.c | 12 +-
lib/librte_eal/common/malloc_elem.h | 6 +-
lib/librte_eal/common/malloc_heap.c | 62 +-
lib/librte_eal/common/rte_malloc.c | 22 +-
lib/librte_eal/linuxapp/eal/eal.c | 15 +-
lib/librte_eal/linuxapp/eal/eal_hugepage_info.c | 25 +-
lib/librte_eal/linuxapp/eal/eal_memory.c | 913 +++++++++++++++-------
lib/librte_eal/linuxapp/eal/eal_vfio.c | 9 +-
lib/librte_eal/rte_eal_version.map | 3 +-
lib/librte_mempool/rte_mempool.c | 9 +-
test/test/test_malloc.c | 30 +-
test/test/test_memory.c | 10 +-
test/test/test_memzone.c | 12 +-
37 files changed, 1563 insertions(+), 589 deletions(-)

diff --git a/config/common_base b/config/common_base
index 7abf7c6..0ca1a06 100644
--- a/config/common_base
+++ b/config/common_base
@@ -61,7 +61,20 @@ CONFIG_RTE_CACHE_LINE_SIZE=64
CONFIG_RTE_LIBRTE_EAL=y
CONFIG_RTE_MAX_LCORE=128
CONFIG_RTE_MAX_NUMA_NODES=8
-CONFIG_RTE_MAX_MEMSEG=256
+CONFIG_RTE_MAX_MEMSEG_LISTS=64
+# each memseg list will be limited to either RTE_MAX_MEMSEG_PER_LIST pages
+# or RTE_MAX_MEM_MB_PER_LIST megabytes worth of memory, whichever is smaller
+CONFIG_RTE_MAX_MEMSEG_PER_LIST=8192
+CONFIG_RTE_MAX_MEM_MB_PER_LIST=32768
+# a "type" is a combination of page size and NUMA node. total number of memseg
+# lists per type will be limited to either RTE_MAX_MEMSEG_PER_TYPE pages (split
+# over multiple lists of RTE_MAX_MEMSEG_PER_LIST pages), or
+# RTE_MAX_MEM_MB_PER_TYPE megabytes of memory (split over multiple lists of
+# RTE_MAX_MEM_MB_PER_LIST), whichever is smaller
+CONFIG_RTE_MAX_MEMSEG_PER_TYPE=32768
+CONFIG_RTE_MAX_MEM_MB_PER_TYPE=131072
+# global maximum usable amount of VA, in megabytes
+CONFIG_RTE_MAX_MEM_MB=524288
CONFIG_RTE_MAX_MEMZONE=2560
CONFIG_RTE_MAX_TAILQ=32
CONFIG_RTE_ENABLE_ASSERT=n
diff --git a/config/defconfig_i686-native-linuxapp-gcc b/config/defconfig_i686-native-linuxapp-gcc
index a42ba4f..1178fe3 100644
--- a/config/defconfig_i686-native-linuxapp-gcc
+++ b/config/defconfig_i686-native-linuxapp-gcc
@@ -46,3 +46,6 @@ CONFIG_RTE_LIBRTE_PMD_ZUC=n
# AVP PMD is not supported on 32-bit
#
CONFIG_RTE_LIBRTE_AVP_PMD=n
+
+# 32-bit doesn't break up memory in lists, but does have VA allocation limit
+CONFIG_RTE_MAX_MEM_MB=2048
diff --git a/config/defconfig_i686-native-linuxapp-icc b/config/defconfig_i686-native-linuxapp-icc
index 144ba0a..f096e22 100644
--- a/config/defconfig_i686-native-linuxapp-icc
+++ b/config/defconfig_i686-native-linuxapp-icc
@@ -51,3 +51,6 @@ CONFIG_RTE_LIBRTE_PMD_ZUC=n
# AVP PMD is not supported on 32-bit
#
CONFIG_RTE_LIBRTE_AVP_PMD=n
+
+# 32-bit doesn't break up memory in lists, but does have VA allocation limit
+CONFIG_RTE_MAX_MEM_MB=2048
diff --git a/config/defconfig_x86_x32-native-linuxapp-gcc b/config/defconfig_x86_x32-native-linuxapp-gcc
index b6206a5..57d000d 100644
--- a/config/defconfig_x86_x32-native-linuxapp-gcc
+++ b/config/defconfig_x86_x32-native-linuxapp-gcc
@@ -26,3 +26,6 @@ CONFIG_RTE_LIBRTE_SFC_EFX_PMD=n
# AVP PMD is not supported on 32-bit
#
CONFIG_RTE_LIBRTE_AVP_PMD=n
+
+# 32-bit doesn't break up memory in lists, but does have VA allocation limit
+CONFIG_RTE_MAX_MEM_MB=2048
diff --git a/config/rte_config.h b/config/rte_config.h
index 72c0aa2..e42be1c 100644
--- a/config/rte_config.h
+++ b/config/rte_config.h
@@ -21,7 +21,12 @@
/****** library defines ********/

/* EAL defines */
-#define RTE_MAX_MEMSEG 512
+#define RTE_MAX_MEMSEG_LISTS 128
+#define RTE_MAX_MEMSEG_PER_LIST 8192
+#define RTE_MAX_MEM_MB_PER_LIST 32768
+#define RTE_MAX_MEMSEG_PER_TYPE 32768
+#define RTE_MAX_MEM_MB_PER_TYPE 65536
+#define RTE_MAX_MEM_MB 524288
#define RTE_MAX_MEMZONE 2560
#define RTE_MAX_TAILQ 32
#define RTE_LOG_LEVEL RTE_LOG_INFO
diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
index ec70b5f..c9f2703 100644
--- a/doc/guides/rel_notes/deprecation.rst
+++ b/doc/guides/rel_notes/deprecation.rst
@@ -38,15 +38,6 @@ Deprecation Notices
success and failure, respectively. This will change to 1 and 0 for true and
false, respectively, to make use of the function more intuitive.

-* eal: due to internal data layout reorganization, there will be changes to
- several structures and functions as a result of coming changes to support
- memory hotplug in v18.05.
- ``rte_eal_get_physmem_layout`` will be deprecated and removed in subsequent
- releases.
- ``rte_mem_config`` contents will change due to switch to memseg lists.
- ``rte_memzone`` member ``memseg_id`` will no longer serve any useful purpose
- and will be removed.
-
* eal: a new set of mbuf mempool ops name APIs for user, platform and best
mempool names have been defined in ``rte_mbuf`` in v18.02. The uses of
``rte_eal_mbuf_default_mempool_ops`` shall be replaced by
diff --git a/drivers/bus/fslmc/fslmc_vfio.c b/drivers/bus/fslmc/fslmc_vfio.c
index ccdbeff..31831e3 100644
--- a/drivers/bus/fslmc/fslmc_vfio.c
+++ b/drivers/bus/fslmc/fslmc_vfio.c
@@ -194,7 +194,8 @@ static int vfio_map_irq_region(struct fslmc_vfio_group *group)
}

static int
-fslmc_vfio_map(const struct rte_memseg *ms, void *arg)
+fslmc_vfio_map(const struct rte_memseg_list *msl __rte_unused,
+ const struct rte_memseg *ms, void *arg)
{
int *n_segs = arg;
struct fslmc_vfio_group *group;
@@ -236,18 +237,11 @@ fslmc_vfio_map(const struct rte_memseg *ms, void *arg)

int rte_fslmc_vfio_dmamap(void)
{
- const struct rte_memseg *memseg;
int i = 0;

if (is_dma_done)
return 0;

- memseg = rte_eal_get_physmem_layout();
- if (memseg == NULL) {
- FSLMC_VFIO_LOG(ERR, "Cannot get physical layout.");
- return -ENODEV;
- }
-
if (rte_memseg_walk(fslmc_vfio_map, &i) < 0)
return -1;

diff --git a/drivers/bus/fslmc/portal/dpaa2_hw_pvt.h b/drivers/bus/fslmc/portal/dpaa2_hw_pvt.h
index 45fd41e..72aae43 100644
--- a/drivers/bus/fslmc/portal/dpaa2_hw_pvt.h
+++ b/drivers/bus/fslmc/portal/dpaa2_hw_pvt.h
@@ -274,7 +274,7 @@ static phys_addr_t dpaa2_mem_vtop(uint64_t vaddr)
if (dpaa2_virt_mode)
return vaddr;

- memseg = rte_mem_virt2memseg((void *)(uintptr_t)vaddr);
+ memseg = rte_mem_virt2memseg((void *)(uintptr_t)vaddr, NULL);
if (memseg)
return memseg->phys_addr + RTE_PTR_DIFF(vaddr, memseg->addr);
return (size_t)NULL;
diff --git a/drivers/bus/pci/linux/pci.c b/drivers/bus/pci/linux/pci.c
index 6dda054..4630a80 100644
--- a/drivers/bus/pci/linux/pci.c
+++ b/drivers/bus/pci/linux/pci.c
@@ -117,9 +117,10 @@ rte_pci_unmap_device(struct rte_pci_device *dev)
}

static int
-find_max_end_va(const struct rte_memseg *ms, void *arg)
+find_max_end_va(const struct rte_memseg_list *msl, void *arg)
{
- void *end_va = RTE_PTR_ADD(ms->addr, ms->len);
+ size_t sz = msl->memseg_arr.len * msl->page_sz;
+ void *end_va = RTE_PTR_ADD(msl->base_va, sz);
void **max_va = arg;

if (*max_va < end_va)
@@ -132,10 +133,11 @@ pci_find_max_end_va(void)
{
void *va = NULL;

- rte_memseg_walk(find_max_end_va, &va);
+ rte_memseg_list_walk(find_max_end_va, &va);
return va;
}

+
/* parse one line of the "resource" sysfs file (note that the 'line'
* string is modified)
*/
diff --git a/drivers/crypto/dpaa_sec/dpaa_sec.c b/drivers/crypto/dpaa_sec/dpaa_sec.c
index a14e669..b685220 100644
--- a/drivers/crypto/dpaa_sec/dpaa_sec.c
+++ b/drivers/crypto/dpaa_sec/dpaa_sec.c
@@ -95,7 +95,7 @@ dpaa_mem_vtop(void *vaddr)
{
const struct rte_memseg *ms;

- ms = rte_mem_virt2memseg(vaddr);
+ ms = rte_mem_virt2memseg(vaddr, NULL);
if (ms)
return ms->iova + RTE_PTR_DIFF(vaddr, ms->addr);
return (size_t)NULL;
diff --git a/drivers/net/mlx4/mlx4_mr.c b/drivers/net/mlx4/mlx4_mr.c
index 47dd542..2ba609e 100644
--- a/drivers/net/mlx4/mlx4_mr.c
+++ b/drivers/net/mlx4/mlx4_mr.c
@@ -142,10 +142,10 @@ mlx4_mr_get(struct priv *priv, struct rte_mempool *mp)
(void *)mp, (void *)start, (void *)end,
(size_t)(end - start));
/* Round start and end to page boundary if found in memory segments. */
- ms = rte_mem_virt2memseg((void *)start);
+ ms = rte_mem_virt2memseg((void *)start, NULL);
if (ms != NULL)
start = RTE_ALIGN_FLOOR(start, ms->hugepage_sz);
- ms = rte_mem_virt2memseg((void *)end);
+ ms = rte_mem_virt2memseg((void *)end, NULL);
if (ms != NULL)
end = RTE_ALIGN_CEIL(end, ms->hugepage_sz);

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 1724b65..e228356 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -478,7 +478,8 @@ static struct rte_pci_driver mlx5_driver;
static void *uar_base;

static int
-find_lower_va_bound(const struct rte_memseg *ms, void *arg)
+find_lower_va_bound(const struct rte_memseg_list *msl __rte_unused,
+ const struct rte_memseg *ms, void *arg)
{
void **addr = arg;

diff --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c
index d8c04dc..6638185 100644
--- a/drivers/net/mlx5/mlx5_mr.c
+++ b/drivers/net/mlx5/mlx5_mr.c
@@ -263,10 +263,10 @@ mlx5_mr_new(struct rte_eth_dev *dev, struct rte_mempool *mp)
mr->end = end;

/* Round start and end to page boundary if found in memory segments. */
- ms = rte_mem_virt2memseg((void *)start);
+ ms = rte_mem_virt2memseg((void *)start, NULL);
if (ms != NULL)
start = RTE_ALIGN_FLOOR(start, ms->hugepage_sz);
- ms = rte_mem_virt2memseg((void *)end);
+ ms = rte_mem_virt2memseg((void *)end, NULL);
if (ms != NULL)
end = RTE_ALIGN_CEIL(end, ms->hugepage_sz);

diff --git a/drivers/net/virtio/virtio_user/vhost_kernel.c b/drivers/net/virtio/virtio_user/vhost_kernel.c
index 93d7efe..b244409 100644
--- a/drivers/net/virtio/virtio_user/vhost_kernel.c
+++ b/drivers/net/virtio/virtio_user/vhost_kernel.c
@@ -75,7 +75,8 @@ struct walk_arg {
uint32_t region_nr;
};
static int
-add_memory_region(const struct rte_memseg *ms, size_t len, void *arg)
+add_memory_region(const struct rte_memseg_list *msl __rte_unused,
+ const struct rte_memseg *ms, size_t len, void *arg)
{
struct walk_arg *wa = arg;
struct vhost_memory_region *mr;
@@ -95,7 +96,6 @@ add_memory_region(const struct rte_memseg *ms, size_t len, void *arg)
return 0;
}

-
/* By default, vhost kernel module allows 64 regions, but DPDK allows
* 256 segments. As a relief, below function merges those virtually
* adjacent memsegs into one region.
diff --git a/lib/librte_eal/bsdapp/eal/eal.c b/lib/librte_eal/bsdapp/eal/eal.c
index f44b904..d009cf0 100644
--- a/lib/librte_eal/bsdapp/eal/eal.c
+++ b/lib/librte_eal/bsdapp/eal/eal.c
@@ -64,8 +64,8 @@ static int mem_cfg_fd = -1;
static struct flock wr_lock = {
.l_type = F_WRLCK,
.l_whence = SEEK_SET,
- .l_start = offsetof(struct rte_mem_config, memseg),
- .l_len = sizeof(early_mem_config.memseg),
+ .l_start = offsetof(struct rte_mem_config, memsegs),
+ .l_len = sizeof(early_mem_config.memsegs),
};

/* Address of global and public configuration */
@@ -430,11 +430,11 @@ eal_parse_args(int argc, char **argv)
}

static int
-check_socket(const struct rte_memseg *ms, void *arg)
+check_socket(const struct rte_memseg_list *msl, void *arg)
{
int *socket_id = arg;

- if (ms->socket_id == *socket_id)
+ if (msl->socket_id == *socket_id && msl->memseg_arr.count != 0)
return 1;

return 0;
@@ -447,10 +447,11 @@ eal_check_mem_on_local_socket(void)

socket_id = rte_lcore_to_socket_id(rte_config.master_lcore);

- if (rte_memseg_walk(check_socket, &socket_id) == 0)
+ if (rte_memseg_list_walk(check_socket, &socket_id) == 0)
RTE_LOG(WARNING, EAL, "WARNING: Master core has no memory on local socket!\n");
}

+
static int
sync_func(__attribute__((unused)) void *arg)
{
@@ -561,7 +562,6 @@ rte_eal_init(int argc, char **argv)
rte_eal_get_configuration()->iova_mode = rte_bus_get_iommu_class();

if (internal_config.no_hugetlbfs == 0 &&
- internal_config.process_type != RTE_PROC_SECONDARY &&
eal_hugepage_info_init() < 0) {
rte_eal_init_alert("Cannot get hugepage information.");
rte_errno = EACCES;
diff --git a/lib/librte_eal/bsdapp/eal/eal_hugepage_info.c b/lib/librte_eal/bsdapp/eal/eal_hugepage_info.c
index be2dbf0..ba44da0 100644
--- a/lib/librte_eal/bsdapp/eal/eal_hugepage_info.c
+++ b/lib/librte_eal/bsdapp/eal/eal_hugepage_info.c
@@ -47,12 +47,18 @@ eal_hugepage_info_init(void)
struct hugepage_info *hpi = &internal_config.hugepage_info[0];
struct hugepage_info *tmp_hpi;

+ internal_config.num_hugepage_sizes = 1;
+
+ /* nothing more to be done for secondary */
+ if (rte_eal_process_type() == RTE_PROC_SECONDARY)
+ return 0;
+
sysctl_size = sizeof(num_buffers);
error = sysctlbyname("hw.contigmem.num_buffers", &num_buffers,
&sysctl_size, NULL, 0);

if (error != 0) {
- RTE_LOG(ERR, EAL, "could not read sysctl hw.contigmem.num_buffers");
+ RTE_LOG(ERR, EAL, "could not read sysctl hw.contigmem.num_buffers\n");
return -1;
}

@@ -61,7 +67,7 @@ eal_hugepage_info_init(void)
&sysctl_size, NULL, 0);

if (error != 0) {
- RTE_LOG(ERR, EAL, "could not read sysctl hw.contigmem.buffer_size");
+ RTE_LOG(ERR, EAL, "could not read sysctl hw.contigmem.buffer_size\n");
return -1;
}

@@ -81,22 +87,21 @@ eal_hugepage_info_init(void)
RTE_LOG(INFO, EAL, "Contigmem driver has %d buffers, each of size %dKB\n",
num_buffers, (int)(buffer_size>>10));

- internal_config.num_hugepage_sizes = 1;
hpi->hugedir = CONTIGMEM_DEV;
hpi->hugepage_sz = buffer_size;
hpi->num_pages[0] = num_buffers;
hpi->lock_descriptor = fd;

tmp_hpi = create_shared_memory(eal_hugepage_info_path(),
- sizeof(struct hugepage_info));
+ sizeof(internal_config.hugepage_info));
if (tmp_hpi == NULL ) {
RTE_LOG(ERR, EAL, "Failed to create shared memory!\n");
return -1;
}

- memcpy(tmp_hpi, hpi, sizeof(struct hugepage_info));
+ memcpy(tmp_hpi, hpi, sizeof(internal_config.hugepage_info));

- if ( munmap(tmp_hpi, sizeof(struct hugepage_info)) < 0) {
+ if (munmap(tmp_hpi, sizeof(internal_config.hugepage_info)) < 0) {
RTE_LOG(ERR, EAL, "Failed to unmap shared memory!\n");
return -1;
}
diff --git a/lib/librte_eal/bsdapp/eal/eal_memory.c b/lib/librte_eal/bsdapp/eal/eal_memory.c
index bdfb882..6692b3d 100644
--- a/lib/librte_eal/bsdapp/eal/eal_memory.c
+++ b/lib/librte_eal/bsdapp/eal/eal_memory.c
@@ -6,6 +6,8 @@
#include <sys/types.h>
#include <sys/sysctl.h>
#include <inttypes.h>
+#include <errno.h>
+#include <string.h>
#include <fcntl.h>

#include <rte_eal.h>
@@ -41,37 +43,135 @@ rte_eal_hugepage_init(void)
struct rte_mem_config *mcfg;
uint64_t total_mem = 0;
void *addr;
- unsigned i, j, seg_idx = 0;
+ unsigned int i, j, seg_idx = 0;

/* get pointer to global configuration */
mcfg = rte_eal_get_configuration()->mem_config;

/* for debug purposes, hugetlbfs can be disabled */
if (internal_config.no_hugetlbfs) {
- addr = malloc(internal_config.memory);
- mcfg->memseg[0].iova = (rte_iova_t)(uintptr_t)addr;
- mcfg->memseg[0].addr = addr;
- mcfg->memseg[0].hugepage_sz = RTE_PGSIZE_4K;
- mcfg->memseg[0].len = internal_config.memory;
- mcfg->memseg[0].socket_id = 0;
+ struct rte_memseg_list *msl;
+ struct rte_fbarray *arr;
+ struct rte_memseg *ms;
+ uint64_t page_sz;
+ int n_segs, cur_seg;
+
+ /* create a memseg list */
+ msl = &mcfg->memsegs[0];
+
+ page_sz = RTE_PGSIZE_4K;
+ n_segs = internal_config.memory / page_sz;
+
+ if (rte_fbarray_init(&msl->memseg_arr, "nohugemem", n_segs,
+ sizeof(struct rte_memseg))) {
+ RTE_LOG(ERR, EAL, "Cannot allocate memseg list\n");
+ return -1;
+ }
+
+ addr = mmap(NULL, internal_config.memory,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
+ if (addr == MAP_FAILED) {
+ RTE_LOG(ERR, EAL, "%s: mmap() failed: %s\n", __func__,
+ strerror(errno));
+ return -1;
+ }
+ msl->base_va = addr;
+ msl->page_sz = page_sz;
+ msl->socket_id = 0;
+
+ /* populate memsegs. each memseg is 1 page long */
+ for (cur_seg = 0; cur_seg < n_segs; cur_seg++) {
+ arr = &mcfg->memsegs[cur_seg].memseg_arr;
+
+ ms = rte_fbarray_get(arr, cur_seg);
+ if (rte_eal_iova_mode() == RTE_IOVA_VA)
+ ms->iova = (uintptr_t)addr;
+ else
+ ms->iova = RTE_BAD_IOVA;
+ ms->addr = addr;
+ ms->hugepage_sz = page_sz;
+ ms->len = page_sz;
+ ms->socket_id = 0;
+
+ rte_fbarray_set_used(arr, cur_seg);
+
+ addr = RTE_PTR_ADD(addr, page_sz);
+ }
return 0;
}

/* map all hugepages and sort them */
for (i = 0; i < internal_config.num_hugepage_sizes; i ++){
struct hugepage_info *hpi;
+ uint64_t page_sz, mem_needed;
+ unsigned int n_pages, max_pages;

hpi = &internal_config.hugepage_info[i];
- for (j = 0; j < hpi->num_pages[0]; j++) {
+ page_sz = hpi->hugepage_sz;
+ max_pages = hpi->num_pages[0];
+ mem_needed = RTE_ALIGN_CEIL(internal_config.memory - total_mem,
+ page_sz);
+
+ n_pages = RTE_MIN(mem_needed / page_sz, max_pages);
+
+ for (j = 0; j < n_pages; j++) {
+ struct rte_memseg_list *msl;
+ struct rte_fbarray *arr;
struct rte_memseg *seg;
+ int msl_idx, ms_idx;
rte_iova_t physaddr;
int error;
size_t sysctl_size = sizeof(physaddr);
char physaddr_str[64];

- addr = mmap(NULL, hpi->hugepage_sz, PROT_READ|PROT_WRITE,
- MAP_SHARED, hpi->lock_descriptor,
- j * EAL_PAGE_SIZE);
+ for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS;
+ msl_idx++) {
+ bool empty;
+ msl = &mcfg->memsegs[msl_idx];
+ arr = &msl->memseg_arr;
+
+ if (msl->page_sz != page_sz)
+ continue;
+
+ empty = arr->count == 0;
+
+ /* we need 1, plus hole if not empty */
+ ms_idx = rte_fbarray_find_next_n_free(arr,
+ 0, empty ? 1 : 0);
+
+ /* memseg list is full? */
+ if (ms_idx < 0)
+ continue;
+
+ /* leave some space between memsegs, they are
+ * not IOVA contiguous, so they shouldn't be VA
+ * contiguous either.
+ */
+ if (!empty)
+ ms_idx++;
+
+ break;
+ }
+ if (msl_idx == RTE_MAX_MEMSEG_LISTS) {
+ RTE_LOG(ERR, EAL, "Could not find space for memseg. Please increase %s and/or %s in configuration.\n",
+ RTE_STR(CONFIG_RTE_MAX_MEMSEG_PER_TYPE),
+ RTE_STR(CONFIG_RTE_MAX_MEM_PER_TYPE));
+ return -1;
+ }
+ arr = &msl->memseg_arr;
+ seg = rte_fbarray_get(arr, ms_idx);
+
+ addr = RTE_PTR_ADD(msl->base_va,
+ (size_t)msl->page_sz * ms_idx);
+
+ /* address is already mapped in memseg list, so using
+ * MAP_FIXED here is safe.
+ */
+ addr = mmap(addr, page_sz, PROT_READ|PROT_WRITE,
+ MAP_SHARED | MAP_FIXED,
+ hpi->lock_descriptor,
+ j * EAL_PAGE_SIZE);
if (addr == MAP_FAILED) {
RTE_LOG(ERR, EAL, "Failed to mmap buffer %u from %s\n",
j, hpi->hugedir);
@@ -88,33 +188,60 @@ rte_eal_hugepage_init(void)
return -1;
}

- seg = &mcfg->memseg[seg_idx++];
seg->addr = addr;
seg->iova = physaddr;
- seg->hugepage_sz = hpi->hugepage_sz;
- seg->len = hpi->hugepage_sz;
+ seg->hugepage_sz = page_sz;
+ seg->len = page_sz;
seg->nchannel = mcfg->nchannel;
seg->nrank = mcfg->nrank;
seg->socket_id = 0;

+ rte_fbarray_set_used(arr, ms_idx);
+
RTE_LOG(INFO, EAL, "Mapped memory segment %u @ %p: physaddr:0x%"
PRIx64", len %zu\n",
- seg_idx, addr, physaddr, hpi->hugepage_sz);
- if (total_mem >= internal_config.memory ||
- seg_idx >= RTE_MAX_MEMSEG)
- break;
+ seg_idx, addr, physaddr, page_sz);
+
+ total_mem += seg->len;
}
+ if (total_mem >= internal_config.memory)
+ break;
+ }
+ if (total_mem < internal_config.memory) {
+ RTE_LOG(ERR, EAL, "Couldn't reserve requested memory, requested: %" PRIu64 "M available: %" PRIu64 "M\n",
+ internal_config.memory >> 20, total_mem >> 20);
+ return -1;
}
return 0;
}

+struct attach_walk_args {
+ int fd_hugepage;
+ int seg_idx;
+};
+static int
+attach_segment(const struct rte_memseg_list *msl __rte_unused,
+ const struct rte_memseg *ms, void *arg)
+{
+ struct attach_walk_args *wa = arg;
+ void *addr;
+
+ addr = mmap(ms->addr, ms->len, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_FIXED, wa->fd_hugepage,
+ wa->seg_idx * EAL_PAGE_SIZE);
+ if (addr == MAP_FAILED || addr != ms->addr)
+ return -1;
+ wa->seg_idx++;
+
+ return 0;
+}
+
int
rte_eal_hugepage_attach(void)
{
const struct hugepage_info *hpi;
int fd_hugepage_info, fd_hugepage = -1;
- unsigned i = 0;
- struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ unsigned int i;

/* Obtain a file descriptor for hugepage_info */
fd_hugepage_info = open(eal_hugepage_info_path(), O_RDONLY);
@@ -124,41 +251,43 @@ rte_eal_hugepage_attach(void)
}

/* Map the shared hugepage_info into the process address spaces */
- hpi = mmap(NULL, sizeof(struct hugepage_info), PROT_READ, MAP_PRIVATE,
- fd_hugepage_info, 0);
+ hpi = mmap(NULL, sizeof(internal_config.hugepage_info),
+ PROT_READ, MAP_PRIVATE, fd_hugepage_info, 0);
if (hpi == MAP_FAILED) {
RTE_LOG(ERR, EAL, "Could not mmap %s\n", eal_hugepage_info_path());
goto error;
}

- /* Obtain a file descriptor for contiguous memory */
- fd_hugepage = open(hpi->hugedir, O_RDWR);
- if (fd_hugepage < 0) {
- RTE_LOG(ERR, EAL, "Could not open %s\n", hpi->hugedir);
- goto error;
- }
+ for (i = 0; i < internal_config.num_hugepage_sizes; i++) {
+ const struct hugepage_info *cur_hpi = &hpi[i];
+ struct attach_walk_args wa;

- /* Map the contiguous memory into each memory segment */
- for (i = 0; i < hpi->num_pages[0]; i++) {
+ memset(&wa, 0, sizeof(wa));

- void *addr;
- struct rte_memseg *seg = &mcfg->memseg[i];
+ /* Obtain a file descriptor for contiguous memory */
+ fd_hugepage = open(cur_hpi->hugedir, O_RDWR);
+ if (fd_hugepage < 0) {
+ RTE_LOG(ERR, EAL, "Could not open %s\n",
+ cur_hpi->hugedir);
+ goto error;
+ }
+ wa.fd_hugepage = fd_hugepage;
+ wa.seg_idx = 0;

- addr = mmap(seg->addr, hpi->hugepage_sz, PROT_READ|PROT_WRITE,
- MAP_SHARED|MAP_FIXED, fd_hugepage,
- i * EAL_PAGE_SIZE);
- if (addr == MAP_FAILED || addr != seg->addr) {
+ /* Map the contiguous memory into each memory segment */
+ if (rte_memseg_walk(attach_segment, &wa) < 0) {
RTE_LOG(ERR, EAL, "Failed to mmap buffer %u from %s\n",
- i, hpi->hugedir);
+ wa.seg_idx, cur_hpi->hugedir);
goto error;
}

+ close(fd_hugepage);
+ fd_hugepage = -1;
}

/* hugepage_info is no longer required */
- munmap((void *)(uintptr_t)hpi, sizeof(struct hugepage_info));
+ munmap((void *)(uintptr_t)hpi, sizeof(internal_config.hugepage_info));
close(fd_hugepage_info);
- close(fd_hugepage);
return 0;

error:
diff --git a/lib/librte_eal/common/eal_common_memory.c b/lib/librte_eal/common/eal_common_memory.c
index fd78d2f..0a6d678 100644
--- a/lib/librte_eal/common/eal_common_memory.c
+++ b/lib/librte_eal/common/eal_common_memory.c
@@ -13,6 +13,7 @@
#include <sys/mman.h>
#include <sys/queue.h>

+#include <rte_fbarray.h>
#include <rte_memory.h>
#include <rte_eal.h>
#include <rte_eal_memconfig.h>
@@ -30,6 +31,8 @@
* which is a multiple of hugepage size.
*/

+#define MEMSEG_LIST_FMT "memseg-%" PRIu64 "k-%i-%i"
+
static uint64_t baseaddr_offset;
static uint64_t system_page_sz;

@@ -120,15 +123,393 @@ eal_get_virtual_area(void *requested_addr, size_t *size,
return aligned_addr;
}

-/*
- * Return a pointer to a read-only table of struct rte_physmem_desc
- * elements, containing the layout of all addressable physical
- * memory. The last element of the table contains a NULL address.
- */
-const struct rte_memseg *
-rte_eal_get_physmem_layout(void)
+static uint64_t
+get_mem_amount(uint64_t page_sz, uint64_t max_mem)
+{
+ uint64_t area_sz, max_pages;
+
+ /* limit to RTE_MAX_MEMSEG_PER_LIST pages or RTE_MAX_MEM_MB_PER_LIST */
+ max_pages = RTE_MAX_MEMSEG_PER_LIST;
+ max_mem = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20, max_mem);
+
+ area_sz = RTE_MIN(page_sz * max_pages, max_mem);
+
+ /* make sure the list isn't smaller than the page size */
+ area_sz = RTE_MAX(area_sz, page_sz);
+
+ return RTE_ALIGN(area_sz, page_sz);
+}
+
+static int
+alloc_memseg_list(struct rte_memseg_list *msl, uint64_t page_sz,
+ uint64_t max_mem, int socket_id, int type_msl_idx)
+{
+ char name[RTE_FBARRAY_NAME_LEN];
+ uint64_t mem_amount;
+ int max_segs;
+
+ mem_amount = get_mem_amount(page_sz, max_mem);
+ max_segs = mem_amount / page_sz;
+
+ snprintf(name, sizeof(name), MEMSEG_LIST_FMT, page_sz >> 10, socket_id,
+ type_msl_idx);
+ if (rte_fbarray_init(&msl->memseg_arr, name, max_segs,
+ sizeof(struct rte_memseg))) {
+ RTE_LOG(ERR, EAL, "Cannot allocate memseg list: %s\n",
+ rte_strerror(rte_errno));
+ return -1;
+ }
+
+ msl->page_sz = page_sz;
+ msl->socket_id = socket_id;
+ msl->base_va = NULL;
+
+ RTE_LOG(DEBUG, EAL, "Memseg list allocated: 0x%zxkB at socket %i\n",
+ (size_t)page_sz >> 10, socket_id);
+
+ return 0;
+}
+
+static int
+alloc_va_space(struct rte_memseg_list *msl)
+{
+ uint64_t page_sz;
+ size_t mem_sz;
+ void *addr;
+ int flags = 0;
+
+#ifdef RTE_ARCH_PPC_64
+ flags |= MAP_HUGETLB;
+#endif
+
+ page_sz = msl->page_sz;
+ mem_sz = page_sz * msl->memseg_arr.len;
+
+ addr = eal_get_virtual_area(msl->base_va, &mem_sz, page_sz, 0, flags);
+ if (addr == NULL) {
+ if (rte_errno == EADDRNOTAVAIL)
+ RTE_LOG(ERR, EAL, "Could not mmap %llu bytes at [%p] - please use '--base-virtaddr' option\n",
+ (unsigned long long)mem_sz, msl->base_va);
+ else
+ RTE_LOG(ERR, EAL, "Cannot reserve memory\n");
+ return -1;
+ }
+ msl->base_va = addr;
+
+ return 0;
+}
+
+static int __rte_unused
+memseg_primary_init_32(void)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ int active_sockets, hpi_idx, msl_idx = 0;
+ unsigned int socket_id, i;
+ struct rte_memseg_list *msl;
+ uint64_t extra_mem_per_socket, total_extra_mem, total_requested_mem;
+ uint64_t max_mem;
+
+ /* no-huge does not need this at all */
+ if (internal_config.no_hugetlbfs)
+ return 0;
+
+ /* this is a giant hack, but desperate times call for desperate
+ * measures. in legacy 32-bit mode, we cannot preallocate VA space,
+ * because having upwards of 2 gigabytes of VA space already mapped will
+ * interfere with our ability to map and sort hugepages.
+ *
+ * therefore, in legacy 32-bit mode, we will be initializing memseg
+ * lists much later - in eal_memory.c, right after we unmap all the
+ * unneeded pages. this will not affect secondary processes, as those
+ * should be able to mmap the space without (too many) problems.
+ */
+ if (internal_config.legacy_mem)
+ return 0;
+
+ /* 32-bit mode is a very special case. we cannot know in advance where
+ * the user will want to allocate their memory, so we have to do some
+ * heuristics.
+ */
+ active_sockets = 0;
+ total_requested_mem = 0;
+ if (internal_config.force_sockets)
+ for (i = 0; i < rte_socket_count(); i++) {
+ uint64_t mem;
+
+ socket_id = rte_socket_id_by_idx(i);
+ mem = internal_config.socket_mem[socket_id];
+
+ if (mem == 0)
+ continue;
+
+ active_sockets++;
+ total_requested_mem += mem;
+ }
+ else
+ total_requested_mem = internal_config.memory;
+
+ max_mem = (uint64_t) RTE_MAX_MEM_MB_PER_TYPE << 20;
+ if (total_requested_mem > max_mem) {
+ RTE_LOG(ERR, EAL, "Invalid parameters: 32-bit process can at most use %uM of memory\n",
+ (unsigned int)(max_mem >> 20));
+ return -1;
+ }
+ total_extra_mem = max_mem - total_requested_mem;
+ extra_mem_per_socket = active_sockets == 0 ? total_extra_mem :
+ total_extra_mem / active_sockets;
+
+ /* the allocation logic is a little bit convoluted, but here's how it
+ * works, in a nutshell:
+ * - if user hasn't specified on which sockets to allocate memory via
+ * --socket-mem, we allocate all of our memory on master core socket.
+ * - if user has specified sockets to allocate memory on, there may be
+ * some "unused" memory left (e.g. if user has specified --socket-mem
+ * such that not all memory adds up to 2 gigabytes), so add it to all
+ * sockets that are in use equally.
+ *
+ * page sizes are sorted by size in descending order, so we can safely
+ * assume that we dispense with bigger page sizes first.
+ */
+
+ /* create memseg lists */
+ for (i = 0; i < rte_socket_count(); i++) {
+ int hp_sizes = (int) internal_config.num_hugepage_sizes;
+ uint64_t max_socket_mem, cur_socket_mem;
+ unsigned int master_lcore_socket;
+ struct rte_config *cfg = rte_eal_get_configuration();
+ bool skip;
+
+ socket_id = rte_socket_id_by_idx(i);
+
+#ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES
+ if (socket_id > 0)
+ break;
+#endif
+
+ /* if we didn't specifically request memory on this socket */
+ skip = active_sockets != 0 &&
+ internal_config.socket_mem[socket_id] == 0;
+ /* ...or if we didn't specifically request memory on *any*
+ * socket, and this is not master lcore
+ */
+ master_lcore_socket = rte_lcore_to_socket_id(cfg->master_lcore);
+ skip |= active_sockets == 0 && socket_id != master_lcore_socket;
+
+ if (skip) {
+ RTE_LOG(DEBUG, EAL, "Will not preallocate memory on socket %u\n",
+ socket_id);
+ continue;
+ }
+
+ /* max amount of memory on this socket */
+ max_socket_mem = (active_sockets != 0 ?
+ internal_config.socket_mem[socket_id] :
+ internal_config.memory) +
+ extra_mem_per_socket;
+ cur_socket_mem = 0;
+
+ for (hpi_idx = 0; hpi_idx < hp_sizes; hpi_idx++) {
+ uint64_t max_pagesz_mem, cur_pagesz_mem = 0;
+ uint64_t hugepage_sz;
+ struct hugepage_info *hpi;
+ int type_msl_idx, max_segs, total_segs = 0;
+
+ hpi = &internal_config.hugepage_info[hpi_idx];
+ hugepage_sz = hpi->hugepage_sz;
+
+ max_segs = RTE_MAX_MEMSEG_PER_TYPE;
+ max_pagesz_mem = max_socket_mem - cur_socket_mem;
+
+ /* make it multiple of page size */
+ max_pagesz_mem = RTE_ALIGN_FLOOR(max_pagesz_mem,
+ hugepage_sz);
+
+ RTE_LOG(DEBUG, EAL, "Attempting to preallocate %" PRIu64 "M on socket %i\n",
+ max_pagesz_mem >> 20, socket_id);
+
+ type_msl_idx = 0;
+ while (cur_pagesz_mem < max_pagesz_mem &&
+ total_segs < max_segs) {
+ if (msl_idx >= RTE_MAX_MEMSEG_LISTS) {
+ RTE_LOG(ERR, EAL,
+ "No more space in memseg lists, please increase %s\n",
+ RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS));
+ return -1;
+ }
+
+ msl = &mcfg->memsegs[msl_idx++];
+
+ if (alloc_memseg_list(msl, hugepage_sz,
+ max_pagesz_mem, socket_id,
+ type_msl_idx))
+ return -1;
+
+ total_segs += msl->memseg_arr.len;
+ cur_pagesz_mem = total_segs * hugepage_sz;
+ type_msl_idx++;
+
+ if (alloc_va_space(msl)) {
+ RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n");
+ return -1;
+ }
+ }
+ cur_socket_mem += cur_pagesz_mem;
+ }
+ }
+
+ return 0;
+}
+
+static int __rte_unused
+memseg_primary_init(void)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ int i, socket_id, hpi_idx, msl_idx = 0;
+ struct rte_memseg_list *msl;
+ uint64_t max_mem, total_mem;
+
+ /* no-huge does not need this at all */
+ if (internal_config.no_hugetlbfs)
+ return 0;
+
+ max_mem = (uint64_t)RTE_MAX_MEM_MB << 20;
+ total_mem = 0;
+
+ /* create memseg lists */
+ for (hpi_idx = 0; hpi_idx < (int) internal_config.num_hugepage_sizes;
+ hpi_idx++) {
+ struct hugepage_info *hpi;
+ uint64_t hugepage_sz;
+
+ hpi = &internal_config.hugepage_info[hpi_idx];
+ hugepage_sz = hpi->hugepage_sz;
+
+ for (i = 0; i < (int) rte_socket_count(); i++) {
+ uint64_t max_type_mem, total_type_mem = 0;
+ int type_msl_idx, max_segs, total_segs = 0;
+
+ socket_id = rte_socket_id_by_idx(i);
+
+#ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES
+ if (socket_id > 0)
+ break;
+#endif
+
+ max_type_mem = RTE_MIN(max_mem - total_mem,
+ (uint64_t)RTE_MAX_MEM_MB_PER_TYPE << 20);
+ max_segs = RTE_MAX_MEMSEG_PER_TYPE;
+
+ type_msl_idx = 0;
+ while (total_type_mem < max_type_mem &&
+ total_segs < max_segs) {
+ uint64_t cur_max_mem;
+ if (msl_idx >= RTE_MAX_MEMSEG_LISTS) {
+ RTE_LOG(ERR, EAL,
+ "No more space in memseg lists, please increase %s\n",
+ RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS));
+ return -1;
+ }
+
+ msl = &mcfg->memsegs[msl_idx++];
+
+ cur_max_mem = max_type_mem - total_type_mem;
+ if (alloc_memseg_list(msl, hugepage_sz,
+ cur_max_mem, socket_id,
+ type_msl_idx))
+ return -1;
+
+ total_segs += msl->memseg_arr.len;
+ total_type_mem = total_segs * hugepage_sz;
+ type_msl_idx++;
+
+ if (alloc_va_space(msl)) {
+ RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n");
+ return -1;
+ }
+ }
+ total_mem += total_type_mem;
+ }
+ }
+ return 0;
+}
+
+static int
+memseg_secondary_init(void)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ int msl_idx = 0;
+ struct rte_memseg_list *msl;
+
+ for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
+
+ msl = &mcfg->memsegs[msl_idx];
+
+ /* skip empty memseg lists */
+ if (msl->memseg_arr.len == 0)
+ continue;
+
+ if (rte_fbarray_attach(&msl->memseg_arr)) {
+ RTE_LOG(ERR, EAL, "Cannot attach to primary process memseg lists\n");
+ return -1;
+ }
+
+ /* preallocate VA space */
+ if (alloc_va_space(msl)) {
+ RTE_LOG(ERR, EAL, "Cannot preallocate VA space for hugepage memory\n");
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+static struct rte_memseg *
+virt2memseg(const void *addr, const struct rte_memseg_list *msl)
{
- return rte_eal_get_configuration()->mem_config->memseg;
+ const struct rte_fbarray *arr;
+ void *start, *end;
+ int ms_idx;
+
+ /* a memseg list was specified, check if it's the right one */
+ start = msl->base_va;
+ end = RTE_PTR_ADD(start, (size_t)msl->page_sz * msl->memseg_arr.len);
+
+ if (addr < start || addr >= end)
+ return NULL;
+
+ /* now, calculate index */
+ arr = &msl->memseg_arr;
+ ms_idx = RTE_PTR_DIFF(addr, msl->base_va) / msl->page_sz;
+ return rte_fbarray_get(arr, ms_idx);
+}
+
+static struct rte_memseg_list *
+virt2memseg_list(const void *addr)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct rte_memseg_list *msl;
+ int msl_idx;
+
+ for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
+ void *start, *end;
+ msl = &mcfg->memsegs[msl_idx];
+
+ start = msl->base_va;
+ end = RTE_PTR_ADD(start,
+ (size_t)msl->page_sz * msl->memseg_arr.len);
+ if (addr >= start && addr < end)
+ break;
+ }
+ /* if we didn't find our memseg list */
+ if (msl_idx == RTE_MAX_MEMSEG_LISTS)
+ return NULL;
+ return msl;
+}
+
+__rte_experimental struct rte_memseg_list *
+rte_mem_virt2memseg_list(const void *addr)
+{
+ return virt2memseg_list(addr);
}

struct virtiova {
@@ -136,7 +517,8 @@ struct virtiova {
void *virt;
};
static int
-find_virt(const struct rte_memseg *ms, void *arg)
+find_virt(const struct rte_memseg_list *msl __rte_unused,
+ const struct rte_memseg *ms, void *arg)
{
struct virtiova *vi = arg;
if (vi->iova >= ms->iova && vi->iova < (ms->iova + ms->len)) {
@@ -161,49 +543,19 @@ rte_mem_iova2virt(rte_iova_t iova)
return vi.virt;
}

-struct virtms {
- const void *virt;
- struct rte_memseg *ms;
-};
-static int
-find_memseg(const struct rte_memseg *ms, void *arg)
-{
- struct virtms *vm = arg;
-
- if (arg >= ms->addr && arg < RTE_PTR_ADD(ms->addr, ms->len)) {
- struct rte_memseg *memseg, *found_ms;
- int idx;
-
- memseg = rte_eal_get_configuration()->mem_config->memseg;
- idx = ms - memseg;
- found_ms = &memseg[idx];
-
- vm->ms = found_ms;
- return 1;
- }
- return 0;
-}
-
__rte_experimental struct rte_memseg *
-rte_mem_virt2memseg(const void *addr)
+rte_mem_virt2memseg(const void *addr, const struct rte_memseg_list *msl)
{
- struct virtms vm;
-
- memset(&vm, 0, sizeof(vm));
-
- vm.virt = addr;
-
- rte_memseg_walk(find_memseg, &vm);
-
- return vm.ms;
+ return virt2memseg(addr, msl != NULL ? msl :
+ rte_mem_virt2memseg_list(addr));
}

static int
-physmem_size(const struct rte_memseg *ms, void *arg)
+physmem_size(const struct rte_memseg_list *msl, void *arg)
{
uint64_t *total_len = arg;

- *total_len += ms->len;
+ *total_len += msl->memseg_arr.count * msl->page_sz;

return 0;
}
@@ -214,32 +566,39 @@ rte_eal_get_physmem_size(void)
{
uint64_t total_len = 0;

- rte_memseg_walk(physmem_size, &total_len);
+ rte_memseg_list_walk(physmem_size, &total_len);

return total_len;
}

static int
-dump_memseg(const struct rte_memseg *ms, void *arg)
+dump_memseg(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
+ void *arg)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
- int i = ms - mcfg->memseg;
+ int msl_idx, ms_idx;
FILE *f = arg;

- if (i < 0 || i >= RTE_MAX_MEMSEG)
+ msl_idx = msl - mcfg->memsegs;
+ if (msl_idx < 0 || msl_idx >= RTE_MAX_MEMSEG_LISTS)
return -1;

- fprintf(f, "Segment %u: IOVA:0x%"PRIx64", len:%zu, "
+ ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
+ if (ms_idx < 0)
+ return -1;
+
+ fprintf(f, "Segment %i-%i: IOVA:0x%"PRIx64", len:%zu, "
"virt:%p, socket_id:%"PRId32", "
"hugepage_sz:%"PRIu64", nchannel:%"PRIx32", "
- "nrank:%"PRIx32"\n", i,
- mcfg->memseg[i].iova,
- mcfg->memseg[i].len,
- mcfg->memseg[i].addr,
- mcfg->memseg[i].socket_id,
- mcfg->memseg[i].hugepage_sz,
- mcfg->memseg[i].nchannel,
- mcfg->memseg[i].nrank);
+ "nrank:%"PRIx32"\n",
+ msl_idx, ms_idx,
+ ms->iova,
+ ms->len,
+ ms->addr,
+ ms->socket_id,
+ ms->hugepage_sz,
+ ms->nchannel,
+ ms->nrank);

return 0;
}
@@ -289,55 +648,89 @@ rte_mem_lock_page(const void *virt)
}

int __rte_experimental
-rte_memseg_walk(rte_memseg_walk_t func, void *arg)
+rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
- int i, ret;
+ int i, ms_idx, ret = 0;

- for (i = 0; i < RTE_MAX_MEMSEG; i++) {
- const struct rte_memseg *ms = &mcfg->memseg[i];
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+ struct rte_memseg_list *msl = &mcfg->memsegs[i];
+ const struct rte_memseg *ms;
+ struct rte_fbarray *arr;

- if (ms->addr == NULL)
+ if (msl->memseg_arr.count == 0)
continue;

- ret = func(ms, arg);
- if (ret < 0)
- return -1;
- if (ret > 0)
- return 1;
+ arr = &msl->memseg_arr;
+
+ ms_idx = rte_fbarray_find_next_used(arr, 0);
+ while (ms_idx >= 0) {
+ int n_segs;
+ size_t len;
+
+ ms = rte_fbarray_get(arr, ms_idx);
+
+ /* find how many more segments there are, starting with
+ * this one.
+ */
+ n_segs = rte_fbarray_find_contig_used(arr, ms_idx);
+ len = n_segs * msl->page_sz;
+
+ ret = func(msl, ms, len, arg);
+ if (ret < 0)
+ return -1;
+ else if (ret > 0)
+ return 1;
+ ms_idx = rte_fbarray_find_next_used(arr,
+ ms_idx + n_segs);
+ }
}
return 0;
}

int __rte_experimental
-rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg)
+rte_memseg_walk(rte_memseg_walk_t func, void *arg)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
- int i, j, ret;
+ int i, ms_idx, ret = 0;

- for (i = 0; i < RTE_MAX_MEMSEG; i++) {
- const struct rte_memseg *ms = &mcfg->memseg[i];
- size_t total_len;
- void *end_addr;
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+ struct rte_memseg_list *msl = &mcfg->memsegs[i];
+ const struct rte_memseg *ms;
+ struct rte_fbarray *arr;

- if (ms->addr == NULL)
+ if (msl->memseg_arr.count == 0)
continue;

- end_addr = RTE_PTR_ADD(ms->addr, ms->len);
+ arr = &msl->memseg_arr;
+
+ ms_idx = rte_fbarray_find_next_used(arr, 0);
+ while (ms_idx >= 0) {
+ ms = rte_fbarray_get(arr, ms_idx);
+ ret = func(msl, ms, arg);
+ if (ret < 0)
+ return -1;
+ else if (ret > 0)
+ return 1;
+ ms_idx = rte_fbarray_find_next_used(arr, ms_idx + 1);
+ }
+ }
+ return 0;
+}

- /* check how many more segments are contiguous to this one */
- for (j = i + 1; j < RTE_MAX_MEMSEG; j++) {
- const struct rte_memseg *next = &mcfg->memseg[j];
+int __rte_experimental
+rte_memseg_list_walk(rte_memseg_list_walk_t func, void *arg)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ int i, ret = 0;

- if (next->addr != end_addr)
- break;
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+ struct rte_memseg_list *msl = &mcfg->memsegs[i];

- end_addr = RTE_PTR_ADD(next->addr, next->len);
- i++;
- }
- total_len = RTE_PTR_DIFF(end_addr, ms->addr);
+ if (msl->base_va == NULL)
+ continue;

- ret = func(ms, total_len, arg);
+ ret = func(msl, arg);
if (ret < 0)
return -1;
if (ret > 0)
@@ -350,9 +743,25 @@ rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg)
int
rte_eal_memory_init(void)
{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ int retval;
RTE_LOG(DEBUG, EAL, "Setting up physically contiguous memory...\n");

- const int retval = rte_eal_process_type() == RTE_PROC_PRIMARY ?
+ if (!mcfg)
+ return -1;
+
+ retval = rte_eal_process_type() == RTE_PROC_PRIMARY ?
+#ifndef RTE_ARCH_64
+ memseg_primary_init_32() :
+#else
+ memseg_primary_init() :
+#endif
+ memseg_secondary_init();
+
+ if (retval < 0)
+ return -1;
+
+ retval = rte_eal_process_type() == RTE_PROC_PRIMARY ?
rte_eal_hugepage_init() :
rte_eal_hugepage_attach();
if (retval < 0)
diff --git a/lib/librte_eal/common/eal_common_memzone.c b/lib/librte_eal/common/eal_common_memzone.c
index 88f401f..529b36f 100644
--- a/lib/librte_eal/common/eal_common_memzone.c
+++ b/lib/librte_eal/common/eal_common_memzone.c
@@ -234,10 +234,9 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
mz->iova = rte_malloc_virt2iova(mz_addr);
mz->addr = mz_addr;
mz->len = (requested_len == 0 ? elem->size : requested_len);
- mz->hugepage_sz = elem->ms->hugepage_sz;
- mz->socket_id = elem->ms->socket_id;
+ mz->hugepage_sz = elem->msl->page_sz;
+ mz->socket_id = elem->msl->socket_id;
mz->flags = 0;
- mz->memseg_id = elem->ms - rte_eal_get_configuration()->mem_config->memseg;

return mz;
}
@@ -399,20 +398,50 @@ static void
dump_memzone(const struct rte_memzone *mz, void *arg)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct rte_memseg_list *msl = NULL;
+ void *cur_addr, *mz_end;
+ struct rte_memseg *ms;
+ int mz_idx, ms_idx;
+ size_t page_sz;
FILE *f = arg;
- int mz_idx;

mz_idx = mz - mcfg->memzone;

- fprintf(f, "Zone %u: name:<%s>, IO:0x%"PRIx64", len:0x%zx, virt:%p, "
+ fprintf(f, "Zone %u: name:<%s>, len:0x%zx, virt:%p, "
"socket_id:%"PRId32", flags:%"PRIx32"\n",
mz_idx,
mz->name,
- mz->iova,
mz->len,
mz->addr,
mz->socket_id,
mz->flags);
+
+ /* go through each page occupied by this memzone */
+ msl = rte_mem_virt2memseg_list(mz->addr);
+ if (!msl) {
+ RTE_LOG(DEBUG, EAL, "Skipping bad memzone\n");
+ return;
+ }
+ page_sz = (size_t)mz->hugepage_sz;
+ cur_addr = RTE_PTR_ALIGN_FLOOR(mz->addr, page_sz);
+ mz_end = RTE_PTR_ADD(cur_addr, mz->len);
+
+ fprintf(f, "physical segments used:\n");
+ ms_idx = RTE_PTR_DIFF(mz->addr, msl->base_va) / page_sz;
+ ms = rte_fbarray_get(&msl->memseg_arr, ms_idx);
+
+ do {
+ fprintf(f, " addr: %p iova: 0x%" PRIx64 " "
+ "len: 0x%zx "
+ "pagesz: 0x%zx\n",
+ cur_addr, ms->iova, ms->len, page_sz);
+
+ /* advance VA to next page */
+ cur_addr = RTE_PTR_ADD(cur_addr, page_sz);
+
+ /* memzones occupy contiguous segments */
+ ++ms;
+ } while (cur_addr < mz_end);
}

/* Dump all reserved memory zones on console */
@@ -429,7 +458,6 @@ int
rte_eal_memzone_init(void)
{
struct rte_mem_config *mcfg;
- const struct rte_memseg *memseg;

/* get pointer to global configuration */
mcfg = rte_eal_get_configuration()->mem_config;
@@ -438,12 +466,6 @@ rte_eal_memzone_init(void)
if (rte_eal_process_type() == RTE_PROC_SECONDARY)
return 0;

- memseg = rte_eal_get_physmem_layout();
- if (memseg == NULL) {
- RTE_LOG(ERR, EAL, "%s(): Cannot get physical layout\n", __func__);
- return -1;
- }
-
rte_rwlock_write_lock(&mcfg->mlock);

/* delete all zones */
diff --git a/lib/librte_eal/common/eal_hugepages.h b/lib/librte_eal/common/eal_hugepages.h
index 1d519bb..ad1b0b6 100644
--- a/lib/librte_eal/common/eal_hugepages.h
+++ b/lib/librte_eal/common/eal_hugepages.h
@@ -22,7 +22,6 @@ struct hugepage_file {
size_t size; /**< the page size */
int socket_id; /**< NUMA socket ID */
int file_id; /**< the '%d' in HUGEFILE_FMT */
- int memseg_id; /**< the memory segment to which page belongs */
char filepath[MAX_HUGEPAGE_PATH]; /**< path to backing file on filesystem */
};

diff --git a/lib/librte_eal/common/eal_internal_cfg.h b/lib/librte_eal/common/eal_internal_cfg.h
index fda087b..5cf7102 100644
--- a/lib/librte_eal/common/eal_internal_cfg.h
+++ b/lib/librte_eal/common/eal_internal_cfg.h
@@ -23,7 +23,7 @@ struct hugepage_info {
uint64_t hugepage_sz; /**< size of a huge page */
const char *hugedir; /**< dir where hugetlbfs is mounted */
uint32_t num_pages[RTE_MAX_NUMA_NODES];
- /**< number of hugepages of that size on each socket */
+ /**< number of hugepages of that size on each socket */
int lock_descriptor; /**< file descriptor for hugepage dir */
};

diff --git a/lib/librte_eal/common/include/rte_eal_memconfig.h b/lib/librte_eal/common/include/rte_eal_memconfig.h
index 29fa0b6..b745e18 100644
--- a/lib/librte_eal/common/include/rte_eal_memconfig.h
+++ b/lib/librte_eal/common/include/rte_eal_memconfig.h
@@ -12,12 +12,30 @@
#include <rte_malloc_heap.h>
#include <rte_rwlock.h>
#include <rte_pause.h>
+#include <rte_fbarray.h>

#ifdef __cplusplus
extern "C" {
#endif

/**
+ * memseg list is a special case as we need to store a bunch of other data
+ * together with the array itself.
+ */
+struct rte_memseg_list {
+ RTE_STD_C11
+ union {
+ void *base_va;
+ /**< Base virtual address for this memseg list. */
+ uint64_t addr_64;
+ /**< Makes sure addr is always 64-bits */
+ };
+ int socket_id; /**< Socket ID for all memsegs in this list. */
+ uint64_t page_sz; /**< Page size for all memsegs in this list. */
+ struct rte_fbarray memseg_arr;
+};
+
+/**
* the structure for the memory configuration for the RTE.
* Used by the rte_config structure. It is separated out, as for multi-process
* support, the memory details should be shared across instances
@@ -43,9 +61,11 @@ struct rte_mem_config {
uint32_t memzone_cnt; /**< Number of allocated memzones */

/* memory segments and zones */
- struct rte_memseg memseg[RTE_MAX_MEMSEG]; /**< Physmem descriptors. */
struct rte_memzone memzone[RTE_MAX_MEMZONE]; /**< Memzone descriptors. */

+ struct rte_memseg_list memsegs[RTE_MAX_MEMSEG_LISTS];
+ /**< list of dynamic arrays holding memsegs */
+
struct rte_tailq_head tailq_head[RTE_MAX_TAILQ]; /**< Tailqs for objects */

/* Heaps of Malloc per socket */
diff --git a/lib/librte_eal/common/include/rte_memory.h b/lib/librte_eal/common/include/rte_memory.h
index b3d7e61..55383c4 100644
--- a/lib/librte_eal/common/include/rte_memory.h
+++ b/lib/librte_eal/common/include/rte_memory.h
@@ -23,6 +23,9 @@ extern "C" {
#include <rte_compat.h>
#include <rte_config.h>

+/* forward declaration for pointers */
+struct rte_memseg_list;
+
__extension__
enum rte_page_sizes {
RTE_PGSIZE_4K = 1ULL << 12,
@@ -151,7 +154,18 @@ rte_mem_iova2virt(rte_iova_t iova);
* Memseg pointer on success, or NULL on error.
*/
__rte_experimental struct rte_memseg *
-rte_mem_virt2memseg(const void *virt);
+rte_mem_virt2memseg(const void *virt, const struct rte_memseg_list *msl);
+
+/**
+ * Get memseg list corresponding to virtual memory address.
+ *
+ * @param virt
+ * The virtual address.
+ * @return
+ * Memseg list to which this virtual address belongs to.
+ */
+__rte_experimental struct rte_memseg_list *
+rte_mem_virt2memseg_list(const void *virt);

/**
* Memseg walk function prototype.
@@ -160,7 +174,8 @@ rte_mem_virt2memseg(const void *virt);
* Returning 1 will stop the walk
* Returning -1 will stop the walk and report error
*/
-typedef int (*rte_memseg_walk_t)(const struct rte_memseg *ms, void *arg);
+typedef int (*rte_memseg_walk_t)(const struct rte_memseg_list *msl,
+ const struct rte_memseg *ms, void *arg);

/**
* Memseg contig walk function prototype. This will trigger a callback on every
@@ -171,8 +186,19 @@ typedef int (*rte_memseg_walk_t)(const struct rte_memseg *ms, void *arg);
* Returning 1 will stop the walk
* Returning -1 will stop the walk and report error
*/
-typedef int (*rte_memseg_contig_walk_t)(const struct rte_memseg *ms,
- size_t len, void *arg);
+typedef int (*rte_memseg_contig_walk_t)(const struct rte_memseg_list *msl,
+ const struct rte_memseg *ms, size_t len, void *arg);
+
+/**
+ * Memseg list walk function prototype. This will trigger a callback on every
+ * allocated memseg list.
+ *
+ * Returning 0 will continue walk
+ * Returning 1 will stop the walk
+ * Returning -1 will stop the walk and report error
+ */
+typedef int (*rte_memseg_list_walk_t)(const struct rte_memseg_list *msl,
+ void *arg);

/**
* Walk list of all memsegs.
@@ -205,21 +231,19 @@ int __rte_experimental
rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg);

/**
- * Get the layout of the available physical memory.
- *
- * It can be useful for an application to have the full physical
- * memory layout to decide the size of a memory zone to reserve. This
- * table is stored in rte_config (see rte_eal_get_configuration()).
+ * Walk each allocated memseg list.
*
+ * @param func
+ * Iterator function
+ * @param arg
+ * Argument passed to iterator
* @return
- * - On success, return a pointer to a read-only table of struct
- * rte_physmem_desc elements, containing the layout of all
- * addressable physical memory. The last element of the table
- * contains a NULL address.
- * - On error, return NULL. This should not happen since it is a fatal
- * error that will probably cause the entire system to panic.
+ * 0 if walked over the entire list
+ * 1 if stopped by the user
+ * -1 if user function reported error
*/
-const struct rte_memseg *rte_eal_get_physmem_layout(void);
+int __rte_experimental
+rte_memseg_list_walk(rte_memseg_list_walk_t func, void *arg);

/**
* Dump the physical memory layout to a file.
diff --git a/lib/librte_eal/common/include/rte_memzone.h b/lib/librte_eal/common/include/rte_memzone.h
index ef3a4dd..6d4bdf1 100644
--- a/lib/librte_eal/common/include/rte_memzone.h
+++ b/lib/librte_eal/common/include/rte_memzone.h
@@ -67,7 +67,6 @@ struct rte_memzone {
int32_t socket_id; /**< NUMA socket ID. */

uint32_t flags; /**< Characteristics of this memzone. */
- uint32_t memseg_id; /**< Memseg it belongs. */
} __attribute__((__packed__));

/**
diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c
index 87695b9..685aac4 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -27,11 +27,11 @@
* Initialize a general malloc_elem header structure
*/
void
-malloc_elem_init(struct malloc_elem *elem,
- struct malloc_heap *heap, const struct rte_memseg *ms, size_t size)
+malloc_elem_init(struct malloc_elem *elem, struct malloc_heap *heap,
+ struct rte_memseg_list *msl, size_t size)
{
elem->heap = heap;
- elem->ms = ms;
+ elem->msl = msl;
elem->prev = NULL;
elem->next = NULL;
memset(&elem->free_list, 0, sizeof(elem->free_list));
@@ -100,7 +100,7 @@ malloc_elem_insert(struct malloc_elem *elem)
* so we just check the page addresses.
*/
static bool
-elem_check_phys_contig(const struct rte_memseg *ms __rte_unused,
+elem_check_phys_contig(const struct rte_memseg_list *msl __rte_unused,
void *start, size_t size)
{
rte_iova_t cur, expected;
@@ -191,7 +191,7 @@ elem_start_pt(struct malloc_elem *elem, size_t size, unsigned align,
* couldn't fit all data into one physically contiguous
* block, try again with lower addresses.
*/
- if (!elem_check_phys_contig(elem->ms,
+ if (!elem_check_phys_contig(elem->msl,
(void *)new_data_start,
new_data_size)) {
elem_size -= align;
@@ -225,7 +225,7 @@ split_elem(struct malloc_elem *elem, struct malloc_elem *split_pt)
const size_t old_elem_size = (uintptr_t)split_pt - (uintptr_t)elem;
const size_t new_elem_size = elem->size - old_elem_size;

- malloc_elem_init(split_pt, elem->heap, elem->ms, new_elem_size);
+ malloc_elem_init(split_pt, elem->heap, elem->msl, new_elem_size);
split_pt->prev = elem;
split_pt->next = next_elem;
if (next_elem)
diff --git a/lib/librte_eal/common/malloc_elem.h b/lib/librte_eal/common/malloc_elem.h
index 34bd268..620dd44 100644
--- a/lib/librte_eal/common/malloc_elem.h
+++ b/lib/librte_eal/common/malloc_elem.h
@@ -7,7 +7,7 @@

#include <stdbool.h>

-#include <rte_memory.h>
+#include <rte_eal_memconfig.h>

/* dummy definition of struct so we can use pointers to it in malloc_elem struct */
struct malloc_heap;
@@ -26,7 +26,7 @@ struct malloc_elem {
/**< points to next elem in memseg */
LIST_ENTRY(malloc_elem) free_list;
/**< list of free elements in heap */
- const struct rte_memseg *ms;
+ struct rte_memseg_list *msl;
volatile enum elem_state state;
uint32_t pad;
size_t size;
@@ -113,7 +113,7 @@ malloc_elem_from_data(const void *data)
void
malloc_elem_init(struct malloc_elem *elem,
struct malloc_heap *heap,
- const struct rte_memseg *ms,
+ struct rte_memseg_list *msl,
size_t size);

void
diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c
index 79914fc..0ef2c45 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -21,6 +21,7 @@
#include <rte_memcpy.h>
#include <rte_atomic.h>

+#include "eal_internal_cfg.h"
#include "malloc_elem.h"
#include "malloc_heap.h"

@@ -62,36 +63,49 @@ check_hugepage_sz(unsigned flags, uint64_t hugepage_sz)
}

/*
- * Expand the heap with a memseg.
- * This reserves the zone and sets a dummy malloc_elem header at the end
- * to prevent overflow. The rest of the zone is added to free list as a single
- * large free block
+ * Expand the heap with a memory area.
*/
+static struct malloc_elem *
+malloc_heap_add_memory(struct malloc_heap *heap, struct rte_memseg_list *msl,
+ void *start, size_t len)
+{
+ struct malloc_elem *elem = start;
+
+ malloc_elem_init(elem, heap, msl, len);
+
+ malloc_elem_insert(elem);
+
+ elem = malloc_elem_join_adjacent_free(elem);
+
+ malloc_elem_free_list_insert(elem);
+
+ heap->total_size += len;
+
+ return elem;
+}
+
static int
-malloc_heap_add_memseg(const struct rte_memseg *ms, void *arg __rte_unused)
+malloc_add_seg(const struct rte_memseg_list *msl,
+ const struct rte_memseg *ms, size_t len, void *arg __rte_unused)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
- struct malloc_elem *start_elem;
- struct rte_memseg *found_ms;
+ struct rte_memseg_list *found_msl;
struct malloc_heap *heap;
- size_t elem_size;
- int ms_idx;
-
- heap = &mcfg->malloc_heaps[ms->socket_id];
+ int msl_idx;

- /* ms is const, so find it */
- ms_idx = ms - mcfg->memseg;
- found_ms = &mcfg->memseg[ms_idx];
+ heap = &mcfg->malloc_heaps[msl->socket_id];

- start_elem = (struct malloc_elem *)found_ms->addr;
- elem_size = ms->len - MALLOC_ELEM_OVERHEAD;
+ /* msl is const, so find it */
+ msl_idx = msl - mcfg->memsegs;
+ found_msl = &mcfg->memsegs[msl_idx];

- malloc_elem_init(start_elem, heap, found_ms, elem_size);
- malloc_elem_insert(start_elem);
- malloc_elem_free_list_insert(start_elem);
+ if (msl_idx < 0 || msl_idx >= RTE_MAX_MEMSEG_LISTS)
+ return -1;

- heap->total_size += elem_size;
+ malloc_heap_add_memory(heap, found_msl, ms->addr, len);

+ RTE_LOG(DEBUG, EAL, "Added %zuM to heap on socket %i\n", len >> 20,
+ msl->socket_id);
return 0;
}

@@ -114,7 +128,8 @@ find_suitable_element(struct malloc_heap *heap, size_t size,
!!elem; elem = LIST_NEXT(elem, free_list)) {
if (malloc_elem_can_hold(elem, size, align, bound,
contig)) {
- if (check_hugepage_sz(flags, elem->ms->hugepage_sz))
+ if (check_hugepage_sz(flags,
+ elem->msl->page_sz))
return elem;
if (alt_elem == NULL)
alt_elem = elem;
@@ -263,7 +278,6 @@ rte_eal_malloc_heap_init(void)
if (mcfg == NULL)
return -1;

- rte_memseg_walk(malloc_heap_add_memseg, NULL);
-
- return 0;
+ /* add all IOVA-contiguous areas to the heap */
+ return rte_memseg_contig_walk(malloc_add_seg, NULL);
}
diff --git a/lib/librte_eal/common/rte_malloc.c b/lib/librte_eal/common/rte_malloc.c
index 436818a..c6d3e57 100644
--- a/lib/librte_eal/common/rte_malloc.c
+++ b/lib/librte_eal/common/rte_malloc.c
@@ -242,17 +242,21 @@ rte_malloc_set_limit(__rte_unused const char *type,
rte_iova_t
rte_malloc_virt2iova(const void *addr)
{
- rte_iova_t iova;
- const struct malloc_elem *elem = malloc_elem_from_data(addr);
+ const struct rte_memseg *ms;
+ struct malloc_elem *elem = malloc_elem_from_data(addr);
+
if (elem == NULL)
return RTE_BAD_IOVA;
- if (elem->ms->iova == RTE_BAD_IOVA)
- return RTE_BAD_IOVA;

if (rte_eal_iova_mode() == RTE_IOVA_VA)
- iova = (uintptr_t)addr;
- else
- iova = elem->ms->iova +
- RTE_PTR_DIFF(addr, elem->ms->addr);
- return iova;
+ return (uintptr_t) addr;
+
+ ms = rte_mem_virt2memseg(addr, elem->msl);
+ if (ms == NULL)
+ return RTE_BAD_IOVA;
+
+ if (ms->iova == RTE_BAD_IOVA)
+ return RTE_BAD_IOVA;
+
+ return ms->iova + RTE_PTR_DIFF(addr, ms->addr);
}
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index b34e57a..ffcbd71 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -74,8 +74,8 @@ static int mem_cfg_fd = -1;
static struct flock wr_lock = {
.l_type = F_WRLCK,
.l_whence = SEEK_SET,
- .l_start = offsetof(struct rte_mem_config, memseg),
- .l_len = sizeof(early_mem_config.memseg),
+ .l_start = offsetof(struct rte_mem_config, memsegs),
+ .l_len = sizeof(early_mem_config.memsegs),
};

/* Address of global and public configuration */
@@ -640,11 +640,14 @@ eal_parse_args(int argc, char **argv)
}

static int
-check_mem(const struct rte_memseg *ms, void *arg)
+check_socket(const struct rte_memseg_list *msl, void *arg)
{
- int *socket = arg;
+ int *socket_id = arg;

- return ms->socket_id == *socket;
+ if (msl->socket_id == *socket_id && msl->memseg_arr.count != 0)
+ return 1;
+
+ return 0;
}

static void
@@ -654,7 +657,7 @@ eal_check_mem_on_local_socket(void)

socket_id = rte_lcore_to_socket_id(rte_config.master_lcore);

- if (rte_memseg_walk(check_mem, &socket_id) == 0)
+ if (rte_memseg_list_walk(check_socket, &socket_id) == 0)
RTE_LOG(WARNING, EAL, "WARNING: Master core has no memory on local socket!\n");
}

diff --git a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
index 8bbf771..afebd42 100644
--- a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
+++ b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
@@ -15,6 +15,7 @@
#include <unistd.h>
#include <errno.h>
#include <sys/queue.h>
+#include <sys/stat.h>

#include <rte_memory.h>
#include <rte_eal.h>
@@ -160,6 +161,18 @@ get_hugepage_dir(uint64_t hugepage_sz)
}

/*
+ * uses fstat to report the size of a file on disk
+ */
+static off_t
+get_file_size(int fd)
+{
+ struct stat st;
+ if (fstat(fd, &st) < 0)
+ return 0;
+ return st.st_size;
+}
+
+/*
* Clear the hugepage directory of whatever hugepage files
* there are. Checks if the file is locked (i.e.
* if it's in use by another DPDK process).
@@ -189,6 +202,8 @@ clear_hugedir(const char * hugedir)
}

while(dirent != NULL){
+ struct flock lck = {0};
+
/* skip files that don't match the hugepage pattern */
if (fnmatch(filter, dirent->d_name, 0) > 0) {
dirent = readdir(dir);
@@ -205,11 +220,17 @@ clear_hugedir(const char * hugedir)
}

/* non-blocking lock */
- lck_result = flock(fd, LOCK_EX | LOCK_NB);
+ lck.l_type = F_RDLCK;
+ lck.l_whence = SEEK_SET;
+ lck.l_start = 0;
+ lck.l_len = get_file_size(fd);
+
+ lck_result = fcntl(fd, F_SETLK, &lck);

/* if lock succeeds, unlock and remove the file */
if (lck_result != -1) {
- flock(fd, LOCK_UN);
+ lck.l_type = F_UNLCK;
+ fcntl(fd, F_SETLK, &lck);
unlinkat(dir_fd, dirent->d_name, 0);
}
close (fd);
diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
index 1d3defe..d38fb68 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -253,13 +253,12 @@ void numa_error(char *where)
*/
static unsigned
map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
- uint64_t *essential_memory __rte_unused, int orig)
+ uint64_t *essential_memory __rte_unused)
{
int fd;
unsigned i;
void *virtaddr;
- void *vma_addr = NULL;
- size_t vma_len = 0;
+ struct flock lck = {0};
#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
int node_id = -1;
int essential_prev = 0;
@@ -274,7 +273,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
have_numa = false;
}

- if (orig && have_numa) {
+ if (have_numa) {
RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n");
if (get_mempolicy(&oldpolicy, oldmask->maskp,
oldmask->size + 1, 0, 0) < 0) {
@@ -290,6 +289,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
#endif

for (i = 0; i < hpi->num_pages[0]; i++) {
+ struct hugepage_file *hf = &hugepg_tbl[i];
uint64_t hugepage_sz = hpi->hugepage_sz;

#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
@@ -324,66 +324,14 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
}
#endif

- if (orig) {
- hugepg_tbl[i].file_id = i;
- hugepg_tbl[i].size = hugepage_sz;
- eal_get_hugefile_path(hugepg_tbl[i].filepath,
- sizeof(hugepg_tbl[i].filepath), hpi->hugedir,
- hugepg_tbl[i].file_id);
- hugepg_tbl[i].filepath[sizeof(hugepg_tbl[i].filepath) - 1] = '\0';
- }
-#ifndef RTE_ARCH_64
- /* for 32-bit systems, don't remap 1G and 16G pages, just reuse
- * original map address as final map address.
- */
- else if ((hugepage_sz == RTE_PGSIZE_1G)
- || (hugepage_sz == RTE_PGSIZE_16G)) {
- hugepg_tbl[i].final_va = hugepg_tbl[i].orig_va;
- hugepg_tbl[i].orig_va = NULL;
- continue;
- }
-#endif
- else if (vma_len == 0) {
- unsigned j, num_pages;
-
- /* reserve a virtual area for next contiguous
- * physical block: count the number of
- * contiguous physical pages. */
- for (j = i+1; j < hpi->num_pages[0] ; j++) {
-#ifdef RTE_ARCH_PPC_64
- /* The physical addresses are sorted in
- * descending order on PPC64 */
- if (hugepg_tbl[j].physaddr !=
- hugepg_tbl[j-1].physaddr - hugepage_sz)
- break;
-#else
- if (hugepg_tbl[j].physaddr !=
- hugepg_tbl[j-1].physaddr + hugepage_sz)
- break;
-#endif
- }
- num_pages = j - i;
- vma_len = num_pages * hugepage_sz;
-
- /* get the biggest virtual memory area up to
- * vma_len. If it fails, vma_addr is NULL, so
- * let the kernel provide the address. */
- vma_addr = eal_get_virtual_area(NULL, &vma_len,
- hpi->hugepage_sz,
- EAL_VIRTUAL_AREA_ALLOW_SHRINK |
- EAL_VIRTUAL_AREA_UNMAP,
-#ifdef RTE_ARCH_PPC_64
- MAP_HUGETLB
-#else
- 0
-#endif
- );
- if (vma_addr == NULL)
- vma_len = hugepage_sz;
- }
+ hf->file_id = i;
+ hf->size = hugepage_sz;
+ eal_get_hugefile_path(hf->filepath, sizeof(hf->filepath),
+ hpi->hugedir, hf->file_id);
+ hf->filepath[sizeof(hf->filepath) - 1] = '\0';

/* try to create hugepage file */
- fd = open(hugepg_tbl[i].filepath, O_CREAT | O_RDWR, 0600);
+ fd = open(hf->filepath, O_CREAT | O_RDWR, 0600);
if (fd < 0) {
RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__,
strerror(errno));
@@ -391,8 +339,11 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
}

/* map the segment, and populate page tables,
- * the kernel fills this segment with zeros */
- virtaddr = mmap(vma_addr, hugepage_sz, PROT_READ | PROT_WRITE,
+ * the kernel fills this segment with zeros. we don't care where
+ * this gets mapped - we already have contiguous memory areas
+ * ready for us to map into.
+ */
+ virtaddr = mmap(NULL, hugepage_sz, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_POPULATE, fd, 0);
if (virtaddr == MAP_FAILED) {
RTE_LOG(DEBUG, EAL, "%s(): mmap failed: %s\n", __func__,
@@ -401,44 +352,38 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
goto out;
}

- if (orig) {
- hugepg_tbl[i].orig_va = virtaddr;
- }
- else {
- /* rewrite physical addresses in IOVA as VA mode */
- if (rte_eal_iova_mode() == RTE_IOVA_VA)
- hugepg_tbl[i].physaddr = (uintptr_t)virtaddr;
- hugepg_tbl[i].final_va = virtaddr;
- }
+ hf->orig_va = virtaddr;

- if (orig) {
- /* In linux, hugetlb limitations, like cgroup, are
- * enforced at fault time instead of mmap(), even
- * with the option of MAP_POPULATE. Kernel will send
- * a SIGBUS signal. To avoid to be killed, save stack
- * environment here, if SIGBUS happens, we can jump
- * back here.
- */
- if (huge_wrap_sigsetjmp()) {
- RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more "
- "hugepages of size %u MB\n",
- (unsigned)(hugepage_sz / 0x100000));
- munmap(virtaddr, hugepage_sz);
- close(fd);
- unlink(hugepg_tbl[i].filepath);
+ /* In linux, hugetlb limitations, like cgroup, are
+ * enforced at fault time instead of mmap(), even
+ * with the option of MAP_POPULATE. Kernel will send
+ * a SIGBUS signal. To avoid to be killed, save stack
+ * environment here, if SIGBUS happens, we can jump
+ * back here.
+ */
+ if (huge_wrap_sigsetjmp()) {
+ RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more "
+ "hugepages of size %u MB\n",
+ (unsigned int)(hugepage_sz / 0x100000));
+ munmap(virtaddr, hugepage_sz);
+ close(fd);
+ unlink(hugepg_tbl[i].filepath);
#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
- if (maxnode)
- essential_memory[node_id] =
- essential_prev;
+ if (maxnode)
+ essential_memory[node_id] =
+ essential_prev;
#endif
- goto out;
- }
- *(int *)virtaddr = 0;
+ goto out;
}
+ *(int *)virtaddr = 0;


- /* set shared flock on the file. */
- if (flock(fd, LOCK_SH | LOCK_NB) == -1) {
+ /* set shared lock on the file. */
+ lck.l_type = F_RDLCK;
+ lck.l_whence = SEEK_SET;
+ lck.l_start = 0;
+ lck.l_len = hugepage_sz;
+ if (fcntl(fd, F_SETLK, &lck) == -1) {
RTE_LOG(DEBUG, EAL, "%s(): Locking file failed:%s \n",
__func__, strerror(errno));
close(fd);
@@ -446,9 +391,6 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
}

close(fd);
-
- vma_addr = (char *)vma_addr + hugepage_sz;
- vma_len -= hugepage_sz;
}

out:
@@ -470,20 +412,6 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
return i;
}

-/* Unmap all hugepages from original mapping */
-static int
-unmap_all_hugepages_orig(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
-{
- unsigned i;
- for (i = 0; i < hpi->num_pages[0]; i++) {
- if (hugepg_tbl[i].orig_va) {
- munmap(hugepg_tbl[i].orig_va, hpi->hugepage_sz);
- hugepg_tbl[i].orig_va = NULL;
- }
- }
- return 0;
-}
-
/*
* Parse /proc/self/numa_maps to get the NUMA socket ID for each huge
* page.
@@ -623,7 +551,7 @@ copy_hugepages_to_shared_mem(struct hugepage_file * dst, int dest_size,
int src_pos, dst_pos = 0;

for (src_pos = 0; src_pos < src_size; src_pos++) {
- if (src[src_pos].final_va != NULL) {
+ if (src[src_pos].orig_va != NULL) {
/* error on overflow attempt */
if (dst_pos == dest_size)
return -1;
@@ -694,9 +622,10 @@ unmap_unneeded_hugepages(struct hugepage_file *hugepg_tbl,
unmap_len = hp->size;

/* get start addr and len of the remaining segment */
- munmap(hp->final_va, (size_t) unmap_len);
+ munmap(hp->orig_va,
+ (size_t)unmap_len);

- hp->final_va = NULL;
+ hp->orig_va = NULL;
if (unlink(hp->filepath) == -1) {
RTE_LOG(ERR, EAL, "%s(): Removing %s failed: %s\n",
__func__, hp->filepath, strerror(errno));
@@ -715,6 +644,413 @@ unmap_unneeded_hugepages(struct hugepage_file *hugepg_tbl,
return 0;
}

+static int
+remap_segment(struct hugepage_file *hugepages, int seg_start, int seg_end)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct rte_memseg_list *msl;
+ struct rte_fbarray *arr;
+ int cur_page, seg_len;
+ unsigned int msl_idx;
+ int ms_idx;
+ uint64_t page_sz;
+ size_t memseg_len;
+ int socket_id;
+
+ page_sz = hugepages[seg_start].size;
+ socket_id = hugepages[seg_start].socket_id;
+ seg_len = seg_end - seg_start;
+
+ RTE_LOG(DEBUG, EAL, "Attempting to map %" PRIu64 "M on socket %i\n",
+ (seg_len * page_sz) >> 20ULL, socket_id);
+
+ /* find free space in memseg lists */
+ for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
+ bool empty;
+ msl = &mcfg->memsegs[msl_idx];
+ arr = &msl->memseg_arr;
+
+ if (msl->page_sz != page_sz)
+ continue;
+ if (msl->socket_id != socket_id)
+ continue;
+
+ /* leave space for a hole if array is not empty */
+ empty = arr->count == 0;
+ ms_idx = rte_fbarray_find_next_n_free(arr, 0,
+ seg_len + (empty ? 0 : 1));
+
+ /* memseg list is full? */
+ if (ms_idx < 0)
+ continue;
+
+ /* leave some space between memsegs, they are not IOVA
+ * contiguous, so they shouldn't be VA contiguous either.
+ */
+ if (!empty)
+ ms_idx++;
+ break;
+ }
+ if (msl_idx == RTE_MAX_MEMSEG_LISTS) {
+ RTE_LOG(ERR, EAL, "Could not find space for memseg. Please increase %s and/or %s in configuration.\n",
+ RTE_STR(CONFIG_RTE_MAX_MEMSEG_PER_TYPE),
+ RTE_STR(CONFIG_RTE_MAX_MEM_PER_TYPE));
+ return -1;
+ }
+
+#ifdef RTE_ARCH_PPC64
+ /* for PPC64 we go through the list backwards */
+ for (cur_page = seg_end - 1; cur_page >= seg_start;
+ cur_page--, ms_idx++) {
+#else
+ for (cur_page = seg_start; cur_page < seg_end; cur_page++, ms_idx++) {
+#endif
+ struct hugepage_file *hfile = &hugepages[cur_page];
+ struct rte_memseg *ms = rte_fbarray_get(arr, ms_idx);
+ struct flock lck;
+ void *addr;
+ int fd;
+
+ fd = open(hfile->filepath, O_RDWR);
+ if (fd < 0) {
+ RTE_LOG(ERR, EAL, "Could not open '%s': %s\n",
+ hfile->filepath, strerror(errno));
+ return -1;
+ }
+ /* set shared lock on the file. */
+ lck.l_type = F_RDLCK;
+ lck.l_whence = SEEK_SET;
+ lck.l_start = 0;
+ lck.l_len = page_sz;
+ if (fcntl(fd, F_SETLK, &lck) == -1) {
+ RTE_LOG(DEBUG, EAL, "Could not lock '%s': %s\n",
+ hfile->filepath, strerror(errno));
+ close(fd);
+ return -1;
+ }
+ memseg_len = (size_t)page_sz;
+ addr = RTE_PTR_ADD(msl->base_va, ms_idx * memseg_len);
+
+ /* we know this address is already mmapped by memseg list, so
+ * using MAP_FIXED here is safe
+ */
+ addr = mmap(addr, page_sz, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd, 0);
+ if (addr == MAP_FAILED) {
+ RTE_LOG(ERR, EAL, "Couldn't remap '%s': %s\n",
+ hfile->filepath, strerror(errno));
+ close(fd);
+ return -1;
+ }
+
+ /* we have a new address, so unmap previous one */
+#ifndef RTE_ARCH_64
+ /* in 32-bit legacy mode, we have already unmapped the page */
+ if (!internal_config.legacy_mem)
+ munmap(hfile->orig_va, page_sz);
+#else
+ munmap(hfile->orig_va, page_sz);
+#endif
+
+ hfile->orig_va = NULL;
+ hfile->final_va = addr;
+
+ /* rewrite physical addresses in IOVA as VA mode */
+ if (rte_eal_iova_mode() == RTE_IOVA_VA)
+ hfile->physaddr = (uintptr_t)addr;
+
+ /* set up memseg data */
+ ms->addr = addr;
+ ms->hugepage_sz = page_sz;
+ ms->len = memseg_len;
+ ms->iova = hfile->physaddr;
+ ms->socket_id = hfile->socket_id;
+ ms->nchannel = rte_memory_get_nchannel();
+ ms->nrank = rte_memory_get_nrank();
+
+ rte_fbarray_set_used(arr, ms_idx);
+
+ close(fd);
+ }
+ RTE_LOG(DEBUG, EAL, "Allocated %" PRIu64 "M on socket %i\n",
+ (seg_len * page_sz) >> 20, socket_id);
+ return 0;
+}
+
+#define MEMSEG_LIST_FMT "memseg-%" PRIu64 "k-%i-%i"
+static int
+alloc_memseg_list(struct rte_memseg_list *msl, uint64_t page_sz,
+ int n_segs, int socket_id, int type_msl_idx)
+{
+ char name[RTE_FBARRAY_NAME_LEN];
+
+ snprintf(name, sizeof(name), MEMSEG_LIST_FMT, page_sz >> 10, socket_id,
+ type_msl_idx);
+ if (rte_fbarray_init(&msl->memseg_arr, name, n_segs,
+ sizeof(struct rte_memseg))) {
+ RTE_LOG(ERR, EAL, "Cannot allocate memseg list: %s\n",
+ rte_strerror(rte_errno));
+ return -1;
+ }
+
+ msl->page_sz = page_sz;
+ msl->socket_id = socket_id;
+ msl->base_va = NULL;
+
+ RTE_LOG(DEBUG, EAL, "Memseg list allocated: 0x%zxkB at socket %i\n",
+ (size_t)page_sz >> 10, socket_id);
+
+ return 0;
+}
+
+static int
+alloc_va_space(struct rte_memseg_list *msl)
+{
+ uint64_t page_sz;
+ size_t mem_sz;
+ void *addr;
+ int flags = 0;
+
+#ifdef RTE_ARCH_PPC_64
+ flags |= MAP_HUGETLB;
+#endif
+
+ page_sz = msl->page_sz;
+ mem_sz = page_sz * msl->memseg_arr.len;
+
+ addr = eal_get_virtual_area(msl->base_va, &mem_sz, page_sz, 0, flags);
+ if (addr == NULL) {
+ if (rte_errno == EADDRNOTAVAIL)
+ RTE_LOG(ERR, EAL, "Could not mmap %llu bytes at [%p] - please use '--base-virtaddr' option\n",
+ (unsigned long long)mem_sz, msl->base_va);
+ else
+ RTE_LOG(ERR, EAL, "Cannot reserve memory\n");
+ return -1;
+ }
+ msl->base_va = addr;
+
+ return 0;
+}
+
+/*
+ * Our VA space is not preallocated yet, so preallocate it here. We need to know
+ * how many segments there are in order to map all pages into one address space,
+ * and leave appropriate holes between segments so that rte_malloc does not
+ * concatenate them into one big segment.
+ *
+ * we also need to unmap original pages to free up address space.
+ */
+static int __rte_unused
+prealloc_segments(struct hugepage_file *hugepages, int n_pages)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ int cur_page, seg_start_page, end_seg, new_memseg;
+ unsigned int hpi_idx, socket, i;
+ int n_contig_segs, n_segs;
+ int msl_idx;
+
+ /* before we preallocate segments, we need to free up our VA space.
+ * we're not removing files, and we already have information about
+ * PA-contiguousness, so it is safe to unmap everything.
+ */
+ for (cur_page = 0; cur_page < n_pages; cur_page++) {
+ struct hugepage_file *hpi = &hugepages[cur_page];
+ munmap(hpi->orig_va, hpi->size);
+ hpi->orig_va = NULL;
+ }
+
+ /* we cannot know how many page sizes and sockets we have discovered, so
+ * loop over all of them
+ */
+ for (hpi_idx = 0; hpi_idx < internal_config.num_hugepage_sizes;
+ hpi_idx++) {
+ uint64_t page_sz =
+ internal_config.hugepage_info[hpi_idx].hugepage_sz;
+
+ for (i = 0; i < rte_socket_count(); i++) {
+ struct rte_memseg_list *msl;
+
+ socket = rte_socket_id_by_idx(i);
+ n_contig_segs = 0;
+ n_segs = 0;
+ seg_start_page = -1;
+
+ for (cur_page = 0; cur_page < n_pages; cur_page++) {
+ struct hugepage_file *prev, *cur;
+ int prev_seg_start_page = -1;
+
+ cur = &hugepages[cur_page];
+ prev = cur_page == 0 ? NULL :
+ &hugepages[cur_page - 1];
+
+ new_memseg = 0;
+ end_seg = 0;
+
+ if (cur->size == 0)
+ end_seg = 1;
+ else if (cur->socket_id != (int) socket)
+ end_seg = 1;
+ else if (cur->size != page_sz)
+ end_seg = 1;
+ else if (cur_page == 0)
+ new_memseg = 1;
+#ifdef RTE_ARCH_PPC_64
+ /* On PPC64 architecture, the mmap always start
+ * from higher address to lower address. Here,
+ * physical addresses are in descending order.
+ */
+ else if ((prev->physaddr - cur->physaddr) !=
+ cur->size)
+ new_memseg = 1;
+#else
+ else if ((cur->physaddr - prev->physaddr) !=
+ cur->size)
+ new_memseg = 1;
+#endif
+ if (new_memseg) {
+ /* if we're already inside a segment,
+ * new segment means end of current one
+ */
+ if (seg_start_page != -1) {
+ end_seg = 1;
+ prev_seg_start_page =
+ seg_start_page;
+ }
+ seg_start_page = cur_page;
+ }
+
+ if (end_seg) {
+ if (prev_seg_start_page != -1) {
+ /* we've found a new segment */
+ n_contig_segs++;
+ n_segs += cur_page -
+ prev_seg_start_page;
+ } else if (seg_start_page != -1) {
+ /* we didn't find new segment,
+ * but did end current one
+ */
+ n_contig_segs++;
+ n_segs += cur_page -
+ seg_start_page;
+ seg_start_page = -1;
+ continue;
+ } else {
+ /* we're skipping this page */
+ continue;
+ }
+ }
+ /* segment continues */
+ }
+ /* check if we missed last segment */
+ if (seg_start_page != -1) {
+ n_contig_segs++;
+ n_segs += cur_page - seg_start_page;
+ }
+
+ /* if no segments were found, do not preallocate */
+ if (n_segs == 0)
+ continue;
+
+ /* we now have total number of pages that we will
+ * allocate for this segment list. add separator pages
+ * to the total count, and preallocate VA space.
+ */
+ n_segs += n_contig_segs - 1;
+
+ /* now, preallocate VA space for these segments */
+
+ /* first, find suitable memseg list for this */
+ for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS;
+ msl_idx++) {
+ msl = &mcfg->memsegs[msl_idx];
+
+ if (msl->base_va != NULL)
+ continue;
+ break;
+ }
+ if (msl_idx == RTE_MAX_MEMSEG_LISTS) {
+ RTE_LOG(ERR, EAL, "Not enough space in memseg lists, please increase %s\n",
+ RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS));
+ return -1;
+ }
+
+ /* now, allocate fbarray itself */
+ if (alloc_memseg_list(msl, page_sz, n_segs, socket,
+ msl_idx) < 0)
+ return -1;
+
+ /* finally, allocate VA space */
+ if (alloc_va_space(msl) < 0)
+ return -1;
+ }
+ }
+ return 0;
+}
+
+/*
+ * We cannot reallocate memseg lists on the fly because PPC64 stores pages
+ * backwards, therefore we have to process the entire memseg first before
+ * remapping it into memseg list VA space.
+ */
+static int
+remap_needed_hugepages(struct hugepage_file *hugepages, int n_pages)
+{
+ int cur_page, seg_start_page, new_memseg, ret;
+
+ seg_start_page = 0;
+ for (cur_page = 0; cur_page < n_pages; cur_page++) {
+ struct hugepage_file *prev, *cur;
+
+ new_memseg = 0;
+
+ cur = &hugepages[cur_page];
+ prev = cur_page == 0 ? NULL : &hugepages[cur_page - 1];
+
+ /* if size is zero, no more pages left */
+ if (cur->size == 0)
+ break;
+
+ if (cur_page == 0)
+ new_memseg = 1;
+ else if (cur->socket_id != prev->socket_id)
+ new_memseg = 1;
+ else if (cur->size != prev->size)
+ new_memseg = 1;
+#ifdef RTE_ARCH_PPC_64
+ /* On PPC64 architecture, the mmap always start from higher
+ * address to lower address. Here, physical addresses are in
+ * descending order.
+ */
+ else if ((prev->physaddr - cur->physaddr) != cur->size)
+ new_memseg = 1;
+#else
+ else if ((cur->physaddr - prev->physaddr) != cur->size)
+ new_memseg = 1;
+#endif
+
+ if (new_memseg) {
+ /* if this isn't the first time, remap segment */
+ if (cur_page != 0) {
+ ret = remap_segment(hugepages, seg_start_page,
+ cur_page);
+ if (ret != 0)
+ return -1;
+ }
+ /* remember where we started */
+ seg_start_page = cur_page;
+ }
+ /* continuation of previous memseg */
+ }
+ /* we were stopped, but we didn't remap the last segment, do it now */
+ if (cur_page != 0) {
+ ret = remap_segment(hugepages, seg_start_page,
+ cur_page);
+ if (ret != 0)
+ return -1;
+ }
+ return 0;
+}
+
static inline uint64_t
get_socket_mem_size(int socket)
{
@@ -753,8 +1089,10 @@ calc_num_pages_per_socket(uint64_t * memory,

/* if specific memory amounts per socket weren't requested */
if (internal_config.force_sockets == 0) {
+ size_t total_size;
+#ifdef RTE_ARCH_64
int cpu_per_socket[RTE_MAX_NUMA_NODES];
- size_t default_size, total_size;
+ size_t default_size;
unsigned lcore_id;

/* Compute number of cores per socket */
@@ -772,7 +1110,7 @@ calc_num_pages_per_socket(uint64_t * memory,

/* Set memory amount per socket */
default_size = (internal_config.memory * cpu_per_socket[socket])
- / rte_lcore_count();
+ / rte_lcore_count();

/* Limit to maximum available memory on socket */
default_size = RTE_MIN(default_size, get_socket_mem_size(socket));
@@ -789,12 +1127,33 @@ calc_num_pages_per_socket(uint64_t * memory,
for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; socket++) {
/* take whatever is available */
default_size = RTE_MIN(get_socket_mem_size(socket) - memory[socket],
- total_size);
+ total_size);

/* Update sizes */
memory[socket] += default_size;
total_size -= default_size;
}
+#else
+ /* in 32-bit mode, allocate all of the memory only on master
+ * lcore socket
+ */
+ total_size = internal_config.memory;
+ for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0;
+ socket++) {
+ struct rte_config *cfg = rte_eal_get_configuration();
+ unsigned int master_lcore_socket;
+
+ master_lcore_socket =
+ rte_lcore_to_socket_id(cfg->master_lcore);
+
+ if (master_lcore_socket != socket)
+ continue;
+
+ /* Update sizes */
+ memory[socket] = total_size;
+ break;
+ }
+#endif
}

for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_mem != 0; socket++) {
@@ -842,7 +1201,8 @@ calc_num_pages_per_socket(uint64_t * memory,
}
}
/* if we didn't satisfy all memory requirements per socket */
- if (memory[socket] > 0) {
+ if (memory[socket] > 0 &&
+ internal_config.socket_mem[socket] != 0) {
/* to prevent icc errors */
requested = (unsigned) (internal_config.socket_mem[socket] /
0x100000);
@@ -928,11 +1288,13 @@ eal_legacy_hugepage_init(void)
struct rte_mem_config *mcfg;
struct hugepage_file *hugepage = NULL, *tmp_hp = NULL;
struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES];
+ struct rte_fbarray *arr;
+ struct rte_memseg *ms;

uint64_t memory[RTE_MAX_NUMA_NODES];

unsigned hp_offset;
- int i, j, new_memseg;
+ int i, j;
int nr_hugefiles, nr_hugepages = 0;
void *addr;

@@ -945,6 +1307,25 @@ eal_legacy_hugepage_init(void)

/* hugetlbfs can be disabled */
if (internal_config.no_hugetlbfs) {
+ struct rte_memseg_list *msl;
+ uint64_t page_sz;
+ int n_segs, cur_seg;
+
+ /* nohuge mode is legacy mode */
+ internal_config.legacy_mem = 1;
+
+ /* create a memseg list */
+ msl = &mcfg->memsegs[0];
+
+ page_sz = RTE_PGSIZE_4K;
+ n_segs = internal_config.memory / page_sz;
+
+ if (rte_fbarray_init(&msl->memseg_arr, "nohugemem", n_segs,
+ sizeof(struct rte_memseg))) {
+ RTE_LOG(ERR, EAL, "Cannot allocate memseg list\n");
+ return -1;
+ }
+
addr = mmap(NULL, internal_config.memory, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
if (addr == MAP_FAILED) {
@@ -952,14 +1333,27 @@ eal_legacy_hugepage_init(void)
strerror(errno));
return -1;
}
- if (rte_eal_iova_mode() == RTE_IOVA_VA)
- mcfg->memseg[0].iova = (uintptr_t)addr;
- else
- mcfg->memseg[0].iova = RTE_BAD_IOVA;
- mcfg->memseg[0].addr = addr;
- mcfg->memseg[0].hugepage_sz = RTE_PGSIZE_4K;
- mcfg->memseg[0].len = internal_config.memory;
- mcfg->memseg[0].socket_id = 0;
+ msl->base_va = addr;
+ msl->page_sz = page_sz;
+ msl->socket_id = 0;
+
+ /* populate memsegs. each memseg is one page long */
+ for (cur_seg = 0; cur_seg < n_segs; cur_seg++) {
+ arr = &mcfg->memsegs[cur_seg].memseg_arr;
+
+ ms = rte_fbarray_get(arr, cur_seg);
+ if (rte_eal_iova_mode() == RTE_IOVA_VA)
+ ms->iova = (uintptr_t)addr;
+ else
+ ms->iova = RTE_BAD_IOVA;
+ ms->addr = addr;
+ ms->hugepage_sz = page_sz;
+ ms->socket_id = 0;
+
+ rte_fbarray_set_used(arr, cur_seg);
+
+ addr = RTE_PTR_ADD(addr, (size_t)page_sz);
+ }
return 0;
}

@@ -992,7 +1386,6 @@ eal_legacy_hugepage_init(void)
for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
memory[i] = internal_config.socket_mem[i];

-
/* map all hugepages and sort them */
for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){
unsigned pages_old, pages_new;
@@ -1010,8 +1403,7 @@ eal_legacy_hugepage_init(void)

/* map all hugepages available */
pages_old = hpi->num_pages[0];
- pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi,
- memory, 1);
+ pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, memory);
if (pages_new < pages_old) {
RTE_LOG(DEBUG, EAL,
"%d not %d hugepages of size %u MB allocated\n",
@@ -1054,18 +1446,6 @@ eal_legacy_hugepage_init(void)
qsort(&tmp_hp[hp_offset], hpi->num_pages[0],
sizeof(struct hugepage_file), cmp_physaddr);

- /* remap all hugepages */
- if (map_all_hugepages(&tmp_hp[hp_offset], hpi, NULL, 0) !=
- hpi->num_pages[0]) {
- RTE_LOG(ERR, EAL, "Failed to remap %u MB pages\n",
- (unsigned)(hpi->hugepage_sz / 0x100000));
- goto fail;
- }
-
- /* unmap original mappings */
- if (unmap_all_hugepages_orig(&tmp_hp[hp_offset], hpi) < 0)
- goto fail;
-
/* we have processed a num of hugepages of this size, so inc offset */
hp_offset += hpi->num_pages[0];
}
@@ -1148,7 +1528,7 @@ eal_legacy_hugepage_init(void)

/*
* copy stuff from malloc'd hugepage* to the actual shared memory.
- * this procedure only copies those hugepages that have final_va
+ * this procedure only copies those hugepages that have orig_va
* not NULL. has overflow protection.
*/
if (copy_hugepages_to_shared_mem(hugepage, nr_hugefiles,
@@ -1157,6 +1537,23 @@ eal_legacy_hugepage_init(void)
goto fail;
}

+#ifndef RTE_ARCH_64
+ /* for legacy 32-bit mode, we did not preallocate VA space, so do it */
+ if (internal_config.legacy_mem &&
+ prealloc_segments(hugepage, nr_hugefiles)) {
+ RTE_LOG(ERR, EAL, "Could not preallocate VA space for hugepages\n");
+ goto fail;
+ }
+#endif
+
+ /* remap all pages we do need into memseg list VA space, so that those
+ * pages become first-class citizens in DPDK memory subsystem
+ */
+ if (remap_needed_hugepages(hugepage, nr_hugefiles)) {
+ RTE_LOG(ERR, EAL, "Couldn't remap hugepage files into memseg lists\n");
+ goto fail;
+ }
+
/* free the hugepage backing files */
if (internal_config.hugepage_unlink &&
unlink_hugepage_files(tmp_hp, internal_config.num_hugepage_sizes) < 0) {
@@ -1168,75 +1565,30 @@ eal_legacy_hugepage_init(void)
free(tmp_hp);
tmp_hp = NULL;

- /* first memseg index shall be 0 after incrementing it below */
- j = -1;
- for (i = 0; i < nr_hugefiles; i++) {
- new_memseg = 0;
-
- /* if this is a new section, create a new memseg */
- if (i == 0)
- new_memseg = 1;
- else if (hugepage[i].socket_id != hugepage[i-1].socket_id)
- new_memseg = 1;
- else if (hugepage[i].size != hugepage[i-1].size)
- new_memseg = 1;
-
-#ifdef RTE_ARCH_PPC_64
- /* On PPC64 architecture, the mmap always start from higher
- * virtual address to lower address. Here, both the physical
- * address and virtual address are in descending order */
- else if ((hugepage[i-1].physaddr - hugepage[i].physaddr) !=
- hugepage[i].size)
- new_memseg = 1;
- else if (((unsigned long)hugepage[i-1].final_va -
- (unsigned long)hugepage[i].final_va) != hugepage[i].size)
- new_memseg = 1;
-#else
- else if ((hugepage[i].physaddr - hugepage[i-1].physaddr) !=
- hugepage[i].size)
- new_memseg = 1;
- else if (((unsigned long)hugepage[i].final_va -
- (unsigned long)hugepage[i-1].final_va) != hugepage[i].size)
- new_memseg = 1;
-#endif
+ munmap(hugepage, nr_hugefiles * sizeof(struct hugepage_file));

- if (new_memseg) {
- j += 1;
- if (j == RTE_MAX_MEMSEG)
- break;
+ /* we're not going to allocate more pages, so release VA space for
+ * unused memseg lists
+ */
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+ struct rte_memseg_list *msl = &mcfg->memsegs[i];
+ size_t mem_sz;

- mcfg->memseg[j].iova = hugepage[i].physaddr;
- mcfg->memseg[j].addr = hugepage[i].final_va;
- mcfg->memseg[j].len = hugepage[i].size;
- mcfg->memseg[j].socket_id = hugepage[i].socket_id;
- mcfg->memseg[j].hugepage_sz = hugepage[i].size;
- }
- /* continuation of previous memseg */
- else {
-#ifdef RTE_ARCH_PPC_64
- /* Use the phy and virt address of the last page as segment
- * address for IBM Power architecture */
- mcfg->memseg[j].iova = hugepage[i].physaddr;
- mcfg->memseg[j].addr = hugepage[i].final_va;
-#endif
- mcfg->memseg[j].len += mcfg->memseg[j].hugepage_sz;
- }
- hugepage[i].memseg_id = j;
- }
+ /* skip inactive lists */
+ if (msl->base_va == NULL)
+ continue;
+ /* skip lists where there is at least one page allocated */
+ if (msl->memseg_arr.count > 0)
+ continue;
+ /* this is an unused list, deallocate it */
+ mem_sz = (size_t)msl->page_sz * msl->memseg_arr.len;
+ munmap(msl->base_va, mem_sz);
+ msl->base_va = NULL;

- if (i < nr_hugefiles) {
- RTE_LOG(ERR, EAL, "Can only reserve %d pages "
- "from %d requested\n"
- "Current %s=%d is not enough\n"
- "Please either increase it or request less amount "
- "of memory.\n",
- i, nr_hugefiles, RTE_STR(CONFIG_RTE_MAX_MEMSEG),
- RTE_MAX_MEMSEG);
- goto fail;
+ /* destroy backing fbarray */
+ rte_fbarray_destroy(&msl->memseg_arr);
}

- munmap(hugepage, nr_hugefiles * sizeof(struct hugepage_file));
-
return 0;

fail:
@@ -1269,11 +1621,10 @@ getFileSize(int fd)
static int
eal_legacy_hugepage_attach(void)
{
- const struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
struct hugepage_file *hp = NULL;
- unsigned num_hp = 0;
- unsigned i, s = 0; /* s used to track the segment number */
- unsigned max_seg = RTE_MAX_MEMSEG;
+ unsigned int num_hp = 0;
+ unsigned int i = 0;
+ unsigned int cur_seg;
off_t size = 0;
int fd, fd_hugepage = -1;

@@ -1292,50 +1643,6 @@ eal_legacy_hugepage_attach(void)
goto error;
}

- /* map all segments into memory to make sure we get the addrs */
- for (s = 0; s < RTE_MAX_MEMSEG; ++s) {
- void *base_addr;
- size_t mmap_sz;
- int mmap_flags = 0;
-
- /*
- * the first memory segment with len==0 is the one that
- * follows the last valid segment.
- */
- if (mcfg->memseg[s].len == 0)
- break;
-
- /* get identical addresses as the primary process.
- */
-#ifdef RTE_ARCH_PPC_64
- mmap_flags |= MAP_HUGETLB;
-#endif
- mmap_sz = mcfg->memseg[s].len;
- base_addr = eal_get_virtual_area(mcfg->memseg[s].addr,
- &mmap_sz, mcfg->memseg[s].hugepage_sz, 0,
- mmap_flags);
- if (base_addr == NULL) {
- max_seg = s;
- if (rte_errno == EADDRNOTAVAIL) {
- RTE_LOG(ERR, EAL, "Could not mmap %llu bytes at [%p] - please use '--base-virtaddr' option\n",
- (unsigned long long)mcfg->memseg[s].len,
- mcfg->memseg[s].addr);
- } else {
- RTE_LOG(ERR, EAL, "Could not mmap %llu bytes at [%p]: '%s'\n",
- (unsigned long long)mcfg->memseg[s].len,
- mcfg->memseg[s].addr,
- rte_strerror(rte_errno));
- }
- if (aslr_enabled() > 0) {
- RTE_LOG(ERR, EAL, "It is recommended to "
- "disable ASLR in the kernel "
- "and retry running both primary "
- "and secondary processes\n");
- }
- goto error;
- }
- }
-
size = getFileSize(fd_hugepage);
hp = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd_hugepage, 0);
if (hp == MAP_FAILED) {
@@ -1346,46 +1653,49 @@ eal_legacy_hugepage_attach(void)
num_hp = size / sizeof(struct hugepage_file);
RTE_LOG(DEBUG, EAL, "Analysing %u files\n", num_hp);

- s = 0;
- while (s < RTE_MAX_MEMSEG && mcfg->memseg[s].len > 0){
- void *addr, *base_addr;
- uintptr_t offset = 0;
- size_t mapping_size;
- /*
- * free previously mapped memory so we can map the
- * hugepages into the space
- */
- base_addr = mcfg->memseg[s].addr;
- munmap(base_addr, mcfg->memseg[s].len);
-
- /* find the hugepages for this segment and map them
- * we don't need to worry about order, as the server sorted the
- * entries before it did the second mmap of them */
- for (i = 0; i < num_hp && offset < mcfg->memseg[s].len; i++){
- if (hp[i].memseg_id == (int)s){
- fd = open(hp[i].filepath, O_RDWR);
- if (fd < 0) {
- RTE_LOG(ERR, EAL, "Could not open %s\n",
- hp[i].filepath);
- goto error;
- }
- mapping_size = hp[i].size;
- addr = mmap(RTE_PTR_ADD(base_addr, offset),
- mapping_size, PROT_READ | PROT_WRITE,
- MAP_SHARED, fd, 0);
- close(fd); /* close file both on success and on failure */
- if (addr == MAP_FAILED ||
- addr != RTE_PTR_ADD(base_addr, offset)) {
- RTE_LOG(ERR, EAL, "Could not mmap %s\n",
- hp[i].filepath);
- goto error;
- }
- offset+=mapping_size;
- }
+ /* map all segments into memory to make sure we get the addrs. the
+ * segments themselves are already in memseg list (which is shared and
+ * has its VA space already preallocated), so we just need to map
+ * everything into correct addresses.
+ */
+ for (i = 0; i < num_hp; i++) {
+ struct hugepage_file *hf = &hp[i];
+ size_t map_sz = hf->size;
+ void *map_addr = hf->final_va;
+ struct flock lck;
+
+ /* if size is zero, no more pages left */
+ if (map_sz == 0)
+ break;
+
+ fd = open(hf->filepath, O_RDWR);
+ if (fd < 0) {
+ RTE_LOG(ERR, EAL, "Could not open %s: %s\n",
+ hf->filepath, strerror(errno));
+ goto error;
}
- RTE_LOG(DEBUG, EAL, "Mapped segment %u of size 0x%llx\n", s,
- (unsigned long long)mcfg->memseg[s].len);
- s++;
+
+ map_addr = mmap(map_addr, map_sz, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_FIXED, fd, 0);
+ if (map_addr == MAP_FAILED) {
+ RTE_LOG(ERR, EAL, "Could not map %s: %s\n",
+ hf->filepath, strerror(errno));
+ goto error;
+ }
+
+ /* set shared lock on the file. */
+ lck.l_type = F_RDLCK;
+ lck.l_whence = SEEK_SET;
+ lck.l_start = 0;
+ lck.l_len = map_sz;
+ if (fcntl(fd, F_SETLK, &lck) == -1) {
+ RTE_LOG(DEBUG, EAL, "%s(): Locking file failed: %s\n",
+ __func__, strerror(errno));
+ close(fd);
+ goto error;
+ }
+
+ close(fd);
}
/* unmap the hugepage config file, since we are done using it */
munmap(hp, size);
@@ -1393,8 +1703,15 @@ eal_legacy_hugepage_attach(void)
return 0;

error:
- for (i = 0; i < max_seg && mcfg->memseg[i].len > 0; i++)
- munmap(mcfg->memseg[i].addr, mcfg->memseg[i].len);
+ /* map all segments into memory to make sure we get the addrs */
+ cur_seg = 0;
+ for (cur_seg = 0; cur_seg < i; cur_seg++) {
+ struct hugepage_file *hf = &hp[i];
+ size_t map_sz = hf->size;
+ void *map_addr = hf->final_va;
+
+ munmap(map_addr, map_sz);
+ }
if (hp != NULL && hp != MAP_FAILED)
munmap(hp, size);
if (fd_hugepage >= 0)
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index f6fe93e..2c27063 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -686,7 +686,8 @@ vfio_get_group_no(const char *sysfs_base,
}

static int
-type1_map(const struct rte_memseg *ms, void *arg)
+type1_map(const struct rte_memseg_list *msl __rte_unused,
+ const struct rte_memseg *ms, void *arg)
{
int *vfio_container_fd = arg;

@@ -799,7 +800,8 @@ vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
}

static int
-vfio_spapr_map_walk(const struct rte_memseg *ms, void *arg)
+vfio_spapr_map_walk(const struct rte_memseg_list *msl __rte_unused,
+ const struct rte_memseg *ms, void *arg)
{
int *vfio_container_fd = arg;

@@ -812,7 +814,8 @@ struct spapr_walk_param {
uint64_t hugepage_sz;
};
static int
-vfio_spapr_window_size_walk(const struct rte_memseg *ms, void *arg)
+vfio_spapr_window_size_walk(const struct rte_memseg_list *msl __rte_unused,
+ const struct rte_memseg *ms, void *arg)
{
struct spapr_walk_param *param = arg;
uint64_t max = ms->iova + ms->len;
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index 3a12112..df5802d 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -25,7 +25,6 @@ DPDK_2.0 {
rte_eal_devargs_type_count;
rte_eal_get_configuration;
rte_eal_get_lcore_state;
- rte_eal_get_physmem_layout;
rte_eal_get_physmem_size;
rte_eal_has_hugepages;
rte_eal_hpet_init;
@@ -241,7 +240,9 @@ EXPERIMENTAL {
rte_malloc_dump_heaps;
rte_mem_iova2virt;
rte_mem_virt2memseg;
+ rte_mem_virt2memseg_list;
rte_memseg_contig_walk;
+ rte_memseg_list_walk;
rte_memseg_walk;
rte_memzone_reserve_contig;
rte_memzone_reserve_aligned_contig;
diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index bb33c3a..38fb1ba 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -100,12 +100,12 @@ static unsigned optimize_object_size(unsigned obj_size)
}

static int
-find_min_pagesz(const struct rte_memseg *ms, void *arg)
+find_min_pagesz(const struct rte_memseg_list *msl, void *arg)
{
size_t *min = arg;

- if (ms->hugepage_sz < *min)
- *min = ms->hugepage_sz;
+ if (msl->page_sz < *min)
+ *min = msl->page_sz;

return 0;
}
@@ -115,11 +115,12 @@ get_min_page_size(void)
{
size_t min_pagesz = SIZE_MAX;

- rte_memseg_walk(find_min_pagesz, &min_pagesz);
+ rte_memseg_list_walk(find_min_pagesz, &min_pagesz);

return min_pagesz == SIZE_MAX ? (size_t) getpagesize() : min_pagesz;
}

+
static void
mempool_add_elem(struct rte_mempool *mp, void *obj, rte_iova_t iova)
{
diff --git a/test/test/test_malloc.c b/test/test/test_malloc.c
index 578ad04..805bf04 100644
--- a/test/test/test_malloc.c
+++ b/test/test/test_malloc.c
@@ -12,6 +12,7 @@

#include <rte_common.h>
#include <rte_memory.h>
+#include <rte_eal_memconfig.h>
#include <rte_per_lcore.h>
#include <rte_launch.h>
#include <rte_eal.h>
@@ -706,36 +707,20 @@ test_malloc_bad_params(void)
}

static int
-check_socket_mem(const struct rte_memseg *ms, void *arg)
+check_socket_mem(const struct rte_memseg_list *msl, void *arg)
{
int32_t *socket = arg;

- return *socket == ms->socket_id;
+ return *socket == msl->socket_id;
}

/* Check if memory is available on a specific socket */
static int
is_mem_on_socket(int32_t socket)
{
- return rte_memseg_walk(check_socket_mem, &socket);
+ return rte_memseg_list_walk(check_socket_mem, &socket);
}

-struct walk_param {
- void *addr;
- int32_t socket;
-};
-static int
-find_socket(const struct rte_memseg *ms, void *arg)
-{
- struct walk_param *param = arg;
-
- if (param->addr >= ms->addr &&
- param->addr < RTE_PTR_ADD(ms->addr, ms->len)) {
- param->socket = ms->socket_id;
- return 1;
- }
- return 0;
-}

/*
* Find what socket a memory address is on. Only works for addresses within
@@ -744,10 +729,9 @@ find_socket(const struct rte_memseg *ms, void *arg)
static int32_t
addr_to_socket(void * addr)
{
- struct walk_param param = {.addr = addr, .socket = 0};
- if (rte_memseg_walk(find_socket, &param) > 0)
- return param.socket;
- return -1;
+ const struct rte_memseg *ms = rte_mem_virt2memseg(addr, NULL);
+ return ms == NULL ? -1 : ms->socket_id;
+
}

/* Test using rte_[c|m|zm]alloc_socket() on a specific socket */
diff --git a/test/test/test_memory.c b/test/test/test_memory.c
index c9b287c..b96bca7 100644
--- a/test/test/test_memory.c
+++ b/test/test/test_memory.c
@@ -5,8 +5,11 @@
#include <stdio.h>
#include <stdint.h>

+#include <rte_eal.h>
+#include <rte_eal_memconfig.h>
#include <rte_memory.h>
#include <rte_common.h>
+#include <rte_memzone.h>

#include "test.h"

@@ -23,12 +26,13 @@
*/

static int
-check_mem(const struct rte_memseg *ms, void *arg __rte_unused)
+check_mem(const struct rte_memseg_list *msl __rte_unused,
+ const struct rte_memseg *ms, void *arg __rte_unused)
{
volatile uint8_t *mem = (volatile uint8_t *) ms->addr;
- size_t i;
+ size_t i, max = ms->len;

- for (i = 0; i < ms->len; i++, mem++)
+ for (i = 0; i < max; i++, mem++)
*mem;
return 0;
}
diff --git a/test/test/test_memzone.c b/test/test/test_memzone.c
index cbf0cfa..0046f04 100644
--- a/test/test/test_memzone.c
+++ b/test/test/test_memzone.c
@@ -111,17 +111,17 @@ struct walk_arg {
int hugepage_16GB_avail;
};
static int
-find_available_pagesz(const struct rte_memseg *ms, void *arg)
+find_available_pagesz(const struct rte_memseg_list *msl, void *arg)
{
struct walk_arg *wa = arg;

- if (ms->hugepage_sz == RTE_PGSIZE_2M)
+ if (msl->page_sz == RTE_PGSIZE_2M)
wa->hugepage_2MB_avail = 1;
- if (ms->hugepage_sz == RTE_PGSIZE_1G)
+ if (msl->page_sz == RTE_PGSIZE_1G)
wa->hugepage_1GB_avail = 1;
- if (ms->hugepage_sz == RTE_PGSIZE_16M)
+ if (msl->page_sz == RTE_PGSIZE_16M)
wa->hugepage_16MB_avail = 1;
- if (ms->hugepage_sz == RTE_PGSIZE_16G)
+ if (msl->page_sz == RTE_PGSIZE_16G)
wa->hugepage_16GB_avail = 1;

return 0;
@@ -138,7 +138,7 @@ test_memzone_reserve_flags(void)

memset(&wa, 0, sizeof(wa));

- rte_memseg_walk(find_available_pagesz, &wa);
+ rte_memseg_list_walk(find_available_pagesz, &wa);

hugepage_2MB_avail = wa.hugepage_2MB_avail;
hugepage_1GB_avail = wa.hugepage_1GB_avail;
--
2.7.4
Anatoly Burakov
2018-04-03 23:22:17 UTC
Permalink
Now that every other piece of the puzzle is in place, enable non-legacy
init mode.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/linuxapp/eal/eal.c | 2 --
1 file changed, 2 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index e7c6dcf..99c2242 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -772,8 +772,6 @@ rte_eal_init(int argc, char **argv)
rte_atomic32_clear(&run_once);
return -1;
}
- /* for now, always set legacy mem */
- internal_config.legacy_mem = 1;

if (eal_plugins_init() < 0) {
rte_eal_init_alert("Cannot init plugins\n");
--
2.7.4
Anatoly Burakov
2018-04-03 23:22:20 UTC
Permalink
It is common sense to expect for DPDK process to not deallocate any
pages that were preallocated by "-m" or "--socket-mem" flags - yet,
currently, DPDK memory subsystem will do exactly that once it finds
that the pages are unused.

Fix this by marking pages as unfreebale, and preventing malloc from
ever trying to free them.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/common/include/rte_memory.h | 3 +++
lib/librte_eal/common/malloc_heap.c | 23 +++++++++++++++++++++++
lib/librte_eal/linuxapp/eal/eal_memalloc.c | 7 +++++++
lib/librte_eal/linuxapp/eal/eal_memory.c | 18 +++++++++++++++---
4 files changed, 48 insertions(+), 3 deletions(-)

diff --git a/lib/librte_eal/common/include/rte_memory.h b/lib/librte_eal/common/include/rte_memory.h
index 0318bfe..58b607b 100644
--- a/lib/librte_eal/common/include/rte_memory.h
+++ b/lib/librte_eal/common/include/rte_memory.h
@@ -83,6 +83,8 @@ typedef uint64_t rte_iova_t;
/**
* Physical memory segment descriptor.
*/
+#define RTE_MEMSEG_FLAG_DO_NOT_FREE (1 << 0)
+/**< Prevent this segment from being freed back to the OS. */
struct rte_memseg {
RTE_STD_C11
union {
@@ -99,6 +101,7 @@ struct rte_memseg {
int32_t socket_id; /**< NUMA socket ID. */
uint32_t nchannel; /**< Number of channels. */
uint32_t nrank; /**< Number of ranks. */
+ uint32_t flags; /**< Memseg-specific flags */
} __rte_packed;

/**
diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c
index f8daf84..41c14a8 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -606,6 +606,7 @@ malloc_heap_free(struct malloc_elem *elem)
void *start, *aligned_start, *end, *aligned_end;
size_t len, aligned_len, page_sz;
struct rte_memseg_list *msl;
+ unsigned int i, n_segs;
int ret;

if (!malloc_elem_cookies_ok(elem) || elem->state != ELEM_BUSY)
@@ -647,6 +648,28 @@ malloc_heap_free(struct malloc_elem *elem)
if (aligned_len < page_sz)
goto free_unlock;

+ /* we can free something. however, some of these pages may be marked as
+ * unfreeable, so also check that as well
+ */
+ n_segs = aligned_len / page_sz;
+ for (i = 0; i < n_segs; i++) {
+ const struct rte_memseg *tmp =
+ rte_mem_virt2memseg(aligned_start, msl);
+
+ if (tmp->flags & RTE_MEMSEG_FLAG_DO_NOT_FREE) {
+ /* this is an unfreeable segment, so move start */
+ aligned_start = RTE_PTR_ADD(tmp->addr, tmp->len);
+ }
+ }
+
+ /* recalculate length and number of segments */
+ aligned_len = RTE_PTR_DIFF(aligned_end, aligned_start);
+ n_segs = aligned_len / page_sz;
+
+ /* check if we can still free some pages */
+ if (n_segs == 0)
+ goto free_unlock;
+
rte_rwlock_write_lock(&mcfg->memory_hotplug_lock);

/*
diff --git a/lib/librte_eal/linuxapp/eal/eal_memalloc.c b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
index 87c1bdb..0da2e3c 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memalloc.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
@@ -806,6 +806,13 @@ eal_memalloc_free_seg_bulk(struct rte_memseg **ms, int n_segs)
struct free_walk_param wa;
int i, walk_res;

+ /* if this page is marked as unfreeable, fail */
+ if (cur->flags & RTE_MEMSEG_FLAG_DO_NOT_FREE) {
+ RTE_LOG(DEBUG, EAL, "Page is not allowed to be freed\n");
+ ret = -1;
+ continue;
+ }
+
memset(&wa, 0, sizeof(wa));

for (i = 0; i < (int)RTE_DIM(internal_config.hugepage_info);
diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
index 7ec7129..2bd9c30 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -1637,21 +1637,33 @@ eal_hugepage_init(void)
hp_sz_idx++) {
for (socket_id = 0; socket_id < RTE_MAX_NUMA_NODES;
socket_id++) {
+ struct rte_memseg **pages;
struct hugepage_info *hpi = &used_hp[hp_sz_idx];
unsigned int num_pages = hpi->num_pages[socket_id];
- int num_pages_alloc;
+ int num_pages_alloc, i;

if (num_pages == 0)
continue;

+ pages = malloc(sizeof(*pages) * num_pages);
+
RTE_LOG(DEBUG, EAL, "Allocating %u pages of size %" PRIu64 "M on socket %i\n",
num_pages, hpi->hugepage_sz >> 20, socket_id);

- num_pages_alloc = eal_memalloc_alloc_seg_bulk(NULL,
+ num_pages_alloc = eal_memalloc_alloc_seg_bulk(pages,
num_pages, hpi->hugepage_sz,
socket_id, true);
- if (num_pages_alloc < 0)
+ if (num_pages_alloc < 0) {
+ free(pages);
return -1;
+ }
+
+ /* mark preallocated pages as unfreeable */
+ for (i = 0; i < num_pages_alloc; i++) {
+ struct rte_memseg *ms = pages[i];
+ ms->flags |= RTE_MEMSEG_FLAG_DO_NOT_FREE;
+ }
+ free(pages);
}
}
return 0;
--
2.7.4
Anatoly Burakov
2018-04-03 23:22:05 UTC
Permalink
Currently, DPDK stores all pages as separate files in hugetlbfs.
This option will allow storing all pages in one file (one file
per memseg list).

We do this by using fallocate() calls on FreeBSD, however this is
only supported on fairly recent (4.3+) kernels, so ftruncate()
fallback is provided to grow (but not shrink) hugepage files.
Naming scheme is deterministic, so both primary and secondary
processes will be able to easily map needed files and offsets.

For multi-file segments, we can close fd's right away. For
single-file segments, we can reuse the same fd and reduce the
amount of fd's needed to map/use hugepages. However, we need to
store the fd's somewhere, so we add a tailq.

Signed-off-by: Anatoly Burakov <***@intel.com>
---

Notes:
v3:
- Split this change into a separate patch
- Provide more explanation as to how it works

lib/librte_eal/common/eal_common_options.c | 4 +
lib/librte_eal/common/eal_internal_cfg.h | 4 +
lib/librte_eal/common/eal_options.h | 2 +
lib/librte_eal/linuxapp/eal/eal.c | 1 +
lib/librte_eal/linuxapp/eal/eal_memalloc.c | 337 ++++++++++++++++++++++++-----
5 files changed, 297 insertions(+), 51 deletions(-)

diff --git a/lib/librte_eal/common/eal_common_options.c b/lib/librte_eal/common/eal_common_options.c
index fb5ea03..5b5da5f 100644
--- a/lib/librte_eal/common/eal_common_options.c
+++ b/lib/librte_eal/common/eal_common_options.c
@@ -74,6 +74,7 @@ eal_long_options[] = {
{OPT_VFIO_INTR, 1, NULL, OPT_VFIO_INTR_NUM },
{OPT_VMWARE_TSC_MAP, 0, NULL, OPT_VMWARE_TSC_MAP_NUM },
{OPT_LEGACY_MEM, 0, NULL, OPT_LEGACY_MEM_NUM },
+ {OPT_SINGLE_FILE_SEGMENTS, 0, NULL, OPT_SINGLE_FILE_SEGMENTS_NUM},
{0, 0, NULL, 0 }
};

@@ -1188,6 +1189,9 @@ eal_parse_common_option(int opt, const char *optarg,
case OPT_LEGACY_MEM_NUM:
conf->legacy_mem = 1;
break;
+ case OPT_SINGLE_FILE_SEGMENTS_NUM:
+ conf->single_file_segments = 1;
+ break;

/* don't know what to do, leave this to caller */
default:
diff --git a/lib/librte_eal/common/eal_internal_cfg.h b/lib/librte_eal/common/eal_internal_cfg.h
index 5cf7102..9d33cf4 100644
--- a/lib/librte_eal/common/eal_internal_cfg.h
+++ b/lib/librte_eal/common/eal_internal_cfg.h
@@ -51,6 +51,10 @@ struct internal_config {
/**< true to enable legacy memory behavior (no dynamic allocation,
* IOVA-contiguous segments).
*/
+ volatile unsigned single_file_segments;
+ /**< true if storing all pages within single files (per-page-size,
+ * per-node) non-legacy mode only.
+ */
volatile int syslog_facility; /**< facility passed to openlog() */
/** default interrupt mode for VFIO */
volatile enum rte_intr_mode vfio_intr_mode;
diff --git a/lib/librte_eal/common/eal_options.h b/lib/librte_eal/common/eal_options.h
index d301d0b..211ae06 100644
--- a/lib/librte_eal/common/eal_options.h
+++ b/lib/librte_eal/common/eal_options.h
@@ -57,6 +57,8 @@ enum {
OPT_VMWARE_TSC_MAP_NUM,
#define OPT_LEGACY_MEM "legacy-mem"
OPT_LEGACY_MEM_NUM,
+#define OPT_SINGLE_FILE_SEGMENTS "single-file-segments"
+ OPT_SINGLE_FILE_SEGMENTS_NUM,
OPT_LONG_MAX_NUM
};

diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 9832551..2c12811 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -349,6 +349,7 @@ eal_usage(const char *prgname)
" --"OPT_CREATE_UIO_DEV" Create /dev/uioX (usually done by hotplug)\n"
" --"OPT_VFIO_INTR" Interrupt mode for VFIO (legacy|msi|msix)\n"
" --"OPT_LEGACY_MEM" Legacy memory mode (no dynamic allocation, contiguous segments)\n"
+ " --"OPT_SINGLE_FILE_SEGMENTS" Put all hugepage memory in single files\n"
"\n");
/* Allow the application to print its usage message too if hook is set */
if ( rte_application_usage_hook ) {
diff --git a/lib/librte_eal/linuxapp/eal/eal_memalloc.c b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
index 118b12d..545ac49 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memalloc.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
@@ -39,6 +39,31 @@
#include "eal_internal_cfg.h"
#include "eal_memalloc.h"

+/*
+ * not all kernel version support fallocate on hugetlbfs, so fall back to
+ * ftruncate and disallow deallocation if fallocate is not supported.
+ */
+static int fallocate_supported = -1; /* unknown */
+
+/*
+ * If each page is in a separate file, we can close fd's since we need each fd
+ * only once. However, in single file segments mode, we can get away with using
+ * a single fd for entire segments, but we need to store them somewhere. Each
+ * fd is different within each process, so we'll store them in a local tailq.
+ */
+struct msl_entry {
+ TAILQ_ENTRY(msl_entry) next;
+ unsigned int msl_idx;
+ int fd;
+};
+
+/** Double linked list of memseg list fd's. */
+TAILQ_HEAD(msl_entry_list, msl_entry);
+
+static struct msl_entry_list msl_entry_list =
+ TAILQ_HEAD_INITIALIZER(msl_entry_list);
+static rte_spinlock_t tailq_lock = RTE_SPINLOCK_INITIALIZER;
+
static sigjmp_buf huge_jmpenv;

static void __rte_unused huge_sigbus_handler(int signo __rte_unused)
@@ -129,18 +154,100 @@ resotre_numa(int *oldpolicy, struct bitmask *oldmask)
}
#endif

+static struct msl_entry *
+get_msl_entry_by_idx(unsigned int list_idx)
+{
+ struct msl_entry *te;
+
+ rte_spinlock_lock(&tailq_lock);
+
+ TAILQ_FOREACH(te, &msl_entry_list, next) {
+ if (te->msl_idx == list_idx)
+ break;
+ }
+ if (te == NULL) {
+ /* doesn't exist, so create it and set fd to -1 */
+
+ te = malloc(sizeof(*te));
+ if (te == NULL) {
+ RTE_LOG(ERR, EAL, "%s(): cannot allocate tailq entry for memseg list\n",
+ __func__);
+ goto unlock;
+ }
+ te->msl_idx = list_idx;
+ te->fd = -1;
+ TAILQ_INSERT_TAIL(&msl_entry_list, te, next);
+ }
+unlock:
+ rte_spinlock_unlock(&tailq_lock);
+ return te;
+}
+
+/*
+ * uses fstat to report the size of a file on disk
+ */
+static off_t
+get_file_size(int fd)
+{
+ struct stat st;
+ if (fstat(fd, &st) < 0)
+ return 0;
+ return st.st_size;
+}
+
+/*
+ * uses fstat to check if file size on disk is zero (regular fstat won't show
+ * true file size due to how fallocate works)
+ */
+static bool
+is_zero_length(int fd)
+{
+ struct stat st;
+ if (fstat(fd, &st) < 0)
+ return false;
+ return st.st_blocks == 0;
+}
+
static int
get_seg_fd(char *path, int buflen, struct hugepage_info *hi,
unsigned int list_idx, unsigned int seg_idx)
{
int fd;
- eal_get_hugefile_path(path, buflen, hi->hugedir,
- list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx);
- fd = open(path, O_CREAT | O_RDWR, 0600);
- if (fd < 0) {
- RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__,
- strerror(errno));
- return -1;
+
+ if (internal_config.single_file_segments) {
+ /*
+ * try to find a tailq entry, for this memseg list, or create
+ * one if it doesn't exist.
+ */
+ struct msl_entry *te = get_msl_entry_by_idx(list_idx);
+ if (te == NULL) {
+ RTE_LOG(ERR, EAL, "%s(): cannot allocate tailq entry for memseg list\n",
+ __func__);
+ return -1;
+ } else if (te->fd < 0) {
+ /* create a hugepage file */
+ eal_get_hugefile_path(path, buflen, hi->hugedir,
+ list_idx);
+ fd = open(path, O_CREAT | O_RDWR, 0600);
+ if (fd < 0) {
+ RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n",
+ __func__, strerror(errno));
+ return -1;
+ }
+ te->fd = fd;
+ } else {
+ fd = te->fd;
+ }
+ } else {
+ /* one file per page, just create it */
+ eal_get_hugefile_path(path, buflen, hi->hugedir,
+ list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx);
+ fd = open(path, O_CREAT | O_RDWR, 0600);
+ if (fd < 0) {
+ RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__,
+ strerror(errno));
+ return -1;
+ }
}
return fd;
}
@@ -173,6 +280,94 @@ static int lock(int fd, uint64_t offset, uint64_t len, int type)
}

static int
+resize_hugefile(int fd, uint64_t fa_offset, uint64_t page_sz,
+ bool grow)
+{
+ bool again = false;
+ do {
+ if (fallocate_supported == 0) {
+ /* we cannot deallocate memory if fallocate() is not
+ * supported, but locks are still needed to prevent
+ * primary process' initialization from clearing out
+ * huge pages used by this process.
+ */
+
+ if (!grow) {
+ RTE_LOG(DEBUG, EAL, "%s(): fallocate not supported, not freeing page back to the system\n",
+ __func__);
+ return -1;
+ }
+ uint64_t new_size = fa_offset + page_sz;
+ uint64_t cur_size = get_file_size(fd);
+
+ /* fallocate isn't supported, fall back to ftruncate */
+ if (new_size > cur_size &&
+ ftruncate(fd, new_size) < 0) {
+ RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n",
+ __func__, strerror(errno));
+ return -1;
+ }
+ /* not being able to take out a read lock is an error */
+ if (lock(fd, fa_offset, page_sz, F_RDLCK) != 1)
+ return -1;
+ } else {
+ int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE |
+ FALLOC_FL_KEEP_SIZE;
+ int ret;
+
+ /* if fallocate() is supported, we need to take out a
+ * read lock on allocate (to prevent other processes
+ * from deallocating this page), and take out a write
+ * lock on deallocate (to ensure nobody else is using
+ * this page).
+ *
+ * we can't use flock() for this, as we actually need to
+ * lock part of the file, not the entire file.
+ */
+
+ if (!grow) {
+ ret = lock(fd, fa_offset, page_sz, F_WRLCK);
+
+ if (ret < 0)
+ return -1;
+ else if (ret == 0)
+ /* failed to lock, not an error */
+ return 0;
+ }
+ if (fallocate(fd, flags, fa_offset, page_sz) < 0) {
+ if (fallocate_supported == -1 &&
+ errno == ENOTSUP) {
+ RTE_LOG(ERR, EAL, "%s(): fallocate() not supported, hugepage deallocation will be disabled\n",
+ __func__);
+ again = true;
+ fallocate_supported = 0;
+ } else {
+ RTE_LOG(DEBUG, EAL, "%s(): fallocate() failed: %s\n",
+ __func__,
+ strerror(errno));
+ return -1;
+ }
+ } else {
+ fallocate_supported = 1;
+
+ if (grow) {
+ /* if can't read lock, it's an error */
+ if (lock(fd, fa_offset, page_sz,
+ F_RDLCK) != 1)
+ return -1;
+ } else {
+ /* if can't unlock, it's an error */
+ if (lock(fd, fa_offset, page_sz,
+ F_UNLCK) != 1)
+ return -1;
+ }
+ }
+ }
+ } while (again);
+ return 0;
+}
+
+static int
alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
struct hugepage_info *hi, unsigned int list_idx,
unsigned int seg_idx)
@@ -191,34 +386,40 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
return -1;

alloc_sz = hi->hugepage_sz;
-
- map_offset = 0;
- if (ftruncate(fd, alloc_sz) < 0) {
- RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n",
- __func__, strerror(errno));
- goto resized;
- }
- /* we've allocated a page - take out a read lock. we're using fcntl()
- * locks rather than flock() here because doing that gives us one huge
- * advantage - fcntl() locks are per-process, not per-file descriptor,
- * which means that we don't have to keep the original fd's around to
- * keep a lock on the file.
- *
- * this is useful, because when it comes to unmapping pages, we will
- * have to take out a write lock (to figure out if another process still
- * has this page mapped), and to do itwith flock() we'll have to use
- * original fd, as lock is associated with that particular fd. with
- * fcntl(), this is not necessary - we can open a new fd and use fcntl()
- * on that.
- */
- ret = lock(fd, map_offset, alloc_sz, F_RDLCK);
-
- /* this should not fail */
- if (ret != 1) {
- RTE_LOG(ERR, EAL, "%s(): error locking file: %s\n",
- __func__,
- strerror(errno));
- goto resized;
+ if (internal_config.single_file_segments) {
+ map_offset = seg_idx * alloc_sz;
+ ret = resize_hugefile(fd, map_offset, alloc_sz, true);
+ if (ret < 1)
+ goto resized;
+ } else {
+ map_offset = 0;
+ if (ftruncate(fd, alloc_sz) < 0) {
+ RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n",
+ __func__, strerror(errno));
+ goto resized;
+ }
+ /* we've allocated a page - take out a read lock. we're using
+ * fcntl() locks rather than flock() here because doing that
+ * gives us one huge advantage - fcntl() locks are per-process,
+ * not per-file descriptor, which means that we don't have to
+ * keep the original fd's around to keep a lock on the file.
+ *
+ * this is useful, because when it comes to unmapping pages, we
+ * will have to take out a write lock (to figure out if another
+ * process still has this page mapped), and to do itwith flock()
+ * we'll have to use original fd, as lock is associated with
+ * that particular fd. with fcntl(), this is not necessary - we
+ * can open a new fd and use fcntl() on that.
+ */
+ ret = lock(fd, map_offset, alloc_sz, F_RDLCK);
+
+ /* this should not fail */
+ if (ret != 1) {
+ RTE_LOG(ERR, EAL, "%s(): error locking file: %s\n",
+ __func__,
+ strerror(errno));
+ goto resized;
+ }
}

/*
@@ -227,7 +428,9 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
*/
void *va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd, map_offset);
- close(fd);
+ /* for non-single file segments, we can close fd here */
+ if (!internal_config.single_file_segments)
+ close(fd);

if (va == MAP_FAILED) {
RTE_LOG(DEBUG, EAL, "%s(): mmap() failed: %s\n", __func__,
@@ -284,8 +487,21 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
mapped:
munmap(addr, alloc_sz);
resized:
- close(fd);
- unlink(path);
+ if (internal_config.single_file_segments) {
+ resize_hugefile(fd, map_offset, alloc_sz, false);
+ if (is_zero_length(fd)) {
+ struct msl_entry *te = get_msl_entry_by_idx(list_idx);
+ if (te != NULL && te->fd >= 0) {
+ close(te->fd);
+ te->fd = -1;
+ }
+ /* ignore errors, can't make it any worse */
+ unlink(path);
+ }
+ } else {
+ close(fd);
+ unlink(path);
+ }
return -1;
}

@@ -293,6 +509,7 @@ static int
free_seg(struct rte_memseg *ms, struct hugepage_info *hi,
unsigned int list_idx, unsigned int seg_idx)
{
+ uint64_t map_offset;
char path[PATH_MAX];
int fd, ret;

@@ -310,21 +527,39 @@ free_seg(struct rte_memseg *ms, struct hugepage_info *hi,
if (fd < 0)
return -1;

- /* if we're able to take out a write lock, we're the last one
- * holding onto this page.
- */
-
- ret = lock(fd, 0, ms->len, F_WRLCK);
- if (ret >= 0) {
- /* no one else is using this page */
- if (ret == 1)
+ if (internal_config.single_file_segments) {
+ map_offset = seg_idx * ms->len;
+ if (resize_hugefile(fd, map_offset, ms->len, false))
+ return -1;
+ /* if file is zero-length, we've already shrunk it, so it's
+ * safe to remove.
+ */
+ if (is_zero_length(fd)) {
+ struct msl_entry *te = get_msl_entry_by_idx(list_idx);
+ if (te != NULL && te->fd >= 0) {
+ close(te->fd);
+ te->fd = -1;
+ }
unlink(path);
- ret = lock(fd, 0, ms->len, F_UNLCK);
- if (ret != 1)
- RTE_LOG(ERR, EAL, "%s(): unable to unlock file %s\n",
- __func__, path);
+ }
+ ret = 0;
+ } else {
+ /* if we're able to take out a write lock, we're the last one
+ * holding onto this page.
+ */
+
+ ret = lock(fd, 0, ms->len, F_WRLCK);
+ if (ret >= 0) {
+ /* no one else is using this page */
+ if (ret == 1)
+ unlink(path);
+ ret = lock(fd, 0, ms->len, F_UNLCK);
+ if (ret != 1)
+ RTE_LOG(ERR, EAL, "%s(): unable to unlock file %s\n",
+ __func__, path);
+ }
+ close(fd);
}
- close(fd);

memset(ms, 0, sizeof(*ms));
--
2.7.4
Anatoly Burakov
2018-04-03 23:22:18 UTC
Permalink
This API will enable application to register for notifications
on page allocations that are about to happen, giving the application
a chance to allow or deny the allocation when total memory utilization
as a result would be above specified limit on specified socket.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/common/eal_common_memalloc.c | 138 +++++++++++++++++++++++++++-
lib/librte_eal/common/eal_common_memory.c | 26 ++++++
lib/librte_eal/common/eal_memalloc.h | 10 ++
lib/librte_eal/common/include/rte_memory.h | 59 ++++++++++++
lib/librte_eal/rte_eal_version.map | 2 +
5 files changed, 234 insertions(+), 1 deletion(-)

diff --git a/lib/librte_eal/common/eal_common_memalloc.c b/lib/librte_eal/common/eal_common_memalloc.c
index 2d2d46f..49fd53c 100644
--- a/lib/librte_eal/common/eal_common_memalloc.c
+++ b/lib/librte_eal/common/eal_common_memalloc.c
@@ -22,14 +22,26 @@ struct mem_event_callback_entry {
rte_mem_event_callback_t clb;
};

+struct mem_alloc_validator_entry {
+ TAILQ_ENTRY(mem_alloc_validator_entry) next;
+ char name[RTE_MEM_ALLOC_VALIDATOR_NAME_LEN];
+ rte_mem_alloc_validator_t clb;
+ int socket_id;
+ size_t limit;
+};
+
/** Double linked list of actions. */
TAILQ_HEAD(mem_event_callback_entry_list, mem_event_callback_entry);
+TAILQ_HEAD(mem_alloc_validator_entry_list, mem_alloc_validator_entry);

static struct mem_event_callback_entry_list mem_event_callback_list =
TAILQ_HEAD_INITIALIZER(mem_event_callback_list);
-
static rte_rwlock_t mem_event_rwlock = RTE_RWLOCK_INITIALIZER;

+static struct mem_alloc_validator_entry_list mem_alloc_validator_list =
+ TAILQ_HEAD_INITIALIZER(mem_alloc_validator_list);
+static rte_rwlock_t mem_alloc_validator_rwlock = RTE_RWLOCK_INITIALIZER;
+
static struct mem_event_callback_entry *
find_mem_event_callback(const char *name)
{
@@ -42,6 +54,18 @@ find_mem_event_callback(const char *name)
return r;
}

+static struct mem_alloc_validator_entry *
+find_mem_alloc_validator(const char *name, int socket_id)
+{
+ struct mem_alloc_validator_entry *r;
+
+ TAILQ_FOREACH(r, &mem_alloc_validator_list, next) {
+ if (!strcmp(r->name, name) && r->socket_id == socket_id)
+ break;
+ }
+ return r;
+}
+
bool
eal_memalloc_is_contig(const struct rte_memseg_list *msl, void *start,
size_t len)
@@ -221,3 +245,115 @@ eal_memalloc_mem_event_notify(enum rte_mem_event event, const void *start,

rte_rwlock_read_unlock(&mem_event_rwlock);
}
+
+int
+eal_memalloc_mem_alloc_validator_register(const char *name,
+ rte_mem_alloc_validator_t clb, int socket_id, size_t limit)
+{
+ struct mem_alloc_validator_entry *entry;
+ int ret, len;
+ if (name == NULL || clb == NULL || socket_id < 0) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+ len = strnlen(name, RTE_MEM_ALLOC_VALIDATOR_NAME_LEN);
+ if (len == 0) {
+ rte_errno = EINVAL;
+ return -1;
+ } else if (len == RTE_MEM_ALLOC_VALIDATOR_NAME_LEN) {
+ rte_errno = ENAMETOOLONG;
+ return -1;
+ }
+ rte_rwlock_write_lock(&mem_alloc_validator_rwlock);
+
+ entry = find_mem_alloc_validator(name, socket_id);
+ if (entry != NULL) {
+ rte_errno = EEXIST;
+ ret = -1;
+ goto unlock;
+ }
+
+ entry = malloc(sizeof(*entry));
+ if (entry == NULL) {
+ rte_errno = ENOMEM;
+ ret = -1;
+ goto unlock;
+ }
+
+ /* callback successfully created and is valid, add it to the list */
+ entry->clb = clb;
+ entry->socket_id = socket_id;
+ entry->limit = limit;
+ snprintf(entry->name, RTE_MEM_ALLOC_VALIDATOR_NAME_LEN, "%s", name);
+ TAILQ_INSERT_TAIL(&mem_alloc_validator_list, entry, next);
+
+ ret = 0;
+
+ RTE_LOG(DEBUG, EAL, "Mem alloc validator '%s' on socket %i with limit %zu registered\n",
+ name, socket_id, limit);
+
+unlock:
+ rte_rwlock_write_unlock(&mem_alloc_validator_rwlock);
+ return ret;
+}
+
+int
+eal_memalloc_mem_alloc_validator_unregister(const char *name, int socket_id)
+{
+ struct mem_alloc_validator_entry *entry;
+ int ret, len;
+
+ if (name == NULL || socket_id < 0) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+ len = strnlen(name, RTE_MEM_ALLOC_VALIDATOR_NAME_LEN);
+ if (len == 0) {
+ rte_errno = EINVAL;
+ return -1;
+ } else if (len == RTE_MEM_ALLOC_VALIDATOR_NAME_LEN) {
+ rte_errno = ENAMETOOLONG;
+ return -1;
+ }
+ rte_rwlock_write_lock(&mem_alloc_validator_rwlock);
+
+ entry = find_mem_alloc_validator(name, socket_id);
+ if (entry == NULL) {
+ rte_errno = ENOENT;
+ ret = -1;
+ goto unlock;
+ }
+ TAILQ_REMOVE(&mem_alloc_validator_list, entry, next);
+ free(entry);
+
+ ret = 0;
+
+ RTE_LOG(DEBUG, EAL, "Mem alloc validator '%s' on socket %i unregistered\n",
+ name, socket_id);
+
+unlock:
+ rte_rwlock_write_unlock(&mem_alloc_validator_rwlock);
+ return ret;
+}
+
+int
+eal_memalloc_mem_alloc_validate(int socket_id, size_t new_len)
+{
+ struct mem_alloc_validator_entry *entry;
+ int ret = 0;
+
+ rte_rwlock_read_lock(&mem_alloc_validator_rwlock);
+
+ TAILQ_FOREACH(entry, &mem_alloc_validator_list, next) {
+ if (entry->socket_id != socket_id || entry->limit > new_len)
+ continue;
+ RTE_LOG(DEBUG, EAL, "Calling mem alloc validator '%s' on socket %i\n",
+ entry->name, entry->socket_id);
+ if (entry->clb(socket_id, entry->limit, new_len) < 0)
+ ret = -1;
+ }
+
+ rte_rwlock_read_unlock(&mem_alloc_validator_rwlock);
+
+ return ret;
+}
diff --git a/lib/librte_eal/common/eal_common_memory.c b/lib/librte_eal/common/eal_common_memory.c
index e3ce69c..d221240 100644
--- a/lib/librte_eal/common/eal_common_memory.c
+++ b/lib/librte_eal/common/eal_common_memory.c
@@ -632,6 +632,32 @@ rte_mem_event_callback_unregister(const char *name)
return eal_memalloc_mem_event_callback_unregister(name);
}

+int __rte_experimental
+rte_mem_alloc_validator_register(const char *name,
+ rte_mem_alloc_validator_t clb, int socket_id, size_t limit)
+{
+ /* FreeBSD boots with legacy mem enabled by default */
+ if (internal_config.legacy_mem) {
+ RTE_LOG(DEBUG, EAL, "Registering mem alloc validators not supported\n");
+ rte_errno = ENOTSUP;
+ return -1;
+ }
+ return eal_memalloc_mem_alloc_validator_register(name, clb, socket_id,
+ limit);
+}
+
+int __rte_experimental
+rte_mem_alloc_validator_unregister(const char *name, int socket_id)
+{
+ /* FreeBSD boots with legacy mem enabled by default */
+ if (internal_config.legacy_mem) {
+ RTE_LOG(DEBUG, EAL, "Registering mem alloc validators not supported\n");
+ rte_errno = ENOTSUP;
+ return -1;
+ }
+ return eal_memalloc_mem_alloc_validator_unregister(name, socket_id);
+}
+
/* Dump the physical memory layout on console */
void
rte_dump_physmem_layout(FILE *f)
diff --git a/lib/librte_eal/common/eal_memalloc.h b/lib/librte_eal/common/eal_memalloc.h
index 4d27403..6bec52c 100644
--- a/lib/librte_eal/common/eal_memalloc.h
+++ b/lib/librte_eal/common/eal_memalloc.h
@@ -67,4 +67,14 @@ void
eal_memalloc_mem_event_notify(enum rte_mem_event event, const void *start,
size_t len);

+int
+eal_memalloc_mem_alloc_validator_register(const char *name,
+ rte_mem_alloc_validator_t clb, int socket_id, size_t limit);
+
+int
+eal_memalloc_mem_alloc_validator_unregister(const char *name, int socket_id);
+
+int
+eal_memalloc_mem_alloc_validate(int socket_id, size_t new_len);
+
#endif // EAL_MEMALLOC_H
diff --git a/lib/librte_eal/common/include/rte_memory.h b/lib/librte_eal/common/include/rte_memory.h
index 0de1198..0318bfe 100644
--- a/lib/librte_eal/common/include/rte_memory.h
+++ b/lib/librte_eal/common/include/rte_memory.h
@@ -339,6 +339,65 @@ rte_mem_event_callback_register(const char *name, rte_mem_event_callback_t clb);
int __rte_experimental
rte_mem_event_callback_unregister(const char *name);

+
+#define RTE_MEM_ALLOC_VALIDATOR_NAME_LEN 64
+/**< maximum length of alloc validator name */
+/**
+ * Function typedef used to register memory allocation validation callbacks.
+ *
+ * Returning 0 will allow allocation attempt to continue. Returning -1 will
+ * prevent allocation from succeeding.
+ */
+typedef int (*rte_mem_alloc_validator_t)(int socket_id,
+ size_t cur_limit, size_t new_len);
+
+/**
+ * @brief Register validator callback for memory allocations.
+ *
+ * Callbacks registered by this function will be called right before memory
+ * allocator is about to trigger allocation of more pages from the system if
+ * said allocation will bring total memory usage above specified limit on
+ * specified socket. User will be able to cancel pending allocation if callback
+ * returns -1.
+ *
+ * @param name
+ * Name associated with specified callback to be added to the list.
+ *
+ * @param clb
+ * Callback function pointer.
+ *
+ * @param socket_id
+ * Socket ID on which to watch for allocations.
+ *
+ * @param limit
+ * Limit above which to trigger callbacks.
+ *
+ * @return
+ * 0 on successful callback register
+ * -1 on unsuccessful callback register, with rte_errno value indicating
+ * reason for failure.
+ */
+int __rte_experimental
+rte_mem_alloc_validator_register(const char *name,
+ rte_mem_alloc_validator_t clb, int socket_id, size_t limit);
+
+/**
+ * @brief Unregister validator callback for memory allocations.
+ *
+ * @param name
+ * Name associated with specified callback to be removed from the list.
+ *
+ * @param socket_id
+ * Socket ID on which to watch for allocations.
+ *
+ * @return
+ * 0 on successful callback unregister
+ * -1 on unsuccessful callback unregister, with rte_errno value indicating
+ * reason for failure.
+ */
+int __rte_experimental
+rte_mem_alloc_validator_unregister(const char *name, int socket_id);
+
#ifdef __cplusplus
}
#endif
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index 0460edb..b94c48c 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -238,6 +238,8 @@ EXPERIMENTAL {
rte_fbarray_set_used;
rte_log_register_type_and_pick_level;
rte_malloc_dump_heaps;
+ rte_mem_alloc_validator_register;
+ rte_mem_alloc_validator_unregister;
rte_mem_event_callback_register;
rte_mem_event_callback_unregister;
rte_mem_iova2virt;
--
2.7.4
Anatoly Burakov
2018-04-03 23:22:13 UTC
Permalink
This enables multiprocess synchronization for memory hotplug
requests at runtime (as opposed to initialization).

Basic workflow is the following. Primary process always does initial
mapping and unmapping, and secondary processes always follow primary
page map. Only one allocation request can be active at any one time.

When primary allocates memory, it ensures that all other processes
have allocated the same set of hugepages successfully, otherwise
any allocations made are being rolled back, and heap is freed back.
Heap is locked throughout the process, and there is also a global
memory hotplug lock, so no race conditions can happen.

When primary frees memory, it frees the heap, deallocates affected
pages, and notifies other processes of deallocations. Since heap is
freed from that memory chunk, the area basically becomes invisible
to other processes even if they happen to fail to unmap that
specific set of pages, so it's completely safe to ignore results of
sync requests.

When secondary allocates memory, it does not do so by itself.
Instead, it sends a request to primary process to try and allocate
pages of specified size and on specified socket, such that a
specified heap allocation request could complete. Primary process
then sends all secondaries (including the requestor) a separate
notification of allocated pages, and expects all secondary
processes to report success before considering pages as "allocated".

Only after primary process ensures that all memory has been
successfully allocated in all secondary process, it will respond
positively to the initial request, and let secondary proceed with
the allocation. Since the heap now has memory that can satisfy
allocation request, and it was locked all this time (so no other
allocations could take place), secondary process will be able to
allocate memory from the heap.

When secondary frees memory, it hides pages to be deallocated from
the heap. Then, it sends a deallocation request to primary process,
so that it deallocates pages itself, and then sends a separate sync
request to all other processes (including the requestor) to unmap
the same pages. This way, even if secondary fails to notify other
processes of this deallocation, that memory will become invisible
to other processes, and will not be allocated from again.

So, to summarize: address space will only become part of the heap
if primary process can ensure that all other processes have
allocated this memory successfully. If anything goes wrong, the
worst thing that could happen is that a page will "leak" and will
not be available to neither DPDK nor the system, as some process
will still hold onto it. It's not an actual leak, as we can account
for the page - it's just that none of the processes will be able
to use this page for anything useful, until it gets allocated from
by the primary.

Due to underlying DPDK IPC implementation being single-threaded,
some asynchronous magic had to be done, as we need to complete
several requests before we can definitively allow secondary process
to use allocated memory (namely, it has to be present in all other
secondary processes before it can be used). Additionally, only
one allocation request is allowed to be submitted at once.

Memory allocation requests are only allowed when there are no
secondary processes currently initializing. To enforce that,
a shared rwlock is used, that is set to read lock on init (so that
several secondaries could initialize concurrently), and write lock
on making allocation requests (so that either secondary init will
have to wait, or allocation request will have to wait until all
processes have initialized).

Any other function that wishes to iterate over memory or prevent
allocations should be using memory hotplug lock.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/bsdapp/eal/Makefile | 1 +
lib/librte_eal/common/eal_common_memory.c | 67 +-
lib/librte_eal/common/include/rte_eal_memconfig.h | 3 +
lib/librte_eal/common/malloc_elem.c | 124 ++--
lib/librte_eal/common/malloc_heap.c | 255 ++++++--
lib/librte_eal/common/malloc_mp.c | 744 ++++++++++++++++++++++
lib/librte_eal/common/malloc_mp.h | 86 +++
lib/librte_eal/common/meson.build | 1 +
lib/librte_eal/linuxapp/eal/Makefile | 1 +
lib/librte_eal/linuxapp/eal/eal_memalloc.c | 32 +-
lib/librte_eal/linuxapp/eal/eal_vfio.c | 40 +-
11 files changed, 1229 insertions(+), 125 deletions(-)
create mode 100644 lib/librte_eal/common/malloc_mp.c
create mode 100644 lib/librte_eal/common/malloc_mp.h

diff --git a/lib/librte_eal/bsdapp/eal/Makefile b/lib/librte_eal/bsdapp/eal/Makefile
index 907e30d..250d5c1 100644
--- a/lib/librte_eal/bsdapp/eal/Makefile
+++ b/lib/librte_eal/bsdapp/eal/Makefile
@@ -59,6 +59,7 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_fbarray.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += rte_malloc.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += malloc_elem.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += malloc_heap.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += malloc_mp.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += rte_keepalive.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += rte_service.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += rte_reciprocal.c
diff --git a/lib/librte_eal/common/eal_common_memory.c b/lib/librte_eal/common/eal_common_memory.c
index 8db085e..0c4c1f5 100644
--- a/lib/librte_eal/common/eal_common_memory.c
+++ b/lib/librte_eal/common/eal_common_memory.c
@@ -654,6 +654,9 @@ rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg)
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
int i, ms_idx, ret = 0;

+ /* do not allow allocations/frees/init while we iterate */
+ rte_rwlock_read_lock(&mcfg->memory_hotplug_lock);
+
for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
struct rte_memseg_list *msl = &mcfg->memsegs[i];
const struct rte_memseg *ms;
@@ -678,15 +681,20 @@ rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg)
len = n_segs * msl->page_sz;

ret = func(msl, ms, len, arg);
- if (ret < 0)
- return -1;
- else if (ret > 0)
- return 1;
+ if (ret < 0) {
+ ret = -1;
+ goto out;
+ } else if (ret > 0) {
+ ret = 1;
+ goto out;
+ }
ms_idx = rte_fbarray_find_next_used(arr,
ms_idx + n_segs);
}
}
- return 0;
+out:
+ rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
+ return ret;
}

int __rte_experimental
@@ -695,6 +703,9 @@ rte_memseg_walk(rte_memseg_walk_t func, void *arg)
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
int i, ms_idx, ret = 0;

+ /* do not allow allocations/frees/init while we iterate */
+ rte_rwlock_read_lock(&mcfg->memory_hotplug_lock);
+
for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
struct rte_memseg_list *msl = &mcfg->memsegs[i];
const struct rte_memseg *ms;
@@ -709,14 +720,19 @@ rte_memseg_walk(rte_memseg_walk_t func, void *arg)
while (ms_idx >= 0) {
ms = rte_fbarray_get(arr, ms_idx);
ret = func(msl, ms, arg);
- if (ret < 0)
- return -1;
- else if (ret > 0)
- return 1;
+ if (ret < 0) {
+ ret = -1;
+ goto out;
+ } else if (ret > 0) {
+ ret = 1;
+ goto out;
+ }
ms_idx = rte_fbarray_find_next_used(arr, ms_idx + 1);
}
}
- return 0;
+out:
+ rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
+ return ret;
}

int __rte_experimental
@@ -725,6 +741,9 @@ rte_memseg_list_walk(rte_memseg_list_walk_t func, void *arg)
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
int i, ret = 0;

+ /* do not allow allocations/frees/init while we iterate */
+ rte_rwlock_read_lock(&mcfg->memory_hotplug_lock);
+
for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
struct rte_memseg_list *msl = &mcfg->memsegs[i];

@@ -732,12 +751,18 @@ rte_memseg_list_walk(rte_memseg_list_walk_t func, void *arg)
continue;

ret = func(msl, arg);
- if (ret < 0)
- return -1;
- if (ret > 0)
- return 1;
+ if (ret < 0) {
+ ret = -1;
+ goto out;
+ }
+ if (ret > 0) {
+ ret = 1;
+ goto out;
+ }
}
- return 0;
+out:
+ rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
+ return ret;
}

/* init memory subsystem */
@@ -751,6 +776,9 @@ rte_eal_memory_init(void)
if (!mcfg)
return -1;

+ /* lock mem hotplug here, to prevent races while we init */
+ rte_rwlock_read_lock(&mcfg->memory_hotplug_lock);
+
retval = rte_eal_process_type() == RTE_PROC_PRIMARY ?
#ifndef RTE_ARCH_64
memseg_primary_init_32() :
@@ -760,16 +788,19 @@ rte_eal_memory_init(void)
memseg_secondary_init();

if (retval < 0)
- return -1;
+ goto fail;

retval = rte_eal_process_type() == RTE_PROC_PRIMARY ?
rte_eal_hugepage_init() :
rte_eal_hugepage_attach();
if (retval < 0)
- return -1;
+ goto fail;

if (internal_config.no_shconf == 0 && rte_eal_memdevice_init() < 0)
- return -1;
+ goto fail;

return 0;
+fail:
+ rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
+ return -1;
}
diff --git a/lib/librte_eal/common/include/rte_eal_memconfig.h b/lib/librte_eal/common/include/rte_eal_memconfig.h
index a781793..aff0688 100644
--- a/lib/librte_eal/common/include/rte_eal_memconfig.h
+++ b/lib/librte_eal/common/include/rte_eal_memconfig.h
@@ -59,6 +59,9 @@ struct rte_mem_config {
rte_rwlock_t qlock; /**< used for tailq operation for thread safe. */
rte_rwlock_t mplock; /**< only used by mempool LIB for thread-safe. */

+ rte_rwlock_t memory_hotplug_lock;
+ /**< indicates whether memory hotplug request is in progress. */
+
/* memory segments and zones */
struct rte_fbarray memzones; /**< Memzone descriptors. */

diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c
index 4346532..ee79dcd 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -460,74 +460,80 @@ malloc_elem_hide_region(struct malloc_elem *elem, void *start, size_t len)
prev = elem->prev;
next = elem->next;

- len_before = RTE_PTR_DIFF(hide_start, elem);
- len_after = RTE_PTR_DIFF(next, hide_end);
-
- if (len_after >= MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) {
- /* split after */
- split_elem(elem, hide_end);
-
- malloc_elem_free_list_insert(hide_end);
- } else if (len_after >= MALLOC_ELEM_HEADER_LEN) {
- /* shrink current element */
- elem->size -= len_after;
- memset(hide_end, 0, sizeof(*hide_end));
-
- /* copy next element's data to our pad */
- memcpy(hide_end, next, sizeof(*hide_end));
-
- /* pad next element */
- next->state = ELEM_PAD;
- next->pad = len_after;
-
- /* next element is busy, would've been merged otherwise */
- hide_end->pad = len_after;
- hide_end->size += len_after;
-
- /* adjust pointers to point to our new pad */
- if (next->next)
- next->next->prev = hide_end;
- elem->next = hide_end;
- } else if (len_after > 0) {
- RTE_LOG(ERR, EAL, "Unaligned element, heap is probably corrupt\n");
- rte_panic("blow up\n");
- return;
+ /* we cannot do anything with non-adjacent elements */
+ if (next && next_elem_is_adjacent(elem)) {
+ len_after = RTE_PTR_DIFF(next, hide_end);
+ if (len_after >= MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) {
+ /* split after */
+ split_elem(elem, hide_end);
+
+ malloc_elem_free_list_insert(hide_end);
+ } else if (len_after >= MALLOC_ELEM_HEADER_LEN) {
+ /* shrink current element */
+ elem->size -= len_after;
+ memset(hide_end, 0, sizeof(*hide_end));
+
+ /* copy next element's data to our pad */
+ memcpy(hide_end, next, sizeof(*hide_end));
+
+ /* pad next element */
+ next->state = ELEM_PAD;
+ next->pad = len_after;
+ next->size -= len_after;
+
+ /* next element busy, would've been merged otherwise */
+ hide_end->pad = len_after;
+ hide_end->size += len_after;
+
+ /* adjust pointers to point to our new pad */
+ if (next->next)
+ next->next->prev = hide_end;
+ elem->next = hide_end;
+ } else if (len_after > 0) {
+ RTE_LOG(ERR, EAL, "Unaligned element, heap is probably corrupt\n");
+ return;
+ }
}

- if (len_before >= MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) {
- /* split before */
- split_elem(elem, hide_start);
+ /* we cannot do anything with non-adjacent elements */
+ if (prev && prev_elem_is_adjacent(elem)) {
+ len_before = RTE_PTR_DIFF(hide_start, elem);
+ if (len_before >= MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) {
+ /* split before */
+ split_elem(elem, hide_start);

- prev = elem;
- elem = hide_start;
+ prev = elem;
+ elem = hide_start;

- malloc_elem_free_list_insert(prev);
- } else if (len_before > 0) {
- /*
- * unlike with elements after current, here we don't need to
- * pad elements, but rather just increase the size of previous
- * element, copy the old header and and set up trailer.
- */
- void *trailer = RTE_PTR_ADD(prev,
- prev->size - MALLOC_ELEM_TRAILER_LEN);
+ malloc_elem_free_list_insert(prev);
+ } else if (len_before > 0) {
+ /*
+ * unlike with elements after current, here we don't
+ * need to pad elements, but rather just increase the
+ * size of previous element, copy the old header and set
+ * up trailer.
+ */
+ void *trailer = RTE_PTR_ADD(prev,
+ prev->size - MALLOC_ELEM_TRAILER_LEN);

- memcpy(hide_start, elem, sizeof(*elem));
- hide_start->size = len;
+ memcpy(hide_start, elem, sizeof(*elem));
+ hide_start->size = len;

- prev->size += len_before;
- set_trailer(prev);
+ prev->size += len_before;
+ set_trailer(prev);

- /* update pointers */
- prev->next = hide_start;
- if (next)
- next->prev = hide_start;
+ /* update pointers */
+ prev->next = hide_start;
+ if (next)
+ next->prev = hide_start;

- elem = hide_start;
+ /* erase old trailer */
+ memset(trailer, 0, MALLOC_ELEM_TRAILER_LEN);
+ /* erase old header */
+ memset(elem, 0, sizeof(*elem));

- /* erase old trailer */
- memset(trailer, 0, MALLOC_ELEM_TRAILER_LEN);
- /* erase old header */
- memset(elem, 0, sizeof(*elem));
+ elem = hide_start;
+ }
}

remove_elem(elem);
diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c
index 5f8c643..be39250 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -10,6 +10,7 @@
#include <sys/queue.h>

#include <rte_memory.h>
+#include <rte_errno.h>
#include <rte_eal.h>
#include <rte_eal_memconfig.h>
#include <rte_launch.h>
@@ -26,6 +27,7 @@
#include "eal_memalloc.h"
#include "malloc_elem.h"
#include "malloc_heap.h"
+#include "malloc_mp.h"

static unsigned
check_hugepage_sz(unsigned flags, uint64_t hugepage_sz)
@@ -81,8 +83,6 @@ malloc_heap_add_memory(struct malloc_heap *heap, struct rte_memseg_list *msl,

malloc_elem_free_list_insert(elem);

- heap->total_size += len;
-
return elem;
}

@@ -171,68 +171,118 @@ heap_alloc(struct malloc_heap *heap, const char *type __rte_unused, size_t size,
return elem == NULL ? NULL : (void *)(&elem[1]);
}

-static int
-try_expand_heap(struct malloc_heap *heap, size_t pg_sz, size_t elt_size,
+/* this function is exposed in malloc_mp.h */
+void
+rollback_expand_heap(struct rte_memseg **ms, int n_segs,
+ struct malloc_elem *elem, void *map_addr, size_t map_len)
+{
+ if (elem != NULL) {
+ malloc_elem_free_list_remove(elem);
+ malloc_elem_hide_region(elem, map_addr, map_len);
+ }
+
+ eal_memalloc_free_seg_bulk(ms, n_segs);
+}
+
+/* this function is exposed in malloc_mp.h */
+struct malloc_elem *
+alloc_pages_on_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size,
int socket, unsigned int flags, size_t align, size_t bound,
- bool contig)
+ bool contig, struct rte_memseg **ms, int n_segs)
{
- size_t map_len;
struct rte_memseg_list *msl;
- struct rte_memseg **ms;
- struct malloc_elem *elem;
- int n_segs, allocd_pages;
+ struct malloc_elem *elem = NULL;
+ size_t alloc_sz;
+ int allocd_pages;
void *ret, *map_addr;

- align = RTE_MAX(align, MALLOC_ELEM_HEADER_LEN);
- map_len = RTE_ALIGN_CEIL(align + elt_size + MALLOC_ELEM_TRAILER_LEN,
- pg_sz);
-
- n_segs = map_len / pg_sz;
-
- /* we can't know in advance how many pages we'll need, so malloc */
- ms = malloc(sizeof(*ms) * n_segs);
-
allocd_pages = eal_memalloc_alloc_seg_bulk(ms, n_segs, pg_sz,
socket, true);

/* make sure we've allocated our pages... */
if (allocd_pages < 0)
- goto free_ms;
+ return NULL;

map_addr = ms[0]->addr;
msl = rte_mem_virt2memseg_list(map_addr);
+ alloc_sz = (size_t)msl->page_sz * allocd_pages;

/* check if we wanted contiguous memory but didn't get it */
- if (contig && !eal_memalloc_is_contig(msl, map_addr, map_len)) {
+ if (contig && !eal_memalloc_is_contig(msl, map_addr, alloc_sz)) {
RTE_LOG(DEBUG, EAL, "%s(): couldn't allocate physically contiguous space\n",
__func__);
- goto free_pages;
+ goto fail;
}

/* add newly minted memsegs to malloc heap */
- elem = malloc_heap_add_memory(heap, msl, map_addr, map_len);
+ elem = malloc_heap_add_memory(heap, msl, map_addr, alloc_sz);

/* try once more, as now we have allocated new memory */
ret = find_suitable_element(heap, elt_size, flags, align, bound,
contig);

if (ret == NULL)
+ goto fail;
+
+ return elem;
+
+fail:
+ rollback_expand_heap(ms, n_segs, elem, map_addr, alloc_sz);
+ return NULL;
+}
+
+static int
+try_expand_heap_primary(struct malloc_heap *heap, uint64_t pg_sz,
+ size_t elt_size, int socket, unsigned int flags, size_t align,
+ size_t bound, bool contig)
+{
+ struct malloc_elem *elem;
+ struct rte_memseg **ms;
+ void *map_addr;
+ size_t alloc_sz;
+ int n_segs;
+
+ alloc_sz = RTE_ALIGN_CEIL(align + elt_size +
+ MALLOC_ELEM_TRAILER_LEN, pg_sz);
+ n_segs = alloc_sz / pg_sz;
+
+ /* we can't know in advance how many pages we'll need, so we malloc */
+ ms = malloc(sizeof(*ms) * n_segs);
+
+ memset(ms, 0, sizeof(*ms) * n_segs);
+
+ if (ms == NULL)
+ return -1;
+
+ elem = alloc_pages_on_heap(heap, pg_sz, elt_size, socket, flags, align,
+ bound, contig, ms, n_segs);
+
+ if (elem == NULL)
+ goto free_ms;
+
+ map_addr = ms[0]->addr;
+
+ /* notify other processes that this has happened */
+ if (request_sync()) {
+ /* we couldn't ensure all processes have mapped memory,
+ * so free it back and notify everyone that it's been
+ * freed back.
+ */
goto free_elem;
+ }
+ heap->total_size += alloc_sz;

RTE_LOG(DEBUG, EAL, "Heap on socket %d was expanded by %zdMB\n",
- socket, map_len >> 20ULL);
+ socket, alloc_sz >> 20ULL);

free(ms);

return 0;

free_elem:
- malloc_elem_free_list_remove(elem);
- malloc_elem_hide_region(elem, map_addr, map_len);
- heap->total_size -= map_len;
+ rollback_expand_heap(ms, n_segs, elem, map_addr, alloc_sz);

-free_pages:
- eal_memalloc_free_seg_bulk(ms, n_segs);
+ request_sync();
free_ms:
free(ms);

@@ -240,6 +290,59 @@ try_expand_heap(struct malloc_heap *heap, size_t pg_sz, size_t elt_size,
}

static int
+try_expand_heap_secondary(struct malloc_heap *heap, uint64_t pg_sz,
+ size_t elt_size, int socket, unsigned int flags, size_t align,
+ size_t bound, bool contig)
+{
+ struct malloc_mp_req req;
+ int req_result;
+
+ memset(&req, 0, sizeof(req));
+
+ req.t = REQ_TYPE_ALLOC;
+ req.alloc_req.align = align;
+ req.alloc_req.bound = bound;
+ req.alloc_req.contig = contig;
+ req.alloc_req.flags = flags;
+ req.alloc_req.elt_size = elt_size;
+ req.alloc_req.page_sz = pg_sz;
+ req.alloc_req.socket = socket;
+ req.alloc_req.heap = heap; /* it's in shared memory */
+
+ req_result = request_to_primary(&req);
+
+ if (req_result != 0)
+ return -1;
+
+ if (req.result != REQ_RESULT_SUCCESS)
+ return -1;
+
+ return 0;
+}
+
+static int
+try_expand_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size,
+ int socket, unsigned int flags, size_t align, size_t bound,
+ bool contig)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ int ret;
+
+ rte_rwlock_write_lock(&mcfg->memory_hotplug_lock);
+
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ ret = try_expand_heap_primary(heap, pg_sz, elt_size, socket,
+ flags, align, bound, contig);
+ } else {
+ ret = try_expand_heap_secondary(heap, pg_sz, elt_size, socket,
+ flags, align, bound, contig);
+ }
+
+ rte_rwlock_write_unlock(&mcfg->memory_hotplug_lock);
+ return ret;
+}
+
+static int
compare_pagesz(const void *a, const void *b)
{
const struct rte_memseg_list * const*mpa = a;
@@ -257,11 +360,10 @@ compare_pagesz(const void *a, const void *b)
}

static int
-alloc_mem_on_socket(size_t size, int socket, unsigned int flags, size_t align,
- size_t bound, bool contig)
+alloc_more_mem_on_socket(struct malloc_heap *heap, size_t size, int socket,
+ unsigned int flags, size_t align, size_t bound, bool contig)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
- struct malloc_heap *heap = &mcfg->malloc_heaps[socket];
struct rte_memseg_list *requested_msls[RTE_MAX_MEMSEG_LISTS];
struct rte_memseg_list *other_msls[RTE_MAX_MEMSEG_LISTS];
uint64_t requested_pg_sz[RTE_MAX_MEMSEG_LISTS];
@@ -393,7 +495,8 @@ heap_alloc_on_socket(const char *type, size_t size, int socket,
if (ret != NULL)
goto alloc_unlock;

- if (!alloc_mem_on_socket(size, socket, flags, align, bound, contig)) {
+ if (!alloc_more_mem_on_socket(heap, size, socket, flags, align, bound,
+ contig)) {
ret = heap_alloc(heap, type, size, flags, align, bound, contig);

/* this should have succeeded */
@@ -446,14 +549,41 @@ malloc_heap_alloc(const char *type, size_t size, int socket_arg,
return NULL;
}

+/* this function is exposed in malloc_mp.h */
+int
+malloc_heap_free_pages(void *aligned_start, size_t aligned_len)
+{
+ int n_segs, seg_idx, max_seg_idx;
+ struct rte_memseg_list *msl;
+ size_t page_sz;
+
+ msl = rte_mem_virt2memseg_list(aligned_start);
+ if (msl == NULL)
+ return -1;
+
+ page_sz = (size_t)msl->page_sz;
+ n_segs = aligned_len / page_sz;
+ seg_idx = RTE_PTR_DIFF(aligned_start, msl->base_va) / page_sz;
+ max_seg_idx = seg_idx + n_segs;
+
+ for (; seg_idx < max_seg_idx; seg_idx++) {
+ struct rte_memseg *ms;
+
+ ms = rte_fbarray_get(&msl->memseg_arr, seg_idx);
+ eal_memalloc_free_seg(ms);
+ }
+ return 0;
+}
+
int
malloc_heap_free(struct malloc_elem *elem)
{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
struct malloc_heap *heap;
void *start, *aligned_start, *end, *aligned_end;
size_t len, aligned_len, page_sz;
struct rte_memseg_list *msl;
- int n_segs, seg_idx, max_seg_idx, ret;
+ int ret;

if (!malloc_elem_cookies_ok(elem) || elem->state != ELEM_BUSY)
return -1;
@@ -494,25 +624,56 @@ malloc_heap_free(struct malloc_elem *elem)
if (aligned_len < page_sz)
goto free_unlock;

+ rte_rwlock_write_lock(&mcfg->memory_hotplug_lock);
+
+ /*
+ * we allow secondary processes to clear the heap of this allocated
+ * memory because it is safe to do so, as even if notifications about
+ * unmapped pages don't make it to other processes, heap is shared
+ * across all processes, and will become empty of this memory anyway,
+ * and nothing can allocate it back unless primary process will be able
+ * to deliver allocation message to every single running process.
+ */
+
malloc_elem_free_list_remove(elem);

malloc_elem_hide_region(elem, (void *) aligned_start, aligned_len);

- /* we don't really care if we fail to deallocate memory */
- n_segs = aligned_len / page_sz;
- seg_idx = RTE_PTR_DIFF(aligned_start, msl->base_va) / page_sz;
- max_seg_idx = seg_idx + n_segs;
+ heap->total_size -= aligned_len;

- for (; seg_idx < max_seg_idx; seg_idx++) {
- struct rte_memseg *ms;
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ /* don't care if any of this fails */
+ malloc_heap_free_pages(aligned_start, aligned_len);

- ms = rte_fbarray_get(&msl->memseg_arr, seg_idx);
- eal_memalloc_free_seg(ms);
+ request_sync();
+ } else {
+ struct malloc_mp_req req;
+
+ memset(&req, 0, sizeof(req));
+
+ req.t = REQ_TYPE_FREE;
+ req.free_req.addr = aligned_start;
+ req.free_req.len = aligned_len;
+
+ /*
+ * we request primary to deallocate pages, but we don't do it
+ * in this thread. instead, we notify primary that we would like
+ * to deallocate pages, and this process will receive another
+ * request (in parallel) that will do it for us on another
+ * thread.
+ *
+ * we also don't really care if this succeeds - the data is
+ * already removed from the heap, so it is, for all intents and
+ * purposes, hidden from the rest of DPDK even if some other
+ * process (including this one) may have these pages mapped.
+ */
+ request_to_primary(&req);
}
- heap->total_size -= aligned_len;

RTE_LOG(DEBUG, EAL, "Heap on socket %d was shrunk by %zdMB\n",
msl->socket_id, aligned_len >> 20ULL);
+
+ rte_rwlock_write_unlock(&mcfg->memory_hotplug_lock);
free_unlock:
rte_spinlock_unlock(&(heap->lock));
return ret;
@@ -600,8 +761,16 @@ rte_eal_malloc_heap_init(void)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;

- if (mcfg == NULL)
+ if (register_mp_requests()) {
+ RTE_LOG(ERR, EAL, "Couldn't register malloc multiprocess actions\n");
return -1;
+ }
+
+ /* unlock mem hotplug here. it's safe for primary as no requests can
+ * even come before primary itself is fully initialized, and secondaries
+ * do not need to initialize the heap.
+ */
+ rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);

/* secondary process does not need to initialize anything */
if (rte_eal_process_type() != RTE_PROC_PRIMARY)
diff --git a/lib/librte_eal/common/malloc_mp.c b/lib/librte_eal/common/malloc_mp.c
new file mode 100644
index 0000000..72b1f4c
--- /dev/null
+++ b/lib/librte_eal/common/malloc_mp.c
@@ -0,0 +1,744 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#include <string.h>
+#include <sys/time.h>
+
+#include <rte_alarm.h>
+#include <rte_errno.h>
+
+#include "eal_memalloc.h"
+
+#include "malloc_elem.h"
+#include "malloc_mp.h"
+
+#define MP_ACTION_SYNC "mp_malloc_sync"
+/**< request sent by primary process to notify of changes in memory map */
+#define MP_ACTION_ROLLBACK "mp_malloc_rollback"
+/**< request sent by primary process to notify of changes in memory map. this is
+ * essentially a regular sync request, but we cannot send sync requests while
+ * another one is in progress, and we might have to - therefore, we do this as
+ * a separate callback.
+ */
+#define MP_ACTION_REQUEST "mp_malloc_request"
+/**< request sent by secondary process to ask for allocation/deallocation */
+#define MP_ACTION_RESPONSE "mp_malloc_response"
+/**< response sent to secondary process to indicate result of request */
+
+/* forward declarations */
+static int
+handle_sync_response(const struct rte_mp_msg *request,
+ const struct rte_mp_reply *reply);
+static int
+handle_rollback_response(const struct rte_mp_msg *request,
+ const struct rte_mp_reply *reply);
+
+#define MP_TIMEOUT_S 5 /**< 5 seconds timeouts */
+
+/* when we're allocating, we need to store some state to ensure that we can
+ * roll back later
+ */
+struct primary_alloc_req_state {
+ struct malloc_heap *heap;
+ struct rte_memseg **ms;
+ int ms_len;
+ struct malloc_elem *elem;
+ void *map_addr;
+ size_t map_len;
+};
+
+enum req_state {
+ REQ_STATE_INACTIVE = 0,
+ REQ_STATE_ACTIVE,
+ REQ_STATE_COMPLETE
+};
+
+struct mp_request {
+ TAILQ_ENTRY(mp_request) next;
+ struct malloc_mp_req user_req; /**< contents of request */
+ pthread_cond_t cond; /**< variable we use to time out on this request */
+ enum req_state state; /**< indicate status of this request */
+ struct primary_alloc_req_state alloc_state;
+};
+
+/*
+ * We could've used just a single request, but it may be possible for
+ * secondaries to timeout earlier than the primary, and send a new request while
+ * primary is still expecting replies to the old one. Therefore, each new
+ * request will get assigned a new ID, which is how we will distinguish between
+ * expected and unexpected messages.
+ */
+TAILQ_HEAD(mp_request_list, mp_request);
+static struct {
+ struct mp_request_list list;
+ pthread_mutex_t lock;
+} mp_request_list = {
+ .list = TAILQ_HEAD_INITIALIZER(mp_request_list.list),
+ .lock = PTHREAD_MUTEX_INITIALIZER
+};
+
+/**
+ * General workflow is the following:
+ *
+ * Allocation:
+ * S: send request to primary
+ * P: attempt to allocate memory
+ * if failed, sendmsg failure
+ * if success, send sync request
+ * S: if received msg of failure, quit
+ * if received sync request, synchronize memory map and reply with result
+ * P: if received sync request result
+ * if success, sendmsg success
+ * if failure, roll back allocation and send a rollback request
+ * S: if received msg of success, quit
+ * if received rollback request, synchronize memory map and reply with result
+ * P: if received sync request result
+ * sendmsg sync request result
+ * S: if received msg, quit
+ *
+ * Aside from timeouts, there are three points where we can quit:
+ * - if allocation failed straight away
+ * - if allocation and sync request succeeded
+ * - if allocation succeeded, sync request failed, allocation rolled back and
+ * rollback request received (irrespective of whether it succeeded or failed)
+ *
+ * Deallocation:
+ * S: send request to primary
+ * P: attempt to deallocate memory
+ * if failed, sendmsg failure
+ * if success, send sync request
+ * S: if received msg of failure, quit
+ * if received sync request, synchronize memory map and reply with result
+ * P: if received sync request result
+ * sendmsg sync request result
+ * S: if received msg, quit
+ *
+ * There is no "rollback" from deallocation, as it's safe to have some memory
+ * mapped in some processes - it's absent from the heap, so it won't get used.
+ */
+
+static struct mp_request *
+find_request_by_id(uint64_t id)
+{
+ struct mp_request *req;
+ TAILQ_FOREACH(req, &mp_request_list.list, next) {
+ if (req->user_req.id == id)
+ break;
+ }
+ return req;
+}
+
+/* this ID is, like, totally guaranteed to be absolutely unique. pinky swear. */
+static uint64_t
+get_unique_id(void)
+{
+ uint64_t id;
+ do {
+ id = rte_rand();
+ } while (find_request_by_id(id) != NULL);
+ return id;
+}
+
+/* secondary will respond to sync requests thusly */
+static int
+handle_sync(const struct rte_mp_msg *msg, const void *peer)
+{
+ struct rte_mp_msg reply;
+ const struct malloc_mp_req *req =
+ (const struct malloc_mp_req *)msg->param;
+ struct malloc_mp_req *resp =
+ (struct malloc_mp_req *)reply.param;
+ int ret;
+
+ if (req->t != REQ_TYPE_SYNC) {
+ RTE_LOG(ERR, EAL, "Unexpected request from primary\n");
+ return -1;
+ }
+
+ memset(&reply, 0, sizeof(reply));
+
+ reply.num_fds = 0;
+ snprintf(reply.name, sizeof(reply.name), "%s", msg->name);
+ reply.len_param = sizeof(*resp);
+
+ ret = eal_memalloc_sync_with_primary();
+
+ resp->t = REQ_TYPE_SYNC;
+ resp->id = req->id;
+ resp->result = ret == 0 ? REQ_RESULT_SUCCESS : REQ_RESULT_FAIL;
+
+ rte_mp_reply(&reply, peer);
+
+ return 0;
+}
+
+static int
+handle_alloc_request(const struct malloc_mp_req *m,
+ struct mp_request *req)
+{
+ const struct malloc_req_alloc *ar = &m->alloc_req;
+ struct malloc_heap *heap;
+ struct malloc_elem *elem;
+ struct rte_memseg **ms;
+ size_t alloc_sz;
+ int n_segs;
+ void *map_addr;
+
+ alloc_sz = RTE_ALIGN_CEIL(ar->align + ar->elt_size +
+ MALLOC_ELEM_TRAILER_LEN, ar->page_sz);
+ n_segs = alloc_sz / ar->page_sz;
+
+ heap = ar->heap;
+
+ /* we can't know in advance how many pages we'll need, so we malloc */
+ ms = malloc(sizeof(*ms) * n_segs);
+
+ memset(ms, 0, sizeof(*ms) * n_segs);
+
+ if (ms == NULL) {
+ RTE_LOG(ERR, EAL, "Couldn't allocate memory for request state\n");
+ goto fail;
+ }
+
+ elem = alloc_pages_on_heap(heap, ar->page_sz, ar->elt_size, ar->socket,
+ ar->flags, ar->align, ar->bound, ar->contig, ms,
+ n_segs);
+
+ if (elem == NULL)
+ goto fail;
+
+ map_addr = ms[0]->addr;
+
+ /* we have succeeded in allocating memory, but we still need to sync
+ * with other processes. however, since DPDK IPC is single-threaded, we
+ * send an asynchronous request and exit this callback.
+ */
+
+ req->alloc_state.ms = ms;
+ req->alloc_state.ms_len = n_segs;
+ req->alloc_state.map_addr = map_addr;
+ req->alloc_state.map_len = alloc_sz;
+ req->alloc_state.elem = elem;
+ req->alloc_state.heap = heap;
+
+ return 0;
+fail:
+ free(ms);
+ return -1;
+}
+
+/* first stage of primary handling requests from secondary */
+static int
+handle_request(const struct rte_mp_msg *msg, const void *peer __rte_unused)
+{
+ const struct malloc_mp_req *m =
+ (const struct malloc_mp_req *)msg->param;
+ struct mp_request *entry;
+ int ret;
+
+ /* lock access to request */
+ pthread_mutex_lock(&mp_request_list.lock);
+
+ /* make sure it's not a dupe */
+ entry = find_request_by_id(m->id);
+ if (entry != NULL) {
+ RTE_LOG(ERR, EAL, "Duplicate request id\n");
+ goto fail;
+ }
+
+ entry = malloc(sizeof(*entry));
+ if (entry == NULL) {
+ RTE_LOG(ERR, EAL, "Unable to allocate memory for request\n");
+ goto fail;
+ }
+
+ /* erase all data */
+ memset(entry, 0, sizeof(*entry));
+
+ if (m->t == REQ_TYPE_ALLOC) {
+ ret = handle_alloc_request(m, entry);
+ } else if (m->t == REQ_TYPE_FREE) {
+ ret = malloc_heap_free_pages(m->free_req.addr,
+ m->free_req.len);
+ } else {
+ RTE_LOG(ERR, EAL, "Unexpected request from secondary\n");
+ goto fail;
+ }
+
+ if (ret != 0) {
+ struct rte_mp_msg resp_msg;
+ struct malloc_mp_req *resp =
+ (struct malloc_mp_req *)resp_msg.param;
+
+ /* send failure message straight away */
+ resp_msg.num_fds = 0;
+ resp_msg.len_param = sizeof(*resp);
+ snprintf(resp_msg.name, sizeof(resp_msg.name), "%s",
+ MP_ACTION_RESPONSE);
+
+ resp->t = m->t;
+ resp->result = REQ_RESULT_FAIL;
+ resp->id = m->id;
+
+ if (rte_mp_sendmsg(&resp_msg)) {
+ RTE_LOG(ERR, EAL, "Couldn't send response\n");
+ goto fail;
+ }
+ /* we did not modify the request */
+ free(entry);
+ } else {
+ struct rte_mp_msg sr_msg;
+ struct malloc_mp_req *sr =
+ (struct malloc_mp_req *)sr_msg.param;
+ struct timespec ts;
+
+ memset(&sr_msg, 0, sizeof(sr_msg));
+
+ /* we can do something, so send sync request asynchronously */
+ sr_msg.num_fds = 0;
+ sr_msg.len_param = sizeof(*sr);
+ snprintf(sr_msg.name, sizeof(sr_msg.name), "%s",
+ MP_ACTION_SYNC);
+
+ ts.tv_nsec = 0;
+ ts.tv_sec = MP_TIMEOUT_S;
+
+ /* sync requests carry no data */
+ sr->t = REQ_TYPE_SYNC;
+ sr->id = m->id;
+
+ /* there may be stray timeout still waiting */
+ do {
+ ret = rte_mp_request_async(&sr_msg, &ts,
+ handle_sync_response);
+ } while (ret != 0 && rte_errno == EEXIST);
+ if (ret != 0) {
+ RTE_LOG(ERR, EAL, "Couldn't send sync request\n");
+ if (m->t == REQ_TYPE_ALLOC)
+ free(entry->alloc_state.ms);
+ goto fail;
+ }
+
+ /* mark request as in progress */
+ memcpy(&entry->user_req, m, sizeof(*m));
+ entry->state = REQ_STATE_ACTIVE;
+
+ TAILQ_INSERT_TAIL(&mp_request_list.list, entry, next);
+ }
+ pthread_mutex_unlock(&mp_request_list.lock);
+ return 0;
+fail:
+ pthread_mutex_unlock(&mp_request_list.lock);
+ free(entry);
+ return -1;
+}
+
+/* callback for asynchronous sync requests for primary. this will either do a
+ * sendmsg with results, or trigger rollback request.
+ */
+static int
+handle_sync_response(const struct rte_mp_msg *request,
+ const struct rte_mp_reply *reply)
+{
+ enum malloc_req_result result;
+ struct mp_request *entry;
+ const struct malloc_mp_req *mpreq =
+ (const struct malloc_mp_req *)request->param;
+ int i;
+
+ /* lock the request */
+ pthread_mutex_lock(&mp_request_list.lock);
+
+ entry = find_request_by_id(mpreq->id);
+ if (entry == NULL) {
+ RTE_LOG(ERR, EAL, "Wrong request ID\n");
+ goto fail;
+ }
+
+ result = REQ_RESULT_SUCCESS;
+
+ if (reply->nb_received != reply->nb_sent)
+ result = REQ_RESULT_FAIL;
+
+ for (i = 0; i < reply->nb_received; i++) {
+ struct malloc_mp_req *resp =
+ (struct malloc_mp_req *)reply->msgs[i].param;
+
+ if (resp->t != REQ_TYPE_SYNC) {
+ RTE_LOG(ERR, EAL, "Unexpected response to sync request\n");
+ result = REQ_RESULT_FAIL;
+ break;
+ }
+ if (resp->id != entry->user_req.id) {
+ RTE_LOG(ERR, EAL, "Response to wrong sync request\n");
+ result = REQ_RESULT_FAIL;
+ break;
+ }
+ if (resp->result == REQ_RESULT_FAIL) {
+ result = REQ_RESULT_FAIL;
+ break;
+ }
+ }
+
+ if (entry->user_req.t == REQ_TYPE_FREE) {
+ struct rte_mp_msg msg;
+ struct malloc_mp_req *resp = (struct malloc_mp_req *)msg.param;
+
+ memset(&msg, 0, sizeof(msg));
+
+ /* this is a free request, just sendmsg result */
+ resp->t = REQ_TYPE_FREE;
+ resp->result = result;
+ resp->id = entry->user_req.id;
+ msg.num_fds = 0;
+ msg.len_param = sizeof(*resp);
+ snprintf(msg.name, sizeof(msg.name), "%s", MP_ACTION_RESPONSE);
+
+ if (rte_mp_sendmsg(&msg))
+ RTE_LOG(ERR, EAL, "Could not send message to secondary process\n");
+
+ TAILQ_REMOVE(&mp_request_list.list, entry, next);
+ free(entry);
+ } else if (entry->user_req.t == REQ_TYPE_ALLOC &&
+ result == REQ_RESULT_SUCCESS) {
+ struct malloc_heap *heap = entry->alloc_state.heap;
+ struct rte_mp_msg msg;
+ struct malloc_mp_req *resp =
+ (struct malloc_mp_req *)msg.param;
+
+ memset(&msg, 0, sizeof(msg));
+
+ heap->total_size += entry->alloc_state.map_len;
+
+ /* result is success, so just notify secondary about this */
+ resp->t = REQ_TYPE_ALLOC;
+ resp->result = result;
+ resp->id = entry->user_req.id;
+ msg.num_fds = 0;
+ msg.len_param = sizeof(*resp);
+ snprintf(msg.name, sizeof(msg.name), "%s", MP_ACTION_RESPONSE);
+
+ if (rte_mp_sendmsg(&msg))
+ RTE_LOG(ERR, EAL, "Could not send message to secondary process\n");
+
+ TAILQ_REMOVE(&mp_request_list.list, entry, next);
+ free(entry->alloc_state.ms);
+ free(entry);
+ } else if (entry->user_req.t == REQ_TYPE_ALLOC &&
+ result == REQ_RESULT_FAIL) {
+ struct rte_mp_msg rb_msg;
+ struct malloc_mp_req *rb =
+ (struct malloc_mp_req *)rb_msg.param;
+ struct timespec ts;
+ struct primary_alloc_req_state *state =
+ &entry->alloc_state;
+ int ret;
+
+ memset(&rb_msg, 0, sizeof(rb_msg));
+
+ /* we've failed to sync, so do a rollback */
+ rollback_expand_heap(state->ms, state->ms_len, state->elem,
+ state->map_addr, state->map_len);
+
+ /* send rollback request */
+ rb_msg.num_fds = 0;
+ rb_msg.len_param = sizeof(*rb);
+ snprintf(rb_msg.name, sizeof(rb_msg.name), "%s",
+ MP_ACTION_ROLLBACK);
+
+ ts.tv_nsec = 0;
+ ts.tv_sec = MP_TIMEOUT_S;
+
+ /* sync requests carry no data */
+ rb->t = REQ_TYPE_SYNC;
+ rb->id = entry->user_req.id;
+
+ /* there may be stray timeout still waiting */
+ do {
+ ret = rte_mp_request_async(&rb_msg, &ts,
+ handle_rollback_response);
+ } while (ret != 0 && rte_errno == EEXIST);
+ if (ret != 0) {
+ RTE_LOG(ERR, EAL, "Could not send rollback request to secondary process\n");
+
+ /* we couldn't send rollback request, but that's OK -
+ * secondary will time out, and memory has been removed
+ * from heap anyway.
+ */
+ TAILQ_REMOVE(&mp_request_list.list, entry, next);
+ free(state->ms);
+ free(entry);
+ goto fail;
+ }
+ } else {
+ RTE_LOG(ERR, EAL, " to sync request of unknown type\n");
+ goto fail;
+ }
+
+ pthread_mutex_unlock(&mp_request_list.lock);
+ return 0;
+fail:
+ pthread_mutex_unlock(&mp_request_list.lock);
+ return -1;
+}
+
+static int
+handle_rollback_response(const struct rte_mp_msg *request,
+ const struct rte_mp_reply *reply __rte_unused)
+{
+ struct rte_mp_msg msg;
+ struct malloc_mp_req *resp = (struct malloc_mp_req *)msg.param;
+ const struct malloc_mp_req *mpreq =
+ (const struct malloc_mp_req *)request->param;
+ struct mp_request *entry;
+
+ /* lock the request */
+ pthread_mutex_lock(&mp_request_list.lock);
+
+ memset(&msg, 0, sizeof(0));
+
+ entry = find_request_by_id(mpreq->id);
+ if (entry == NULL) {
+ RTE_LOG(ERR, EAL, "Wrong request ID\n");
+ goto fail;
+ }
+
+ if (entry->user_req.t != REQ_TYPE_ALLOC) {
+ RTE_LOG(ERR, EAL, "Unexpected active request\n");
+ goto fail;
+ }
+
+ /* we don't care if rollback succeeded, request still failed */
+ resp->t = REQ_TYPE_ALLOC;
+ resp->result = REQ_RESULT_FAIL;
+ resp->id = mpreq->id;
+ msg.num_fds = 0;
+ msg.len_param = sizeof(*resp);
+ snprintf(msg.name, sizeof(msg.name), "%s", MP_ACTION_RESPONSE);
+
+ if (rte_mp_sendmsg(&msg))
+ RTE_LOG(ERR, EAL, "Could not send message to secondary process\n");
+
+ /* clean up */
+ TAILQ_REMOVE(&mp_request_list.list, entry, next);
+ free(entry->alloc_state.ms);
+ free(entry);
+
+ pthread_mutex_unlock(&mp_request_list.lock);
+ return 0;
+fail:
+ pthread_mutex_unlock(&mp_request_list.lock);
+ return -1;
+}
+
+/* final stage of the request from secondary */
+static int
+handle_response(const struct rte_mp_msg *msg, const void *peer __rte_unused)
+{
+ const struct malloc_mp_req *m =
+ (const struct malloc_mp_req *)msg->param;
+ struct mp_request *entry;
+
+ pthread_mutex_lock(&mp_request_list.lock);
+
+ entry = find_request_by_id(m->id);
+ if (entry != NULL) {
+ /* update request status */
+ entry->user_req.result = m->result;
+
+ entry->state = REQ_STATE_COMPLETE;
+
+ /* trigger thread wakeup */
+ pthread_cond_signal(&entry->cond);
+ }
+
+ pthread_mutex_unlock(&mp_request_list.lock);
+
+ return 0;
+}
+
+/* synchronously request memory map sync, this is only called whenever primary
+ * process initiates the allocation.
+ */
+int
+request_sync(void)
+{
+ struct rte_mp_msg msg;
+ struct rte_mp_reply reply;
+ struct malloc_mp_req *req = (struct malloc_mp_req *)msg.param;
+ struct timespec ts;
+ int i, ret;
+
+ memset(&msg, 0, sizeof(msg));
+ memset(&reply, 0, sizeof(reply));
+
+ /* no need to create tailq entries as this is entirely synchronous */
+
+ msg.num_fds = 0;
+ msg.len_param = sizeof(*req);
+ snprintf(msg.name, sizeof(msg.name), "%s", MP_ACTION_SYNC);
+
+ /* sync request carries no data */
+ req->t = REQ_TYPE_SYNC;
+ req->id = get_unique_id();
+
+ ts.tv_nsec = 0;
+ ts.tv_sec = MP_TIMEOUT_S;
+
+ /* there may be stray timeout still waiting */
+ do {
+ ret = rte_mp_request_sync(&msg, &reply, &ts);
+ } while (ret != 0 && rte_errno == EEXIST);
+ if (ret != 0) {
+ RTE_LOG(ERR, EAL, "Could not send sync request to secondary process\n");
+ ret = -1;
+ goto out;
+ }
+
+ if (reply.nb_received != reply.nb_sent) {
+ RTE_LOG(ERR, EAL, "Not all secondaries have responded\n");
+ ret = -1;
+ goto out;
+ }
+
+ for (i = 0; i < reply.nb_received; i++) {
+ struct malloc_mp_req *resp =
+ (struct malloc_mp_req *)reply.msgs[i].param;
+ if (resp->t != REQ_TYPE_SYNC) {
+ RTE_LOG(ERR, EAL, "Unexpected response from secondary\n");
+ ret = -1;
+ goto out;
+ }
+ if (resp->id != req->id) {
+ RTE_LOG(ERR, EAL, "Wrong request ID\n");
+ ret = -1;
+ goto out;
+ }
+ if (resp->result != REQ_RESULT_SUCCESS) {
+ RTE_LOG(ERR, EAL, "Secondary process failed to synchronize\n");
+ ret = -1;
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ free(reply.msgs);
+ return ret;
+}
+
+/* this is a synchronous wrapper around a bunch of asynchronous requests to
+ * primary process. this will initiate a request and wait until responses come.
+ */
+int
+request_to_primary(struct malloc_mp_req *user_req)
+{
+ struct rte_mp_msg msg;
+ struct malloc_mp_req *msg_req = (struct malloc_mp_req *)msg.param;
+ struct mp_request *entry;
+ struct timespec ts;
+ struct timeval now;
+ int ret;
+
+ memset(&msg, 0, sizeof(msg));
+ memset(&ts, 0, sizeof(ts));
+
+ pthread_mutex_lock(&mp_request_list.lock);
+
+ entry = malloc(sizeof(*entry));
+ if (entry == NULL) {
+ RTE_LOG(ERR, EAL, "Cannot allocate memory for request\n");
+ goto fail;
+ }
+
+ memset(entry, 0, sizeof(*entry));
+
+ if (gettimeofday(&now, NULL) < 0) {
+ RTE_LOG(ERR, EAL, "Cannot get current time\n");
+ goto fail;
+ }
+
+ ts.tv_nsec = (now.tv_usec * 1000) % 1000000000;
+ ts.tv_sec = now.tv_sec + MP_TIMEOUT_S +
+ (now.tv_usec * 1000) / 1000000000;
+
+ /* initialize the request */
+ pthread_cond_init(&entry->cond, NULL);
+
+ msg.num_fds = 0;
+ msg.len_param = sizeof(*msg_req);
+ snprintf(msg.name, sizeof(msg.name), "%s", MP_ACTION_REQUEST);
+
+ /* (attempt to) get a unique id */
+ user_req->id = get_unique_id();
+
+ /* copy contents of user request into the message */
+ memcpy(msg_req, user_req, sizeof(*msg_req));
+
+ if (rte_mp_sendmsg(&msg)) {
+ RTE_LOG(ERR, EAL, "Cannot send message to primary\n");
+ goto fail;
+ }
+
+ /* copy contents of user request into active request */
+ memcpy(&entry->user_req, user_req, sizeof(*user_req));
+
+ /* mark request as in progress */
+ entry->state = REQ_STATE_ACTIVE;
+
+ TAILQ_INSERT_TAIL(&mp_request_list.list, entry, next);
+
+ /* finally, wait on timeout */
+ do {
+ ret = pthread_cond_timedwait(&entry->cond,
+ &mp_request_list.lock, &ts);
+ } while (ret != 0 && ret != ETIMEDOUT);
+
+ if (entry->state != REQ_STATE_COMPLETE) {
+ RTE_LOG(ERR, EAL, "Request timed out\n");
+ ret = -1;
+ } else {
+ ret = 0;
+ user_req->result = entry->user_req.result;
+ }
+ TAILQ_REMOVE(&mp_request_list.list, entry, next);
+ free(entry);
+
+ pthread_mutex_unlock(&mp_request_list.lock);
+ return ret;
+fail:
+ pthread_mutex_unlock(&mp_request_list.lock);
+ free(entry);
+ return -1;
+}
+
+int
+register_mp_requests(void)
+{
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ if (rte_mp_action_register(MP_ACTION_REQUEST, handle_request)) {
+ RTE_LOG(ERR, EAL, "Couldn't register '%s' action\n",
+ MP_ACTION_REQUEST);
+ return -1;
+ }
+ } else {
+ if (rte_mp_action_register(MP_ACTION_SYNC, handle_sync)) {
+ RTE_LOG(ERR, EAL, "Couldn't register '%s' action\n",
+ MP_ACTION_SYNC);
+ return -1;
+ }
+ if (rte_mp_action_register(MP_ACTION_ROLLBACK, handle_sync)) {
+ RTE_LOG(ERR, EAL, "Couldn't register '%s' action\n",
+ MP_ACTION_SYNC);
+ return -1;
+ }
+ if (rte_mp_action_register(MP_ACTION_RESPONSE,
+ handle_response)) {
+ RTE_LOG(ERR, EAL, "Couldn't register '%s' action\n",
+ MP_ACTION_RESPONSE);
+ return -1;
+ }
+ }
+ return 0;
+}
diff --git a/lib/librte_eal/common/malloc_mp.h b/lib/librte_eal/common/malloc_mp.h
new file mode 100644
index 0000000..6810b4f
--- /dev/null
+++ b/lib/librte_eal/common/malloc_mp.h
@@ -0,0 +1,86 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef MALLOC_MP_H
+#define MALLOC_MP_H
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include <rte_common.h>
+#include <rte_random.h>
+#include <rte_spinlock.h>
+#include <rte_tailq.h>
+
+/* forward declarations */
+struct malloc_heap;
+struct rte_memseg;
+
+/* multiprocess synchronization structures for malloc */
+enum malloc_req_type {
+ REQ_TYPE_ALLOC, /**< ask primary to allocate */
+ REQ_TYPE_FREE, /**< ask primary to free */
+ REQ_TYPE_SYNC /**< ask secondary to synchronize its memory map */
+};
+
+enum malloc_req_result {
+ REQ_RESULT_SUCCESS,
+ REQ_RESULT_FAIL
+};
+
+struct malloc_req_alloc {
+ struct malloc_heap *heap;
+ uint64_t page_sz;
+ size_t elt_size;
+ int socket;
+ unsigned int flags;
+ size_t align;
+ size_t bound;
+ bool contig;
+};
+
+struct malloc_req_free {
+ RTE_STD_C11
+ union {
+ void *addr;
+ uint64_t addr_64;
+ };
+ uint64_t len;
+};
+
+struct malloc_mp_req {
+ enum malloc_req_type t;
+ RTE_STD_C11
+ union {
+ struct malloc_req_alloc alloc_req;
+ struct malloc_req_free free_req;
+ };
+ uint64_t id; /**< not to be populated by caller */
+ enum malloc_req_result result;
+};
+
+int
+register_mp_requests(void);
+
+int
+request_to_primary(struct malloc_mp_req *req);
+
+/* synchronous memory map sync request */
+int
+request_sync(void);
+
+/* functions from malloc_heap exposed here */
+int
+malloc_heap_free_pages(void *aligned_start, size_t aligned_len);
+
+struct malloc_elem *
+alloc_pages_on_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size,
+ int socket, unsigned int flags, size_t align, size_t bound,
+ bool contig, struct rte_memseg **ms, int n_segs);
+
+void
+rollback_expand_heap(struct rte_memseg **ms, int n_segs,
+ struct malloc_elem *elem, void *map_addr, size_t map_len);
+
+#endif // MALLOC_MP_H
diff --git a/lib/librte_eal/common/meson.build b/lib/librte_eal/common/meson.build
index a1ada24..8a3dcfe 100644
--- a/lib/librte_eal/common/meson.build
+++ b/lib/librte_eal/common/meson.build
@@ -27,6 +27,7 @@ common_sources = files(
'eal_common_timer.c',
'malloc_elem.c',
'malloc_heap.c',
+ 'malloc_mp.c',
'rte_keepalive.c',
'rte_malloc.c',
'rte_reciprocal.c',
diff --git a/lib/librte_eal/linuxapp/eal/Makefile b/lib/librte_eal/linuxapp/eal/Makefile
index 5380ba8..542bf7e 100644
--- a/lib/librte_eal/linuxapp/eal/Makefile
+++ b/lib/librte_eal/linuxapp/eal/Makefile
@@ -67,6 +67,7 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_fbarray.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_malloc.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_elem.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_heap.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_mp.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_keepalive.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_service.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_reciprocal.c
diff --git a/lib/librte_eal/linuxapp/eal/eal_memalloc.c b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
index ce242b1..8202a1b 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memalloc.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
@@ -211,6 +211,32 @@ is_zero_length(int fd)
return st.st_blocks == 0;
}

+/* we cannot use rte_memseg_list_walk() here because we will be holding a
+ * write lock whenever we enter every function in this file, however copying
+ * the same iteration code everywhere is not ideal as well. so, use a lockless
+ * copy of memseg list walk here.
+ */
+static int
+memseg_list_walk_thread_unsafe(rte_memseg_list_walk_t func, void *arg)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ int i, ret = 0;
+
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+ struct rte_memseg_list *msl = &mcfg->memsegs[i];
+
+ if (msl->base_va == NULL)
+ continue;
+
+ ret = func(msl, arg);
+ if (ret < 0)
+ return -1;
+ if (ret > 0)
+ return 1;
+ }
+ return 0;
+}
+
static int
get_seg_fd(char *path, int buflen, struct hugepage_info *hi,
unsigned int list_idx, unsigned int seg_idx)
@@ -739,7 +765,7 @@ eal_memalloc_alloc_seg_bulk(struct rte_memseg **ms, int n_segs, size_t page_sz,
wa.socket = socket;
wa.segs_allocated = 0;

- ret = rte_memseg_list_walk(alloc_seg_walk, &wa);
+ ret = memseg_list_walk_thread_unsafe(alloc_seg_walk, &wa);
if (ret == 0) {
RTE_LOG(ERR, EAL, "%s(): couldn't find suitable memseg_list\n",
__func__);
@@ -798,7 +824,7 @@ eal_memalloc_free_seg_bulk(struct rte_memseg **ms, int n_segs)
wa.ms = cur;
wa.hi = hi;

- walk_res = rte_memseg_list_walk(free_seg_walk, &wa);
+ walk_res = memseg_list_walk_thread_unsafe(free_seg_walk, &wa);
if (walk_res == 1)
continue;
if (walk_res == 0)
@@ -1055,7 +1081,7 @@ eal_memalloc_sync_with_primary(void)
if (rte_eal_process_type() == RTE_PROC_PRIMARY)
return 0;

- if (rte_memseg_list_walk(sync_walk, NULL))
+ if (memseg_list_walk_thread_unsafe(sync_walk, NULL))
return -1;
return 0;
}
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index 2c27063..cbfe3c9 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -53,6 +53,42 @@ static const struct vfio_iommu_type iommu_types[] = {
},
};

+/* for sPAPR IOMMU, we will need to walk memseg list, but we cannot use
+ * rte_memseg_walk() because by the time we enter callback we will be holding a
+ * write lock, so regular rte-memseg_walk will deadlock. copying the same
+ * iteration code everywhere is not ideal as well. so, use a lockless copy of
+ * memseg walk here.
+ */
+static int
+memseg_walk_thread_unsafe(rte_memseg_walk_t func, void *arg)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ int i, ms_idx, ret = 0;
+
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+ struct rte_memseg_list *msl = &mcfg->memsegs[i];
+ const struct rte_memseg *ms;
+ struct rte_fbarray *arr;
+
+ if (msl->memseg_arr.count == 0)
+ continue;
+
+ arr = &msl->memseg_arr;
+
+ ms_idx = rte_fbarray_find_next_used(arr, 0);
+ while (ms_idx >= 0) {
+ ms = rte_fbarray_get(arr, ms_idx);
+ ret = func(msl, ms, arg);
+ if (ret < 0)
+ return -1;
+ if (ret > 0)
+ return 1;
+ ms_idx = rte_fbarray_find_next_used(arr, ms_idx + 1);
+ }
+ }
+ return 0;
+}
+
int
vfio_get_group_fd(int iommu_group_no)
{
@@ -884,7 +920,7 @@ vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
/* check if window size needs to be adjusted */
memset(&param, 0, sizeof(param));

- if (rte_memseg_walk(vfio_spapr_window_size_walk, &param) < 0) {
+ if (memseg_walk_thread_unsafe(vfio_spapr_window_size_walk, &param) < 0) {
RTE_LOG(ERR, EAL, "Could not get window size\n");
return -1;
}
@@ -902,7 +938,7 @@ vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
RTE_LOG(ERR, EAL, "Could not create new DMA window\n");
return -1;
}
- if (rte_memseg_walk(vfio_spapr_map_walk,
+ if (memseg_walk_thread_unsafe(vfio_spapr_map_walk,
&vfio_container_fd) < 0) {
RTE_LOG(ERR, EAL, "Could not recreate DMA maps\n");
return -1;
--
2.7.4
Loading...