Discussion:
[dpdk-dev] [PATCH 05/41] test: add command to dump malloc heap contents
Anatoly Burakov
2018-03-03 13:45:53 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---
test/test/commands.c | 3 +++
1 file changed, 3 insertions(+)

diff --git a/test/test/commands.c b/test/test/commands.c
index cf0b726..6bfdc02 100644
--- a/test/test/commands.c
+++ b/test/test/commands.c
@@ -137,6 +137,8 @@ static void cmd_dump_parsed(void *parsed_result,
rte_log_dump(stdout);
else if (!strcmp(res->dump, "dump_malloc_stats"))
rte_malloc_dump_stats(stdout, NULL);
+ else if (!strcmp(res->dump, "dump_malloc_heaps"))
+ rte_malloc_dump_heaps(stdout);
}

cmdline_parse_token_string_t cmd_dump_dump =
@@ -147,6 +149,7 @@ cmdline_parse_token_string_t cmd_dump_dump =
"dump_ring#"
"dump_mempool#"
"dump_malloc_stats#"
+ "dump_malloc_heaps#"
"dump_devargs#"
"dump_log_types");
--
2.7.4
Anatoly Burakov
2018-03-03 13:45:54 UTC
Permalink
We need this function to join newly allocated segments with the heap.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/common/malloc_elem.c | 6 +++---
lib/librte_eal/common/malloc_elem.h | 3 +++
2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c
index e02ed88..2291ee1 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -325,8 +325,8 @@ join_elem(struct malloc_elem *elem1, struct malloc_elem *elem2)
elem1->next = next;
}

-static struct malloc_elem *
-elem_join_adjacent_free(struct malloc_elem *elem)
+struct malloc_elem *
+malloc_elem_join_adjacent_free(struct malloc_elem *elem)
{
/*
* check if next element exists, is adjacent and is free, if so join
@@ -388,7 +388,7 @@ malloc_elem_free(struct malloc_elem *elem)
ptr = RTE_PTR_ADD(elem, sizeof(*elem));
data_len = elem->size - MALLOC_ELEM_OVERHEAD;

- elem = elem_join_adjacent_free(elem);
+ elem = malloc_elem_join_adjacent_free(elem);

malloc_elem_free_list_insert(elem);

diff --git a/lib/librte_eal/common/malloc_elem.h b/lib/librte_eal/common/malloc_elem.h
index 40e8eb5..99921d2 100644
--- a/lib/librte_eal/common/malloc_elem.h
+++ b/lib/librte_eal/common/malloc_elem.h
@@ -141,6 +141,9 @@ malloc_elem_alloc(struct malloc_elem *elem, size_t size,
int
malloc_elem_free(struct malloc_elem *elem);

+struct malloc_elem *
+malloc_elem_join_adjacent_free(struct malloc_elem *elem);
+
/*
* attempt to resize a malloc_elem by expanding into any free space
* immediately after it in memory.
--
2.7.4
Anatoly Burakov
2018-03-03 13:45:49 UTC
Permalink
Move get_virtual_area out of linuxapp EAL memory and make it
common to EAL, so that other code could reserve virtual areas
as well.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/common/eal_common_memory.c | 101 ++++++++++++++++++++++
lib/librte_eal/common/eal_private.h | 33 +++++++
lib/librte_eal/linuxapp/eal/eal_memory.c | 137 ++++++------------------------
3 files changed, 161 insertions(+), 110 deletions(-)

diff --git a/lib/librte_eal/common/eal_common_memory.c b/lib/librte_eal/common/eal_common_memory.c
index 852f3bb..042881b 100644
--- a/lib/librte_eal/common/eal_common_memory.c
+++ b/lib/librte_eal/common/eal_common_memory.c
@@ -2,10 +2,12 @@
* Copyright(c) 2010-2014 Intel Corporation
*/

+#include <errno.h>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <stdarg.h>
+#include <string.h>
#include <unistd.h>
#include <inttypes.h>
#include <sys/mman.h>
@@ -14,12 +16,111 @@
#include <rte_memory.h>
#include <rte_eal.h>
#include <rte_eal_memconfig.h>
+#include <rte_errno.h>
#include <rte_log.h>

#include "eal_private.h"
#include "eal_internal_cfg.h"

/*
+ * Try to mmap *size bytes in /dev/zero. If it is successful, return the
+ * pointer to the mmap'd area and keep *size unmodified. Else, retry
+ * with a smaller zone: decrease *size by hugepage_sz until it reaches
+ * 0. In this case, return NULL. Note: this function returns an address
+ * which is a multiple of hugepage size.
+ */
+
+static uint64_t baseaddr_offset;
+static uint64_t system_page_sz;
+
+void *
+eal_get_virtual_area(void *requested_addr, uint64_t *size,
+ uint64_t page_sz, int flags, int mmap_flags)
+{
+ bool addr_is_hint, allow_shrink, unmap, no_align;
+ uint64_t map_sz;
+ void *mapped_addr, *aligned_addr;
+
+ if (system_page_sz == 0)
+ system_page_sz = sysconf(_SC_PAGESIZE);
+
+ mmap_flags |= MAP_PRIVATE | MAP_ANONYMOUS;
+
+ RTE_LOG(DEBUG, EAL, "Ask a virtual area of 0x%zx bytes\n", *size);
+
+ addr_is_hint = (flags & EAL_VIRTUAL_AREA_ADDR_IS_HINT) > 0;
+ allow_shrink = (flags & EAL_VIRTUAL_AREA_ALLOW_SHRINK) > 0;
+ unmap = (flags & EAL_VIRTUAL_AREA_UNMAP) > 0;
+
+ if (requested_addr == NULL && internal_config.base_virtaddr != 0) {
+ requested_addr = (void *) (internal_config.base_virtaddr +
+ baseaddr_offset);
+ requested_addr = RTE_PTR_ALIGN(requested_addr, page_sz);
+ addr_is_hint = true;
+ }
+
+ /* if requested address is not aligned by page size, or if requested
+ * address is NULL, add page size to requested length as we may get an
+ * address that's aligned by system page size, which can be smaller than
+ * our requested page size. additionally, we shouldn't try to align if
+ * system page size is the same as requested page size.
+ */
+ no_align = (requested_addr != NULL &&
+ ((uintptr_t)requested_addr & (page_sz - 1)) == 0) ||
+ page_sz == system_page_sz;
+
+ do {
+ map_sz = no_align ? *size : *size + page_sz;
+
+ mapped_addr = mmap(requested_addr, map_sz, PROT_READ,
+ mmap_flags, -1, 0);
+ if (mapped_addr == MAP_FAILED && allow_shrink)
+ *size -= page_sz;
+ } while (allow_shrink && mapped_addr == MAP_FAILED && *size > 0);
+
+ /* align resulting address - if map failed, we will ignore the value
+ * anyway, so no need to add additional checks.
+ */
+ aligned_addr = no_align ? mapped_addr :
+ RTE_PTR_ALIGN(mapped_addr, page_sz);
+
+ if (*size == 0) {
+ RTE_LOG(ERR, EAL, "Cannot get a virtual area of any size: %s\n",
+ strerror(errno));
+ rte_errno = errno;
+ return NULL;
+ } else if (mapped_addr == MAP_FAILED) {
+ RTE_LOG(ERR, EAL, "Cannot get a virtual area: %s\n",
+ strerror(errno));
+ /* pass errno up the call chain */
+ rte_errno = errno;
+ return NULL;
+ } else if (requested_addr != NULL && !addr_is_hint &&
+ aligned_addr != requested_addr) {
+ RTE_LOG(ERR, EAL, "Cannot get a virtual area at requested address: %p (got %p)\n",
+ requested_addr, aligned_addr);
+ munmap(mapped_addr, map_sz);
+ rte_errno = EADDRNOTAVAIL;
+ return NULL;
+ } else if (requested_addr != NULL && addr_is_hint &&
+ aligned_addr != requested_addr) {
+ RTE_LOG(WARNING, EAL, "WARNING! Base virtual address hint (%p != %p) not respected!\n",
+ requested_addr, aligned_addr);
+ RTE_LOG(WARNING, EAL, " This may cause issues with mapping memory into secondary processes\n");
+ }
+
+ if (unmap)
+ munmap(mapped_addr, map_sz);
+
+ RTE_LOG(DEBUG, EAL, "Virtual area found at %p (size = 0x%zx)\n",
+ aligned_addr, *size);
+
+ baseaddr_offset += *size;
+
+ return aligned_addr;
+}
+
+/*
* Return a pointer to a read-only table of struct rte_physmem_desc
* elements, containing the layout of all addressable physical
* memory. The last element of the table contains a NULL address.
diff --git a/lib/librte_eal/common/eal_private.h b/lib/librte_eal/common/eal_private.h
index 0b28770..96cebb7 100644
--- a/lib/librte_eal/common/eal_private.h
+++ b/lib/librte_eal/common/eal_private.h
@@ -127,6 +127,39 @@ int rte_eal_alarm_init(void);
int rte_eal_check_module(const char *module_name);

/**
+ * Get virtual area of specified size from the OS.
+ *
+ * This function is private to the EAL.
+ *
+ * @param requested_addr
+ * Address where to request address space.
+ * @param size
+ * Size of requested area.
+ * @param page_sz
+ * Page size on which to align requested virtual area.
+ * @param flags
+ * EAL_VIRTUAL_AREA_* flags.
+ * @param mmap_flags
+ * Extra flags passed directly to mmap().
+ *
+ * @return
+ * Virtual area address if successful.
+ * NULL if unsuccessful.
+ */
+
+#define EAL_VIRTUAL_AREA_ADDR_IS_HINT (1 << 0)
+/**< don't fail if cannot get exact requested address. */
+#define EAL_VIRTUAL_AREA_ALLOW_SHRINK (1 << 1)
+/**< try getting smaller sized (decrement by page size) virtual areas if cannot
+ * get area of requested size.
+ */
+#define EAL_VIRTUAL_AREA_UNMAP (1 << 2)
+/**< immediately unmap reserved virtual area. */
+void *
+eal_get_virtual_area(void *requested_addr, uint64_t *size,
+ uint64_t page_sz, int flags, int mmap_flags);
+
+/**
* Get cpu core_id.
*
* This function is private to the EAL.
diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
index 38853b7..5c11d77 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -28,6 +28,7 @@
#include <numaif.h>
#endif

+#include <rte_errno.h>
#include <rte_log.h>
#include <rte_memory.h>
#include <rte_launch.h>
@@ -57,8 +58,6 @@
* zone as well as a physical contiguous zone.
*/

-static uint64_t baseaddr_offset;
-
static bool phys_addrs_available = true;

#define RANDOMIZE_VA_SPACE_FILE "/proc/sys/kernel/randomize_va_space"
@@ -221,82 +220,6 @@ aslr_enabled(void)
}
}

-/*
- * Try to mmap *size bytes in /dev/zero. If it is successful, return the
- * pointer to the mmap'd area and keep *size unmodified. Else, retry
- * with a smaller zone: decrease *size by hugepage_sz until it reaches
- * 0. In this case, return NULL. Note: this function returns an address
- * which is a multiple of hugepage size.
- */
-static void *
-get_virtual_area(size_t *size, size_t hugepage_sz)
-{
- void *addr;
- void *addr_hint;
- int fd;
- long aligned_addr;
-
- if (internal_config.base_virtaddr != 0) {
- int page_size = sysconf(_SC_PAGE_SIZE);
- addr_hint = (void *) (uintptr_t)
- (internal_config.base_virtaddr + baseaddr_offset);
- addr_hint = RTE_PTR_ALIGN_FLOOR(addr_hint, page_size);
- } else {
- addr_hint = NULL;
- }
-
- RTE_LOG(DEBUG, EAL, "Ask a virtual area of 0x%zx bytes\n", *size);
-
-
- fd = open("/dev/zero", O_RDONLY);
- if (fd < 0){
- RTE_LOG(ERR, EAL, "Cannot open /dev/zero\n");
- return NULL;
- }
- do {
- addr = mmap(addr_hint, (*size) + hugepage_sz, PROT_READ,
-#ifdef RTE_ARCH_PPC_64
- MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
-#else
- MAP_PRIVATE,
-#endif
- fd, 0);
- if (addr == MAP_FAILED) {
- *size -= hugepage_sz;
- } else if (addr_hint != NULL && addr != addr_hint) {
- RTE_LOG(WARNING, EAL, "WARNING! Base virtual address "
- "hint (%p != %p) not respected!\n",
- addr_hint, addr);
- RTE_LOG(WARNING, EAL, " This may cause issues with "
- "mapping memory into secondary processes\n");
- }
- } while (addr == MAP_FAILED && *size > 0);
-
- if (addr == MAP_FAILED) {
- close(fd);
- RTE_LOG(ERR, EAL, "Cannot get a virtual area: %s\n",
- strerror(errno));
- return NULL;
- }
-
- munmap(addr, (*size) + hugepage_sz);
- close(fd);
-
- /* align addr to a huge page size boundary */
- aligned_addr = (long)addr;
- aligned_addr += (hugepage_sz - 1);
- aligned_addr &= (~(hugepage_sz - 1));
- addr = (void *)(aligned_addr);
-
- RTE_LOG(DEBUG, EAL, "Virtual area found at %p (size = 0x%zx)\n",
- addr, *size);
-
- /* increment offset */
- baseaddr_offset += *size;
-
- return addr;
-}
-
static sigjmp_buf huge_jmpenv;

static void huge_sigbus_handler(int signo __rte_unused)
@@ -445,7 +368,16 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
/* get the biggest virtual memory area up to
* vma_len. If it fails, vma_addr is NULL, so
* let the kernel provide the address. */
- vma_addr = get_virtual_area(&vma_len, hpi->hugepage_sz);
+ vma_addr = eal_get_virtual_area(NULL, &vma_len,
+ hpi->hugepage_sz,
+ EAL_VIRTUAL_AREA_ALLOW_SHRINK |
+ EAL_VIRTUAL_AREA_UNMAP,
+#ifdef RTE_ARCH_PPC_64
+ MAP_HUGETLB
+#else
+ 0
+#endif
+ );
if (vma_addr == NULL)
vma_len = hugepage_sz;
}
@@ -1339,7 +1271,7 @@ rte_eal_hugepage_attach(void)
unsigned i, s = 0; /* s used to track the segment number */
unsigned max_seg = RTE_MAX_MEMSEG;
off_t size = 0;
- int fd, fd_zero = -1, fd_hugepage = -1;
+ int fd, fd_hugepage = -1;

if (aslr_enabled() > 0) {
RTE_LOG(WARNING, EAL, "WARNING: Address Space Layout Randomization "
@@ -1350,11 +1282,6 @@ rte_eal_hugepage_attach(void)

test_phys_addrs_available();

- fd_zero = open("/dev/zero", O_RDONLY);
- if (fd_zero < 0) {
- RTE_LOG(ERR, EAL, "Could not open /dev/zero\n");
- goto error;
- }
fd_hugepage = open(eal_hugepage_info_path(), O_RDONLY);
if (fd_hugepage < 0) {
RTE_LOG(ERR, EAL, "Could not open %s\n", eal_hugepage_info_path());
@@ -1364,6 +1291,8 @@ rte_eal_hugepage_attach(void)
/* map all segments into memory to make sure we get the addrs */
for (s = 0; s < RTE_MAX_MEMSEG; ++s) {
void *base_addr;
+ uint64_t mmap_sz;
+ int mmap_flags = 0;

/*
* the first memory segment with len==0 is the one that
@@ -1372,35 +1301,26 @@ rte_eal_hugepage_attach(void)
if (mcfg->memseg[s].len == 0)
break;

- /*
- * fdzero is mmapped to get a contiguous block of virtual
- * addresses of the appropriate memseg size.
- * use mmap to get identical addresses as the primary process.
+ /* get identical addresses as the primary process.
*/
- base_addr = mmap(mcfg->memseg[s].addr, mcfg->memseg[s].len,
- PROT_READ,
#ifdef RTE_ARCH_PPC_64
- MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
-#else
- MAP_PRIVATE,
+ mmap_flags |= MAP_HUGETLB;
#endif
- fd_zero, 0);
- if (base_addr == MAP_FAILED ||
- base_addr != mcfg->memseg[s].addr) {
+ mmap_sz = mcfg->memseg[s].len;
+ base_addr = eal_get_virtual_area(mcfg->memseg[s].addr,
+ &mmap_sz, mcfg->memseg[s].hugepage_sz, 0,
+ mmap_flags);
+ if (base_addr == NULL) {
max_seg = s;
- if (base_addr != MAP_FAILED) {
- /* errno is stale, don't use */
- RTE_LOG(ERR, EAL, "Could not mmap %llu bytes "
- "in /dev/zero at [%p], got [%p] - "
- "please use '--base-virtaddr' option\n",
+ if (rte_errno == EADDRNOTAVAIL) {
+ RTE_LOG(ERR, EAL, "Could not mmap %llu bytes at [%p] - please use '--base-virtaddr' option\n",
(unsigned long long)mcfg->memseg[s].len,
- mcfg->memseg[s].addr, base_addr);
- munmap(base_addr, mcfg->memseg[s].len);
+ mcfg->memseg[s].addr);
} else {
- RTE_LOG(ERR, EAL, "Could not mmap %llu bytes "
- "in /dev/zero at [%p]: '%s'\n",
+ RTE_LOG(ERR, EAL, "Could not mmap %llu bytes at [%p]: '%s'\n",
(unsigned long long)mcfg->memseg[s].len,
- mcfg->memseg[s].addr, strerror(errno));
+ mcfg->memseg[s].addr,
+ rte_strerror(rte_errno));
}
if (aslr_enabled() > 0) {
RTE_LOG(ERR, EAL, "It is recommended to "
@@ -1465,7 +1385,6 @@ rte_eal_hugepage_attach(void)
}
/* unmap the hugepage config file, since we are done using it */
munmap(hp, size);
- close(fd_zero);
close(fd_hugepage);
return 0;

@@ -1474,8 +1393,6 @@ rte_eal_hugepage_attach(void)
munmap(mcfg->memseg[i].addr, mcfg->memseg[i].len);
if (hp != NULL && hp != MAP_FAILED)
munmap(hp, size);
- if (fd_zero >= 0)
- close(fd_zero);
if (fd_hugepage >= 0)
close(fd_hugepage);
return -1;
--
2.7.4
Anatoly Burakov
2018-03-03 13:45:51 UTC
Permalink
As we are preparing for dynamic memory allocation, we need to be
able to handle holes in our malloc heap, hence we're switching to
doubly linked list, and prepare infrastructure to support it.

Since our heap is now aware where are our first and last elements,
there is no longer any need to have a dummy element at the end of
each heap, so get rid of that as well. Instead, let insert/remove/
join/split operations handle end-of-list conditions automatically.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/common/include/rte_malloc_heap.h | 6 +
lib/librte_eal/common/malloc_elem.c | 200 +++++++++++++++++++-----
lib/librte_eal/common/malloc_elem.h | 14 +-
lib/librte_eal/common/malloc_heap.c | 8 +-
4 files changed, 179 insertions(+), 49 deletions(-)

diff --git a/lib/librte_eal/common/include/rte_malloc_heap.h b/lib/librte_eal/common/include/rte_malloc_heap.h
index ba99ed9..9ec4b62 100644
--- a/lib/librte_eal/common/include/rte_malloc_heap.h
+++ b/lib/librte_eal/common/include/rte_malloc_heap.h
@@ -13,12 +13,18 @@
/* Number of free lists per heap, grouped by size. */
#define RTE_HEAP_NUM_FREELISTS 13

+/* dummy definition, for pointers */
+struct malloc_elem;
+
/**
* Structure to hold malloc heap
*/
struct malloc_heap {
rte_spinlock_t lock;
LIST_HEAD(, malloc_elem) free_head[RTE_HEAP_NUM_FREELISTS];
+ struct malloc_elem *first;
+ struct malloc_elem *last;
+
unsigned alloc_count;
size_t total_size;
} __rte_cache_aligned;
diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c
index ea041e2..eb41200 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -31,6 +31,7 @@ malloc_elem_init(struct malloc_elem *elem,
elem->heap = heap;
elem->ms = ms;
elem->prev = NULL;
+ elem->next = NULL;
memset(&elem->free_list, 0, sizeof(elem->free_list));
elem->state = ELEM_FREE;
elem->size = size;
@@ -39,15 +40,56 @@ malloc_elem_init(struct malloc_elem *elem,
set_trailer(elem);
}

-/*
- * Initialize a dummy malloc_elem header for the end-of-memseg marker
- */
void
-malloc_elem_mkend(struct malloc_elem *elem, struct malloc_elem *prev)
+malloc_elem_insert(struct malloc_elem *elem)
{
- malloc_elem_init(elem, prev->heap, prev->ms, 0);
- elem->prev = prev;
- elem->state = ELEM_BUSY; /* mark busy so its never merged */
+ struct malloc_elem *prev_elem, *next_elem;
+ struct malloc_heap *heap = elem->heap;
+
+ if (heap->first == NULL && heap->last == NULL) {
+ /* if empty heap */
+ heap->first = elem;
+ heap->last = elem;
+ prev_elem = NULL;
+ next_elem = NULL;
+ } else if (elem < heap->first) {
+ /* if lower than start */
+ prev_elem = NULL;
+ next_elem = heap->first;
+ heap->first = elem;
+ } else if (elem > heap->last) {
+ /* if higher than end */
+ prev_elem = heap->last;
+ next_elem = NULL;
+ heap->last = elem;
+ } else {
+ /* the new memory is somewhere inbetween start and end */
+ uint64_t dist_from_start, dist_from_end;
+
+ dist_from_end = RTE_PTR_DIFF(heap->last, elem);
+ dist_from_start = RTE_PTR_DIFF(elem, heap->first);
+
+ /* check which is closer, and find closest list entries */
+ if (dist_from_start < dist_from_end) {
+ prev_elem = heap->first;
+ while (prev_elem->next < elem)
+ prev_elem = prev_elem->next;
+ next_elem = prev_elem->next;
+ } else {
+ next_elem = heap->last;
+ while (next_elem->prev > elem)
+ next_elem = next_elem->prev;
+ prev_elem = next_elem->prev;
+ }
+ }
+
+ /* insert new element */
+ elem->prev = prev_elem;
+ elem->next = next_elem;
+ if (prev_elem)
+ prev_elem->next = elem;
+ if (next_elem)
+ next_elem->prev = elem;
}

/*
@@ -98,18 +140,58 @@ malloc_elem_can_hold(struct malloc_elem *elem, size_t size, unsigned align,
static void
split_elem(struct malloc_elem *elem, struct malloc_elem *split_pt)
{
- struct malloc_elem *next_elem = RTE_PTR_ADD(elem, elem->size);
+ struct malloc_elem *next_elem = elem->next;
const size_t old_elem_size = (uintptr_t)split_pt - (uintptr_t)elem;
const size_t new_elem_size = elem->size - old_elem_size;

malloc_elem_init(split_pt, elem->heap, elem->ms, new_elem_size);
split_pt->prev = elem;
- next_elem->prev = split_pt;
+ split_pt->next = next_elem;
+ if (next_elem)
+ next_elem->prev = split_pt;
+ else
+ elem->heap->last = split_pt;
+ elem->next = split_pt;
elem->size = old_elem_size;
set_trailer(elem);
}

/*
+ * our malloc heap is a doubly linked list, so doubly remove our element.
+ */
+static void __rte_unused
+remove_elem(struct malloc_elem *elem)
+{
+ struct malloc_elem *next, *prev;
+ next = elem->next;
+ prev = elem->prev;
+
+ if (next)
+ next->prev = prev;
+ else
+ elem->heap->last = prev;
+ if (prev)
+ prev->next = next;
+ else
+ elem->heap->first = next;
+
+ elem->prev = NULL;
+ elem->next = NULL;
+}
+
+static int
+next_elem_is_adjacent(struct malloc_elem *elem)
+{
+ return elem->next == RTE_PTR_ADD(elem, elem->size);
+}
+
+static int
+prev_elem_is_adjacent(struct malloc_elem *elem)
+{
+ return elem == RTE_PTR_ADD(elem->prev, elem->prev->size);
+}
+
+/*
* Given an element size, compute its freelist index.
* We free an element into the freelist containing similarly-sized elements.
* We try to allocate elements starting with the freelist containing
@@ -192,6 +274,9 @@ malloc_elem_alloc(struct malloc_elem *elem, size_t size, unsigned align,

split_elem(elem, new_free_elem);
malloc_elem_free_list_insert(new_free_elem);
+
+ if (elem == elem->heap->last)
+ elem->heap->last = new_free_elem;
}

if (old_elem_size < MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) {
@@ -230,9 +315,62 @@ malloc_elem_alloc(struct malloc_elem *elem, size_t size, unsigned align,
static inline void
join_elem(struct malloc_elem *elem1, struct malloc_elem *elem2)
{
- struct malloc_elem *next = RTE_PTR_ADD(elem2, elem2->size);
+ struct malloc_elem *next = elem2->next;
elem1->size += elem2->size;
- next->prev = elem1;
+ if (next)
+ next->prev = elem1;
+ else
+ elem1->heap->last = elem1;
+ elem1->next = next;
+}
+
+static struct malloc_elem *
+elem_join_adjacent_free(struct malloc_elem *elem)
+{
+ /*
+ * check if next element exists, is adjacent and is free, if so join
+ * with it, need to remove from free list.
+ */
+ if (elem->next != NULL && elem->next->state == ELEM_FREE &&
+ next_elem_is_adjacent(elem)) {
+ void *erase;
+
+ /* we will want to erase the trailer and header */
+ erase = RTE_PTR_SUB(elem->next, MALLOC_ELEM_TRAILER_LEN);
+
+ /* remove from free list, join to this one */
+ elem_free_list_remove(elem->next);
+ join_elem(elem, elem->next);
+
+ /* erase header and trailer */
+ memset(erase, 0, MALLOC_ELEM_OVERHEAD);
+ }
+
+ /*
+ * check if prev element exists, is adjacent and is free, if so join
+ * with it, need to remove from free list.
+ */
+ if (elem->prev != NULL && elem->prev->state == ELEM_FREE &&
+ prev_elem_is_adjacent(elem)) {
+ struct malloc_elem *new_elem;
+ void *erase;
+
+ /* we will want to erase trailer and header */
+ erase = RTE_PTR_SUB(elem, MALLOC_ELEM_TRAILER_LEN);
+
+ /* remove from free list, join to this one */
+ elem_free_list_remove(elem->prev);
+
+ new_elem = elem->prev;
+ join_elem(new_elem, elem);
+
+ /* erase header and trailer */
+ memset(erase, 0, MALLOC_ELEM_OVERHEAD);
+
+ elem = new_elem;
+ }
+
+ return elem;
}

/*
@@ -243,32 +381,20 @@ join_elem(struct malloc_elem *elem1, struct malloc_elem *elem2)
int
malloc_elem_free(struct malloc_elem *elem)
{
- size_t sz = elem->size - sizeof(*elem) - MALLOC_ELEM_TRAILER_LEN;
- uint8_t *ptr = (uint8_t *)&elem[1];
- struct malloc_elem *next = RTE_PTR_ADD(elem, elem->size);
- if (next->state == ELEM_FREE){
- /* remove from free list, join to this one */
- elem_free_list_remove(next);
- join_elem(elem, next);
- sz += (sizeof(*elem) + MALLOC_ELEM_TRAILER_LEN);
- }
+ void *ptr;
+ size_t data_len;
+
+ ptr = RTE_PTR_ADD(elem, sizeof(*elem));
+ data_len = elem->size - MALLOC_ELEM_OVERHEAD;
+
+ elem = elem_join_adjacent_free(elem);

- /* check if previous element is free, if so join with it and return,
- * need to re-insert in free list, as that element's size is changing
- */
- if (elem->prev != NULL && elem->prev->state == ELEM_FREE) {
- elem_free_list_remove(elem->prev);
- join_elem(elem->prev, elem);
- sz += (sizeof(*elem) + MALLOC_ELEM_TRAILER_LEN);
- ptr -= (sizeof(*elem) + MALLOC_ELEM_TRAILER_LEN);
- elem = elem->prev;
- }
malloc_elem_free_list_insert(elem);

/* decrease heap's count of allocated elements */
elem->heap->alloc_count--;

- memset(ptr, 0, sz);
+ memset(ptr, 0, data_len);

return 0;
}
@@ -281,21 +407,23 @@ int
malloc_elem_resize(struct malloc_elem *elem, size_t size)
{
const size_t new_size = size + elem->pad + MALLOC_ELEM_OVERHEAD;
+
/* if we request a smaller size, then always return ok */
if (elem->size >= new_size)
return 0;

- struct malloc_elem *next = RTE_PTR_ADD(elem, elem->size);
- if (next ->state != ELEM_FREE)
+ /* check if there is a next element, it's free and adjacent */
+ if (!elem->next || elem->next->state != ELEM_FREE ||
+ !next_elem_is_adjacent(elem))
return -1;
- if (elem->size + next->size < new_size)
+ if (elem->size + elem->next->size < new_size)
return -1;

/* we now know the element fits, so remove from free list,
* join the two
*/
- elem_free_list_remove(next);
- join_elem(elem, next);
+ elem_free_list_remove(elem->next);
+ join_elem(elem, elem->next);

if (elem->size - new_size >= MIN_DATA_SIZE + MALLOC_ELEM_OVERHEAD) {
/* now we have a big block together. Lets cut it down a bit, by splitting */
diff --git a/lib/librte_eal/common/malloc_elem.h b/lib/librte_eal/common/malloc_elem.h
index f4c1c7a..238e451 100644
--- a/lib/librte_eal/common/malloc_elem.h
+++ b/lib/librte_eal/common/malloc_elem.h
@@ -18,8 +18,12 @@ enum elem_state {

struct malloc_elem {
struct malloc_heap *heap;
- struct malloc_elem *volatile prev; /* points to prev elem in memseg */
- LIST_ENTRY(malloc_elem) free_list; /* list of free elements in heap */
+ struct malloc_elem *volatile prev;
+ /**< points to prev elem in memseg */
+ struct malloc_elem *volatile next;
+ /**< points to next elem in memseg */
+ LIST_ENTRY(malloc_elem) free_list;
+ /**< list of free elements in heap */
const struct rte_memseg *ms;
volatile enum elem_state state;
uint32_t pad;
@@ -110,12 +114,8 @@ malloc_elem_init(struct malloc_elem *elem,
const struct rte_memseg *ms,
size_t size);

-/*
- * initialise a dummy malloc_elem header for the end-of-memseg marker
- */
void
-malloc_elem_mkend(struct malloc_elem *elem,
- struct malloc_elem *prev_free);
+malloc_elem_insert(struct malloc_elem *elem);

/*
* return true if the current malloc_elem can hold a block of data
diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c
index 7d8d70a..9c95166 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -70,15 +70,11 @@ check_hugepage_sz(unsigned flags, uint64_t hugepage_sz)
static void
malloc_heap_add_memseg(struct malloc_heap *heap, struct rte_memseg *ms)
{
- /* allocate the memory block headers, one at end, one at start */
struct malloc_elem *start_elem = (struct malloc_elem *)ms->addr;
- struct malloc_elem *end_elem = RTE_PTR_ADD(ms->addr,
- ms->len - MALLOC_ELEM_OVERHEAD);
- end_elem = RTE_PTR_ALIGN_FLOOR(end_elem, RTE_CACHE_LINE_SIZE);
- const size_t elem_size = (uintptr_t)end_elem - (uintptr_t)start_elem;
+ const size_t elem_size = ms->len - MALLOC_ELEM_OVERHEAD;

malloc_elem_init(start_elem, heap, ms, elem_size);
- malloc_elem_mkend(end_elem, start_elem);
+ malloc_elem_insert(start_elem);
malloc_elem_free_list_insert(start_elem);

heap->total_size += elem_size;
--
2.7.4
Olivier Matz
2018-03-19 17:33:41 UTC
Permalink
Post by Anatoly Burakov
As we are preparing for dynamic memory allocation, we need to be
able to handle holes in our malloc heap, hence we're switching to
doubly linked list, and prepare infrastructure to support it.
Since our heap is now aware where are our first and last elements,
there is no longer any need to have a dummy element at the end of
each heap, so get rid of that as well. Instead, let insert/remove/
join/split operations handle end-of-list conditions automatically.
---
lib/librte_eal/common/include/rte_malloc_heap.h | 6 +
lib/librte_eal/common/malloc_elem.c | 200 +++++++++++++++++++-----
lib/librte_eal/common/malloc_elem.h | 14 +-
lib/librte_eal/common/malloc_heap.c | 8 +-
4 files changed, 179 insertions(+), 49 deletions(-)
diff --git a/lib/librte_eal/common/include/rte_malloc_heap.h b/lib/librte_eal/common/include/rte_malloc_heap.h
index ba99ed9..9ec4b62 100644
--- a/lib/librte_eal/common/include/rte_malloc_heap.h
+++ b/lib/librte_eal/common/include/rte_malloc_heap.h
@@ -13,12 +13,18 @@
/* Number of free lists per heap, grouped by size. */
#define RTE_HEAP_NUM_FREELISTS 13
+/* dummy definition, for pointers */
+struct malloc_elem;
+
/**
* Structure to hold malloc heap
*/
struct malloc_heap {
rte_spinlock_t lock;
LIST_HEAD(, malloc_elem) free_head[RTE_HEAP_NUM_FREELISTS];
+ struct malloc_elem *first;
+ struct malloc_elem *last;
+
unsigned alloc_count;
size_t total_size;
} __rte_cache_aligned;
diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c
index ea041e2..eb41200 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -31,6 +31,7 @@ malloc_elem_init(struct malloc_elem *elem,
elem->heap = heap;
elem->ms = ms;
elem->prev = NULL;
+ elem->next = NULL;
memset(&elem->free_list, 0, sizeof(elem->free_list));
elem->state = ELEM_FREE;
elem->size = size;
@@ -39,15 +40,56 @@ malloc_elem_init(struct malloc_elem *elem,
set_trailer(elem);
}
-/*
- * Initialize a dummy malloc_elem header for the end-of-memseg marker
- */
void
-malloc_elem_mkend(struct malloc_elem *elem, struct malloc_elem *prev)
+malloc_elem_insert(struct malloc_elem *elem)
{
- malloc_elem_init(elem, prev->heap, prev->ms, 0);
- elem->prev = prev;
- elem->state = ELEM_BUSY; /* mark busy so its never merged */
+ struct malloc_elem *prev_elem, *next_elem;
+ struct malloc_heap *heap = elem->heap;
+
+ if (heap->first == NULL && heap->last == NULL) {
+ /* if empty heap */
+ heap->first = elem;
+ heap->last = elem;
+ prev_elem = NULL;
+ next_elem = NULL;
+ } else if (elem < heap->first) {
+ /* if lower than start */
+ prev_elem = NULL;
+ next_elem = heap->first;
+ heap->first = elem;
+ } else if (elem > heap->last) {
+ /* if higher than end */
+ prev_elem = heap->last;
+ next_elem = NULL;
+ heap->last = elem;
+ } else {
+ /* the new memory is somewhere inbetween start and end */
+ uint64_t dist_from_start, dist_from_end;
+
+ dist_from_end = RTE_PTR_DIFF(heap->last, elem);
+ dist_from_start = RTE_PTR_DIFF(elem, heap->first);
+
+ /* check which is closer, and find closest list entries */
+ if (dist_from_start < dist_from_end) {
+ prev_elem = heap->first;
+ while (prev_elem->next < elem)
+ prev_elem = prev_elem->next;
+ next_elem = prev_elem->next;
+ } else {
+ next_elem = heap->last;
+ while (next_elem->prev > elem)
+ next_elem = next_elem->prev;
+ prev_elem = next_elem->prev;
+ }
+ }
+
+ /* insert new element */
+ elem->prev = prev_elem;
+ elem->next = next_elem;
+ if (prev_elem)
+ prev_elem->next = elem;
+ if (next_elem)
+ next_elem->prev = elem;
}
Would it be possible here to use a TAILQ? If yes, it could be
easier to read.
Burakov, Anatoly
2018-03-20 09:39:45 UTC
Permalink
Post by Olivier Matz
Post by Anatoly Burakov
As we are preparing for dynamic memory allocation, we need to be
able to handle holes in our malloc heap, hence we're switching to
doubly linked list, and prepare infrastructure to support it.
Since our heap is now aware where are our first and last elements,
there is no longer any need to have a dummy element at the end of
each heap, so get rid of that as well. Instead, let insert/remove/
join/split operations handle end-of-list conditions automatically.
---
lib/librte_eal/common/include/rte_malloc_heap.h | 6 +
lib/librte_eal/common/malloc_elem.c | 200 +++++++++++++++++++-----
lib/librte_eal/common/malloc_elem.h | 14 +-
lib/librte_eal/common/malloc_heap.c | 8 +-
4 files changed, 179 insertions(+), 49 deletions(-)
diff --git a/lib/librte_eal/common/include/rte_malloc_heap.h b/lib/librte_eal/common/include/rte_malloc_heap.h
index ba99ed9..9ec4b62 100644
--- a/lib/librte_eal/common/include/rte_malloc_heap.h
+++ b/lib/librte_eal/common/include/rte_malloc_heap.h
@@ -13,12 +13,18 @@
/* Number of free lists per heap, grouped by size. */
#define RTE_HEAP_NUM_FREELISTS 13
+/* dummy definition, for pointers */
+struct malloc_elem;
+
/**
* Structure to hold malloc heap
*/
struct malloc_heap {
rte_spinlock_t lock;
LIST_HEAD(, malloc_elem) free_head[RTE_HEAP_NUM_FREELISTS];
+ struct malloc_elem *first;
+ struct malloc_elem *last;
+
unsigned alloc_count;
size_t total_size;
} __rte_cache_aligned;
diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c
index ea041e2..eb41200 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -31,6 +31,7 @@ malloc_elem_init(struct malloc_elem *elem,
elem->heap = heap;
elem->ms = ms;
elem->prev = NULL;
+ elem->next = NULL;
memset(&elem->free_list, 0, sizeof(elem->free_list));
elem->state = ELEM_FREE;
elem->size = size;
@@ -39,15 +40,56 @@ malloc_elem_init(struct malloc_elem *elem,
set_trailer(elem);
}
-/*
- * Initialize a dummy malloc_elem header for the end-of-memseg marker
- */
void
-malloc_elem_mkend(struct malloc_elem *elem, struct malloc_elem *prev)
+malloc_elem_insert(struct malloc_elem *elem)
{
- malloc_elem_init(elem, prev->heap, prev->ms, 0);
- elem->prev = prev;
- elem->state = ELEM_BUSY; /* mark busy so its never merged */
+ struct malloc_elem *prev_elem, *next_elem;
+ struct malloc_heap *heap = elem->heap;
+
+ if (heap->first == NULL && heap->last == NULL) {
+ /* if empty heap */
+ heap->first = elem;
+ heap->last = elem;
+ prev_elem = NULL;
+ next_elem = NULL;
+ } else if (elem < heap->first) {
+ /* if lower than start */
+ prev_elem = NULL;
+ next_elem = heap->first;
+ heap->first = elem;
+ } else if (elem > heap->last) {
+ /* if higher than end */
+ prev_elem = heap->last;
+ next_elem = NULL;
+ heap->last = elem;
+ } else {
+ /* the new memory is somewhere inbetween start and end */
+ uint64_t dist_from_start, dist_from_end;
+
+ dist_from_end = RTE_PTR_DIFF(heap->last, elem);
+ dist_from_start = RTE_PTR_DIFF(elem, heap->first);
+
+ /* check which is closer, and find closest list entries */
+ if (dist_from_start < dist_from_end) {
+ prev_elem = heap->first;
+ while (prev_elem->next < elem)
+ prev_elem = prev_elem->next;
+ next_elem = prev_elem->next;
+ } else {
+ next_elem = heap->last;
+ while (next_elem->prev > elem)
+ next_elem = next_elem->prev;
+ prev_elem = next_elem->prev;
+ }
+ }
+
+ /* insert new element */
+ elem->prev = prev_elem;
+ elem->next = next_elem;
+ if (prev_elem)
+ prev_elem->next = elem;
+ if (next_elem)
+ next_elem->prev = elem;
}
Would it be possible here to use a TAILQ? If yes, it could be
easier to read.
Hi Olivier,

I think it would be a bit hard to make TAILQ's work with pad elements
without making the code unreadable :) I am inclined to leave it as is.
--
Thanks,
Anatoly
Anatoly Burakov
2018-03-03 13:45:50 UTC
Permalink
Down the line, we will need to do everything from the heap as any
alloc or free may trigger alloc/free OS memory, which would involve
growing/shrinking heap.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/common/malloc_elem.c | 16 ++--------------
lib/librte_eal/common/malloc_heap.c | 38 +++++++++++++++++++++++++++++++++++++
lib/librte_eal/common/malloc_heap.h | 6 ++++++
lib/librte_eal/common/rte_malloc.c | 4 ++--
4 files changed, 48 insertions(+), 16 deletions(-)

diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c
index 0cadc8a..ea041e2 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -243,10 +243,6 @@ join_elem(struct malloc_elem *elem1, struct malloc_elem *elem2)
int
malloc_elem_free(struct malloc_elem *elem)
{
- if (!malloc_elem_cookies_ok(elem) || elem->state != ELEM_BUSY)
- return -1;
-
- rte_spinlock_lock(&(elem->heap->lock));
size_t sz = elem->size - sizeof(*elem) - MALLOC_ELEM_TRAILER_LEN;
uint8_t *ptr = (uint8_t *)&elem[1];
struct malloc_elem *next = RTE_PTR_ADD(elem, elem->size);
@@ -274,8 +270,6 @@ malloc_elem_free(struct malloc_elem *elem)

memset(ptr, 0, sz);

- rte_spinlock_unlock(&(elem->heap->lock));
-
return 0;
}

@@ -292,11 +286,10 @@ malloc_elem_resize(struct malloc_elem *elem, size_t size)
return 0;

struct malloc_elem *next = RTE_PTR_ADD(elem, elem->size);
- rte_spinlock_lock(&elem->heap->lock);
if (next ->state != ELEM_FREE)
- goto err_return;
+ return -1;
if (elem->size + next->size < new_size)
- goto err_return;
+ return -1;

/* we now know the element fits, so remove from free list,
* join the two
@@ -311,10 +304,5 @@ malloc_elem_resize(struct malloc_elem *elem, size_t size)
split_elem(elem, split_pt);
malloc_elem_free_list_insert(split_pt);
}
- rte_spinlock_unlock(&elem->heap->lock);
return 0;
-
-err_return:
- rte_spinlock_unlock(&elem->heap->lock);
- return -1;
}
diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c
index 7aafc88..7d8d70a 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -145,6 +145,44 @@ malloc_heap_alloc(struct malloc_heap *heap,
return elem == NULL ? NULL : (void *)(&elem[1]);
}

+int
+malloc_heap_free(struct malloc_elem *elem)
+{
+ struct malloc_heap *heap;
+ int ret;
+
+ if (!malloc_elem_cookies_ok(elem) || elem->state != ELEM_BUSY)
+ return -1;
+
+ /* elem may be merged with previous element, so keep heap address */
+ heap = elem->heap;
+
+ rte_spinlock_lock(&(heap->lock));
+
+ ret = malloc_elem_free(elem);
+
+ rte_spinlock_unlock(&(heap->lock));
+
+ return ret;
+}
+
+int
+malloc_heap_resize(struct malloc_elem *elem, size_t size)
+{
+ int ret;
+
+ if (!malloc_elem_cookies_ok(elem) || elem->state != ELEM_BUSY)
+ return -1;
+
+ rte_spinlock_lock(&(elem->heap->lock));
+
+ ret = malloc_elem_resize(elem, size);
+
+ rte_spinlock_unlock(&(elem->heap->lock));
+
+ return ret;
+}
+
/*
* Function to retrieve data for heap on given socket
*/
diff --git a/lib/librte_eal/common/malloc_heap.h b/lib/librte_eal/common/malloc_heap.h
index e0defa7..ab0005c 100644
--- a/lib/librte_eal/common/malloc_heap.h
+++ b/lib/librte_eal/common/malloc_heap.h
@@ -28,6 +28,12 @@ malloc_heap_alloc(struct malloc_heap *heap, const char *type, size_t size,
unsigned flags, size_t align, size_t bound);

int
+malloc_heap_free(struct malloc_elem *elem);
+
+int
+malloc_heap_resize(struct malloc_elem *elem, size_t size);
+
+int
malloc_heap_get_stats(struct malloc_heap *heap,
struct rte_malloc_socket_stats *socket_stats);

diff --git a/lib/librte_eal/common/rte_malloc.c b/lib/librte_eal/common/rte_malloc.c
index e0e0d0b..970813e 100644
--- a/lib/librte_eal/common/rte_malloc.c
+++ b/lib/librte_eal/common/rte_malloc.c
@@ -29,7 +29,7 @@
void rte_free(void *addr)
{
if (addr == NULL) return;
- if (malloc_elem_free(malloc_elem_from_data(addr)) < 0)
+ if (malloc_heap_free(malloc_elem_from_data(addr)) < 0)
rte_panic("Fatal error: Invalid memory\n");
}

@@ -140,7 +140,7 @@ rte_realloc(void *ptr, size_t size, unsigned align)
size = RTE_CACHE_LINE_ROUNDUP(size), align = RTE_CACHE_LINE_ROUNDUP(align);
/* check alignment matches first, and if ok, see if we can resize block */
if (RTE_PTR_ALIGN(ptr,align) == ptr &&
- malloc_elem_resize(elem, size) == 0)
+ malloc_heap_resize(elem, size) == 0)
return ptr;

/* either alignment is off, or we have no room to expand,
--
2.7.4
Anatoly Burakov
2018-03-03 13:45:55 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/common/malloc_elem.c | 12 ++++++------
lib/librte_eal/common/malloc_elem.h | 3 +++
2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c
index 2291ee1..008f5a3 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -245,8 +245,8 @@ malloc_elem_free_list_insert(struct malloc_elem *elem)
/*
* Remove the specified element from its heap's free list.
*/
-static void
-elem_free_list_remove(struct malloc_elem *elem)
+void
+malloc_elem_free_list_remove(struct malloc_elem *elem)
{
LIST_REMOVE(elem, free_list);
}
@@ -266,7 +266,7 @@ malloc_elem_alloc(struct malloc_elem *elem, size_t size, unsigned align,
const size_t trailer_size = elem->size - old_elem_size - size -
MALLOC_ELEM_OVERHEAD;

- elem_free_list_remove(elem);
+ malloc_elem_free_list_remove(elem);

if (trailer_size > MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) {
/* split it, too much free space after elem */
@@ -340,7 +340,7 @@ malloc_elem_join_adjacent_free(struct malloc_elem *elem)
erase = RTE_PTR_SUB(elem->next, MALLOC_ELEM_TRAILER_LEN);

/* remove from free list, join to this one */
- elem_free_list_remove(elem->next);
+ malloc_elem_free_list_remove(elem->next);
join_elem(elem, elem->next);

/* erase header and trailer */
@@ -360,7 +360,7 @@ malloc_elem_join_adjacent_free(struct malloc_elem *elem)
erase = RTE_PTR_SUB(elem, MALLOC_ELEM_TRAILER_LEN);

/* remove from free list, join to this one */
- elem_free_list_remove(elem->prev);
+ malloc_elem_free_list_remove(elem->prev);

new_elem = elem->prev;
join_elem(new_elem, elem);
@@ -423,7 +423,7 @@ malloc_elem_resize(struct malloc_elem *elem, size_t size)
/* we now know the element fits, so remove from free list,
* join the two
*/
- elem_free_list_remove(elem->next);
+ malloc_elem_free_list_remove(elem->next);
join_elem(elem, elem->next);

if (elem->size - new_size >= MIN_DATA_SIZE + MALLOC_ELEM_OVERHEAD) {
diff --git a/lib/librte_eal/common/malloc_elem.h b/lib/librte_eal/common/malloc_elem.h
index 99921d2..46e2383 100644
--- a/lib/librte_eal/common/malloc_elem.h
+++ b/lib/librte_eal/common/malloc_elem.h
@@ -151,6 +151,9 @@ malloc_elem_join_adjacent_free(struct malloc_elem *elem);
int
malloc_elem_resize(struct malloc_elem *elem, size_t size);

+void
+malloc_elem_free_list_remove(struct malloc_elem *elem);
+
/*
* dump contents of malloc elem to a file.
*/
--
2.7.4
Anatoly Burakov
2018-03-03 13:46:00 UTC
Permalink
For non-legacy memory init mode, instead of looking at generic
sysfs path, look at sysfs paths pertaining to each NUMA node
for hugepage counts. Note that per-NUMA node path does not
provide information regarding reserved pages, so we might not
get the best info from these paths, but this saves us from the
whole mapping/remapping business before we're actually able to
tell which page is on which socket, because we no longer require
our memory to be physically contiguous.

Legacy memory init will not use this.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/linuxapp/eal/eal_hugepage_info.c | 79 +++++++++++++++++++++++--
1 file changed, 73 insertions(+), 6 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
index 8bbf771..706b6d5 100644
--- a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
+++ b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
@@ -30,6 +30,7 @@
#include "eal_filesystem.h"

static const char sys_dir_path[] = "/sys/kernel/mm/hugepages";
+static const char sys_pages_numa_dir_path[] = "/sys/devices/system/node";

/* this function is only called from eal_hugepage_info_init which itself
* is only called from a primary process */
@@ -70,6 +71,45 @@ get_num_hugepages(const char *subdir)
return num_pages;
}

+static uint32_t
+get_num_hugepages_on_node(const char *subdir, unsigned int socket)
+{
+ char path[PATH_MAX], socketpath[PATH_MAX];
+ DIR *socketdir;
+ unsigned long num_pages = 0;
+ const char *nr_hp_file = "free_hugepages";
+
+ snprintf(socketpath, sizeof(socketpath), "%s/node%u/hugepages",
+ sys_pages_numa_dir_path, socket);
+
+ socketdir = opendir(socketpath);
+ if (socketdir) {
+ /* Keep calm and carry on */
+ closedir(socketdir);
+ } else {
+ /* Can't find socket dir, so ignore it */
+ return 0;
+ }
+
+ snprintf(path, sizeof(path), "%s/%s/%s",
+ socketpath, subdir, nr_hp_file);
+ if (eal_parse_sysfs_value(path, &num_pages) < 0)
+ return 0;
+
+ if (num_pages == 0)
+ RTE_LOG(WARNING, EAL, "No free hugepages reported in %s\n",
+ subdir);
+
+ /*
+ * we want to return a uint32_t and more than this looks suspicious
+ * anyway ...
+ */
+ if (num_pages > UINT32_MAX)
+ num_pages = UINT32_MAX;
+
+ return num_pages;
+}
+
static uint64_t
get_default_hp_size(void)
{
@@ -248,7 +288,7 @@ eal_hugepage_info_init(void)
{
const char dirent_start_text[] = "hugepages-";
const size_t dirent_start_len = sizeof(dirent_start_text) - 1;
- unsigned i, num_sizes = 0;
+ unsigned int i, total_pages, num_sizes = 0;
DIR *dir;
struct dirent *dirent;

@@ -302,9 +342,27 @@ eal_hugepage_info_init(void)
if (clear_hugedir(hpi->hugedir) == -1)
break;

- /* for now, put all pages into socket 0,
- * later they will be sorted */
- hpi->num_pages[0] = get_num_hugepages(dirent->d_name);
+ /*
+ * first, try to put all hugepages into relevant sockets, but
+ * if first attempts fails, fall back to collecting all pages
+ * in one socket and sorting them later
+ */
+ total_pages = 0;
+ /* we also don't want to do this for legacy init */
+ if (!internal_config.legacy_mem)
+ for (i = 0; i < rte_num_sockets(); i++) {
+ unsigned int num_pages =
+ get_num_hugepages_on_node(
+ dirent->d_name, i);
+ hpi->num_pages[i] = num_pages;
+ total_pages += num_pages;
+ }
+ /*
+ * we failed to sort memory from the get go, so fall
+ * back to old way
+ */
+ if (total_pages == 0)
+ hpi->num_pages[0] = get_num_hugepages(dirent->d_name);

#ifndef RTE_ARCH_64
/* for 32-bit systems, limit number of hugepages to
@@ -328,10 +386,19 @@ eal_hugepage_info_init(void)
sizeof(internal_config.hugepage_info[0]), compare_hpi);

/* now we have all info, check we have at least one valid size */
- for (i = 0; i < num_sizes; i++)
+ for (i = 0; i < num_sizes; i++) {
+ /* pages may no longer all be on socket 0, so check all */
+ unsigned int j, num_pages = 0;
+
+ for (j = 0; j < RTE_MAX_NUMA_NODES; j++) {
+ struct hugepage_info *hpi =
+ &internal_config.hugepage_info[i];
+ num_pages += hpi->num_pages[j];
+ }
if (internal_config.hugepage_info[i].hugedir != NULL &&
- internal_config.hugepage_info[i].num_pages[0] > 0)
+ num_pages > 0)
return 0;
+ }

/* no valid hugepage mounts available, return error */
return -1;
--
2.7.4
Anatoly Burakov
2018-03-03 13:45:56 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/common/malloc_elem.c | 4 ++--
lib/librte_eal/common/malloc_elem.h | 2 +-
lib/librte_eal/common/malloc_heap.c | 4 ++--
3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c
index 008f5a3..c18f050 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -379,7 +379,7 @@ malloc_elem_join_adjacent_free(struct malloc_elem *elem)
* blocks either immediately before or immediately after newly freed block
* are also free, the blocks are merged together.
*/
-int
+struct malloc_elem *
malloc_elem_free(struct malloc_elem *elem)
{
void *ptr;
@@ -397,7 +397,7 @@ malloc_elem_free(struct malloc_elem *elem)

memset(ptr, 0, data_len);

- return 0;
+ return elem;
}

/*
diff --git a/lib/librte_eal/common/malloc_elem.h b/lib/librte_eal/common/malloc_elem.h
index 46e2383..9c1614c 100644
--- a/lib/librte_eal/common/malloc_elem.h
+++ b/lib/librte_eal/common/malloc_elem.h
@@ -138,7 +138,7 @@ malloc_elem_alloc(struct malloc_elem *elem, size_t size,
* blocks either immediately before or immediately after newly freed block
* are also free, the blocks are merged together.
*/
-int
+struct malloc_elem *
malloc_elem_free(struct malloc_elem *elem);

struct malloc_elem *
diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c
index 44538d7..a2c2e4c 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -145,7 +145,7 @@ int
malloc_heap_free(struct malloc_elem *elem)
{
struct malloc_heap *heap;
- int ret;
+ struct malloc_elem *ret;

if (!malloc_elem_cookies_ok(elem) || elem->state != ELEM_BUSY)
return -1;
@@ -159,7 +159,7 @@ malloc_heap_free(struct malloc_elem *elem)

rte_spinlock_unlock(&(heap->lock));

- return ret;
+ return ret != NULL ? 0 : -1;
}

int
--
2.7.4
Olivier Matz
2018-03-19 17:34:57 UTC
Permalink
Post by Anatoly Burakov
---
lib/librte_eal/common/malloc_elem.c | 4 ++--
lib/librte_eal/common/malloc_elem.h | 2 +-
lib/librte_eal/common/malloc_heap.c | 4 ++--
3 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c
index 008f5a3..c18f050 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -379,7 +379,7 @@ malloc_elem_join_adjacent_free(struct malloc_elem *elem)
* blocks either immediately before or immediately after newly freed block
* are also free, the blocks are merged together.
*/
-int
+struct malloc_elem *
malloc_elem_free(struct malloc_elem *elem)
{
void *ptr;
@@ -397,7 +397,7 @@ malloc_elem_free(struct malloc_elem *elem)
memset(ptr, 0, data_len);
- return 0;
+ return elem;
}
/*
An explanation about why this change is needed would make sense I think.

Thanks,
Olivier
Burakov, Anatoly
2018-03-20 09:40:53 UTC
Permalink
Post by Olivier Matz
Post by Anatoly Burakov
---
lib/librte_eal/common/malloc_elem.c | 4 ++--
lib/librte_eal/common/malloc_elem.h | 2 +-
lib/librte_eal/common/malloc_heap.c | 4 ++--
3 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c
index 008f5a3..c18f050 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -379,7 +379,7 @@ malloc_elem_join_adjacent_free(struct malloc_elem *elem)
* blocks either immediately before or immediately after newly freed block
* are also free, the blocks are merged together.
*/
-int
+struct malloc_elem *
malloc_elem_free(struct malloc_elem *elem)
{
void *ptr;
@@ -397,7 +397,7 @@ malloc_elem_free(struct malloc_elem *elem)
memset(ptr, 0, data_len);
- return 0;
+ return elem;
}
/*
An explanation about why this change is needed would make sense I think.
Thanks,
Olivier
Sure, i'll add this in future commits.

However, to provide some context - we need this because down the line we
will need to know which element did we create/free to be able to roll
back the changes, should the sync fail.
--
Thanks,
Anatoly
Anatoly Burakov
2018-03-03 13:46:03 UTC
Permalink
This isn't used anywhere yet, but the support is now there. Also,
adding cleanup to allocation procedures, so that if we fail to
allocate everything we asked for, we can free all of it back.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/common/eal_memalloc.h | 3 +
lib/librte_eal/linuxapp/eal/eal_memalloc.c | 148 ++++++++++++++++++++++++++++-
2 files changed, 146 insertions(+), 5 deletions(-)

diff --git a/lib/librte_eal/common/eal_memalloc.h b/lib/librte_eal/common/eal_memalloc.h
index c1076cf..adf59c4 100644
--- a/lib/librte_eal/common/eal_memalloc.h
+++ b/lib/librte_eal/common/eal_memalloc.h
@@ -16,4 +16,7 @@ int
eal_memalloc_alloc_page_bulk(struct rte_memseg **ms, int n, uint64_t size,
int socket, bool exact);

+int
+eal_memalloc_free_page(struct rte_memseg *ms);
+
#endif // EAL_MEMALLOC_H
diff --git a/lib/librte_eal/linuxapp/eal/eal_memalloc.c b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
index 1ba1201..bbeeeba 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memalloc.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
@@ -499,6 +499,64 @@ alloc_page(struct rte_memseg *ms, void *addr, uint64_t size, int socket_id,
return -1;
}

+static int
+free_page(struct rte_memseg *ms, struct hugepage_info *hi,
+ unsigned int list_idx, unsigned int seg_idx)
+{
+ uint64_t map_offset;
+ char path[PATH_MAX];
+ int fd, ret;
+
+ if (mmap(ms->addr, ms->hugepage_sz, PROT_READ,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0) ==
+ MAP_FAILED) {
+ RTE_LOG(DEBUG, EAL, "couldn't unmap page\n");
+ return -1;
+ }
+
+ fd = get_page_fd(path, sizeof(path), hi, list_idx, seg_idx);
+ if (fd < 0)
+ return -1;
+
+ if (internal_config.single_file_segments) {
+ map_offset = seg_idx * ms->hugepage_sz;
+ if (resize_hugefile(fd, map_offset, ms->hugepage_sz, false))
+ return -1;
+ /* if file is zero-length, we've already shrunk it, so it's
+ * safe to remove.
+ */
+ if (is_zero_length(fd)) {
+ struct msl_entry *te = get_msl_entry_by_idx(list_idx);
+ if (te != NULL && te->fd >= 0) {
+ close(te->fd);
+ te->fd = -1;
+ }
+ unlink(path);
+ }
+ ret = 0;
+ } else {
+ /* if we're able to take out a write lock, we're the last one
+ * holding onto this page.
+ */
+
+ ret = lock(fd, 0, ms->hugepage_sz, F_WRLCK);
+ if (ret >= 0) {
+ /* no one else is using this page */
+ if (ret == 1)
+ unlink(path);
+ ret = lock(fd, 0, ms->hugepage_sz, F_UNLCK);
+ if (ret != 1)
+ RTE_LOG(ERR, EAL, "%s(): unable to unlock file %s\n",
+ __func__, path);
+ }
+ close(fd);
+ }
+
+ memset(ms, 0, sizeof(*ms));
+
+ return ret;
+}
+
int
eal_memalloc_alloc_page_bulk(struct rte_memseg **ms, int n,
uint64_t size, int socket, bool exact)
@@ -507,7 +565,7 @@ eal_memalloc_alloc_page_bulk(struct rte_memseg **ms, int n,
struct rte_memseg_list *msl = NULL;
void *addr;
unsigned int msl_idx;
- int cur_idx, end_idx, i, ret = -1;
+ int cur_idx, start_idx, end_idx, i, j, ret = -1;
#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
bool have_numa;
int oldpolicy;
@@ -557,6 +615,7 @@ eal_memalloc_alloc_page_bulk(struct rte_memseg **ms, int n,
continue;

end_idx = cur_idx + n;
+ start_idx = cur_idx;

for (i = 0; cur_idx < end_idx; cur_idx++, i++) {
struct rte_memseg *cur;
@@ -567,25 +626,56 @@ eal_memalloc_alloc_page_bulk(struct rte_memseg **ms, int n,

if (alloc_page(cur, addr, size, socket, hi, msl_idx,
cur_idx)) {
+
RTE_LOG(DEBUG, EAL, "attempted to allocate %i pages, but only %i were allocated\n",
n, i);

- /* if exact number wasn't requested, stop */
- if (!exact)
+ /* if exact number of pages wasn't requested,
+ * failing to allocate is not an error. we could
+ * of course try other lists to see if there are
+ * better fits, but a bird in the hand...
+ */
+ if (!exact) {
ret = i;
- goto restore_numa;
+ goto restore_numa;
+ }
+ RTE_LOG(DEBUG, EAL, "exact amount of pages was requested, so returning %i allocated pages\n",
+ i);
+
+ /* clean up */
+ for (j = start_idx; j < cur_idx; j++) {
+ struct rte_memseg *tmp;
+ struct rte_fbarray *arr =
+ &msl->memseg_arr;
+
+ tmp = rte_fbarray_get(arr, j);
+ if (free_page(tmp, hi, msl_idx,
+ start_idx + j))
+ rte_panic("Cannot free page\n");
+
+ rte_fbarray_set_free(arr, j);
+ }
+ /* clear the list */
+ if (ms)
+ memset(ms, 0, sizeof(*ms) * n);
+
+ /* try next list */
+ goto next_list;
}
if (ms)
ms[i] = cur;

rte_fbarray_set_used(&msl->memseg_arr, cur_idx);
}
+ /* we allocated all pages */
ret = n;

break;
+next_list:
+ /* dummy semi-colon to make label work */;
}
/* we didn't break */
- if (!msl) {
+ if (msl_idx == RTE_MAX_MEMSEG_LISTS) {
RTE_LOG(ERR, EAL, "%s(): couldn't find suitable memseg_list\n",
__func__);
}
@@ -607,3 +697,51 @@ eal_memalloc_alloc_page(uint64_t size, int socket)
/* return pointer to newly allocated memseg */
return ms;
}
+
+int
+eal_memalloc_free_page(struct rte_memseg *ms)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct rte_memseg_list *msl = NULL;
+ unsigned int msl_idx, seg_idx;
+ struct hugepage_info *hi = NULL;
+ int i;
+
+ /* dynamic free not supported in legacy mode */
+ if (internal_config.legacy_mem)
+ return -1;
+
+ for (i = 0; i < (int) RTE_DIM(internal_config.hugepage_info); i++) {
+ if (ms->hugepage_sz ==
+ internal_config.hugepage_info[i].hugepage_sz) {
+ hi = &internal_config.hugepage_info[i];
+ break;
+ }
+ }
+ if (!hi) {
+ RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n");
+ return -1;
+ }
+
+ for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
+ uintptr_t start_addr, end_addr;
+ struct rte_memseg_list *cur = &mcfg->memsegs[msl_idx];
+
+ start_addr = (uintptr_t) cur->base_va;
+ end_addr = start_addr + cur->memseg_arr.len * cur->hugepage_sz;
+
+ if ((uintptr_t) ms->addr < start_addr ||
+ (uintptr_t) ms->addr >= end_addr) {
+ continue;
+ }
+ msl = cur;
+ seg_idx = RTE_PTR_DIFF(ms->addr, start_addr) / ms->hugepage_sz;
+ break;
+ }
+ if (!msl) {
+ RTE_LOG(ERR, EAL, "Couldn't find memseg list\n");
+ return -1;
+ }
+ rte_fbarray_set_free(&msl->memseg_arr, seg_idx);
+ return free_page(ms, hi, msl_idx, seg_idx);
+}
--
2.7.4
Anatoly Burakov
2018-03-03 13:46:14 UTC
Permalink
In preparation for implementing multiprocess support, we are adding
a version number and write locks to memseg lists.

There are two ways of implementing multiprocess support for memory
hotplug: either all information about mapped memory is shared
between processes, and secondary processes simply attempt to
map/unmap memory based on requests from the primary, or secondary
processes store their own maps and only check if they are in sync
with the primary process' maps.

This implementation will opt for the latter option: primary process
shared mappings will be authoritative, and each secondary process
will use its own interal view of mapped memory, and will attempt
to synchronize on these mappings using versioning.

Under this model, only primary process will decide which pages get
mapped, and secondary processes will only copy primary's page
maps and get notified of the changes via IPC mechanism (coming
in later commits).

To avoid race conditions, memseg lists will also have write locks -
that is, it will be possible for several secondary processes to
initialize concurrently, but it will not be possible for several
processes to request memory allocation unless all other allocations
were complete (on a single socket - it is OK to allocate/free memory
on different sockets concurrently).

In principle, it is possible for multiple processes to request
allocation/deallcation on multiple sockets, but we will only allow
one such request to be active at any one time.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/bsdapp/eal/eal_memalloc.c | 7 +
lib/librte_eal/common/eal_memalloc.h | 4 +
lib/librte_eal/common/include/rte_eal_memconfig.h | 2 +
lib/librte_eal/linuxapp/eal/eal_memalloc.c | 288 +++++++++++++++++++++-
4 files changed, 295 insertions(+), 6 deletions(-)

diff --git a/lib/librte_eal/bsdapp/eal/eal_memalloc.c b/lib/librte_eal/bsdapp/eal/eal_memalloc.c
index be8340b..255aedc 100644
--- a/lib/librte_eal/bsdapp/eal/eal_memalloc.c
+++ b/lib/librte_eal/bsdapp/eal/eal_memalloc.c
@@ -24,3 +24,10 @@ eal_memalloc_alloc_page(uint64_t __rte_unused size, int __rte_unused socket)
RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n");
return NULL;
}
+
+int
+eal_memalloc_sync_with_primary(void)
+{
+ RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n");
+ return -1;
+}
diff --git a/lib/librte_eal/common/eal_memalloc.h b/lib/librte_eal/common/eal_memalloc.h
index 08ba70e..beac296 100644
--- a/lib/librte_eal/common/eal_memalloc.h
+++ b/lib/librte_eal/common/eal_memalloc.h
@@ -24,4 +24,8 @@ bool
eal_memalloc_is_contig(struct rte_memseg_list *msl, void *start,
size_t len);

+/* synchronize local memory map to primary process */
+int
+eal_memalloc_sync_with_primary(void);
+
#endif // EAL_MEMALLOC_H
diff --git a/lib/librte_eal/common/include/rte_eal_memconfig.h b/lib/librte_eal/common/include/rte_eal_memconfig.h
index b6bdb21..d653d57 100644
--- a/lib/librte_eal/common/include/rte_eal_memconfig.h
+++ b/lib/librte_eal/common/include/rte_eal_memconfig.h
@@ -32,6 +32,8 @@ struct rte_memseg_list {
};
int socket_id; /**< Socket ID for all memsegs in this list. */
uint64_t hugepage_sz; /**< page size for all memsegs in this list. */
+ rte_rwlock_t mplock; /**< read-write lock for multiprocess sync. */
+ uint32_t version; /**< version number for multiprocess sync. */
struct rte_fbarray memseg_arr;
};

diff --git a/lib/librte_eal/linuxapp/eal/eal_memalloc.c b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
index c03e7bc..227d703 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memalloc.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
@@ -65,6 +65,9 @@ static struct msl_entry_list msl_entry_list =
TAILQ_HEAD_INITIALIZER(msl_entry_list);
static rte_spinlock_t tailq_lock = RTE_SPINLOCK_INITIALIZER;

+/** local copy of a memory map, used to synchronize memory hotplug in MP */
+static struct rte_memseg_list local_memsegs[RTE_MAX_MEMSEG_LISTS];
+
static sigjmp_buf huge_jmpenv;

static void __rte_unused huge_sigbus_handler(int signo __rte_unused)
@@ -619,11 +622,14 @@ eal_memalloc_alloc_page_bulk(struct rte_memseg **ms, int n,
continue;
msl = cur_msl;

+ /* lock memseg list */
+ rte_rwlock_write_lock(&msl->mplock);
+
/* try finding space in memseg list */
cur_idx = rte_fbarray_find_next_n_free(&msl->memseg_arr, 0, n);

if (cur_idx < 0)
- continue;
+ goto next_list;

end_idx = cur_idx + n;
start_idx = cur_idx;
@@ -637,7 +643,6 @@ eal_memalloc_alloc_page_bulk(struct rte_memseg **ms, int n,

if (alloc_page(cur, addr, size, socket, hi, msl_idx,
cur_idx)) {
-
RTE_LOG(DEBUG, EAL, "attempted to allocate %i pages, but only %i were allocated\n",
n, i);

@@ -648,7 +653,7 @@ eal_memalloc_alloc_page_bulk(struct rte_memseg **ms, int n,
*/
if (!exact) {
ret = i;
- goto restore_numa;
+ goto success;
}
RTE_LOG(DEBUG, EAL, "exact amount of pages was requested, so returning %i allocated pages\n",
i);
@@ -680,10 +685,13 @@ eal_memalloc_alloc_page_bulk(struct rte_memseg **ms, int n,
}
/* we allocated all pages */
ret = n;
+success:
+ msl->version++;
+ rte_rwlock_write_unlock(&msl->mplock);

break;
next_list:
- /* dummy semi-colon to make label work */;
+ rte_rwlock_write_unlock(&msl->mplock);
}
/* we didn't break */
if (msl_idx == RTE_MAX_MEMSEG_LISTS) {
@@ -716,7 +724,7 @@ eal_memalloc_free_page(struct rte_memseg *ms)
struct rte_memseg_list *msl = NULL;
unsigned int msl_idx, seg_idx;
struct hugepage_info *hi = NULL;
- int i;
+ int ret, i;

/* dynamic free not supported in legacy mode */
if (internal_config.legacy_mem)
@@ -753,6 +761,274 @@ eal_memalloc_free_page(struct rte_memseg *ms)
RTE_LOG(ERR, EAL, "Couldn't find memseg list\n");
return -1;
}
+ rte_rwlock_write_lock(&msl->mplock);
+
rte_fbarray_set_free(&msl->memseg_arr, seg_idx);
- return free_page(ms, hi, msl_idx, seg_idx);
+
+ /* increment version number */
+ msl->version++;
+
+ ret = free_page(ms, hi, msl_idx, seg_idx);
+
+ rte_rwlock_write_unlock(&msl->mplock);
+
+ return ret;
+}
+
+static int
+sync_chunk(struct rte_memseg_list *primary_msl,
+ struct rte_memseg_list *local_msl, struct hugepage_info *hi,
+ unsigned int msl_idx, bool used, int start, int end)
+{
+ struct rte_fbarray *l_arr, *p_arr;
+ int i, ret, chunk_len, diff_len;
+
+ l_arr = &local_msl->memseg_arr;
+ p_arr = &primary_msl->memseg_arr;
+
+ /* we need to aggregate allocations/deallocations into bigger chunks,
+ * as we don't want to spam the user with per-page callbacks.
+ *
+ * to avoid any potential issues, we also want to trigger
+ * deallocation callbacks *before* we actually deallocate
+ * memory, so that the user application could wrap up its use
+ * before it goes away.
+ */
+
+ chunk_len = end - start;
+
+ /* find how many contiguous pages we can map/unmap for this chunk */
+ diff_len = used ?
+ rte_fbarray_find_contig_free(l_arr, start) :
+ rte_fbarray_find_contig_used(l_arr, start);
+
+ /* has to be at least one page */
+ if (diff_len < 1)
+ return -1;
+
+ diff_len = RTE_MIN(chunk_len, diff_len);
+
+ for (i = 0; i < diff_len; i++) {
+ struct rte_memseg *p_ms, *l_ms;
+ int seg_idx = start + i;
+
+ l_ms = rte_fbarray_get(l_arr, seg_idx);
+ p_ms = rte_fbarray_get(p_arr, seg_idx);
+
+ if (l_ms == NULL || p_ms == NULL)
+ return -1;
+
+ if (used) {
+ ret = alloc_page(l_ms, p_ms->addr,
+ p_ms->hugepage_sz,
+ p_ms->socket_id, hi,
+ msl_idx, seg_idx);
+ if (ret < 0)
+ return -1;
+ rte_fbarray_set_used(l_arr, seg_idx);
+ } else {
+ ret = free_page(l_ms, hi, msl_idx, seg_idx);
+ if (ret < 0)
+ return -1;
+ rte_fbarray_set_free(l_arr, seg_idx);
+ }
+ }
+
+ /* calculate how much we can advance until next chunk */
+ diff_len = used ?
+ rte_fbarray_find_contig_used(l_arr, start) :
+ rte_fbarray_find_contig_free(l_arr, start);
+ ret = RTE_MIN(chunk_len, diff_len);
+
+ return ret;
+}
+
+static int
+sync_status(struct rte_memseg_list *primary_msl,
+ struct rte_memseg_list *local_msl, struct hugepage_info *hi,
+ unsigned int msl_idx, bool used)
+{
+ struct rte_fbarray *l_arr, *p_arr;
+ int p_idx, l_chunk_len, p_chunk_len, ret;
+ int start, end;
+
+ /* this is a little bit tricky, but the basic idea is - walk both lists
+ * and spot any places where there are discrepancies. walking both lists
+ * and noting discrepancies in a single go is a hard problem, so we do
+ * it in two passes - first we spot any places where allocated segments
+ * mismatch (i.e. ensure that everything that's allocated in the primary
+ * is also allocated in the secondary), and then we do it by looking at
+ * free segments instead.
+ *
+ * we also need to aggregate changes into chunks, as we have to call
+ * callbacks per allocation, not per page.
+ */
+ l_arr = &local_msl->memseg_arr;
+ p_arr = &primary_msl->memseg_arr;
+
+ if (used)
+ p_idx = rte_fbarray_find_next_used(p_arr, 0);
+ else
+ p_idx = rte_fbarray_find_next_free(p_arr, 0);
+
+ while (p_idx >= 0) {
+ int next_chunk_search_idx;
+
+ if (used) {
+ p_chunk_len = rte_fbarray_find_contig_used(p_arr,
+ p_idx);
+ l_chunk_len = rte_fbarray_find_contig_used(l_arr,
+ p_idx);
+ } else {
+ p_chunk_len = rte_fbarray_find_contig_free(p_arr,
+ p_idx);
+ l_chunk_len = rte_fbarray_find_contig_free(l_arr,
+ p_idx);
+ }
+ /* best case scenario - no differences (or bigger, which will be
+ * fixed during next iteration), look for next chunk
+ */
+ if (l_chunk_len >= p_chunk_len) {
+ next_chunk_search_idx = p_idx + p_chunk_len;
+ goto next_chunk;
+ }
+
+ /* if both chunks start at the same point, skip parts we know
+ * are identical, and sync the rest. each call to sync_chunk
+ * will only sync contiguous segments, so we need to call this
+ * until we are sure there are no more differences in this
+ * chunk.
+ */
+ start = p_idx + l_chunk_len;
+ end = p_idx + p_chunk_len;
+ do {
+ ret = sync_chunk(primary_msl, local_msl, hi, msl_idx,
+ used, start, end);
+ start += ret;
+ } while (start < end && ret >= 0);
+ /* if ret is negative, something went wrong */
+ if (ret < 0)
+ return -1;
+
+ next_chunk_search_idx = p_idx + p_chunk_len;
+next_chunk:
+ /* skip to end of this chunk */
+ if (used) {
+ p_idx = rte_fbarray_find_next_used(p_arr,
+ next_chunk_search_idx);
+ } else {
+ p_idx = rte_fbarray_find_next_free(p_arr,
+ next_chunk_search_idx);
+ }
+ }
+ return 0;
+}
+
+static int
+sync_existing(struct rte_memseg_list *primary_msl,
+ struct rte_memseg_list *local_msl, struct hugepage_info *hi,
+ unsigned int msl_idx)
+{
+ int ret;
+
+ /* ensure all allocated space is the same in both lists */
+ ret = sync_status(primary_msl, local_msl, hi, msl_idx, true);
+ if (ret < 0)
+ return -1;
+
+ /* ensure all unallocated space is the same in both lists */
+ ret = sync_status(primary_msl, local_msl, hi, msl_idx, false);
+ if (ret < 0)
+ return -1;
+
+ /* update version number */
+ local_msl->version = primary_msl->version;
+
+ return 0;
+}
+
+
+int
+eal_memalloc_sync_with_primary(void)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct rte_memseg_list *primary_msl, *local_msl;
+ struct hugepage_info *hi = NULL;
+ unsigned int msl_idx;
+ int i;
+
+ /* nothing to be done in primary */
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+ return 0;
+
+ for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
+ bool new_msl = false;
+ bool fail = false;
+
+ primary_msl = &mcfg->memsegs[msl_idx];
+ local_msl = &local_memsegs[msl_idx];
+
+ if (primary_msl->base_va == 0)
+ continue;
+
+ /* this is a valid memseg list, so read-lock it */
+ rte_rwlock_read_lock(&primary_msl->mplock);
+
+ /* write-lock local memseg list */
+ rte_rwlock_write_lock(&local_msl->mplock);
+
+ /* check if secondary has this memseg list set up */
+ if (local_msl->base_va == 0) {
+ char name[PATH_MAX];
+ int ret;
+ new_msl = true;
+
+ /* create distinct fbarrays for each secondary */
+ snprintf(name, RTE_FBARRAY_NAME_LEN, "%s_%i",
+ primary_msl->memseg_arr.name, getpid());
+
+ ret = rte_fbarray_init(&local_msl->memseg_arr, name,
+ primary_msl->memseg_arr.len,
+ primary_msl->memseg_arr.elt_sz);
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL, "Cannot initialize local memory map\n");
+ fail = true;
+ goto endloop;
+ }
+
+ local_msl->base_va = primary_msl->base_va;
+ }
+
+ for (i = 0; i < (int) RTE_DIM(internal_config.hugepage_info);
+ i++) {
+ uint64_t cur_sz =
+ internal_config.hugepage_info[i].hugepage_sz;
+ uint64_t msl_sz = primary_msl->hugepage_sz;
+ if (msl_sz == cur_sz) {
+ hi = &internal_config.hugepage_info[i];
+ break;
+ }
+ }
+ if (!hi) {
+ RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n");
+ fail = true;
+ goto endloop;
+ }
+
+ /* if versions don't match or if we have just allocated a new
+ * memseg list, synchronize everything
+ */
+ if ((new_msl || local_msl->version != primary_msl->version) &&
+ sync_existing(primary_msl, local_msl, hi,
+ msl_idx)) {
+ fail = true;
+ goto endloop;
+ }
+endloop:
+ rte_rwlock_write_unlock(&local_msl->mplock);
+ rte_rwlock_read_unlock(&primary_msl->mplock);
+ if (fail)
+ return -1;
+ }
+ return 0;
}
--
2.7.4
Anatoly Burakov
2018-03-03 13:46:01 UTC
Permalink
Before, we were aggregating multiple pages into one memseg, so the
number of memsegs was small. Now, each page gets its own memseg,
so the list of memsegs is huge. To accommodate the new memseg list
size and to keep the under-the-hood workings sane, the memseg list
is now not just a single list, but multiple lists. To be precise,
each hugepage size available on the system gets one or more memseg
lists, per socket.

In order to support dynamic memory allocation, we reserve all
memory in advance. As in, we do an anonymous mmap() of the entire
maximum size of memory per hugepage size, per socket (which is
limited to either RTE_MAX_MEMSEG_PER_TYPE pages or
RTE_MAX_MEM_PER_TYPE gigabytes worth of memory, whichever is the
smaller one), split over multiple lists (which are limited to
either RTE_MAX_MEMSEG_PER_LIST memsegs or RTE_MAX_MEM_PER_LIST
gigabytes per list, whichever is the smaller one).

So, for each hugepage size, we get (by default) up to 128G worth
of memory, per socket, split into chunks of up to 32G in size.
The address space is claimed at the start, in eal_common_memory.c.
The actual page allocation code is in eal_memalloc.c (Linux-only
for now), and largely consists of copied EAL memory init code.

Pages in the list are also indexed by address. That is, for
non-legacy mode, in order to figure out where the page belongs,
one can simply look at base address for a memseg list. Similarly,
figuring out IOVA address of a memzone is a matter of finding the
right memseg list, getting offset and dividing by page size to get
the appropriate memseg. For legacy mode, old behavior of walking
the memseg list remains.

Due to switch to fbarray and to avoid any intrusive changes,
secondary processes are not supported in this commit. Also, one
particular API call (dump physmem layout) no longer makes sense
and was removed, according to deprecation notice [1].

In legacy mode, nothing is preallocated, and all memsegs are in
a list like before, but each segment still resides in an appropriate
memseg list.

The rest of the changes are really ripple effects from the memseg
change - heap changes, compile fixes, and rewrites to support
fbarray-backed memseg lists.

[1] http://dpdk.org/dev/patchwork/patch/34002/

Signed-off-by: Anatoly Burakov <***@intel.com>
---
config/common_base | 15 +-
drivers/bus/pci/linux/pci.c | 29 +-
drivers/net/virtio/virtio_user/vhost_kernel.c | 108 +++++---
lib/librte_eal/common/eal_common_memory.c | 322 +++++++++++++++++++---
lib/librte_eal/common/eal_common_memzone.c | 12 +-
lib/librte_eal/common/eal_hugepages.h | 2 +
lib/librte_eal/common/eal_internal_cfg.h | 2 +-
lib/librte_eal/common/include/rte_eal_memconfig.h | 22 +-
lib/librte_eal/common/include/rte_memory.h | 33 ++-
lib/librte_eal/common/include/rte_memzone.h | 1 -
lib/librte_eal/common/malloc_elem.c | 8 +-
lib/librte_eal/common/malloc_elem.h | 6 +-
lib/librte_eal/common/malloc_heap.c | 92 +++++--
lib/librte_eal/common/rte_malloc.c | 22 +-
lib/librte_eal/linuxapp/eal/eal.c | 21 +-
lib/librte_eal/linuxapp/eal/eal_memory.c | 297 +++++++++++++-------
lib/librte_eal/linuxapp/eal/eal_vfio.c | 164 +++++++----
lib/librte_eal/rte_eal_version.map | 3 +-
test/test/test_malloc.c | 29 +-
test/test/test_memory.c | 43 ++-
test/test/test_memzone.c | 17 +-
21 files changed, 917 insertions(+), 331 deletions(-)

diff --git a/config/common_base b/config/common_base
index ad03cf4..e9c1d93 100644
--- a/config/common_base
+++ b/config/common_base
@@ -61,7 +61,20 @@ CONFIG_RTE_CACHE_LINE_SIZE=64
CONFIG_RTE_LIBRTE_EAL=y
CONFIG_RTE_MAX_LCORE=128
CONFIG_RTE_MAX_NUMA_NODES=8
-CONFIG_RTE_MAX_MEMSEG=256
+CONFIG_RTE_MAX_MEMSEG_LISTS=32
+# each memseg list will be limited to either RTE_MAX_MEMSEG_PER_LIST pages
+# or RTE_MAX_MEM_PER_LIST gigabytes worth of memory, whichever is the smallest
+CONFIG_RTE_MAX_MEMSEG_PER_LIST=8192
+CONFIG_RTE_MAX_MEM_PER_LIST=32
+# a "type" is a combination of page size and NUMA node. total number of memseg
+# lists per type will be limited to either RTE_MAX_MEMSEG_PER_TYPE pages (split
+# over multiple lists of RTE_MAX_MEMSEG_PER_LIST pages), or RTE_MAX_MEM_PER_TYPE
+# gigabytes of memory (split over multiple lists of RTE_MAX_MEM_PER_LIST),
+# whichever is the smallest
+CONFIG_RTE_MAX_MEMSEG_PER_TYPE=32768
+CONFIG_RTE_MAX_MEM_PER_TYPE=128
+# legacy mem mode only
+CONFIG_RTE_MAX_LEGACY_MEMSEG=256
CONFIG_RTE_MAX_MEMZONE=2560
CONFIG_RTE_MAX_TAILQ=32
CONFIG_RTE_ENABLE_ASSERT=n
diff --git a/drivers/bus/pci/linux/pci.c b/drivers/bus/pci/linux/pci.c
index abde641..ec05d7c 100644
--- a/drivers/bus/pci/linux/pci.c
+++ b/drivers/bus/pci/linux/pci.c
@@ -119,19 +119,30 @@ rte_pci_unmap_device(struct rte_pci_device *dev)
void *
pci_find_max_end_va(void)
{
- const struct rte_memseg *seg = rte_eal_get_physmem_layout();
- const struct rte_memseg *last = seg;
- unsigned i = 0;
+ void *cur_end, *max_end = NULL;
+ int i = 0;

- for (i = 0; i < RTE_MAX_MEMSEG; i++, seg++) {
- if (seg->addr == NULL)
- break;
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+ struct rte_mem_config *mcfg =
+ rte_eal_get_configuration()->mem_config;
+ struct rte_memseg_list *msl = &mcfg->memsegs[i];
+ struct rte_fbarray *arr = &msl->memseg_arr;

- if (seg->addr > last->addr)
- last = seg;
+ if (arr->count == 0)
+ continue;

+ /*
+ * we need to handle legacy mem case, so don't rely on page size
+ * to calculate max VA end
+ */
+ while ((i = rte_fbarray_find_next_used(arr, i)) >= 0) {
+ struct rte_memseg *ms = rte_fbarray_get(arr, i);
+ cur_end = RTE_PTR_ADD(ms->addr, ms->len);
+ if (cur_end > max_end)
+ max_end = cur_end;
+ }
}
- return RTE_PTR_ADD(last->addr, last->len);
+ return max_end;
}

/* parse one line of the "resource" sysfs file (note that the 'line'
diff --git a/drivers/net/virtio/virtio_user/vhost_kernel.c b/drivers/net/virtio/virtio_user/vhost_kernel.c
index 8d0a1ab..23c5e1c 100644
--- a/drivers/net/virtio/virtio_user/vhost_kernel.c
+++ b/drivers/net/virtio/virtio_user/vhost_kernel.c
@@ -70,6 +70,42 @@ static uint64_t vhost_req_user_to_kernel[] = {
[VHOST_USER_SET_MEM_TABLE] = VHOST_SET_MEM_TABLE,
};

+/* returns number of segments processed */
+static int
+add_memory_region(struct vhost_memory_region *mr, struct rte_fbarray *arr,
+ int reg_start_idx, int max)
+{
+ const struct rte_memseg *ms;
+ void *start_addr, *expected_addr;
+ uint64_t len;
+ int idx;
+
+ idx = reg_start_idx;
+ len = 0;
+ start_addr = NULL;
+ expected_addr = NULL;
+
+ /* we could've relied on page size, but we have to support legacy mem */
+ while (idx < max) {
+ ms = rte_fbarray_get(arr, idx);
+ if (expected_addr == NULL) {
+ start_addr = ms->addr;
+ expected_addr = RTE_PTR_ADD(ms->addr, ms->len);
+ } else if (ms->addr != expected_addr) {
+ break;
+ }
+ len += ms->len;
+ idx++;
+ }
+
+ mr->guest_phys_addr = (uint64_t)(uintptr_t)start_addr;
+ mr->userspace_addr = (uint64_t)(uintptr_t)start_addr;
+ mr->memory_size = len;
+ mr->mmap_offset = 0;
+
+ return idx;
+}
+
/* By default, vhost kernel module allows 64 regions, but DPDK allows
* 256 segments. As a relief, below function merges those virtually
* adjacent memsegs into one region.
@@ -77,8 +113,7 @@ static uint64_t vhost_req_user_to_kernel[] = {
static struct vhost_memory_kernel *
prepare_vhost_memory_kernel(void)
{
- uint32_t i, j, k = 0;
- struct rte_memseg *seg;
+ uint32_t list_idx, region_nr = 0;
struct vhost_memory_region *mr;
struct vhost_memory_kernel *vm;

@@ -88,52 +123,41 @@ prepare_vhost_memory_kernel(void)
if (!vm)
return NULL;

- for (i = 0; i < RTE_MAX_MEMSEG; ++i) {
- seg = &rte_eal_get_configuration()->mem_config->memseg[i];
- if (!seg->addr)
- break;
-
- int new_region = 1;
+ for (list_idx = 0; list_idx < RTE_MAX_MEMSEG_LISTS; ++list_idx) {
+ struct rte_mem_config *mcfg =
+ rte_eal_get_configuration()->mem_config;
+ struct rte_memseg_list *msl = &mcfg->memsegs[list_idx];
+ struct rte_fbarray *arr = &msl->memseg_arr;
+ int reg_start_idx, search_idx;

- for (j = 0; j < k; ++j) {
- mr = &vm->regions[j];
-
- if (mr->userspace_addr + mr->memory_size ==
- (uint64_t)(uintptr_t)seg->addr) {
- mr->memory_size += seg->len;
- new_region = 0;
- break;
- }
-
- if ((uint64_t)(uintptr_t)seg->addr + seg->len ==
- mr->userspace_addr) {
- mr->guest_phys_addr =
- (uint64_t)(uintptr_t)seg->addr;
- mr->userspace_addr =
- (uint64_t)(uintptr_t)seg->addr;
- mr->memory_size += seg->len;
- new_region = 0;
- break;
- }
- }
-
- if (new_region == 0)
+ /* skip empty segment lists */
+ if (arr->count == 0)
continue;

- mr = &vm->regions[k++];
- /* use vaddr here! */
- mr->guest_phys_addr = (uint64_t)(uintptr_t)seg->addr;
- mr->userspace_addr = (uint64_t)(uintptr_t)seg->addr;
- mr->memory_size = seg->len;
- mr->mmap_offset = 0;
-
- if (k >= max_regions) {
- free(vm);
- return NULL;
+ search_idx = 0;
+ while ((reg_start_idx = rte_fbarray_find_next_used(arr,
+ search_idx)) >= 0) {
+ int reg_n_pages;
+ if (region_nr >= max_regions) {
+ free(vm);
+ return NULL;
+ }
+ mr = &vm->regions[region_nr++];
+
+ /*
+ * we know memseg starts at search_idx, check how many
+ * segments there are
+ */
+ reg_n_pages = rte_fbarray_find_contig_used(arr,
+ search_idx);
+
+ /* look at at most reg_n_pages of memsegs */
+ search_idx = add_memory_region(mr, arr, reg_start_idx,
+ search_idx + reg_n_pages);
}
}

- vm->nregions = k;
+ vm->nregions = region_nr;
vm->padding = 0;
return vm;
}
diff --git a/lib/librte_eal/common/eal_common_memory.c b/lib/librte_eal/common/eal_common_memory.c
index 042881b..457e239 100644
--- a/lib/librte_eal/common/eal_common_memory.c
+++ b/lib/librte_eal/common/eal_common_memory.c
@@ -13,6 +13,7 @@
#include <sys/mman.h>
#include <sys/queue.h>

+#include <rte_fbarray.h>
#include <rte_memory.h>
#include <rte_eal.h>
#include <rte_eal_memconfig.h>
@@ -30,6 +31,8 @@
* which is a multiple of hugepage size.
*/

+#define MEMSEG_LIST_FMT "memseg-%luk-%i-%i"
+
static uint64_t baseaddr_offset;
static uint64_t system_page_sz;

@@ -120,15 +123,245 @@ eal_get_virtual_area(void *requested_addr, uint64_t *size,
return aligned_addr;
}

-/*
- * Return a pointer to a read-only table of struct rte_physmem_desc
- * elements, containing the layout of all addressable physical
- * memory. The last element of the table contains a NULL address.
- */
-const struct rte_memseg *
-rte_eal_get_physmem_layout(void)
+static uint64_t
+get_mem_amount(uint64_t page_sz)
+{
+ uint64_t area_sz, max_pages;
+
+ max_pages = internal_config.legacy_mem || internal_config.no_hugetlbfs ?
+ RTE_MAX_LEGACY_MEMSEG : RTE_MAX_MEMSEG_PER_LIST;
+
+ /* limit to RTE_MAX_MEMSEG_PER_LIST pages or RTE_MAX_MEM_PER_LIST GB */
+ area_sz = RTE_MIN(page_sz * max_pages,
+ (uint64_t) RTE_MAX_MEM_PER_LIST << 30);
+ /* make sure the list isn't smaller than the page size */
+ area_sz = RTE_MAX(area_sz, page_sz);
+
+ return rte_align64pow2(area_sz);
+}
+
+static int
+alloc_memseg_list(struct rte_memseg_list *msl, uint64_t page_sz,
+ int socket_id, int type_msl_idx)
+{
+ char name[RTE_FBARRAY_NAME_LEN];
+ int max_pages;
+ uint64_t mem_amount;
+ void *addr;
+
+ if (!internal_config.legacy_mem) {
+ mem_amount = get_mem_amount(page_sz);
+ max_pages = mem_amount / page_sz;
+
+ addr = eal_get_virtual_area(NULL, &mem_amount, page_sz, 0, 0);
+ if (addr == NULL) {
+ RTE_LOG(ERR, EAL, "Cannot reserve memory\n");
+ return -1;
+ }
+ } else {
+ addr = NULL;
+ /* numer of memsegs in each list, these will not be single-page
+ * segments, so RTE_MAX_LEGACY_MEMSEG is like old default.
+ */
+ max_pages = RTE_MAX_LEGACY_MEMSEG;
+ }
+
+ snprintf(name, sizeof(name), MEMSEG_LIST_FMT, page_sz >> 10, socket_id,
+ type_msl_idx);
+ if (rte_fbarray_init(&msl->memseg_arr, name, max_pages,
+ sizeof(struct rte_memseg))) {
+ RTE_LOG(ERR, EAL, "Cannot allocate memseg list: %s\n",
+ rte_strerror(rte_errno));
+ return -1;
+ }
+
+ msl->hugepage_sz = page_sz;
+ msl->socket_id = socket_id;
+ msl->base_va = addr;
+
+ RTE_LOG(DEBUG, EAL, "Memseg list allocated: 0x%zxkB at socket %i\n",
+ page_sz >> 10, socket_id);
+
+ return 0;
+}
+
+static int
+memseg_init(void)
{
- return rte_eal_get_configuration()->mem_config->memseg;
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ int socket_id, hpi_idx, msl_idx = 0;
+ struct rte_memseg_list *msl;
+
+ if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
+ RTE_LOG(ERR, EAL, "Secondary process not supported\n");
+ return -1;
+ }
+
+ /* create memseg lists */
+ for (hpi_idx = 0; hpi_idx < (int) internal_config.num_hugepage_sizes;
+ hpi_idx++) {
+ struct hugepage_info *hpi;
+ uint64_t hugepage_sz;
+
+ hpi = &internal_config.hugepage_info[hpi_idx];
+ hugepage_sz = hpi->hugepage_sz;
+
+ for (socket_id = 0; socket_id < (int) rte_num_sockets();
+ socket_id++) {
+ uint64_t max_mem, total_mem = 0;
+ int type_msl_idx, max_segs, total_segs = 0;
+
+ max_mem = (uint64_t)RTE_MAX_MEM_PER_TYPE << 30;
+ /* no-huge behaves the same as legacy */
+ max_segs = internal_config.legacy_mem ||
+ internal_config.no_hugetlbfs ?
+ RTE_MAX_LEGACY_MEMSEG :
+ RTE_MAX_MEMSEG_PER_TYPE;
+
+ type_msl_idx = 0;
+ while (total_mem < max_mem && total_segs < max_segs) {
+ if (msl_idx >= RTE_MAX_MEMSEG_LISTS) {
+ RTE_LOG(ERR, EAL,
+ "No more space in memseg lists, please increase CONFIG_RTE_MAX_MEMSEG_LISTS\n");
+ return -1;
+ }
+
+ msl = &mcfg->memsegs[msl_idx++];
+
+ if (alloc_memseg_list(msl, hugepage_sz,
+ socket_id, type_msl_idx))
+ return -1;
+
+ total_segs += msl->memseg_arr.len;
+ total_mem = total_segs * msl->hugepage_sz;
+ type_msl_idx++;
+ }
+ }
+ }
+ return 0;
+}
+
+static struct rte_memseg *
+virt2memseg(const void *addr, const struct rte_memseg_list *msl)
+{
+ const struct rte_fbarray *arr;
+ int ms_idx;
+
+ /* a memseg list was specified, check if it's the right one */
+ void *start, *end;
+ start = msl->base_va;
+ end = RTE_PTR_ADD(start, msl->hugepage_sz *
+ msl->memseg_arr.len);
+
+ if (addr < start || addr >= end)
+ return NULL;
+
+ /* now, calculate index */
+ arr = &msl->memseg_arr;
+ ms_idx = RTE_PTR_DIFF(addr, msl->base_va) / msl->hugepage_sz;
+ return rte_fbarray_get(arr, ms_idx);
+}
+
+static struct rte_memseg_list *
+virt2memseg_list(const void *addr)
+{
+ struct rte_mem_config *mcfg =
+ rte_eal_get_configuration()->mem_config;
+ struct rte_memseg_list *msl;
+ int msl_idx;
+
+ for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
+ void *start, *end;
+ msl = &mcfg->memsegs[msl_idx];
+
+ start = msl->base_va;
+ end = RTE_PTR_ADD(start, msl->hugepage_sz *
+ msl->memseg_arr.len);
+ if (addr >= start && addr < end)
+ break;
+ }
+ /* if we didn't find our memseg list */
+ if (msl_idx == RTE_MAX_MEMSEG_LISTS)
+ return NULL;
+ return msl;
+}
+
+static struct rte_memseg_list *
+virt2memseg_list_legacy(const void *addr)
+{
+ struct rte_mem_config *mcfg =
+ rte_eal_get_configuration()->mem_config;
+ struct rte_memseg_list *msl;
+ struct rte_fbarray *arr;
+ int msl_idx, ms_idx;
+ for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
+ msl = &mcfg->memsegs[msl_idx];
+ arr = &msl->memseg_arr;
+
+ ms_idx = 0;
+ while ((ms_idx =
+ rte_fbarray_find_next_used(arr, ms_idx)) >= 0) {
+ const struct rte_memseg *ms;
+ void *start, *end;
+ ms = rte_fbarray_get(arr, ms_idx);
+ start = ms->addr;
+ end = RTE_PTR_ADD(start, ms->len);
+ if (addr >= start && addr < end)
+ return msl;
+ ms_idx++;
+ }
+ }
+ return NULL;
+}
+
+static struct rte_memseg *
+virt2memseg_legacy(const void *addr)
+{
+ struct rte_mem_config *mcfg =
+ rte_eal_get_configuration()->mem_config;
+ struct rte_memseg_list *msl;
+ struct rte_fbarray *arr;
+ int msl_idx, ms_idx;
+ for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
+ msl = &mcfg->memsegs[msl_idx];
+ arr = &msl->memseg_arr;
+
+ ms_idx = 0;
+ while ((ms_idx =
+ rte_fbarray_find_next_used(arr, ms_idx)) >= 0) {
+ struct rte_memseg *ms;
+ void *start, *end;
+ ms = rte_fbarray_get(arr, ms_idx);
+ start = ms->addr;
+ end = RTE_PTR_ADD(start, ms->len);
+ if (addr >= start && addr < end)
+ return ms;
+ ms_idx++;
+ }
+ }
+ return NULL;
+}
+
+struct rte_memseg_list *
+rte_mem_virt2memseg_list(const void *addr)
+{
+ /* for legacy memory, we just walk the list, like in the old days. */
+ if (internal_config.legacy_mem)
+ return virt2memseg_list_legacy(addr);
+ else
+ return virt2memseg_list(addr);
+}
+
+struct rte_memseg *
+rte_mem_virt2memseg(const void *addr, const struct rte_memseg_list *msl)
+{
+ /* for legacy memory, we just walk the list, like in the old days. */
+ if (internal_config.legacy_mem)
+ /* ignore msl value */
+ return virt2memseg_legacy(addr);
+
+ return virt2memseg(addr, msl != NULL ? msl :
+ rte_mem_virt2memseg_list(addr));
}


@@ -136,18 +369,32 @@ rte_eal_get_physmem_layout(void)
uint64_t
rte_eal_get_physmem_size(void)
{
- const struct rte_mem_config *mcfg;
+ struct rte_mem_config *mcfg;
unsigned i = 0;
uint64_t total_len = 0;

/* get pointer to global configuration */
mcfg = rte_eal_get_configuration()->mem_config;

- for (i = 0; i < RTE_MAX_MEMSEG; i++) {
- if (mcfg->memseg[i].addr == NULL)
- break;
-
- total_len += mcfg->memseg[i].len;
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+ struct rte_memseg_list *msl = &mcfg->memsegs[i];
+
+ if (msl->memseg_arr.count == 0)
+ continue;
+
+ /* for legacy mem mode, walk the memsegs */
+ if (internal_config.legacy_mem) {
+ struct rte_fbarray *arr = &msl->memseg_arr;
+ int ms_idx = 0;
+
+ while ((ms_idx = rte_fbarray_find_next_used(arr,
+ ms_idx) >= 0)) {
+ const struct rte_memseg *ms =
+ rte_fbarray_get(arr, ms_idx);
+ total_len += ms->len;
+ }
+ } else
+ total_len += msl->hugepage_sz * msl->memseg_arr.count;
}

return total_len;
@@ -157,27 +404,35 @@ rte_eal_get_physmem_size(void)
void
rte_dump_physmem_layout(FILE *f)
{
- const struct rte_mem_config *mcfg;
+ struct rte_mem_config *mcfg;
unsigned i = 0;

/* get pointer to global configuration */
mcfg = rte_eal_get_configuration()->mem_config;

- for (i = 0; i < RTE_MAX_MEMSEG; i++) {
- if (mcfg->memseg[i].addr == NULL)
- break;
-
- fprintf(f, "Segment %u: IOVA:0x%"PRIx64", len:%zu, "
- "virt:%p, socket_id:%"PRId32", "
- "hugepage_sz:%"PRIu64", nchannel:%"PRIx32", "
- "nrank:%"PRIx32"\n", i,
- mcfg->memseg[i].iova,
- mcfg->memseg[i].len,
- mcfg->memseg[i].addr,
- mcfg->memseg[i].socket_id,
- mcfg->memseg[i].hugepage_sz,
- mcfg->memseg[i].nchannel,
- mcfg->memseg[i].nrank);
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+ struct rte_memseg_list *msl = &mcfg->memsegs[i];
+ struct rte_fbarray *arr = &msl->memseg_arr;
+ int m_idx = 0;
+
+ if (arr->count == 0)
+ continue;
+
+ while ((m_idx = rte_fbarray_find_next_used(arr, m_idx)) >= 0) {
+ struct rte_memseg *ms = rte_fbarray_get(arr, m_idx);
+ fprintf(f, "Page %u-%u: iova:0x%"PRIx64", len:%zu, "
+ "virt:%p, socket_id:%"PRId32", "
+ "hugepage_sz:%"PRIu64", nchannel:%"PRIx32", "
+ "nrank:%"PRIx32"\n", i, m_idx,
+ ms->iova,
+ ms->len,
+ ms->addr,
+ ms->socket_id,
+ ms->hugepage_sz,
+ ms->nchannel,
+ ms->nrank);
+ m_idx++;
+ }
}
}

@@ -222,9 +477,14 @@ rte_mem_lock_page(const void *virt)
int
rte_eal_memory_init(void)
{
+ int retval;
RTE_LOG(DEBUG, EAL, "Setting up physically contiguous memory...\n");

- const int retval = rte_eal_process_type() == RTE_PROC_PRIMARY ?
+ retval = memseg_init();
+ if (retval < 0)
+ return -1;
+
+ retval = rte_eal_process_type() == RTE_PROC_PRIMARY ?
rte_eal_hugepage_init() :
rte_eal_hugepage_attach();
if (retval < 0)
diff --git a/lib/librte_eal/common/eal_common_memzone.c b/lib/librte_eal/common/eal_common_memzone.c
index 1ab3ade..ed36174 100644
--- a/lib/librte_eal/common/eal_common_memzone.c
+++ b/lib/librte_eal/common/eal_common_memzone.c
@@ -226,10 +226,9 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
mz->iova = rte_malloc_virt2iova(mz_addr);
mz->addr = mz_addr;
mz->len = (requested_len == 0 ? elem->size : requested_len);
- mz->hugepage_sz = elem->ms->hugepage_sz;
- mz->socket_id = elem->ms->socket_id;
+ mz->hugepage_sz = elem->msl->hugepage_sz;
+ mz->socket_id = elem->msl->socket_id;
mz->flags = 0;
- mz->memseg_id = elem->ms - rte_eal_get_configuration()->mem_config->memseg;

return mz;
}
@@ -382,7 +381,6 @@ int
rte_eal_memzone_init(void)
{
struct rte_mem_config *mcfg;
- const struct rte_memseg *memseg;

/* get pointer to global configuration */
mcfg = rte_eal_get_configuration()->mem_config;
@@ -391,12 +389,6 @@ rte_eal_memzone_init(void)
if (rte_eal_process_type() == RTE_PROC_SECONDARY)
return 0;

- memseg = rte_eal_get_physmem_layout();
- if (memseg == NULL) {
- RTE_LOG(ERR, EAL, "%s(): Cannot get physical layout\n", __func__);
- return -1;
- }
-
rte_rwlock_write_lock(&mcfg->mlock);

/* delete all zones */
diff --git a/lib/librte_eal/common/eal_hugepages.h b/lib/librte_eal/common/eal_hugepages.h
index 1d519bb..f963ae5 100644
--- a/lib/librte_eal/common/eal_hugepages.h
+++ b/lib/librte_eal/common/eal_hugepages.h
@@ -23,6 +23,8 @@ struct hugepage_file {
int socket_id; /**< NUMA socket ID */
int file_id; /**< the '%d' in HUGEFILE_FMT */
int memseg_id; /**< the memory segment to which page belongs */
+ int memseg_list_id;
+ /**< the memory segment list to which page belongs */
char filepath[MAX_HUGEPAGE_PATH]; /**< path to backing file on filesystem */
};

diff --git a/lib/librte_eal/common/eal_internal_cfg.h b/lib/librte_eal/common/eal_internal_cfg.h
index c8a0676..eea8b66 100644
--- a/lib/librte_eal/common/eal_internal_cfg.h
+++ b/lib/librte_eal/common/eal_internal_cfg.h
@@ -23,7 +23,7 @@ struct hugepage_info {
uint64_t hugepage_sz; /**< size of a huge page */
const char *hugedir; /**< dir where hugetlbfs is mounted */
uint32_t num_pages[RTE_MAX_NUMA_NODES];
- /**< number of hugepages of that size on each socket */
+ /**< number of hugepages of that size on each socket */
int lock_descriptor; /**< file descriptor for hugepage dir */
};

diff --git a/lib/librte_eal/common/include/rte_eal_memconfig.h b/lib/librte_eal/common/include/rte_eal_memconfig.h
index 29fa0b6..31fc8e7 100644
--- a/lib/librte_eal/common/include/rte_eal_memconfig.h
+++ b/lib/librte_eal/common/include/rte_eal_memconfig.h
@@ -12,12 +12,30 @@
#include <rte_malloc_heap.h>
#include <rte_rwlock.h>
#include <rte_pause.h>
+#include <rte_fbarray.h>

#ifdef __cplusplus
extern "C" {
#endif

/**
+ * memseg list is a special case as we need to store a bunch of other data
+ * together with the array itself.
+ */
+struct rte_memseg_list {
+ RTE_STD_C11
+ union {
+ void *base_va;
+ /**< Base virtual address for this memseg list. */
+ uint64_t addr_64;
+ /**< Makes sure addr is always 64-bits */
+ };
+ int socket_id; /**< Socket ID for all memsegs in this list. */
+ uint64_t hugepage_sz; /**< page size for all memsegs in this list. */
+ struct rte_fbarray memseg_arr;
+};
+
+/**
* the structure for the memory configuration for the RTE.
* Used by the rte_config structure. It is separated out, as for multi-process
* support, the memory details should be shared across instances
@@ -43,9 +61,11 @@ struct rte_mem_config {
uint32_t memzone_cnt; /**< Number of allocated memzones */

/* memory segments and zones */
- struct rte_memseg memseg[RTE_MAX_MEMSEG]; /**< Physmem descriptors. */
struct rte_memzone memzone[RTE_MAX_MEMZONE]; /**< Memzone descriptors. */

+ struct rte_memseg_list memsegs[RTE_MAX_MEMSEG_LISTS];
+ /**< list of dynamic arrays holding memsegs */
+
struct rte_tailq_head tailq_head[RTE_MAX_TAILQ]; /**< Tailqs for objects */

/* Heaps of Malloc per socket */
diff --git a/lib/librte_eal/common/include/rte_memory.h b/lib/librte_eal/common/include/rte_memory.h
index 302f865..674d4cb 100644
--- a/lib/librte_eal/common/include/rte_memory.h
+++ b/lib/librte_eal/common/include/rte_memory.h
@@ -22,6 +22,9 @@ extern "C" {
#include <rte_common.h>
#include <rte_config.h>

+/* forward declaration for pointers */
+struct rte_memseg_list;
+
__extension__
enum rte_page_sizes {
RTE_PGSIZE_4K = 1ULL << 12,
@@ -130,21 +133,27 @@ phys_addr_t rte_mem_virt2phy(const void *virt);
rte_iova_t rte_mem_virt2iova(const void *virt);

/**
- * Get the layout of the available physical memory.
+ * Get memseg corresponding to virtual memory address.
*
- * It can be useful for an application to have the full physical
- * memory layout to decide the size of a memory zone to reserve. This
- * table is stored in rte_config (see rte_eal_get_configuration()).
+ * @param virt
+ * The virtual address.
+ * @param msl
+ * Memseg list in which to look for memsegs (can be NULL).
+ * @return
+ * Memseg to which this virtual address belongs to.
+ */
+struct rte_memseg *rte_mem_virt2memseg(const void *virt,
+ const struct rte_memseg_list *msl);
+
+/**
+ * Get memseg list corresponding to virtual memory address.
*
+ * @param virt
+ * The virtual address.
* @return
- * - On success, return a pointer to a read-only table of struct
- * rte_physmem_desc elements, containing the layout of all
- * addressable physical memory. The last element of the table
- * contains a NULL address.
- * - On error, return NULL. This should not happen since it is a fatal
- * error that will probably cause the entire system to panic.
- */
-const struct rte_memseg *rte_eal_get_physmem_layout(void);
+ * Memseg list to which this virtual address belongs to.
+ */
+struct rte_memseg_list *rte_mem_virt2memseg_list(const void *virt);

/**
* Dump the physical memory layout to a file.
diff --git a/lib/librte_eal/common/include/rte_memzone.h b/lib/librte_eal/common/include/rte_memzone.h
index 2bfb273..a69f068 100644
--- a/lib/librte_eal/common/include/rte_memzone.h
+++ b/lib/librte_eal/common/include/rte_memzone.h
@@ -66,7 +66,6 @@ struct rte_memzone {
int32_t socket_id; /**< NUMA socket ID. */

uint32_t flags; /**< Characteristics of this memzone. */
- uint32_t memseg_id; /**< Memseg it belongs. */
} __attribute__((__packed__));

/**
diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c
index c18f050..701bffd 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -26,11 +26,11 @@
* Initialize a general malloc_elem header structure
*/
void
-malloc_elem_init(struct malloc_elem *elem,
- struct malloc_heap *heap, const struct rte_memseg *ms, size_t size)
+malloc_elem_init(struct malloc_elem *elem, struct malloc_heap *heap,
+ struct rte_memseg_list *msl, size_t size)
{
elem->heap = heap;
- elem->ms = ms;
+ elem->msl = msl;
elem->prev = NULL;
elem->next = NULL;
memset(&elem->free_list, 0, sizeof(elem->free_list));
@@ -145,7 +145,7 @@ split_elem(struct malloc_elem *elem, struct malloc_elem *split_pt)
const size_t old_elem_size = (uintptr_t)split_pt - (uintptr_t)elem;
const size_t new_elem_size = elem->size - old_elem_size;

- malloc_elem_init(split_pt, elem->heap, elem->ms, new_elem_size);
+ malloc_elem_init(split_pt, elem->heap, elem->msl, new_elem_size);
split_pt->prev = elem;
split_pt->next = next_elem;
if (next_elem)
diff --git a/lib/librte_eal/common/malloc_elem.h b/lib/librte_eal/common/malloc_elem.h
index 9c1614c..388c16f 100644
--- a/lib/librte_eal/common/malloc_elem.h
+++ b/lib/librte_eal/common/malloc_elem.h
@@ -5,7 +5,7 @@
#ifndef MALLOC_ELEM_H_
#define MALLOC_ELEM_H_

-#include <rte_memory.h>
+#include <rte_eal_memconfig.h>

/* dummy definition of struct so we can use pointers to it in malloc_elem struct */
struct malloc_heap;
@@ -24,7 +24,7 @@ struct malloc_elem {
/**< points to next elem in memseg */
LIST_ENTRY(malloc_elem) free_list;
/**< list of free elements in heap */
- const struct rte_memseg *ms;
+ struct rte_memseg_list *msl;
volatile enum elem_state state;
uint32_t pad;
size_t size;
@@ -111,7 +111,7 @@ malloc_elem_from_data(const void *data)
void
malloc_elem_init(struct malloc_elem *elem,
struct malloc_heap *heap,
- const struct rte_memseg *ms,
+ struct rte_memseg_list *msl,
size_t size);

void
diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c
index a2c2e4c..058ad75 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -21,6 +21,7 @@
#include <rte_memcpy.h>
#include <rte_atomic.h>

+#include "eal_internal_cfg.h"
#include "malloc_elem.h"
#include "malloc_heap.h"

@@ -62,22 +63,25 @@ check_hugepage_sz(unsigned flags, uint64_t hugepage_sz)
}

/*
- * Expand the heap with a memseg.
- * This reserves the zone and sets a dummy malloc_elem header at the end
- * to prevent overflow. The rest of the zone is added to free list as a single
- * large free block
+ * Expand the heap with a memory area.
*/
-static void
-malloc_heap_add_memseg(struct malloc_heap *heap, struct rte_memseg *ms)
+static struct malloc_elem *
+malloc_heap_add_memory(struct malloc_heap *heap, struct rte_memseg_list *msl,
+ void *start, size_t len)
{
- struct malloc_elem *start_elem = (struct malloc_elem *)ms->addr;
- const size_t elem_size = ms->len - MALLOC_ELEM_OVERHEAD;
+ struct malloc_elem *elem = start;
+
+ malloc_elem_init(elem, heap, msl, len);
+
+ malloc_elem_insert(elem);
+
+ elem = malloc_elem_join_adjacent_free(elem);

- malloc_elem_init(start_elem, heap, ms, elem_size);
- malloc_elem_insert(start_elem);
- malloc_elem_free_list_insert(start_elem);
+ malloc_elem_free_list_insert(elem);

- heap->total_size += elem_size;
+ heap->total_size += len;
+
+ return elem;
}

/*
@@ -98,7 +102,8 @@ find_suitable_element(struct malloc_heap *heap, size_t size,
for (elem = LIST_FIRST(&heap->free_head[idx]);
!!elem; elem = LIST_NEXT(elem, free_list)) {
if (malloc_elem_can_hold(elem, size, align, bound)) {
- if (check_hugepage_sz(flags, elem->ms->hugepage_sz))
+ if (check_hugepage_sz(flags,
+ elem->msl->hugepage_sz))
return elem;
if (alt_elem == NULL)
alt_elem = elem;
@@ -243,16 +248,65 @@ int
rte_eal_malloc_heap_init(void)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
- unsigned ms_cnt;
- struct rte_memseg *ms;
+ int msl_idx;
+ struct rte_memseg_list *msl;

if (mcfg == NULL)
return -1;

- for (ms = &mcfg->memseg[0], ms_cnt = 0;
- (ms_cnt < RTE_MAX_MEMSEG) && (ms->len > 0);
- ms_cnt++, ms++) {
- malloc_heap_add_memseg(&mcfg->malloc_heaps[ms->socket_id], ms);
+ for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
+ int start;
+ struct rte_fbarray *arr;
+ struct malloc_heap *heap;
+
+ msl = &mcfg->memsegs[msl_idx];
+ arr = &msl->memseg_arr;
+ heap = &mcfg->malloc_heaps[msl->socket_id];
+
+ if (arr->count == 0)
+ continue;
+
+ /* for legacy mode, just walk the list */
+ if (internal_config.legacy_mem) {
+ int ms_idx = 0;
+ while ((ms_idx = rte_fbarray_find_next_used(arr,
+ ms_idx)) >= 0) {
+ struct rte_memseg *ms =
+ rte_fbarray_get(arr, ms_idx);
+ malloc_heap_add_memory(heap, msl,
+ ms->addr, ms->len);
+ ms_idx++;
+ RTE_LOG(DEBUG, EAL, "Heap on socket %d was expanded by %zdMB\n",
+ msl->socket_id, ms->len >> 20ULL);
+ }
+ continue;
+ }
+
+ /* find first segment */
+ start = rte_fbarray_find_next_used(arr, 0);
+
+ while (start >= 0) {
+ int contig_segs;
+ struct rte_memseg *start_seg;
+ size_t len, hugepage_sz = msl->hugepage_sz;
+
+ /* find how many pages we can lump in together */
+ contig_segs = rte_fbarray_find_contig_used(arr, start);
+ start_seg = rte_fbarray_get(arr, start);
+ len = contig_segs * hugepage_sz;
+
+ /*
+ * we've found (hopefully) a bunch of contiguous
+ * segments, so add them to the heap.
+ */
+ malloc_heap_add_memory(heap, msl, start_seg->addr, len);
+
+ RTE_LOG(DEBUG, EAL, "Heap on socket %d was expanded by %zdMB\n",
+ msl->socket_id, len >> 20ULL);
+
+ start = rte_fbarray_find_next_used(arr,
+ start + contig_segs);
+ }
}

return 0;
diff --git a/lib/librte_eal/common/rte_malloc.c b/lib/librte_eal/common/rte_malloc.c
index 80fb6cc..bd7e757 100644
--- a/lib/librte_eal/common/rte_malloc.c
+++ b/lib/librte_eal/common/rte_malloc.c
@@ -238,17 +238,21 @@ rte_malloc_set_limit(__rte_unused const char *type,
rte_iova_t
rte_malloc_virt2iova(const void *addr)
{
- rte_iova_t iova;
- const struct malloc_elem *elem = malloc_elem_from_data(addr);
+ const struct rte_memseg *ms;
+ struct malloc_elem *elem = malloc_elem_from_data(addr);
+
if (elem == NULL)
return RTE_BAD_IOVA;
- if (elem->ms->iova == RTE_BAD_IOVA)
- return RTE_BAD_IOVA;

if (rte_eal_iova_mode() == RTE_IOVA_VA)
- iova = (uintptr_t)addr;
- else
- iova = elem->ms->iova +
- RTE_PTR_DIFF(addr, elem->ms->addr);
- return iova;
+ return (uintptr_t) addr;
+
+ ms = rte_mem_virt2memseg(addr, elem->msl);
+ if (ms == NULL)
+ return RTE_BAD_IOVA;
+
+ if (ms->iova == RTE_BAD_IOVA)
+ return RTE_BAD_IOVA;
+
+ return ms->iova + RTE_PTR_DIFF(addr, ms->addr);
}
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 5207713..7851a7d 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -74,8 +74,8 @@ static int mem_cfg_fd = -1;
static struct flock wr_lock = {
.l_type = F_WRLCK,
.l_whence = SEEK_SET,
- .l_start = offsetof(struct rte_mem_config, memseg),
- .l_len = sizeof(early_mem_config.memseg),
+ .l_start = offsetof(struct rte_mem_config, memsegs),
+ .l_len = sizeof(early_mem_config.memsegs),
};

/* Address of global and public configuration */
@@ -643,17 +643,20 @@ eal_parse_args(int argc, char **argv)
static void
eal_check_mem_on_local_socket(void)
{
- const struct rte_memseg *ms;
+ const struct rte_memseg_list *msl;
int i, socket_id;

socket_id = rte_lcore_to_socket_id(rte_config.master_lcore);

- ms = rte_eal_get_physmem_layout();
-
- for (i = 0; i < RTE_MAX_MEMSEG; i++)
- if (ms[i].socket_id == socket_id &&
- ms[i].len > 0)
- return;
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+ msl = &rte_eal_get_configuration()->mem_config->memsegs[i];
+ if (msl->socket_id != socket_id)
+ continue;
+ /* for legacy memory, check if there's anything allocated */
+ if (internal_config.legacy_mem && msl->memseg_arr.count == 0)
+ continue;
+ return;
+ }

RTE_LOG(WARNING, EAL, "WARNING: Master core has no "
"memory on local socket!\n");
diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
index b9bcb75..9512da9 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -908,6 +908,28 @@ huge_recover_sigbus(void)
}
}

+/* in legacy mode, each combination of socket and pagesize directly map to a
+ * single memseg list.
+ */
+static struct rte_memseg_list *
+get_memseg_list(int socket, uint64_t page_sz)
+{
+ struct rte_mem_config *mcfg =
+ rte_eal_get_configuration()->mem_config;
+ struct rte_memseg_list *msl;
+ int msl_idx;
+
+ for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
+ msl = &mcfg->memsegs[msl_idx];
+ if (msl->hugepage_sz != page_sz)
+ continue;
+ if (msl->socket_id != socket)
+ continue;
+ return msl;
+ }
+ return NULL;
+}
+
/*
* Prepare physical memory mapping: fill configuration structure with
* these infos, return 0 on success.
@@ -925,11 +947,14 @@ eal_legacy_hugepage_init(void)
struct rte_mem_config *mcfg;
struct hugepage_file *hugepage = NULL, *tmp_hp = NULL;
struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES];
+ struct rte_fbarray *arr;
+ struct rte_memseg *ms;

uint64_t memory[RTE_MAX_NUMA_NODES];

unsigned hp_offset;
int i, j, new_memseg;
+ int ms_idx, msl_idx;
int nr_hugefiles, nr_hugepages = 0;
void *addr;

@@ -942,6 +967,12 @@ eal_legacy_hugepage_init(void)

/* hugetlbfs can be disabled */
if (internal_config.no_hugetlbfs) {
+ /* nohuge mode is legacy mode */
+ internal_config.legacy_mem = 1;
+
+ arr = &mcfg->memsegs[0].memseg_arr;
+ ms = rte_fbarray_get(arr, 0);
+
addr = mmap(NULL, internal_config.memory, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
if (addr == MAP_FAILED) {
@@ -949,14 +980,15 @@ eal_legacy_hugepage_init(void)
strerror(errno));
return -1;
}
+ rte_fbarray_set_used(arr, 0);
if (rte_eal_iova_mode() == RTE_IOVA_VA)
- mcfg->memseg[0].iova = (uintptr_t)addr;
+ ms->iova = (uintptr_t)addr;
else
- mcfg->memseg[0].iova = RTE_BAD_IOVA;
- mcfg->memseg[0].addr = addr;
- mcfg->memseg[0].hugepage_sz = RTE_PGSIZE_4K;
- mcfg->memseg[0].len = internal_config.memory;
- mcfg->memseg[0].socket_id = 0;
+ ms->iova = RTE_BAD_IOVA;
+ ms->addr = addr;
+ ms->hugepage_sz = RTE_PGSIZE_4K;
+ ms->len = internal_config.memory;
+ ms->socket_id = 0;
return 0;
}

@@ -1197,27 +1229,51 @@ eal_legacy_hugepage_init(void)
#endif

if (new_memseg) {
- j += 1;
- if (j == RTE_MAX_MEMSEG)
+ struct rte_memseg_list *msl;
+ int socket;
+ uint64_t page_sz;
+
+ socket = hugepage[i].socket_id;
+ page_sz = hugepage[i].size;
+
+ if (page_sz == 0)
+ continue;
+
+ /* figure out where to put this memseg */
+ msl = get_memseg_list(socket, page_sz);
+ if (!msl)
+ rte_panic("Unknown socket or page sz: %i %lx\n",
+ socket, page_sz);
+ msl_idx = msl - &mcfg->memsegs[0];
+ arr = &msl->memseg_arr;
+
+ ms_idx = rte_fbarray_find_next_free(arr, arr->count);
+ if (ms_idx < 0) {
+ RTE_LOG(ERR, EAL, "No space in memseg list\n");
break;
+ }
+ ms = rte_fbarray_get(arr, ms_idx);
+
+ ms->iova = hugepage[i].physaddr;
+ ms->addr = hugepage[i].final_va;
+ ms->len = page_sz;
+ ms->socket_id = socket;
+ ms->hugepage_sz = page_sz;

- mcfg->memseg[j].iova = hugepage[i].physaddr;
- mcfg->memseg[j].addr = hugepage[i].final_va;
- mcfg->memseg[j].len = hugepage[i].size;
- mcfg->memseg[j].socket_id = hugepage[i].socket_id;
- mcfg->memseg[j].hugepage_sz = hugepage[i].size;
+ rte_fbarray_set_used(arr, ms_idx);
}
/* continuation of previous memseg */
else {
#ifdef RTE_ARCH_PPC_64
/* Use the phy and virt address of the last page as segment
* address for IBM Power architecture */
- mcfg->memseg[j].iova = hugepage[i].physaddr;
- mcfg->memseg[j].addr = hugepage[i].final_va;
+ ms->iova = hugepage[i].physaddr;
+ ms->addr = hugepage[i].final_va;
#endif
- mcfg->memseg[j].len += mcfg->memseg[j].hugepage_sz;
+ ms->len += ms->hugepage_sz;
}
- hugepage[i].memseg_id = j;
+ hugepage[i].memseg_id = ms_idx;
+ hugepage[i].memseg_list_id = msl_idx;
}

if (i < nr_hugefiles) {
@@ -1227,7 +1283,7 @@ eal_legacy_hugepage_init(void)
"Please either increase it or request less amount "
"of memory.\n",
i, nr_hugefiles, RTE_STR(CONFIG_RTE_MAX_MEMSEG),
- RTE_MAX_MEMSEG);
+ RTE_MAX_MEMSEG_PER_LIST);
goto fail;
}

@@ -1265,11 +1321,12 @@ getFileSize(int fd)
static int
eal_legacy_hugepage_attach(void)
{
- const struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
struct hugepage_file *hp = NULL;
- unsigned num_hp = 0;
- unsigned i, s = 0; /* s used to track the segment number */
- unsigned max_seg = RTE_MAX_MEMSEG;
+ unsigned int num_hp = 0;
+ unsigned int i;
+ int ms_idx, msl_idx;
+ unsigned int cur_seg, max_seg;
off_t size = 0;
int fd, fd_hugepage = -1;

@@ -1289,46 +1346,57 @@ eal_legacy_hugepage_attach(void)
}

/* map all segments into memory to make sure we get the addrs */
- for (s = 0; s < RTE_MAX_MEMSEG; ++s) {
- void *base_addr;
+ max_seg = 0;
+ for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
+ struct rte_memseg_list *msl = &mcfg->memsegs[msl_idx];
+ struct rte_fbarray *arr = &msl->memseg_arr;
uint64_t mmap_sz;
int mmap_flags = 0;

- /*
- * the first memory segment with len==0 is the one that
- * follows the last valid segment.
- */
- if (mcfg->memseg[s].len == 0)
- break;
+ ms_idx = rte_fbarray_find_next_used(arr, 0);
+ while (ms_idx >= 0) {
+ struct rte_memseg *ms = rte_fbarray_get(arr, ms_idx);
+ void *base_addr;

- /* get identical addresses as the primary process.
- */
+ ms = rte_fbarray_get(arr, ms_idx);
+
+ /*
+ * the first memory segment with len==0 is the one that
+ * follows the last valid segment.
+ */
+ if (ms->len == 0)
+ break;
+
+ /* get identical addresses as the primary process.
+ */
#ifdef RTE_ARCH_PPC_64
- mmap_flags |= MAP_HUGETLB;
+ mmap_flags |= MAP_HUGETLB;
#endif
- mmap_sz = mcfg->memseg[s].len;
- base_addr = eal_get_virtual_area(mcfg->memseg[s].addr,
- &mmap_sz, mcfg->memseg[s].hugepage_sz, 0,
- mmap_flags);
- if (base_addr == NULL) {
- max_seg = s;
- if (rte_errno == EADDRNOTAVAIL) {
- RTE_LOG(ERR, EAL, "Could not mmap %llu bytes at [%p] - please use '--base-virtaddr' option\n",
- (unsigned long long)mcfg->memseg[s].len,
- mcfg->memseg[s].addr);
- } else {
- RTE_LOG(ERR, EAL, "Could not mmap %llu bytes at [%p]: '%s'\n",
- (unsigned long long)mcfg->memseg[s].len,
- mcfg->memseg[s].addr,
- rte_strerror(rte_errno));
- }
- if (aslr_enabled() > 0) {
- RTE_LOG(ERR, EAL, "It is recommended to "
- "disable ASLR in the kernel "
- "and retry running both primary "
- "and secondary processes\n");
+ mmap_sz = ms->len;
+ base_addr = eal_get_virtual_area(ms->addr, &mmap_sz,
+ ms->hugepage_sz, 0, mmap_flags);
+ if (base_addr == NULL) {
+ if (rte_errno == EADDRNOTAVAIL) {
+ RTE_LOG(ERR, EAL, "Could not mmap %llu bytes at [%p] - please use '--base-virtaddr' option\n",
+ (unsigned long long)ms->len,
+ ms->addr);
+ } else {
+ RTE_LOG(ERR, EAL, "Could not mmap %llu bytes at [%p]: '%s'\n",
+ (unsigned long long)ms->len,
+ ms->addr, strerror(errno));
+ }
+ if (aslr_enabled() > 0) {
+ RTE_LOG(ERR, EAL, "It is recommended to "
+ "disable ASLR in the kernel "
+ "and retry running both primary "
+ "and secondary processes\n");
+ }
+ goto error;
}
- goto error;
+ max_seg++;
+ ms_idx++;
+
+ ms_idx = rte_fbarray_find_next_used(arr, ms_idx);
}
}

@@ -1342,46 +1410,67 @@ eal_legacy_hugepage_attach(void)
num_hp = size / sizeof(struct hugepage_file);
RTE_LOG(DEBUG, EAL, "Analysing %u files\n", num_hp);

- s = 0;
- while (s < RTE_MAX_MEMSEG && mcfg->memseg[s].len > 0){
- void *addr, *base_addr;
- uintptr_t offset = 0;
- size_t mapping_size;
- /*
- * free previously mapped memory so we can map the
- * hugepages into the space
- */
- base_addr = mcfg->memseg[s].addr;
- munmap(base_addr, mcfg->memseg[s].len);
-
- /* find the hugepages for this segment and map them
- * we don't need to worry about order, as the server sorted the
- * entries before it did the second mmap of them */
- for (i = 0; i < num_hp && offset < mcfg->memseg[s].len; i++){
- if (hp[i].memseg_id == (int)s){
- fd = open(hp[i].filepath, O_RDWR);
- if (fd < 0) {
- RTE_LOG(ERR, EAL, "Could not open %s\n",
- hp[i].filepath);
- goto error;
- }
- mapping_size = hp[i].size;
- addr = mmap(RTE_PTR_ADD(base_addr, offset),
- mapping_size, PROT_READ | PROT_WRITE,
- MAP_SHARED, fd, 0);
- close(fd); /* close file both on success and on failure */
- if (addr == MAP_FAILED ||
- addr != RTE_PTR_ADD(base_addr, offset)) {
- RTE_LOG(ERR, EAL, "Could not mmap %s\n",
- hp[i].filepath);
- goto error;
+ /* map all segments into memory to make sure we get the addrs */
+ for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
+ struct rte_memseg_list *msl = &mcfg->memsegs[msl_idx];
+ struct rte_fbarray *arr = &msl->memseg_arr;
+
+ ms_idx = rte_fbarray_find_next_used(arr, 0);
+ while (ms_idx >= 0) {
+ struct rte_memseg *ms = rte_fbarray_get(arr, ms_idx);
+ void *addr, *base_addr;
+ uintptr_t offset = 0;
+ size_t mapping_size;
+
+ ms = rte_fbarray_get(arr, ms_idx);
+ /*
+ * free previously mapped memory so we can map the
+ * hugepages into the space
+ */
+ base_addr = ms->addr;
+ munmap(base_addr, ms->len);
+
+ /*
+ * find the hugepages for this segment and map them
+ * we don't need to worry about order, as the server
+ * sorted the entries before it did the second mmap of
+ * them
+ */
+ for (i = 0; i < num_hp && offset < ms->len; i++) {
+ if (hp[i].memseg_id == ms_idx &&
+ hp[i].memseg_list_id ==
+ msl_idx) {
+ fd = open(hp[i].filepath, O_RDWR);
+ if (fd < 0) {
+ RTE_LOG(ERR, EAL, "Could not open %s\n",
+ hp[i].filepath);
+ goto error;
+ }
+ mapping_size = hp[i].size;
+ addr = mmap(RTE_PTR_ADD(base_addr,
+ offset),
+ mapping_size,
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED, fd, 0);
+ /*
+ * close file both on success and on
+ * failure
+ */
+ close(fd);
+ if (addr == MAP_FAILED ||
+ addr != RTE_PTR_ADD(
+ base_addr, offset)) {
+ RTE_LOG(ERR, EAL, "Could not mmap %s\n",
+ hp[i].filepath);
+ goto error;
+ }
+ offset += mapping_size;
}
- offset+=mapping_size;
}
+ RTE_LOG(DEBUG, EAL, "Mapped segment of size 0x%llx\n",
+ (unsigned long long)ms->len);
+ ms_idx = rte_fbarray_find_next_used(arr, ms_idx + 1);
}
- RTE_LOG(DEBUG, EAL, "Mapped segment %u of size 0x%llx\n", s,
- (unsigned long long)mcfg->memseg[s].len);
- s++;
}
/* unmap the hugepage config file, since we are done using it */
munmap(hp, size);
@@ -1389,8 +1478,28 @@ eal_legacy_hugepage_attach(void)
return 0;

error:
- for (i = 0; i < max_seg && mcfg->memseg[i].len > 0; i++)
- munmap(mcfg->memseg[i].addr, mcfg->memseg[i].len);
+ /* map all segments into memory to make sure we get the addrs */
+ cur_seg = 0;
+ for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
+ struct rte_memseg_list *msl = &mcfg->memsegs[msl_idx];
+ struct rte_fbarray *arr = &msl->memseg_arr;
+
+ if (cur_seg >= max_seg)
+ break;
+
+ ms_idx = rte_fbarray_find_next_used(arr, 0);
+ while (ms_idx >= 0) {
+ struct rte_memseg *ms = rte_fbarray_get(arr, ms_idx);
+
+ if (cur_seg >= max_seg)
+ break;
+ ms = rte_fbarray_get(arr, i);
+ munmap(ms->addr, ms->len);
+
+ cur_seg++;
+ ms_idx = rte_fbarray_find_next_used(arr, ms_idx);
+ }
+ }
if (hp != NULL && hp != MAP_FAILED)
munmap(hp, size);
if (fd_hugepage >= 0)
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index e44ae4d..5192763 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -667,33 +667,53 @@ vfio_get_group_no(const char *sysfs_base,
static int
vfio_type1_dma_map(int vfio_container_fd)
{
- const struct rte_memseg *ms = rte_eal_get_physmem_layout();
int i, ret;

/* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
- for (i = 0; i < RTE_MAX_MEMSEG; i++) {
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
struct vfio_iommu_type1_dma_map dma_map;
+ struct rte_memseg_list *msl;
+ struct rte_fbarray *arr;
+ int ms_idx, next_idx;

- if (ms[i].addr == NULL)
- break;
+ msl = &rte_eal_get_configuration()->mem_config->memsegs[i];
+ arr = &msl->memseg_arr;

- memset(&dma_map, 0, sizeof(dma_map));
- dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
- dma_map.vaddr = ms[i].addr_64;
- dma_map.size = ms[i].len;
- if (rte_eal_iova_mode() == RTE_IOVA_VA)
- dma_map.iova = dma_map.vaddr;
- else
- dma_map.iova = ms[i].iova;
- dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
+ /* skip empty memseg lists */
+ if (arr->count == 0)
+ continue;

- ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+ next_idx = 0;

- if (ret) {
- RTE_LOG(ERR, EAL, " cannot set up DMA remapping, "
- "error %i (%s)\n", errno,
- strerror(errno));
- return -1;
+ while ((ms_idx = rte_fbarray_find_next_used(arr,
+ next_idx) >= 0)) {
+ uint64_t addr, len, hw_addr;
+ const struct rte_memseg *ms;
+ next_idx = ms_idx + 1;
+
+ ms = rte_fbarray_get(arr, ms_idx);
+
+ addr = ms->addr_64;
+ len = ms->hugepage_sz;
+ hw_addr = ms->iova;
+
+ memset(&dma_map, 0, sizeof(dma_map));
+ dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
+ dma_map.vaddr = addr;
+ dma_map.size = len;
+ dma_map.iova = hw_addr;
+ dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
+ VFIO_DMA_MAP_FLAG_WRITE;
+
+ ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA,
+ &dma_map);
+
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot set up DMA remapping, "
+ "error %i (%s)\n", errno,
+ strerror(errno));
+ return -1;
+ }
}
}

@@ -703,8 +723,8 @@ vfio_type1_dma_map(int vfio_container_fd)
static int
vfio_spapr_dma_map(int vfio_container_fd)
{
- const struct rte_memseg *ms = rte_eal_get_physmem_layout();
int i, ret;
+ uint64_t hugepage_sz = 0;

struct vfio_iommu_spapr_register_memory reg = {
.argsz = sizeof(reg),
@@ -738,17 +758,31 @@ vfio_spapr_dma_map(int vfio_container_fd)
}

/* create DMA window from 0 to max(phys_addr + len) */
- for (i = 0; i < RTE_MAX_MEMSEG; i++) {
- if (ms[i].addr == NULL)
- break;
-
- create.window_size = RTE_MAX(create.window_size,
- ms[i].iova + ms[i].len);
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+ struct rte_mem_config *mcfg =
+ rte_eal_get_configuration()->mem_config;
+ struct rte_memseg_list *msl = &mcfg->memsegs[i];
+ struct rte_fbarray *arr = &msl->memseg_arr;
+ int idx, next_idx;
+
+ if (msl->base_va == NULL)
+ continue;
+ if (msl->memseg_arr.count == 0)
+ continue;
+
+ next_idx = 0;
+ while ((idx = rte_fbarray_find_next_used(arr, next_idx)) >= 0) {
+ const struct rte_memseg *ms = rte_fbarray_get(arr, idx);
+ hugepage_sz = RTE_MAX(hugepage_sz, ms->hugepage_sz);
+ create.window_size = RTE_MAX(create.window_size,
+ ms[i].iova + ms[i].len);
+ next_idx = idx + 1;
+ }
}

/* sPAPR requires window size to be a power of 2 */
create.window_size = rte_align64pow2(create.window_size);
- create.page_shift = __builtin_ctzll(ms->hugepage_sz);
+ create.page_shift = __builtin_ctzll(hugepage_sz);
create.levels = 1;

ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
@@ -764,41 +798,61 @@ vfio_spapr_dma_map(int vfio_container_fd)
}

/* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
- for (i = 0; i < RTE_MAX_MEMSEG; i++) {
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
struct vfio_iommu_type1_dma_map dma_map;
+ struct rte_memseg_list *msl;
+ struct rte_fbarray *arr;
+ int ms_idx, next_idx;

- if (ms[i].addr == NULL)
- break;
+ msl = &rte_eal_get_configuration()->mem_config->memsegs[i];
+ arr = &msl->memseg_arr;

- reg.vaddr = (uintptr_t) ms[i].addr;
- reg.size = ms[i].len;
- ret = ioctl(vfio_container_fd,
- VFIO_IOMMU_SPAPR_REGISTER_MEMORY, &reg);
- if (ret) {
- RTE_LOG(ERR, EAL, " cannot register vaddr for IOMMU, "
- "error %i (%s)\n", errno, strerror(errno));
- return -1;
- }
+ /* skip empty memseg lists */
+ if (arr->count == 0)
+ continue;

- memset(&dma_map, 0, sizeof(dma_map));
- dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
- dma_map.vaddr = ms[i].addr_64;
- dma_map.size = ms[i].len;
- if (rte_eal_iova_mode() == RTE_IOVA_VA)
- dma_map.iova = dma_map.vaddr;
- else
- dma_map.iova = ms[i].iova;
- dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
- VFIO_DMA_MAP_FLAG_WRITE;
+ next_idx = 0;

- ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+ while ((ms_idx = rte_fbarray_find_next_used(arr,
+ next_idx) >= 0)) {
+ uint64_t addr, len, hw_addr;
+ const struct rte_memseg *ms;
+ next_idx = ms_idx + 1;

- if (ret) {
- RTE_LOG(ERR, EAL, " cannot set up DMA remapping, "
- "error %i (%s)\n", errno, strerror(errno));
- return -1;
- }
+ ms = rte_fbarray_get(arr, ms_idx);
+
+ addr = ms->addr_64;
+ len = ms->hugepage_sz;
+ hw_addr = ms->iova;

+ reg.vaddr = (uintptr_t) addr;
+ reg.size = len;
+ ret = ioctl(vfio_container_fd,
+ VFIO_IOMMU_SPAPR_REGISTER_MEMORY, &reg);
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot register vaddr for IOMMU, error %i (%s)\n",
+ errno, strerror(errno));
+ return -1;
+ }
+
+ memset(&dma_map, 0, sizeof(dma_map));
+ dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
+ dma_map.vaddr = addr;
+ dma_map.size = len;
+ dma_map.iova = hw_addr;
+ dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
+ VFIO_DMA_MAP_FLAG_WRITE;
+
+ ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA,
+ &dma_map);
+
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot set up DMA remapping, "
+ "error %i (%s)\n", errno,
+ strerror(errno));
+ return -1;
+ }
+ }
}

return 0;
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index a938a2f..4c2e959 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -25,7 +25,6 @@ DPDK_2.0 {
rte_eal_devargs_type_count;
rte_eal_get_configuration;
rte_eal_get_lcore_state;
- rte_eal_get_physmem_layout;
rte_eal_get_physmem_size;
rte_eal_has_hugepages;
rte_eal_hpet_init;
@@ -215,6 +214,8 @@ DPDK_18.05 {
global:

rte_num_sockets;
+ rte_mem_virt2memseg;
+ rte_mem_virt2memseg_list;
rte_malloc_dump_heaps;
rte_fbarray_init;
rte_fbarray_destroy;
diff --git a/test/test/test_malloc.c b/test/test/test_malloc.c
index d23192c..8484fb6 100644
--- a/test/test/test_malloc.c
+++ b/test/test/test_malloc.c
@@ -12,6 +12,7 @@

#include <rte_common.h>
#include <rte_memory.h>
+#include <rte_eal_memconfig.h>
#include <rte_per_lcore.h>
#include <rte_launch.h>
#include <rte_eal.h>
@@ -705,15 +706,23 @@ test_malloc_bad_params(void)
return -1;
}

-/* Check if memory is available on a specific socket */
+/* Check if memory is avilable on a specific socket */
static int
is_mem_on_socket(int32_t socket)
{
- const struct rte_memseg *ms = rte_eal_get_physmem_layout();
+ const struct rte_mem_config *mcfg =
+ rte_eal_get_configuration()->mem_config;
unsigned i;

- for (i = 0; i < RTE_MAX_MEMSEG; i++) {
- if (socket == ms[i].socket_id)
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+ const struct rte_memseg_list *msl =
+ &mcfg->memsegs[i];
+ const struct rte_fbarray *arr = &msl->memseg_arr;
+
+ if (msl->socket_id != socket)
+ continue;
+
+ if (arr->count)
return 1;
}
return 0;
@@ -726,16 +735,8 @@ is_mem_on_socket(int32_t socket)
static int32_t
addr_to_socket(void * addr)
{
- const struct rte_memseg *ms = rte_eal_get_physmem_layout();
- unsigned i;
-
- for (i = 0; i < RTE_MAX_MEMSEG; i++) {
- if ((ms[i].addr <= addr) &&
- ((uintptr_t)addr <
- ((uintptr_t)ms[i].addr + (uintptr_t)ms[i].len)))
- return ms[i].socket_id;
- }
- return -1;
+ const struct rte_memseg *ms = rte_mem_virt2memseg(addr, NULL);
+ return ms == NULL ? -1 : ms->socket_id;
}

/* Test using rte_[c|m|zm]alloc_socket() on a specific socket */
diff --git a/test/test/test_memory.c b/test/test/test_memory.c
index 972321f..8cb52d7 100644
--- a/test/test/test_memory.c
+++ b/test/test/test_memory.c
@@ -5,8 +5,11 @@
#include <stdio.h>
#include <stdint.h>

+#include <rte_eal.h>
+#include <rte_eal_memconfig.h>
#include <rte_memory.h>
#include <rte_common.h>
+#include <rte_memzone.h>

#include "test.h"

@@ -25,10 +28,12 @@
static int
test_memory(void)
{
+ const struct rte_memzone *mz = NULL;
uint64_t s;
unsigned i;
size_t j;
- const struct rte_memseg *mem;
+ struct rte_mem_config *mcfg =
+ rte_eal_get_configuration()->mem_config;

/*
* dump the mapped memory: the python-expect script checks
@@ -40,20 +45,42 @@ test_memory(void)
/* check that memory size is != 0 */
s = rte_eal_get_physmem_size();
if (s == 0) {
- printf("No memory detected\n");
- return -1;
+ printf("No memory detected, attempting to allocate\n");
+ mz = rte_memzone_reserve("tmp", 1000, SOCKET_ID_ANY, 0);
+
+ if (!mz) {
+ printf("Failed to allocate a memzone\n");
+ return -1;
+ }
}

/* try to read memory (should not segfault) */
- mem = rte_eal_get_physmem_layout();
- for (i = 0; i < RTE_MAX_MEMSEG && mem[i].addr != NULL ; i++) {
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+ struct rte_memseg_list *msl = &mcfg->memsegs[i];
+ struct rte_fbarray *arr = &msl->memseg_arr;
+ int search_idx, cur_idx;
+
+ if (arr->count == 0)
+ continue;
+
+ search_idx = 0;

- /* check memory */
- for (j = 0; j<mem[i].len; j++) {
- *((volatile uint8_t *) mem[i].addr + j);
+ while ((cur_idx = rte_fbarray_find_next_used(arr,
+ search_idx)) >= 0) {
+ const struct rte_memseg *ms;
+
+ ms = rte_fbarray_get(arr, cur_idx);
+
+ /* check memory */
+ for (j = 0; j < ms->len; j++)
+ *((volatile uint8_t *) ms->addr + j);
+ search_idx = cur_idx + 1;
}
}

+ if (mz)
+ rte_memzone_free(mz);
+
return 0;
}

diff --git a/test/test/test_memzone.c b/test/test/test_memzone.c
index 8ece1ac..47f4de8 100644
--- a/test/test/test_memzone.c
+++ b/test/test/test_memzone.c
@@ -108,22 +108,25 @@ static int
test_memzone_reserve_flags(void)
{
const struct rte_memzone *mz;
- const struct rte_memseg *ms;
int hugepage_2MB_avail = 0;
int hugepage_1GB_avail = 0;
int hugepage_16MB_avail = 0;
int hugepage_16GB_avail = 0;
const size_t size = 100;
int i = 0;
- ms = rte_eal_get_physmem_layout();
- for (i = 0; i < RTE_MAX_MEMSEG; i++) {
- if (ms[i].hugepage_sz == RTE_PGSIZE_2M)
+
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+ struct rte_mem_config *mcfg =
+ rte_eal_get_configuration()->mem_config;
+ struct rte_memseg_list *msl = &mcfg->memsegs[i];
+
+ if (msl->hugepage_sz == RTE_PGSIZE_2M)
hugepage_2MB_avail = 1;
- if (ms[i].hugepage_sz == RTE_PGSIZE_1G)
+ if (msl->hugepage_sz == RTE_PGSIZE_1G)
hugepage_1GB_avail = 1;
- if (ms[i].hugepage_sz == RTE_PGSIZE_16M)
+ if (msl->hugepage_sz == RTE_PGSIZE_16M)
hugepage_16MB_avail = 1;
- if (ms[i].hugepage_sz == RTE_PGSIZE_16G)
+ if (msl->hugepage_sz == RTE_PGSIZE_16G)
hugepage_16GB_avail = 1;
}
/* Display the availability of 2MB ,1GB, 16MB, 16GB pages */
--
2.7.4
Olivier Matz
2018-03-19 17:39:04 UTC
Permalink
On Sat, Mar 03, 2018 at 01:46:01PM +0000, Anatoly Burakov wrote:

[...]
Post by Anatoly Burakov
--- a/config/common_base
+++ b/config/common_base
@@ -61,7 +61,20 @@ CONFIG_RTE_CACHE_LINE_SIZE=64
CONFIG_RTE_LIBRTE_EAL=y
CONFIG_RTE_MAX_LCORE=128
CONFIG_RTE_MAX_NUMA_NODES=8
-CONFIG_RTE_MAX_MEMSEG=256
+CONFIG_RTE_MAX_MEMSEG_LISTS=32
+# each memseg list will be limited to either RTE_MAX_MEMSEG_PER_LIST pages
+# or RTE_MAX_MEM_PER_LIST gigabytes worth of memory, whichever is the smallest
+CONFIG_RTE_MAX_MEMSEG_PER_LIST=8192
+CONFIG_RTE_MAX_MEM_PER_LIST=32
+# a "type" is a combination of page size and NUMA node. total number of memseg
+# lists per type will be limited to either RTE_MAX_MEMSEG_PER_TYPE pages (split
+# over multiple lists of RTE_MAX_MEMSEG_PER_LIST pages), or RTE_MAX_MEM_PER_TYPE
+# gigabytes of memory (split over multiple lists of RTE_MAX_MEM_PER_LIST),
+# whichever is the smallest
+CONFIG_RTE_MAX_MEMSEG_PER_TYPE=32768
+CONFIG_RTE_MAX_MEM_PER_TYPE=128
+# legacy mem mode only
+CONFIG_RTE_MAX_LEGACY_MEMSEG=256
Would it be possible to suffix CONFIG_RTE_MAX_MEM_PER_LIST and
CONFIG_RTE_MAX_MEM_PER_TYPE with _GB? It's not that obvious that is it
gigabytes.

What is the impact of changing one of these values on the ABI? And what
would be the impact on performance? The underlying question is: shall we
increase these values to avoid changing them later?
Burakov, Anatoly
2018-03-20 09:47:43 UTC
Permalink
Post by Olivier Matz
[...]
Post by Anatoly Burakov
--- a/config/common_base
+++ b/config/common_base
@@ -61,7 +61,20 @@ CONFIG_RTE_CACHE_LINE_SIZE=64
CONFIG_RTE_LIBRTE_EAL=y
CONFIG_RTE_MAX_LCORE=128
CONFIG_RTE_MAX_NUMA_NODES=8
-CONFIG_RTE_MAX_MEMSEG=256
+CONFIG_RTE_MAX_MEMSEG_LISTS=32
+# each memseg list will be limited to either RTE_MAX_MEMSEG_PER_LIST pages
+# or RTE_MAX_MEM_PER_LIST gigabytes worth of memory, whichever is the smallest
+CONFIG_RTE_MAX_MEMSEG_PER_LIST=8192
+CONFIG_RTE_MAX_MEM_PER_LIST=32
+# a "type" is a combination of page size and NUMA node. total number of memseg
+# lists per type will be limited to either RTE_MAX_MEMSEG_PER_TYPE pages (split
+# over multiple lists of RTE_MAX_MEMSEG_PER_LIST pages), or RTE_MAX_MEM_PER_TYPE
+# gigabytes of memory (split over multiple lists of RTE_MAX_MEM_PER_LIST),
+# whichever is the smallest
+CONFIG_RTE_MAX_MEMSEG_PER_TYPE=32768
+CONFIG_RTE_MAX_MEM_PER_TYPE=128
+# legacy mem mode only
+CONFIG_RTE_MAX_LEGACY_MEMSEG=256
Would it be possible to suffix CONFIG_RTE_MAX_MEM_PER_LIST and
CONFIG_RTE_MAX_MEM_PER_TYPE with _GB? It's not that obvious that is it
gigabytes.
Sure, will add this.
Post by Olivier Matz
What is the impact of changing one of these values on the ABI?
Some of them will change the ABI, some won't. MAX_MEMSEG_LISTS will
change the ABI because it's in the rte_eal_memconfig, but other values
are not and are only used during init (and LEGACY_MEMSEG is already
removed in GitHub code).
Post by Olivier Matz
And what would be the impact on performance?
Depending on what you mean by performance. Generally, no impact on
performance will be noticeable because we're not really doing anything
differently - a page is a page, no matter how or when it is mapped.
These changes might also speed up some lookup operations on memseg lists
themselves.
Post by Olivier Matz
The underlying question is: shall we increase these values to avoid changing them later?
I do plan to increase the MAX_MEMSEG_LISTS value to at least 64.
--
Thanks,
Anatoly
Anatoly Burakov
2018-03-03 13:45:52 UTC
Permalink
Malloc heap is now a doubly linked list, so it's now possible to
iterate over each malloc element regardless of its state.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/common/include/rte_malloc.h | 9 +++++++++
lib/librte_eal/common/malloc_elem.c | 24 ++++++++++++++++++++++++
lib/librte_eal/common/malloc_elem.h | 6 ++++++
lib/librte_eal/common/malloc_heap.c | 22 ++++++++++++++++++++++
lib/librte_eal/common/malloc_heap.h | 3 +++
lib/librte_eal/common/rte_malloc.c | 16 ++++++++++++++++
lib/librte_eal/rte_eal_version.map | 1 +
7 files changed, 81 insertions(+)

diff --git a/lib/librte_eal/common/include/rte_malloc.h b/lib/librte_eal/common/include/rte_malloc.h
index f02a8ba..a3fc83e 100644
--- a/lib/librte_eal/common/include/rte_malloc.h
+++ b/lib/librte_eal/common/include/rte_malloc.h
@@ -278,6 +278,15 @@ void
rte_malloc_dump_stats(FILE *f, const char *type);

/**
+ * Dump contents of all malloc heaps to a file.
+ *
+ * @param f
+ * A pointer to a file for output
+ */
+void
+rte_malloc_dump_heaps(FILE *f);
+
+/**
* Set the maximum amount of allocated memory for this type.
*
* This is not yet implemented
diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c
index eb41200..e02ed88 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -1,6 +1,7 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright(c) 2010-2014 Intel Corporation
*/
+#include <inttypes.h>
#include <stdint.h>
#include <stddef.h>
#include <stdio.h>
@@ -434,3 +435,26 @@ malloc_elem_resize(struct malloc_elem *elem, size_t size)
}
return 0;
}
+
+static inline const char *
+elem_state_to_str(enum elem_state state)
+{
+ switch (state) {
+ case ELEM_PAD:
+ return "PAD";
+ case ELEM_BUSY:
+ return "BUSY";
+ case ELEM_FREE:
+ return "FREE";
+ }
+ return "ERROR";
+}
+
+void
+malloc_elem_dump(const struct malloc_elem *elem, FILE *f)
+{
+ fprintf(f, "Malloc element at %p (%s)\n", elem,
+ elem_state_to_str(elem->state));
+ fprintf(f, " len: 0x%zx pad: 0x%" PRIx32 "\n", elem->size, elem->pad);
+ fprintf(f, " prev: %p next: %p\n", elem->prev, elem->next);
+}
diff --git a/lib/librte_eal/common/malloc_elem.h b/lib/librte_eal/common/malloc_elem.h
index 238e451..40e8eb5 100644
--- a/lib/librte_eal/common/malloc_elem.h
+++ b/lib/librte_eal/common/malloc_elem.h
@@ -149,6 +149,12 @@ int
malloc_elem_resize(struct malloc_elem *elem, size_t size);

/*
+ * dump contents of malloc elem to a file.
+ */
+void
+malloc_elem_dump(const struct malloc_elem *elem, FILE *f);
+
+/*
* Given an element size, compute its freelist index.
*/
size_t
diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c
index 9c95166..44538d7 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -217,6 +217,28 @@ malloc_heap_get_stats(struct malloc_heap *heap,
return 0;
}

+/*
+ * Function to retrieve data for heap on given socket
+ */
+void
+malloc_heap_dump(struct malloc_heap *heap, FILE *f)
+{
+ struct malloc_elem *elem;
+
+ rte_spinlock_lock(&heap->lock);
+
+ fprintf(f, "Heap size: 0x%zx\n", heap->total_size);
+ fprintf(f, "Heap alloc count: %u\n", heap->alloc_count);
+
+ elem = heap->first;
+ while (elem) {
+ malloc_elem_dump(elem, f);
+ elem = elem->next;
+ }
+
+ rte_spinlock_unlock(&heap->lock);
+}
+
int
rte_eal_malloc_heap_init(void)
{
diff --git a/lib/librte_eal/common/malloc_heap.h b/lib/librte_eal/common/malloc_heap.h
index ab0005c..bb28422 100644
--- a/lib/librte_eal/common/malloc_heap.h
+++ b/lib/librte_eal/common/malloc_heap.h
@@ -37,6 +37,9 @@ int
malloc_heap_get_stats(struct malloc_heap *heap,
struct rte_malloc_socket_stats *socket_stats);

+void
+malloc_heap_dump(struct malloc_heap *heap, FILE *f);
+
int
rte_eal_malloc_heap_init(void);

diff --git a/lib/librte_eal/common/rte_malloc.c b/lib/librte_eal/common/rte_malloc.c
index 970813e..80fb6cc 100644
--- a/lib/librte_eal/common/rte_malloc.c
+++ b/lib/librte_eal/common/rte_malloc.c
@@ -182,6 +182,22 @@ rte_malloc_get_socket_stats(int socket,
}

/*
+ * Function to dump contents of all heaps
+ */
+void
+rte_malloc_dump_heaps(FILE *f)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ unsigned int socket;
+
+ for (socket = 0; socket < rte_num_sockets(); socket++) {
+ fprintf(f, "Heap on socket %i:\n", socket);
+ malloc_heap_dump(&mcfg->malloc_heaps[socket], f);
+ }
+
+}
+
+/*
* Print stats on memory type. If type is NULL, info on all types is printed
*/
void
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index 52f5940..18b8bf5 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -215,6 +215,7 @@ DPDK_18.05 {
global:

rte_num_sockets;
+ rte_malloc_dump_heaps;

} DPDK_18.02;
--
2.7.4
Anatoly Burakov
2018-03-03 13:45:59 UTC
Permalink
This adds a "--legacy-mem" command-line switch. It will be used to
go back to the old memory behavior, one where we can't dynamically
allocate/free memory (the downside), but one where the user can
get physically contiguous memory, like before (the upside).

For now, nothing but the legacy behavior exists, non-legacy
memory init sequence will be added later.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/bsdapp/eal/eal.c | 3 +++
lib/librte_eal/common/eal_common_options.c | 4 ++++
lib/librte_eal/common/eal_internal_cfg.h | 4 ++++
lib/librte_eal/common/eal_options.h | 2 ++
lib/librte_eal/linuxapp/eal/eal.c | 1 +
lib/librte_eal/linuxapp/eal/eal_memory.c | 24 ++++++++++++++++++++----
6 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/lib/librte_eal/bsdapp/eal/eal.c b/lib/librte_eal/bsdapp/eal/eal.c
index 4eafcb5..45e5670 100644
--- a/lib/librte_eal/bsdapp/eal/eal.c
+++ b/lib/librte_eal/bsdapp/eal/eal.c
@@ -531,6 +531,9 @@ rte_eal_init(int argc, char **argv)
return -1;
}

+ /* FreeBSD always uses legacy memory model */
+ internal_config.legacy_mem = true;
+
if (eal_plugins_init() < 0) {
rte_eal_init_alert("Cannot init plugins\n");
rte_errno = EINVAL;
diff --git a/lib/librte_eal/common/eal_common_options.c b/lib/librte_eal/common/eal_common_options.c
index dbc3fb5..3e92551 100644
--- a/lib/librte_eal/common/eal_common_options.c
+++ b/lib/librte_eal/common/eal_common_options.c
@@ -74,6 +74,7 @@ eal_long_options[] = {
{OPT_VFIO_INTR, 1, NULL, OPT_VFIO_INTR_NUM },
{OPT_VMWARE_TSC_MAP, 0, NULL, OPT_VMWARE_TSC_MAP_NUM },
{OPT_SINGLE_FILE_SEGMENTS, 0, NULL, OPT_SINGLE_FILE_SEGMENTS_NUM},
+ {OPT_LEGACY_MEM, 0, NULL, OPT_LEGACY_MEM_NUM },
{0, 0, NULL, 0 }
};

@@ -1165,6 +1166,9 @@ eal_parse_common_option(int opt, const char *optarg,
case OPT_SINGLE_FILE_SEGMENTS_NUM:
conf->single_file_segments = 1;
break;
+ case OPT_LEGACY_MEM_NUM:
+ conf->legacy_mem = 1;
+ break;

/* don't know what to do, leave this to caller */
default:
diff --git a/lib/librte_eal/common/eal_internal_cfg.h b/lib/librte_eal/common/eal_internal_cfg.h
index 3e31ac6..c8a0676 100644
--- a/lib/librte_eal/common/eal_internal_cfg.h
+++ b/lib/librte_eal/common/eal_internal_cfg.h
@@ -51,6 +51,10 @@ struct internal_config {
/**< true if storing all pages within single files (per-page-size,
* per-node).
*/
+ volatile unsigned legacy_mem;
+ /**< true to enable legacy memory behavior (no dynamic allocation,
+ * contiguous segments).
+ */
volatile int syslog_facility; /**< facility passed to openlog() */
/** default interrupt mode for VFIO */
volatile enum rte_intr_mode vfio_intr_mode;
diff --git a/lib/librte_eal/common/eal_options.h b/lib/librte_eal/common/eal_options.h
index a4b80d5..f9a679d 100644
--- a/lib/librte_eal/common/eal_options.h
+++ b/lib/librte_eal/common/eal_options.h
@@ -57,6 +57,8 @@ enum {
OPT_VMWARE_TSC_MAP_NUM,
#define OPT_SINGLE_FILE_SEGMENTS "single-file-segments"
OPT_SINGLE_FILE_SEGMENTS_NUM,
+#define OPT_LEGACY_MEM "legacy-mem"
+ OPT_LEGACY_MEM_NUM,
OPT_LONG_MAX_NUM
};

diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index c84e6bf..5207713 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -349,6 +349,7 @@ eal_usage(const char *prgname)
" --"OPT_CREATE_UIO_DEV" Create /dev/uioX (usually done by hotplug)\n"
" --"OPT_VFIO_INTR" Interrupt mode for VFIO (legacy|msi|msix)\n"
" --"OPT_SINGLE_FILE_SEGMENTS" Put all hugepage memory in single files\n"
+ " --"OPT_LEGACY_MEM" Legacy memory mode (no dynamic allocation, contiguous segments)\n"
"\n");
/* Allow the application to print its usage message too if hook is set */
if ( rte_application_usage_hook ) {
diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
index 5c11d77..b9bcb75 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -919,8 +919,8 @@ huge_recover_sigbus(void)
* 6. unmap the first mapping
* 7. fill memsegs in configuration with contiguous zones
*/
-int
-rte_eal_hugepage_init(void)
+static int
+eal_legacy_hugepage_init(void)
{
struct rte_mem_config *mcfg;
struct hugepage_file *hugepage = NULL, *tmp_hp = NULL;
@@ -1262,8 +1262,8 @@ getFileSize(int fd)
* configuration and finds the hugepages which form that segment, mapping them
* in order to form a contiguous block in the virtual memory space
*/
-int
-rte_eal_hugepage_attach(void)
+static int
+eal_legacy_hugepage_attach(void)
{
const struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
struct hugepage_file *hp = NULL;
@@ -1399,6 +1399,22 @@ rte_eal_hugepage_attach(void)
}

int
+rte_eal_hugepage_init(void)
+{
+ if (internal_config.legacy_mem)
+ return eal_legacy_hugepage_init();
+ return -1;
+}
+
+int
+rte_eal_hugepage_attach(void)
+{
+ if (internal_config.legacy_mem)
+ return eal_legacy_hugepage_attach();
+ return -1;
+}
+
+int
rte_eal_using_phys_addrs(void)
{
return phys_addrs_available;
--
2.7.4
Anatoly Burakov
2018-03-03 13:46:07 UTC
Permalink
This will be helpful down the line when we implement support for
allocating physically contiguous memory. We can no longer guarantee
physically contiguous memory unless we're in IOVA_AS_VA mode, but
we can certainly try and see if we succeed. In addition, this would
be useful for e.g. PMD's who may allocate chunks that are smaller
than the pagesize, but they must not cross the page boundary, in
which case we will be able to accommodate that request.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/bsdapp/eal/Makefile | 1 +
lib/librte_eal/common/eal_common_memalloc.c | 49 +++++++++++++++++++++++++++++
lib/librte_eal/common/eal_memalloc.h | 5 +++
lib/librte_eal/common/meson.build | 1 +
lib/librte_eal/linuxapp/eal/Makefile | 1 +
5 files changed, 57 insertions(+)
create mode 100644 lib/librte_eal/common/eal_common_memalloc.c

diff --git a/lib/librte_eal/bsdapp/eal/Makefile b/lib/librte_eal/bsdapp/eal/Makefile
index 19f9322..907e30d 100644
--- a/lib/librte_eal/bsdapp/eal/Makefile
+++ b/lib/librte_eal/bsdapp/eal/Makefile
@@ -41,6 +41,7 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_timer.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_memzone.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_log.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_launch.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_memalloc.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_memory.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_tailqs.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_errno.c
diff --git a/lib/librte_eal/common/eal_common_memalloc.c b/lib/librte_eal/common/eal_common_memalloc.c
new file mode 100644
index 0000000..62e8c16
--- /dev/null
+++ b/lib/librte_eal/common/eal_common_memalloc.c
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2017-2018 Intel Corporation
+ */
+
+#include <rte_lcore.h>
+#include <rte_fbarray.h>
+#include <rte_memzone.h>
+#include <rte_memory.h>
+#include <rte_eal_memconfig.h>
+
+#include "eal_private.h"
+#include "eal_internal_cfg.h"
+#include "eal_memalloc.h"
+
+bool
+eal_memalloc_is_contig(struct rte_memseg_list *msl, void *start,
+ size_t len)
+{
+ const struct rte_memseg *ms;
+ uint64_t page_sz;
+ void *end;
+ int start_page, end_page, cur_page;
+ rte_iova_t expected;
+
+ /* for legacy memory, it's always contiguous */
+ if (internal_config.legacy_mem)
+ return true;
+
+ /* figure out how many pages we need to fit in current data */
+ page_sz = msl->hugepage_sz;
+ end = RTE_PTR_ADD(start, len);
+
+ start_page = RTE_PTR_DIFF(start, msl->base_va) / page_sz;
+ end_page = RTE_PTR_DIFF(end, msl->base_va) / page_sz;
+
+ /* now, look for contiguous memory */
+ ms = rte_fbarray_get(&msl->memseg_arr, start_page);
+ expected = ms->iova + page_sz;
+
+ for (cur_page = start_page + 1; cur_page < end_page;
+ cur_page++, expected += page_sz) {
+ ms = rte_fbarray_get(&msl->memseg_arr, cur_page);
+
+ if (ms->iova != expected)
+ return false;
+ }
+
+ return true;
+}
diff --git a/lib/librte_eal/common/eal_memalloc.h b/lib/librte_eal/common/eal_memalloc.h
index adf59c4..08ba70e 100644
--- a/lib/librte_eal/common/eal_memalloc.h
+++ b/lib/librte_eal/common/eal_memalloc.h
@@ -8,6 +8,7 @@
#include <stdbool.h>

#include <rte_memory.h>
+#include <rte_eal_memconfig.h>

struct rte_memseg *
eal_memalloc_alloc_page(uint64_t size, int socket);
@@ -19,4 +20,8 @@ eal_memalloc_alloc_page_bulk(struct rte_memseg **ms, int n, uint64_t size,
int
eal_memalloc_free_page(struct rte_memseg *ms);

+bool
+eal_memalloc_is_contig(struct rte_memseg_list *msl, void *start,
+ size_t len);
+
#endif // EAL_MEMALLOC_H
diff --git a/lib/librte_eal/common/meson.build b/lib/librte_eal/common/meson.build
index 7d02191..a1ada24 100644
--- a/lib/librte_eal/common/meson.build
+++ b/lib/librte_eal/common/meson.build
@@ -16,6 +16,7 @@ common_sources = files(
'eal_common_launch.c',
'eal_common_lcore.c',
'eal_common_log.c',
+ 'eal_common_memalloc.c',
'eal_common_memory.c',
'eal_common_memzone.c',
'eal_common_options.c',
diff --git a/lib/librte_eal/linuxapp/eal/Makefile b/lib/librte_eal/linuxapp/eal/Makefile
index af6b9be..5380ba8 100644
--- a/lib/librte_eal/linuxapp/eal/Makefile
+++ b/lib/librte_eal/linuxapp/eal/Makefile
@@ -49,6 +49,7 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_timer.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_memzone.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_log.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_launch.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_memalloc.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_memory.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_tailqs.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_errno.c
--
2.7.4
Anatoly Burakov
2018-03-03 13:46:09 UTC
Permalink
This adds a new set of _contig API's to rte_memzone.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/common/eal_common_memzone.c | 44 ++++++++
lib/librte_eal/common/include/rte_memzone.h | 154 ++++++++++++++++++++++++++++
2 files changed, 198 insertions(+)

diff --git a/lib/librte_eal/common/eal_common_memzone.c b/lib/librte_eal/common/eal_common_memzone.c
index 75c7dd9..8c9aa28 100644
--- a/lib/librte_eal/common/eal_common_memzone.c
+++ b/lib/librte_eal/common/eal_common_memzone.c
@@ -170,6 +170,12 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
socket_id = SOCKET_ID_ANY;

if (len == 0) {
+ /* len == 0 is only allowed for non-contiguous zones */
+ if (contig) {
+ RTE_LOG(DEBUG, EAL, "Reserving zero-length contiguous memzones is not supported\n");
+ rte_errno = EINVAL;
+ return NULL;
+ }
if (bound != 0)
requested_len = bound;
else {
@@ -251,6 +257,19 @@ rte_memzone_reserve_bounded(const char *name, size_t len, int socket_id,

/*
* Return a pointer to a correctly filled memzone descriptor (with a
+ * specified alignment and boundary). If the allocation cannot be done,
+ * return NULL.
+ */
+const struct rte_memzone *
+rte_memzone_reserve_bounded_contig(const char *name, size_t len, int socket_id,
+ unsigned int flags, unsigned int align, unsigned int bound)
+{
+ return rte_memzone_reserve_thread_safe(name, len, socket_id, flags,
+ align, bound, true);
+}
+
+/*
+ * Return a pointer to a correctly filled memzone descriptor (with a
* specified alignment). If the allocation cannot be done, return NULL.
*/
const struct rte_memzone *
@@ -262,6 +281,18 @@ rte_memzone_reserve_aligned(const char *name, size_t len, int socket_id,
}

/*
+ * Return a pointer to a correctly filled memzone descriptor (with a
+ * specified alignment). If the allocation cannot be done, return NULL.
+ */
+const struct rte_memzone *
+rte_memzone_reserve_aligned_contig(const char *name, size_t len, int socket_id,
+ unsigned int flags, unsigned int align)
+{
+ return rte_memzone_reserve_thread_safe(name, len, socket_id, flags,
+ align, 0, true);
+}
+
+/*
* Return a pointer to a correctly filled memzone descriptor. If the
* allocation cannot be done, return NULL.
*/
@@ -274,6 +305,19 @@ rte_memzone_reserve(const char *name, size_t len, int socket_id,
false);
}

+/*
+ * Return a pointer to a correctly filled memzone descriptor. If the
+ * allocation cannot be done, return NULL.
+ */
+const struct rte_memzone *
+rte_memzone_reserve_contig(const char *name, size_t len, int socket_id,
+ unsigned int flags)
+{
+ return rte_memzone_reserve_thread_safe(name, len, socket_id,
+ flags, RTE_CACHE_LINE_SIZE, 0,
+ true);
+}
+
int
rte_memzone_free(const struct rte_memzone *mz)
{
diff --git a/lib/librte_eal/common/include/rte_memzone.h b/lib/librte_eal/common/include/rte_memzone.h
index a69f068..5f1293f 100644
--- a/lib/librte_eal/common/include/rte_memzone.h
+++ b/lib/librte_eal/common/include/rte_memzone.h
@@ -227,6 +227,160 @@ const struct rte_memzone *rte_memzone_reserve_bounded(const char *name,
unsigned flags, unsigned align, unsigned bound);

/**
+ * Reserve an IOVA-contiguous portion of physical memory.
+ *
+ * This function reserves some IOVA-contiguous memory and returns a pointer to a
+ * correctly filled memzone descriptor. If the allocation cannot be
+ * done, return NULL.
+ *
+ * @param name
+ * The name of the memzone. If it already exists, the function will
+ * fail and return NULL.
+ * @param len
+ * The size of the memory to be reserved.
+ * @param socket_id
+ * The socket identifier in the case of
+ * NUMA. The value can be SOCKET_ID_ANY if there is no NUMA
+ * constraint for the reserved zone.
+ * @param flags
+ * The flags parameter is used to request memzones to be
+ * taken from specifically sized hugepages.
+ * - RTE_MEMZONE_2MB - Reserved from 2MB pages
+ * - RTE_MEMZONE_1GB - Reserved from 1GB pages
+ * - RTE_MEMZONE_16MB - Reserved from 16MB pages
+ * - RTE_MEMZONE_16GB - Reserved from 16GB pages
+ * - RTE_MEMZONE_256KB - Reserved from 256KB pages
+ * - RTE_MEMZONE_256MB - Reserved from 256MB pages
+ * - RTE_MEMZONE_512MB - Reserved from 512MB pages
+ * - RTE_MEMZONE_4GB - Reserved from 4GB pages
+ * - RTE_MEMZONE_SIZE_HINT_ONLY - Allow alternative page size to be used if
+ * the requested page size is unavailable.
+ * If this flag is not set, the function
+ * will return error on an unavailable size
+ * request.
+ * @return
+ * A pointer to a correctly-filled read-only memzone descriptor, or NULL
+ * on error.
+ * On error case, rte_errno will be set appropriately:
+ * - E_RTE_NO_CONFIG - function could not get pointer to rte_config structure
+ * - E_RTE_SECONDARY - function was called from a secondary process instance
+ * - ENOSPC - the maximum number of memzones has already been allocated
+ * - EEXIST - a memzone with the same name already exists
+ * - ENOMEM - no appropriate memory area found in which to create memzone
+ * - EINVAL - invalid parameters
+ */
+const struct rte_memzone *rte_memzone_reserve_contig(const char *name,
+ size_t len, int socket_id, unsigned int flags);
+
+/**
+ * Reserve an IOVA-contiguous portion of physical memory with alignment on a
+ * specified boundary.
+ *
+ * This function reserves some IOVA-contiguous memory with alignment on a
+ * specified boundary, and returns a pointer to a correctly filled memzone
+ * descriptor. If the allocation cannot be done or if the alignment
+ * is not a power of 2, returns NULL.
+ *
+ * @param name
+ * The name of the memzone. If it already exists, the function will
+ * fail and return NULL.
+ * @param len
+ * The size of the memory to be reserved.
+ * @param socket_id
+ * The socket identifier in the case of
+ * NUMA. The value can be SOCKET_ID_ANY if there is no NUMA
+ * constraint for the reserved zone.
+ * @param flags
+ * The flags parameter is used to request memzones to be
+ * taken from specifically sized hugepages.
+ * - RTE_MEMZONE_2MB - Reserved from 2MB pages
+ * - RTE_MEMZONE_1GB - Reserved from 1GB pages
+ * - RTE_MEMZONE_16MB - Reserved from 16MB pages
+ * - RTE_MEMZONE_16GB - Reserved from 16GB pages
+ * - RTE_MEMZONE_256KB - Reserved from 256KB pages
+ * - RTE_MEMZONE_256MB - Reserved from 256MB pages
+ * - RTE_MEMZONE_512MB - Reserved from 512MB pages
+ * - RTE_MEMZONE_4GB - Reserved from 4GB pages
+ * - RTE_MEMZONE_SIZE_HINT_ONLY - Allow alternative page size to be used if
+ * the requested page size is unavailable.
+ * If this flag is not set, the function
+ * will return error on an unavailable size
+ * request.
+ * @param align
+ * Alignment for resulting memzone. Must be a power of 2.
+ * @return
+ * A pointer to a correctly-filled read-only memzone descriptor, or NULL
+ * on error.
+ * On error case, rte_errno will be set appropriately:
+ * - E_RTE_NO_CONFIG - function could not get pointer to rte_config structure
+ * - E_RTE_SECONDARY - function was called from a secondary process instance
+ * - ENOSPC - the maximum number of memzones has already been allocated
+ * - EEXIST - a memzone with the same name already exists
+ * - ENOMEM - no appropriate memory area found in which to create memzone
+ * - EINVAL - invalid parameters
+ */
+const struct rte_memzone *rte_memzone_reserve_aligned_contig(const char *name,
+ size_t len, int socket_id, unsigned int flags,
+ unsigned int align);
+
+/**
+ * Reserve an IOVA-contiguous portion of physical memory with specified
+ * alignment and boundary.
+ *
+ * This function reserves some IOVA-contiguous memory with specified alignment
+ * and boundary, and returns a pointer to a correctly filled memzone
+ * descriptor. If the allocation cannot be done or if the alignment
+ * or boundary are not a power of 2, returns NULL.
+ * Memory buffer is reserved in a way, that it wouldn't cross specified
+ * boundary. That implies that requested length should be less or equal
+ * then boundary.
+ *
+ * @param name
+ * The name of the memzone. If it already exists, the function will
+ * fail and return NULL.
+ * @param len
+ * The size of the memory to be reserved.
+ * @param socket_id
+ * The socket identifier in the case of
+ * NUMA. The value can be SOCKET_ID_ANY if there is no NUMA
+ * constraint for the reserved zone.
+ * @param flags
+ * The flags parameter is used to request memzones to be
+ * taken from specifically sized hugepages.
+ * - RTE_MEMZONE_2MB - Reserved from 2MB pages
+ * - RTE_MEMZONE_1GB - Reserved from 1GB pages
+ * - RTE_MEMZONE_16MB - Reserved from 16MB pages
+ * - RTE_MEMZONE_16GB - Reserved from 16GB pages
+ * - RTE_MEMZONE_256KB - Reserved from 256KB pages
+ * - RTE_MEMZONE_256MB - Reserved from 256MB pages
+ * - RTE_MEMZONE_512MB - Reserved from 512MB pages
+ * - RTE_MEMZONE_4GB - Reserved from 4GB pages
+ * - RTE_MEMZONE_SIZE_HINT_ONLY - Allow alternative page size to be used if
+ * the requested page size is unavailable.
+ * If this flag is not set, the function
+ * will return error on an unavailable size
+ * request.
+ * @param align
+ * Alignment for resulting memzone. Must be a power of 2.
+ * @param bound
+ * Boundary for resulting memzone. Must be a power of 2 or zero.
+ * Zero value implies no boundary condition.
+ * @return
+ * A pointer to a correctly-filled read-only memzone descriptor, or NULL
+ * on error.
+ * On error case, rte_errno will be set appropriately:
+ * - E_RTE_NO_CONFIG - function could not get pointer to rte_config structure
+ * - E_RTE_SECONDARY - function was called from a secondary process instance
+ * - ENOSPC - the maximum number of memzones has already been allocated
+ * - EEXIST - a memzone with the same name already exists
+ * - ENOMEM - no appropriate memory area found in which to create memzone
+ * - EINVAL - invalid parameters
+ */
+const struct rte_memzone *rte_memzone_reserve_bounded_contig(const char *name,
+ size_t len, int socket_id, unsigned int flags,
+ unsigned int align, unsigned int bound);
+
+/**
* Free a memzone.
*
* @param mz
--
2.7.4
Anatoly Burakov
2018-03-03 13:46:18 UTC
Permalink
Also, rewrite VFIO to rely on memory callbacks instead of manually
registering memory with VFIO. Callbacks will only be registered if
VFIO is enabled.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/common/malloc_heap.c | 21 +++++++++++++++++
lib/librte_eal/linuxapp/eal/eal_memalloc.c | 37 +++++++++++++++++++++---------
lib/librte_eal/linuxapp/eal/eal_vfio.c | 35 ++++++++++++++++++++++++++++
3 files changed, 82 insertions(+), 11 deletions(-)

diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c
index 9109555..9d055c8 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -223,6 +223,7 @@ try_expand_heap_primary(struct malloc_heap *heap, uint64_t pg_sz,
void *map_addr;
size_t map_len;
int n_pages;
+ bool callback_triggered = false;

map_len = RTE_ALIGN_CEIL(align + elt_size +
MALLOC_ELEM_TRAILER_LEN, pg_sz);
@@ -242,14 +243,25 @@ try_expand_heap_primary(struct malloc_heap *heap, uint64_t pg_sz,

map_addr = ms[0]->addr;

+ /* notify user about changes in memory map */
+ eal_memalloc_notify(RTE_MEM_EVENT_ALLOC, map_addr, map_len);
+
/* notify other processes that this has happened */
if (request_sync()) {
/* we couldn't ensure all processes have mapped memory,
* so free it back and notify everyone that it's been
* freed back.
+ *
+ * technically, we could've avoided adding memory addresses to
+ * the map, but that would've led to inconsistent behavior
+ * between primary and secondary processes, as those get
+ * callbacks during sync. therefore, force primary process to
+ * do alloc-and-rollback syncs as well.
*/
+ callback_triggered = true;
goto free_elem;
}
+
heap->total_size += map_len;

RTE_LOG(DEBUG, EAL, "Heap on socket %d was expanded by %zdMB\n",
@@ -260,6 +272,9 @@ try_expand_heap_primary(struct malloc_heap *heap, uint64_t pg_sz,
return 0;

free_elem:
+ if (callback_triggered)
+ eal_memalloc_notify(RTE_MEM_EVENT_FREE, map_addr, map_len);
+
rollback_expand_heap(ms, n_pages, elem, map_addr, map_len);

request_sync();
@@ -615,6 +630,10 @@ malloc_heap_free(struct malloc_elem *elem)
heap->total_size -= n_pages * msl->hugepage_sz;

if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ /* notify user about changes in memory map */
+ eal_memalloc_notify(RTE_MEM_EVENT_FREE,
+ aligned_start, aligned_len);
+
/* don't care if any of this fails */
malloc_heap_free_pages(aligned_start, aligned_len);

@@ -637,6 +656,8 @@ malloc_heap_free(struct malloc_elem *elem)
* already removed from the heap, so it is, for all intents and
* purposes, hidden from the rest of DPDK even if some other
* process (including this one) may have these pages mapped.
+ *
+ * notifications about deallocated memory happen during sync.
*/
request_to_primary(&req);
}
diff --git a/lib/librte_eal/linuxapp/eal/eal_memalloc.c b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
index 227d703..1008fae 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memalloc.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
@@ -34,7 +34,6 @@
#include <rte_eal.h>
#include <rte_memory.h>
#include <rte_spinlock.h>
-#include <rte_vfio.h>

#include "eal_filesystem.h"
#include "eal_internal_cfg.h"
@@ -480,10 +479,6 @@ alloc_page(struct rte_memseg *ms, void *addr, uint64_t size, int socket_id,
ms->iova = iova;
ms->socket_id = socket_id;

- /* map the segment so that VFIO has access to it */
- if (rte_eal_iova_mode() == RTE_IOVA_VA &&
- rte_vfio_dma_map(ms->addr_64, iova, size))
- RTE_LOG(DEBUG, EAL, "Cannot register segment with VFIO\n");
return 0;

mapped:
@@ -515,12 +510,6 @@ free_page(struct rte_memseg *ms, struct hugepage_info *hi,
char path[PATH_MAX];
int fd, ret;

- /* unmap the segment from VFIO */
- if (rte_eal_iova_mode() == RTE_IOVA_VA &&
- rte_vfio_dma_unmap(ms->addr_64, ms->iova, ms->len)) {
- RTE_LOG(DEBUG, EAL, "Cannot unregister segment with VFIO\n");
- }
-
if (mmap(ms->addr, ms->hugepage_sz, PROT_READ,
MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0) ==
MAP_FAILED) {
@@ -808,6 +797,19 @@ sync_chunk(struct rte_memseg_list *primary_msl,

diff_len = RTE_MIN(chunk_len, diff_len);

+ /* if we are freeing memory, notif the application */
+ if (!used) {
+ struct rte_memseg *ms;
+ void *start_va;
+ size_t len;
+
+ ms = rte_fbarray_get(l_arr, start);
+ start_va = ms->addr;
+ len = ms->len * diff_len;
+
+ eal_memalloc_notify(RTE_MEM_EVENT_FREE, start_va, len);
+ }
+
for (i = 0; i < diff_len; i++) {
struct rte_memseg *p_ms, *l_ms;
int seg_idx = start + i;
@@ -834,6 +836,19 @@ sync_chunk(struct rte_memseg_list *primary_msl,
}
}

+ /* if we just allocated memory, notify the application */
+ if (used) {
+ struct rte_memseg *ms;
+ void *start_va;
+ size_t len;
+
+ ms = rte_fbarray_get(l_arr, start);
+ start_va = ms->addr;
+ len = ms->len * diff_len;
+
+ eal_memalloc_notify(RTE_MEM_EVENT_ALLOC, start_va, len);
+ }
+
/* calculate how much we can advance until next chunk */
diff_len = used ?
rte_fbarray_find_contig_used(l_arr, start) :
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index 8fe8984..d3c3b70 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -214,6 +214,37 @@ vfio_group_device_count(int vfio_group_fd)
return vfio_cfg.vfio_groups[i].devices;
}

+static void
+vfio_mem_event_callback(enum rte_mem_event type, const void *addr, size_t len)
+{
+ struct rte_memseg_list *msl;
+ struct rte_memseg *ms;
+ size_t cur_len = 0;
+ uint64_t pgsz;
+
+ msl = rte_mem_virt2memseg_list(addr);
+ pgsz = msl->hugepage_sz;
+
+ while (cur_len < len) {
+ const void *va = RTE_PTR_ADD(addr, cur_len);
+ uint64_t vfio_va, iova;
+
+ ms = rte_mem_virt2memseg(va, msl);
+ vfio_va = (uint64_t) (uintptr_t) va;
+ iova = ms->iova;
+
+ /* this never gets called in legacy mode, so we can be sure that
+ * each segment is a single page.
+ */
+ if (type == RTE_MEM_EVENT_ALLOC)
+ rte_vfio_dma_map(vfio_va, iova, pgsz);
+ else
+ rte_vfio_dma_unmap(vfio_va, iova, pgsz);
+
+ cur_len += pgsz;
+ }
+}
+
int
rte_vfio_clear_group(int vfio_group_fd)
{
@@ -507,6 +538,10 @@ rte_vfio_enable(const char *modname)
if (vfio_cfg.vfio_container_fd != -1) {
RTE_LOG(NOTICE, EAL, "VFIO support initialized\n");
vfio_cfg.vfio_enabled = 1;
+
+ /* register callback for mem events */
+ rte_mem_event_register_callback("vfio_mem_event_clb",
+ vfio_mem_event_callback);
} else {
RTE_LOG(NOTICE, EAL, "VFIO support could not be initialized\n");
}
--
2.7.4
Anatoly Burakov
2018-03-03 13:46:08 UTC
Permalink
No major changes, just add some checks in a few key places, and
a new parameter to pass around.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/common/eal_common_memzone.c | 20 +++---
lib/librte_eal/common/malloc_elem.c | 101 ++++++++++++++++++++++-------
lib/librte_eal/common/malloc_elem.h | 4 +-
lib/librte_eal/common/malloc_heap.c | 57 ++++++++++------
lib/librte_eal/common/malloc_heap.h | 4 +-
lib/librte_eal/common/rte_malloc.c | 6 +-
6 files changed, 134 insertions(+), 58 deletions(-)

diff --git a/lib/librte_eal/common/eal_common_memzone.c b/lib/librte_eal/common/eal_common_memzone.c
index 718dee8..75c7dd9 100644
--- a/lib/librte_eal/common/eal_common_memzone.c
+++ b/lib/librte_eal/common/eal_common_memzone.c
@@ -98,7 +98,8 @@ find_heap_max_free_elem(int *s, unsigned align)

static const struct rte_memzone *
memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
- int socket_id, unsigned flags, unsigned align, unsigned bound)
+ int socket_id, unsigned int flags, unsigned int align,
+ unsigned int bound, bool contig)
{
struct rte_memzone *mz;
struct rte_mem_config *mcfg;
@@ -182,7 +183,7 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,

/* allocate memory on heap */
void *mz_addr = malloc_heap_alloc(NULL, requested_len, socket_id, flags,
- align, bound);
+ align, bound, contig);

if (mz_addr == NULL) {
rte_errno = ENOMEM;
@@ -215,9 +216,9 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
}

static const struct rte_memzone *
-rte_memzone_reserve_thread_safe(const char *name, size_t len,
- int socket_id, unsigned flags, unsigned align,
- unsigned bound)
+rte_memzone_reserve_thread_safe(const char *name, size_t len, int socket_id,
+ unsigned int flags, unsigned int align, unsigned int bound,
+ bool contig)
{
struct rte_mem_config *mcfg;
const struct rte_memzone *mz = NULL;
@@ -228,7 +229,7 @@ rte_memzone_reserve_thread_safe(const char *name, size_t len,
rte_rwlock_write_lock(&mcfg->mlock);

mz = memzone_reserve_aligned_thread_unsafe(
- name, len, socket_id, flags, align, bound);
+ name, len, socket_id, flags, align, bound, contig);

rte_rwlock_write_unlock(&mcfg->mlock);

@@ -245,7 +246,7 @@ rte_memzone_reserve_bounded(const char *name, size_t len, int socket_id,
unsigned flags, unsigned align, unsigned bound)
{
return rte_memzone_reserve_thread_safe(name, len, socket_id, flags,
- align, bound);
+ align, bound, false);
}

/*
@@ -257,7 +258,7 @@ rte_memzone_reserve_aligned(const char *name, size_t len, int socket_id,
unsigned flags, unsigned align)
{
return rte_memzone_reserve_thread_safe(name, len, socket_id, flags,
- align, 0);
+ align, 0, false);
}

/*
@@ -269,7 +270,8 @@ rte_memzone_reserve(const char *name, size_t len, int socket_id,
unsigned flags)
{
return rte_memzone_reserve_thread_safe(name, len, socket_id,
- flags, RTE_CACHE_LINE_SIZE, 0);
+ flags, RTE_CACHE_LINE_SIZE, 0,
+ false);
}

int
diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c
index eabad66..d2dba35 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -17,6 +17,7 @@
#include <rte_common.h>
#include <rte_spinlock.h>

+#include "eal_memalloc.h"
#include "malloc_elem.h"
#include "malloc_heap.h"

@@ -94,33 +95,88 @@ malloc_elem_insert(struct malloc_elem *elem)
}

/*
+ * Attempt to find enough physically contiguous memory in this block to store
+ * our data. Assume that element has at least enough space to fit in the data,
+ * so we just check the page addresses.
+ */
+static bool
+elem_check_phys_contig(struct rte_memseg_list *msl, void *start, size_t size)
+{
+ uint64_t page_sz;
+ void *aligned_start, *end, *aligned_end;
+ size_t aligned_len;
+
+ /* figure out how many pages we need to fit in current data */
+ page_sz = msl->hugepage_sz;
+ aligned_start = RTE_PTR_ALIGN_FLOOR(start, page_sz);
+ end = RTE_PTR_ADD(start, size);
+ aligned_end = RTE_PTR_ALIGN_CEIL(end, page_sz);
+
+ aligned_len = RTE_PTR_DIFF(aligned_end, aligned_start);
+
+ return eal_memalloc_is_contig(msl, aligned_start, aligned_len);
+}
+
+/*
* calculate the starting point of where data of the requested size
* and alignment would fit in the current element. If the data doesn't
* fit, return NULL.
*/
static void *
elem_start_pt(struct malloc_elem *elem, size_t size, unsigned align,
- size_t bound)
+ size_t bound, bool contig)
{
- const size_t bmask = ~(bound - 1);
- uintptr_t end_pt = (uintptr_t)elem +
- elem->size - MALLOC_ELEM_TRAILER_LEN;
- uintptr_t new_data_start = RTE_ALIGN_FLOOR((end_pt - size), align);
- uintptr_t new_elem_start;
-
- /* check boundary */
- if ((new_data_start & bmask) != ((end_pt - 1) & bmask)) {
- end_pt = RTE_ALIGN_FLOOR(end_pt, bound);
- new_data_start = RTE_ALIGN_FLOOR((end_pt - size), align);
- end_pt = new_data_start + size;
- if (((end_pt - 1) & bmask) != (new_data_start & bmask))
- return NULL;
- }
+ size_t elem_size = elem->size;

- new_elem_start = new_data_start - MALLOC_ELEM_HEADER_LEN;
+ /*
+ * we're allocating from the end, so adjust the size of element by page
+ * size each time
+ */
+ while (elem_size >= size) {
+ const size_t bmask = ~(bound - 1);
+ uintptr_t end_pt = (uintptr_t)elem +
+ elem_size - MALLOC_ELEM_TRAILER_LEN;
+ uintptr_t new_data_start = RTE_ALIGN_FLOOR((end_pt - size),
+ align);
+ uintptr_t new_elem_start;
+
+ /* check boundary */
+ if ((new_data_start & bmask) != ((end_pt - 1) & bmask)) {
+ end_pt = RTE_ALIGN_FLOOR(end_pt, bound);
+ new_data_start = RTE_ALIGN_FLOOR((end_pt - size),
+ align);
+ end_pt = new_data_start + size;
+
+ if (((end_pt - 1) & bmask) != (new_data_start & bmask))
+ return NULL;
+ }

- /* if the new start point is before the exist start, it won't fit */
- return (new_elem_start < (uintptr_t)elem) ? NULL : (void *)new_elem_start;
+ new_elem_start = new_data_start - MALLOC_ELEM_HEADER_LEN;
+
+ /* if the new start point is before the exist start,
+ * it won't fit
+ */
+ if (new_elem_start < (uintptr_t)elem)
+ return NULL;
+
+ if (contig) {
+ size_t new_data_size = end_pt - new_data_start;
+
+ /*
+ * if physical contiguousness was requested and we
+ * couldn't fit all data into one physically contiguous
+ * block, try again with lower addresses.
+ */
+ if (!elem_check_phys_contig(elem->msl,
+ (void *) new_data_start,
+ new_data_size)) {
+ elem_size -= align;
+ continue;
+ }
+ }
+ return (void *) new_elem_start;
+ }
+ return NULL;
}

/*
@@ -129,9 +185,9 @@ elem_start_pt(struct malloc_elem *elem, size_t size, unsigned align,
*/
int
malloc_elem_can_hold(struct malloc_elem *elem, size_t size, unsigned align,
- size_t bound)
+ size_t bound, bool contig)
{
- return elem_start_pt(elem, size, align, bound) != NULL;
+ return elem_start_pt(elem, size, align, bound, contig) != NULL;
}

/*
@@ -259,9 +315,10 @@ malloc_elem_free_list_remove(struct malloc_elem *elem)
*/
struct malloc_elem *
malloc_elem_alloc(struct malloc_elem *elem, size_t size, unsigned align,
- size_t bound)
+ size_t bound, bool contig)
{
- struct malloc_elem *new_elem = elem_start_pt(elem, size, align, bound);
+ struct malloc_elem *new_elem = elem_start_pt(elem, size, align, bound,
+ contig);
const size_t old_elem_size = (uintptr_t)new_elem - (uintptr_t)elem;
const size_t trailer_size = elem->size - old_elem_size - size -
MALLOC_ELEM_OVERHEAD;
diff --git a/lib/librte_eal/common/malloc_elem.h b/lib/librte_eal/common/malloc_elem.h
index 6d979d2..798472e 100644
--- a/lib/librte_eal/common/malloc_elem.h
+++ b/lib/librte_eal/common/malloc_elem.h
@@ -123,7 +123,7 @@ malloc_elem_insert(struct malloc_elem *elem);
*/
int
malloc_elem_can_hold(struct malloc_elem *elem, size_t size,
- unsigned align, size_t bound);
+ unsigned int align, size_t bound, bool contig);

/*
* reserve a block of data in an existing malloc_elem. If the malloc_elem
@@ -131,7 +131,7 @@ malloc_elem_can_hold(struct malloc_elem *elem, size_t size,
*/
struct malloc_elem *
malloc_elem_alloc(struct malloc_elem *elem, size_t size,
- unsigned align, size_t bound);
+ unsigned int align, size_t bound, bool contig);

/*
* free a malloc_elem block by adding it to the free list. If the
diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c
index 87dc9ad..984e027 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -94,7 +94,7 @@ malloc_heap_add_memory(struct malloc_heap *heap, struct rte_memseg_list *msl,
*/
static struct malloc_elem *
find_suitable_element(struct malloc_heap *heap, size_t size,
- unsigned flags, size_t align, size_t bound)
+ unsigned int flags, size_t align, size_t bound, bool contig)
{
size_t idx;
struct malloc_elem *elem, *alt_elem = NULL;
@@ -103,7 +103,8 @@ find_suitable_element(struct malloc_heap *heap, size_t size,
idx < RTE_HEAP_NUM_FREELISTS; idx++) {
for (elem = LIST_FIRST(&heap->free_head[idx]);
!!elem; elem = LIST_NEXT(elem, free_list)) {
- if (malloc_elem_can_hold(elem, size, align, bound)) {
+ if (malloc_elem_can_hold(elem, size, align, bound,
+ contig)) {
if (check_hugepage_sz(flags,
elem->msl->hugepage_sz))
return elem;
@@ -127,16 +128,16 @@ find_suitable_element(struct malloc_heap *heap, size_t size,
*/
static void *
heap_alloc(struct malloc_heap *heap, const char *type __rte_unused, size_t size,
- unsigned int flags, size_t align, size_t bound)
+ unsigned int flags, size_t align, size_t bound, bool contig)
{
struct malloc_elem *elem;

size = RTE_CACHE_LINE_ROUNDUP(size);
align = RTE_CACHE_LINE_ROUNDUP(align);

- elem = find_suitable_element(heap, size, flags, align, bound);
+ elem = find_suitable_element(heap, size, flags, align, bound, contig);
if (elem != NULL) {
- elem = malloc_elem_alloc(elem, size, align, bound);
+ elem = malloc_elem_alloc(elem, size, align, bound, contig);

/* increase heap's count of allocated elements */
heap->alloc_count++;
@@ -147,14 +148,15 @@ heap_alloc(struct malloc_heap *heap, const char *type __rte_unused, size_t size,

static int
try_expand_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size,
- int socket, unsigned int flags, size_t align, size_t bound)
+ int socket, unsigned int flags, size_t align, size_t bound,
+ bool contig)
{
+ size_t map_len, data_start_offset;
struct rte_memseg_list *msl;
struct rte_memseg **ms;
struct malloc_elem *elem;
- size_t map_len;
int i, n_pages, allocd_pages;
- void *ret, *map_addr;
+ void *ret, *map_addr, *data_start;

align = RTE_MAX(align, MALLOC_ELEM_HEADER_LEN);
map_len = RTE_ALIGN_CEIL(align + elt_size + MALLOC_ELEM_TRAILER_LEN,
@@ -175,11 +177,22 @@ try_expand_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size,
map_addr = ms[0]->addr;
msl = rte_mem_virt2memseg_list(map_addr);

+ /* check if we wanted contiguous memory but didn't get it */
+ data_start_offset = RTE_ALIGN(MALLOC_ELEM_HEADER_LEN, align);
+ data_start = RTE_PTR_ADD(ms[0]->addr, data_start_offset);
+ if (contig && !eal_memalloc_is_contig(msl, data_start,
+ n_pages * msl->hugepage_sz)) {
+ RTE_LOG(DEBUG, EAL, "%s(): couldn't allocate physically contiguous space\n",
+ __func__);
+ goto free_pages;
+ }
+
/* add newly minted memsegs to malloc heap */
elem = malloc_heap_add_memory(heap, msl, map_addr, map_len);

/* try once more, as now we have allocated new memory */
- ret = find_suitable_element(heap, elt_size, flags, align, bound);
+ ret = find_suitable_element(heap, elt_size, flags, align, bound,
+ contig);

if (ret == NULL)
goto free_elem;
@@ -196,6 +209,7 @@ try_expand_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size,
malloc_elem_hide_region(elem, map_addr, map_len);
heap->total_size -= map_len;

+free_pages:
for (i = 0; i < n_pages; i++)
eal_memalloc_free_page(ms[i]);
free_ms:
@@ -223,7 +237,7 @@ compare_pagesz(const void *a, const void *b)

static int
alloc_mem_on_socket(size_t size, int socket, unsigned int flags, size_t align,
- size_t bound)
+ size_t bound, bool contig)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
struct malloc_heap *heap = &mcfg->malloc_heaps[socket];
@@ -304,14 +318,14 @@ alloc_mem_on_socket(size_t size, int socket, unsigned int flags, size_t align,
* sizes first, before resorting to best effort allocation.
*/
if (!try_expand_heap(heap, pg_sz, size, socket, size_flags,
- align, bound))
+ align, bound, contig))
return 0;
}
if (n_other_pg_sz == 0)
return -1;

/* now, check if we can reserve anything with size hint */
- ret = find_suitable_element(heap, size, flags, align, bound);
+ ret = find_suitable_element(heap, size, flags, align, bound, contig);
if (ret != NULL)
return 0;

@@ -323,7 +337,7 @@ alloc_mem_on_socket(size_t size, int socket, unsigned int flags, size_t align,
uint64_t pg_sz = other_pg_sz[i];

if (!try_expand_heap(heap, pg_sz, size, socket, flags,
- align, bound))
+ align, bound, contig))
return 0;
}
return -1;
@@ -332,7 +346,7 @@ alloc_mem_on_socket(size_t size, int socket, unsigned int flags, size_t align,
/* this will try lower page sizes first */
static void *
heap_alloc_on_socket(const char *type, size_t size, int socket,
- unsigned int flags, size_t align, size_t bound)
+ unsigned int flags, size_t align, size_t bound, bool contig)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
struct malloc_heap *heap = &mcfg->malloc_heaps[socket];
@@ -345,7 +359,7 @@ heap_alloc_on_socket(const char *type, size_t size, int socket,

/* for legacy mode, try once and with all flags */
if (internal_config.legacy_mem) {
- ret = heap_alloc(heap, type, size, flags, align, bound);
+ ret = heap_alloc(heap, type, size, flags, align, bound, contig);
goto alloc_unlock;
}

@@ -354,12 +368,12 @@ heap_alloc_on_socket(const char *type, size_t size, int socket,
* we may still be able to allocate memory from appropriate page sizes,
* we just need to request more memory first.
*/
- ret = heap_alloc(heap, type, size, size_flags, align, bound);
+ ret = heap_alloc(heap, type, size, size_flags, align, bound, contig);
if (ret != NULL)
goto alloc_unlock;

- if (!alloc_mem_on_socket(size, socket, flags, align, bound)) {
- ret = heap_alloc(heap, type, size, flags, align, bound);
+ if (!alloc_mem_on_socket(size, socket, flags, align, bound, contig)) {
+ ret = heap_alloc(heap, type, size, flags, align, bound, contig);

/* this should have succeeded */
if (ret == NULL)
@@ -372,7 +386,7 @@ heap_alloc_on_socket(const char *type, size_t size, int socket,

void *
malloc_heap_alloc(const char *type, size_t size, int socket_arg,
- unsigned int flags, size_t align, size_t bound)
+ unsigned int flags, size_t align, size_t bound, bool contig)
{
int socket, i;
void *ret;
@@ -393,7 +407,8 @@ malloc_heap_alloc(const char *type, size_t size, int socket_arg,
if (socket >= RTE_MAX_NUMA_NODES)
return NULL;

- ret = heap_alloc_on_socket(type, size, socket, flags, align, bound);
+ ret = heap_alloc_on_socket(type, size, socket, flags, align, bound,
+ contig);
if (ret != NULL || socket_arg != SOCKET_ID_ANY)
return ret;

@@ -402,7 +417,7 @@ malloc_heap_alloc(const char *type, size_t size, int socket_arg,
if (i == socket)
continue;
ret = heap_alloc_on_socket(type, size, i, flags,
- align, bound);
+ align, bound, contig);
if (ret != NULL)
return ret;
}
diff --git a/lib/librte_eal/common/malloc_heap.h b/lib/librte_eal/common/malloc_heap.h
index 292d578..03b8014 100644
--- a/lib/librte_eal/common/malloc_heap.h
+++ b/lib/librte_eal/common/malloc_heap.h
@@ -5,6 +5,8 @@
#ifndef MALLOC_HEAP_H_
#define MALLOC_HEAP_H_

+#include <stdbool.h>
+
#include <rte_malloc.h>
#include <rte_malloc_heap.h>

@@ -25,7 +27,7 @@ malloc_get_numa_socket(void)

void *
malloc_heap_alloc(const char *type, size_t size, int socket, unsigned int flags,
- size_t align, size_t bound);
+ size_t align, size_t bound, bool contig);

int
malloc_heap_free(struct malloc_elem *elem);
diff --git a/lib/librte_eal/common/rte_malloc.c b/lib/librte_eal/common/rte_malloc.c
index b0fe11c..5cd92d1 100644
--- a/lib/librte_eal/common/rte_malloc.c
+++ b/lib/librte_eal/common/rte_malloc.c
@@ -37,7 +37,8 @@ void rte_free(void *addr)
* Allocate memory on specified heap.
*/
void *
-rte_malloc_socket(const char *type, size_t size, unsigned align, int socket_arg)
+rte_malloc_socket(const char *type, size_t size, unsigned int align,
+ int socket_arg)
{
/* return NULL if size is 0 or alignment is not power-of-2 */
if (size == 0 || (align && !rte_is_power_of_2(align)))
@@ -50,8 +51,7 @@ rte_malloc_socket(const char *type, size_t size, unsigned align, int socket_arg)
if (socket_arg >= RTE_MAX_NUMA_NODES)
return NULL;

- return malloc_heap_alloc(type, size, socket_arg, 0,
- align == 0 ? 1 : align, 0);
+ return malloc_heap_alloc(type, size, socket_arg, 0, align, 0, false);
}

/*
--
2.7.4
Anatoly Burakov
2018-03-03 13:46:15 UTC
Permalink
for legacy memory mode, attach to primary's memseg list, and
map hugepages as before.

for non-legacy mode, preallocate all VA space and then do a
sync of local memory map.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/bsdapp/eal/eal_hugepage_info.c | 7 ++
lib/librte_eal/common/eal_common_memory.c | 99 +++++++++++++++++++++----
lib/librte_eal/common/eal_hugepages.h | 5 ++
lib/librte_eal/linuxapp/eal/eal.c | 18 +++--
lib/librte_eal/linuxapp/eal/eal_hugepage_info.c | 53 ++++++++-----
lib/librte_eal/linuxapp/eal/eal_memory.c | 24 ++++--
6 files changed, 159 insertions(+), 47 deletions(-)

diff --git a/lib/librte_eal/bsdapp/eal/eal_hugepage_info.c b/lib/librte_eal/bsdapp/eal/eal_hugepage_info.c
index be2dbf0..18e6e5e 100644
--- a/lib/librte_eal/bsdapp/eal/eal_hugepage_info.c
+++ b/lib/librte_eal/bsdapp/eal/eal_hugepage_info.c
@@ -103,3 +103,10 @@ eal_hugepage_info_init(void)

return 0;
}
+
+/* memory hotplug is not supported in FreeBSD, so no need to implement this */
+int
+eal_hugepage_info_read(void)
+{
+ return 0;
+}
diff --git a/lib/librte_eal/common/eal_common_memory.c b/lib/librte_eal/common/eal_common_memory.c
index 457e239..a571e24 100644
--- a/lib/librte_eal/common/eal_common_memory.c
+++ b/lib/librte_eal/common/eal_common_memory.c
@@ -20,6 +20,7 @@
#include <rte_errno.h>
#include <rte_log.h>

+#include "eal_memalloc.h"
#include "eal_private.h"
#include "eal_internal_cfg.h"

@@ -147,19 +148,11 @@ alloc_memseg_list(struct rte_memseg_list *msl, uint64_t page_sz,
char name[RTE_FBARRAY_NAME_LEN];
int max_pages;
uint64_t mem_amount;
- void *addr;

if (!internal_config.legacy_mem) {
mem_amount = get_mem_amount(page_sz);
max_pages = mem_amount / page_sz;
-
- addr = eal_get_virtual_area(NULL, &mem_amount, page_sz, 0, 0);
- if (addr == NULL) {
- RTE_LOG(ERR, EAL, "Cannot reserve memory\n");
- return -1;
- }
} else {
- addr = NULL;
/* numer of memsegs in each list, these will not be single-page
* segments, so RTE_MAX_LEGACY_MEMSEG is like old default.
*/
@@ -177,7 +170,7 @@ alloc_memseg_list(struct rte_memseg_list *msl, uint64_t page_sz,

msl->hugepage_sz = page_sz;
msl->socket_id = socket_id;
- msl->base_va = addr;
+ msl->base_va = NULL;

RTE_LOG(DEBUG, EAL, "Memseg list allocated: 0x%zxkB at socket %i\n",
page_sz >> 10, socket_id);
@@ -186,16 +179,46 @@ alloc_memseg_list(struct rte_memseg_list *msl, uint64_t page_sz,
}

static int
-memseg_init(void)
+alloc_va_space(struct rte_memseg_list *msl)
+{
+ uint64_t mem_sz, page_sz;
+ void *addr;
+ int flags = 0;
+
+#ifdef RTE_ARCH_PPC_64
+ flags |= MAP_HUGETLB;
+#endif
+
+ page_sz = msl->hugepage_sz;
+ mem_sz = page_sz * msl->memseg_arr.len;
+
+ addr = eal_get_virtual_area(msl->base_va, &mem_sz, page_sz, 0, flags);
+ if (addr == NULL) {
+ if (rte_errno == EADDRNOTAVAIL)
+ RTE_LOG(ERR, EAL, "Could not mmap %llu bytes at [%p] - please use '--base-virtaddr' option\n",
+ (unsigned long long)mem_sz, msl->base_va);
+ else
+ RTE_LOG(ERR, EAL, "Cannot reserve memory\n");
+ return -1;
+ }
+ msl->base_va = addr;
+
+ return 0;
+}
+
+
+static int
+memseg_primary_init(void)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
int socket_id, hpi_idx, msl_idx = 0;
struct rte_memseg_list *msl;

- if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
- RTE_LOG(ERR, EAL, "Secondary process not supported\n");
- return -1;
- }
+ /* if we start allocating memory segments for pages straight away, VA
+ * space will become fragmented, reducing chances of success when
+ * secondary process maps the same addresses. to fix this, allocate
+ * fbarrays first, and then allocate VA space for them.
+ */

/* create memseg lists */
for (hpi_idx = 0; hpi_idx < (int) internal_config.num_hugepage_sizes;
@@ -235,12 +258,55 @@ memseg_init(void)
total_segs += msl->memseg_arr.len;
total_mem = total_segs * msl->hugepage_sz;
type_msl_idx++;
+
+ /* no need to preallocate VA in legacy mode */
+ if (internal_config.legacy_mem)
+ continue;
+
+ if (alloc_va_space(msl)) {
+ RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n");
+ return -1;
+ }
}
}
}
return 0;
}

+static int
+memseg_secondary_init(void)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ int msl_idx = 0;
+ struct rte_memseg_list *msl;
+
+ for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
+
+ msl = &mcfg->memsegs[msl_idx];
+
+ /* skip empty memseg lists */
+ if (msl->memseg_arr.len == 0)
+ continue;
+
+ if (rte_fbarray_attach(&msl->memseg_arr)) {
+ RTE_LOG(ERR, EAL, "Cannot attach to primary process memseg lists\n");
+ return -1;
+ }
+
+ /* no need to preallocate VA space in legacy mode */
+ if (internal_config.legacy_mem)
+ continue;
+
+ /* preallocate VA space */
+ if (alloc_va_space(msl)) {
+ RTE_LOG(ERR, EAL, "Cannot preallocate VA space for hugepage memory\n");
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
static struct rte_memseg *
virt2memseg(const void *addr, const struct rte_memseg_list *msl)
{
@@ -480,7 +546,10 @@ rte_eal_memory_init(void)
int retval;
RTE_LOG(DEBUG, EAL, "Setting up physically contiguous memory...\n");

- retval = memseg_init();
+ retval = rte_eal_process_type() == RTE_PROC_PRIMARY ?
+ memseg_primary_init() :
+ memseg_secondary_init();
+
if (retval < 0)
return -1;

diff --git a/lib/librte_eal/common/eal_hugepages.h b/lib/librte_eal/common/eal_hugepages.h
index f963ae5..38d0b04 100644
--- a/lib/librte_eal/common/eal_hugepages.h
+++ b/lib/librte_eal/common/eal_hugepages.h
@@ -34,4 +34,9 @@ struct hugepage_file {
*/
int eal_hugepage_info_init(void);

+/**
+ * Read information about hugepages on Linux, but don't clear them out.
+ */
+int eal_hugepage_info_read(void);
+
#endif /* EAL_HUGEPAGES_H */
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index d336c96..7a0d742 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -805,13 +805,17 @@ rte_eal_init(int argc, char **argv)
"KNI module inserted\n");
}

- if (internal_config.no_hugetlbfs == 0 &&
- internal_config.process_type != RTE_PROC_SECONDARY &&
- eal_hugepage_info_init() < 0) {
- rte_eal_init_alert("Cannot get hugepage information.");
- rte_errno = EACCES;
- rte_atomic32_clear(&run_once);
- return -1;
+ if (internal_config.no_hugetlbfs == 0) {
+ /* rte_config isn't initialized yet */
+ ret = internal_config.process_type == RTE_PROC_PRIMARY ?
+ eal_hugepage_info_init() :
+ eal_hugepage_info_read();
+ if (ret < 0) {
+ rte_eal_init_alert("Cannot get hugepage information.");
+ rte_errno = EACCES;
+ rte_atomic32_clear(&run_once);
+ return -1;
+ }
}

if (internal_config.memory == 0 && internal_config.force_sockets == 0) {
diff --git a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
index 7e2475f..7a4adce 100644
--- a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
+++ b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
@@ -6,6 +6,7 @@
#include <sys/types.h>
#include <sys/file.h>
#include <dirent.h>
+#include <stdbool.h>
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
@@ -299,15 +300,9 @@ compare_hpi(const void *a, const void *b)
return hpi_b->hugepage_sz - hpi_a->hugepage_sz;
}

-/*
- * when we initialize the hugepage info, everything goes
- * to socket 0 by default. it will later get sorted by memory
- * initialization procedure.
- */
-int
-eal_hugepage_info_init(void)
-{
- const char dirent_start_text[] = "hugepages-";
+static int
+hugepage_info_init(bool clear_hugepages)
+{ const char dirent_start_text[] = "hugepages-";
const size_t dirent_start_len = sizeof(dirent_start_text) - 1;
unsigned int i, total_pages, num_sizes = 0;
DIR *dir;
@@ -350,18 +345,20 @@ eal_hugepage_info_init(void)
continue;
}

- /* try to obtain a writelock */
- hpi->lock_descriptor = open(hpi->hugedir, O_RDONLY);
+ if (clear_hugepages) {
+ /* try to obtain a writelock */
+ hpi->lock_descriptor = open(hpi->hugedir, O_RDONLY);

- /* if blocking lock failed */
- if (flock(hpi->lock_descriptor, LOCK_EX) == -1) {
- RTE_LOG(CRIT, EAL,
- "Failed to lock hugepage directory!\n");
- break;
+ /* if blocking lock failed */
+ if (flock(hpi->lock_descriptor, LOCK_EX) == -1) {
+ RTE_LOG(CRIT, EAL,
+ "Failed to lock hugepage directory!\n");
+ break;
+ }
+ /* clear out the hugepages dir from unused pages */
+ if (clear_hugedir(hpi->hugedir) == -1)
+ break;
}
- /* clear out the hugepages dir from unused pages */
- if (clear_hugedir(hpi->hugedir) == -1)
- break;

/*
* first, try to put all hugepages into relevant sockets, but
@@ -417,10 +414,26 @@ eal_hugepage_info_init(void)
num_pages += hpi->num_pages[j];
}
if (internal_config.hugepage_info[i].hugedir != NULL &&
- num_pages > 0)
+ (num_pages > 0 || !clear_hugepages))
return 0;
}

/* no valid hugepage mounts available, return error */
return -1;
}
+
+int eal_hugepage_info_read(void)
+{
+ return hugepage_info_init(false);
+}
+
+/*
+ * when we initialize the hugepage info, everything goes
+ * to socket 0 by default. it will later get sorted by memory
+ * initialization procedure.
+ */
+int
+eal_hugepage_info_init(void)
+{
+ return hugepage_info_init(true);
+}
diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
index e0b4988..f74291f 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -1569,6 +1569,22 @@ eal_legacy_hugepage_attach(void)
return -1;
}

+static int
+eal_hugepage_attach(void)
+{
+ if (eal_memalloc_sync_with_primary()) {
+ RTE_LOG(ERR, EAL, "Could not map memory from primary process\n");
+ if (aslr_enabled() > 0) {
+ RTE_LOG(ERR, EAL, "It is recommended to "
+ "disable ASLR in the kernel "
+ "and retry running both primary "
+ "and secondary processes\n");
+ }
+ return -1;
+ }
+ return 0;
+}
+
int
rte_eal_hugepage_init(void)
{
@@ -1580,11 +1596,9 @@ rte_eal_hugepage_init(void)
int
rte_eal_hugepage_attach(void)
{
- if (internal_config.legacy_mem)
- return eal_legacy_hugepage_attach();
- else
- RTE_LOG(ERR, EAL, "Secondary processes aren't supported yet\n");
- return -1;
+ return internal_config.legacy_mem ?
+ eal_legacy_hugepage_attach() :
+ eal_hugepage_attach();
}

int
--
2.7.4
Anatoly Burakov
2018-03-03 13:46:16 UTC
Permalink
This enables multiprocess synchronization for memory hotplug
requests at runtime (as opposed to initialization).

Basic workflow is the following. Primary process always does initial
mapping and unmapping, and secondary processes always follow primary
page map. Only one allocation request can be active at any one time.

When primary allocates memory, it ensures that all other processes
have allocated the same set of hugepages successfully, otherwise
any allocations made are being rolled back, and heap is freed back.
Heap is locked throughout the process, so no race conditions can
happen.

When primary frees memory, it frees the heap, deallocates affected
pages, and notifies other processes of deallocations. Since heap is
freed from that memory chunk, the area basically becomes invisible
to other processes even if they happen to fail to unmap that
specific set of pages, so it's completely safe to ignore results of
sync requests.

When secondary allocates memory, it does not do so by itself.
Instead, it sends a request to primary process to try and allocate
pages of specified size and on specified socket, such that a
specified heap allocation request could complete. Primary process
then sends all secondaries (including the requestor) a separate
notification of allocated pages, and expects all secondary
processes to report success before considering pages as "allocated".

Only after primary process ensures that all memory has been
successfully allocated in all secondary process, it will respond
positively to the initial request, and let secondary proceed with
the allocation. Since the heap now has memory that can satisfy
allocation request, and it was locked all this time (so no other
allocations could take place), secondary process will be able to
allocate memory from the heap.

When secondary frees memory, it hides pages to be deallocated from
the heap. Then, it sends a deallocation request to primary process,
so that it deallocates pages itself, and then sends a separate sync
request to all other processes (including the requestor) to unmap
the same pages. This way, even if secondary fails to notify other
processes of this deallocation, that memory will become invisible
to other processes, and will not be allocated from again.

So, to summarize: address space will only become part of the heap
if primary process can ensure that all other processes have
allocated this memory successfully. If anything goes wrong, the
worst thing that could happen is that a page will "leak" and will
not be available to neither DPDK nor the system, as some process
will still hold onto it. It's not an actual leak, as we can account
for the page - it's just that none of the processes will be able
to use this page for anything useful, until it gets allocated from
by the primary.

Due to underlying DPDK IPC implementation being single-threaded,
some asynchronous magic had to be done, as we need to complete
several requests before we can definitively allow secondary process
to use allocated memory (namely, it has to be present in all other
secondary processes before it can be used). Additionally, only
one allocation request is allowed to be submitted at once.

Memory allocation requests are only allowed when there are no
secondary processes currently initializing. To enforce that,
a shared rwlock is used, that is set to read lock on init (so that
several secondaries could initialize concurrently), and write lock
on making allocation requests (so that either secondary init will
have to wait, or allocation request will have to wait until all
processes have initialized).

To reduce possibility of not releasing lock on fail to init,
replace all rte_panic's with init alert followed by a return -1.

Signed-off-by: Anatoly Burakov <***@intel.com>
---

Notes:
This problem is evidently complex to solve without multithreaded
IPC implementation. An alternative approach would be to process
each individual message in its own thread (or at least spawn a
thread per incoming request) - that way, we can send requests
while responding to another request, and this problem becomes
trivial to solve (and in fact it was solved that way initially,
before my aversion to certain other programming languages kicked
in).

Is the added complexity worth saving a couple of thread spin-ups
here and there?

lib/librte_eal/bsdapp/eal/Makefile | 1 +
lib/librte_eal/common/include/rte_eal_memconfig.h | 3 +
lib/librte_eal/common/malloc_heap.c | 250 ++++++--
lib/librte_eal/common/malloc_mp.c | 723 ++++++++++++++++++++++
lib/librte_eal/common/malloc_mp.h | 86 +++
lib/librte_eal/common/meson.build | 1 +
lib/librte_eal/linuxapp/eal/Makefile | 1 +
lib/librte_eal/linuxapp/eal/eal.c | 50 +-
8 files changed, 1054 insertions(+), 61 deletions(-)
create mode 100644 lib/librte_eal/common/malloc_mp.c
create mode 100644 lib/librte_eal/common/malloc_mp.h

diff --git a/lib/librte_eal/bsdapp/eal/Makefile b/lib/librte_eal/bsdapp/eal/Makefile
index 907e30d..250d5c1 100644
--- a/lib/librte_eal/bsdapp/eal/Makefile
+++ b/lib/librte_eal/bsdapp/eal/Makefile
@@ -59,6 +59,7 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_fbarray.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += rte_malloc.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += malloc_elem.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += malloc_heap.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += malloc_mp.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += rte_keepalive.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += rte_service.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += rte_reciprocal.c
diff --git a/lib/librte_eal/common/include/rte_eal_memconfig.h b/lib/librte_eal/common/include/rte_eal_memconfig.h
index d653d57..c4b36f6 100644
--- a/lib/librte_eal/common/include/rte_eal_memconfig.h
+++ b/lib/librte_eal/common/include/rte_eal_memconfig.h
@@ -60,6 +60,9 @@ struct rte_mem_config {
rte_rwlock_t qlock; /**< used for tailq operation for thread safe. */
rte_rwlock_t mplock; /**< only used by mempool LIB for thread-safe. */

+ rte_rwlock_t memory_hotplug_lock;
+ /**< indicates whether memory hotplug request is in progress. */
+
/* memory segments and zones */
struct rte_fbarray memzones; /**< Memzone descriptors. */

diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c
index 7a3d0f3..9109555 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -10,6 +10,7 @@
#include <sys/queue.h>

#include <rte_memory.h>
+#include <rte_errno.h>
#include <rte_eal.h>
#include <rte_eal_memconfig.h>
#include <rte_launch.h>
@@ -26,6 +27,7 @@
#include "eal_memalloc.h"
#include "malloc_elem.h"
#include "malloc_heap.h"
+#include "malloc_mp.h"

static unsigned
check_hugepage_sz(unsigned flags, uint64_t hugepage_sz)
@@ -81,8 +83,6 @@ malloc_heap_add_memory(struct malloc_heap *heap, struct rte_memseg_list *msl,

malloc_elem_free_list_insert(elem);

- heap->total_size += len;
-
return elem;
}

@@ -146,33 +146,42 @@ heap_alloc(struct malloc_heap *heap, const char *type __rte_unused, size_t size,
return elem == NULL ? NULL : (void *)(&elem[1]);
}

-static int
-try_expand_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size,
+/* this function is exposed in malloc_mp.h */
+void
+rollback_expand_heap(struct rte_memseg **ms, int n_pages,
+ struct malloc_elem *elem, void *map_addr, size_t map_len)
+{
+ int i;
+
+ if (elem != NULL) {
+ malloc_elem_free_list_remove(elem);
+ malloc_elem_hide_region(elem, map_addr, map_len);
+ }
+
+ for (i = 0; i < n_pages; i++)
+ eal_memalloc_free_page(ms[i]);
+}
+
+/* this function is exposed in malloc_mp.h */
+struct malloc_elem *
+alloc_pages_on_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size,
int socket, unsigned int flags, size_t align, size_t bound,
- bool contig)
+ bool contig, struct rte_memseg **ms, int n_pages)
{
size_t map_len, data_start_offset;
struct rte_memseg_list *msl;
- struct rte_memseg **ms;
- struct malloc_elem *elem;
- int i, n_pages, allocd_pages;
+ struct malloc_elem *elem = NULL;
+ int allocd_pages;
void *ret, *map_addr, *data_start;

- align = RTE_MAX(align, MALLOC_ELEM_HEADER_LEN);
- map_len = RTE_ALIGN_CEIL(align + elt_size + MALLOC_ELEM_TRAILER_LEN,
- pg_sz);
-
- n_pages = map_len / pg_sz;
+ map_len = n_pages * pg_sz;

- /* we can't know in advance how many pages we'll need, so malloc */
- ms = malloc(sizeof(*ms) * n_pages);
-
- allocd_pages = eal_memalloc_alloc_page_bulk(ms, n_pages, pg_sz, socket,
- true);
+ allocd_pages = eal_memalloc_alloc_page_bulk(ms, n_pages, pg_sz,
+ socket, true);

/* make sure we've allocated our pages... */
if (allocd_pages != n_pages)
- goto free_ms;
+ return NULL;

map_addr = ms[0]->addr;
msl = rte_mem_virt2memseg_list(map_addr);
@@ -184,7 +193,7 @@ try_expand_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size,
n_pages * msl->hugepage_sz)) {
RTE_LOG(DEBUG, EAL, "%s(): couldn't allocate physically contiguous space\n",
__func__);
- goto free_pages;
+ goto fail;
}

/* add newly minted memsegs to malloc heap */
@@ -195,7 +204,53 @@ try_expand_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size,
contig);

if (ret == NULL)
+ goto fail;
+
+ return elem;
+
+fail:
+ rollback_expand_heap(ms, n_pages, elem, map_addr, map_len);
+ return NULL;
+}
+
+static int
+try_expand_heap_primary(struct malloc_heap *heap, uint64_t pg_sz,
+ size_t elt_size, int socket, unsigned int flags, size_t align,
+ size_t bound, bool contig)
+{
+ struct malloc_elem *elem;
+ struct rte_memseg **ms;
+ void *map_addr;
+ size_t map_len;
+ int n_pages;
+
+ map_len = RTE_ALIGN_CEIL(align + elt_size +
+ MALLOC_ELEM_TRAILER_LEN, pg_sz);
+ n_pages = map_len / pg_sz;
+
+ /* we can't know in advance how many pages we'll need, so we malloc */
+ ms = malloc(sizeof(*ms) * n_pages);
+
+ if (ms == NULL)
+ return -1;
+
+ elem = alloc_pages_on_heap(heap, pg_sz, elt_size, socket, flags, align,
+ bound, contig, ms, n_pages);
+
+ if (elem == NULL)
+ goto free_ms;
+
+ map_addr = ms[0]->addr;
+
+ /* notify other processes that this has happened */
+ if (request_sync()) {
+ /* we couldn't ensure all processes have mapped memory,
+ * so free it back and notify everyone that it's been
+ * freed back.
+ */
goto free_elem;
+ }
+ heap->total_size += map_len;

RTE_LOG(DEBUG, EAL, "Heap on socket %d was expanded by %zdMB\n",
socket, map_len >> 20ULL);
@@ -205,13 +260,9 @@ try_expand_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size,
return 0;

free_elem:
- malloc_elem_free_list_remove(elem);
- malloc_elem_hide_region(elem, map_addr, map_len);
- heap->total_size -= map_len;
+ rollback_expand_heap(ms, n_pages, elem, map_addr, map_len);

-free_pages:
- for (i = 0; i < n_pages; i++)
- eal_memalloc_free_page(ms[i]);
+ request_sync();
free_ms:
free(ms);

@@ -219,6 +270,57 @@ try_expand_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size,
}

static int
+try_expand_heap_secondary(struct malloc_heap *heap, uint64_t pg_sz,
+ size_t elt_size, int socket, unsigned int flags, size_t align,
+ size_t bound, bool contig)
+{
+ struct malloc_mp_req req;
+ int req_result;
+
+ req.t = REQ_TYPE_ALLOC;
+ req.alloc_req.align = align;
+ req.alloc_req.bound = bound;
+ req.alloc_req.contig = contig;
+ req.alloc_req.flags = flags;
+ req.alloc_req.elt_size = elt_size;
+ req.alloc_req.page_sz = pg_sz;
+ req.alloc_req.socket = socket;
+ req.alloc_req.heap = heap; /* it's in shared memory */
+
+ req_result = request_to_primary(&req);
+
+ if (req_result != 0)
+ return -1;
+
+ if (req.result != REQ_RESULT_SUCCESS)
+ return -1;
+
+ return 0;
+}
+
+static int
+try_expand_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size,
+ int socket, unsigned int flags, size_t align, size_t bound,
+ bool contig)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ int ret;
+
+ rte_rwlock_write_lock(&mcfg->memory_hotplug_lock);
+
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ ret = try_expand_heap_primary(heap, pg_sz, elt_size, socket,
+ flags, align, bound, contig);
+ } else {
+ ret = try_expand_heap_secondary(heap, pg_sz, elt_size, socket,
+ flags, align, bound, contig);
+ }
+
+ rte_rwlock_write_unlock(&mcfg->memory_hotplug_lock);
+ return ret;
+}
+
+static int
compare_pagesz(const void *a, const void *b)
{
const struct rte_memseg_list * const*mpa = a;
@@ -236,11 +338,10 @@ compare_pagesz(const void *a, const void *b)
}

static int
-alloc_mem_on_socket(size_t size, int socket, unsigned int flags, size_t align,
- size_t bound, bool contig)
+alloc_more_mem_on_socket(struct malloc_heap *heap, size_t size, int socket,
+ unsigned int flags, size_t align, size_t bound, bool contig)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
- struct malloc_heap *heap = &mcfg->malloc_heaps[socket];
struct rte_memseg_list *requested_msls[RTE_MAX_MEMSEG_LISTS];
struct rte_memseg_list *other_msls[RTE_MAX_MEMSEG_LISTS];
uint64_t requested_pg_sz[RTE_MAX_MEMSEG_LISTS];
@@ -355,7 +456,7 @@ heap_alloc_on_socket(const char *type, size_t size, int socket,

rte_spinlock_lock(&(heap->lock));

- align = align == 0 ? 1 : align;
+ align = RTE_MAX(align == 0 ? 1 : align, MALLOC_ELEM_HEADER_LEN);

/* for legacy mode, try once and with all flags */
if (internal_config.legacy_mem) {
@@ -372,7 +473,8 @@ heap_alloc_on_socket(const char *type, size_t size, int socket,
if (ret != NULL)
goto alloc_unlock;

- if (!alloc_mem_on_socket(size, socket, flags, align, bound, contig)) {
+ if (!alloc_more_mem_on_socket(heap, size, socket, flags, align, bound,
+ contig)) {
ret = heap_alloc(heap, type, size, flags, align, bound, contig);

/* this should have succeeded */
@@ -424,14 +526,40 @@ malloc_heap_alloc(const char *type, size_t size, int socket_arg,
return NULL;
}

+/* this function is exposed in malloc_mp.h */
+int
+malloc_heap_free_pages(void *aligned_start, size_t aligned_len)
+{
+ int n_pages, page_idx, max_page_idx;
+ struct rte_memseg_list *msl;
+
+ msl = rte_mem_virt2memseg_list(aligned_start);
+ if (msl == NULL)
+ return -1;
+
+ n_pages = aligned_len / msl->hugepage_sz;
+ page_idx = RTE_PTR_DIFF(aligned_start, msl->base_va) /
+ msl->hugepage_sz;
+ max_page_idx = page_idx + n_pages;
+
+ for (; page_idx < max_page_idx; page_idx++) {
+ struct rte_memseg *ms;
+
+ ms = rte_fbarray_get(&msl->memseg_arr, page_idx);
+ eal_memalloc_free_page(ms);
+ }
+ return 0;
+}
+
int
malloc_heap_free(struct malloc_elem *elem)
{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
struct malloc_heap *heap;
void *start, *aligned_start, *end, *aligned_end;
size_t len, aligned_len;
struct rte_memseg_list *msl;
- int n_pages, page_idx, max_page_idx, ret;
+ int n_pages, ret;

if (!malloc_elem_cookies_ok(elem) || elem->state != ELEM_BUSY)
return -1;
@@ -463,30 +591,60 @@ malloc_heap_free(struct malloc_elem *elem)
aligned_end = RTE_PTR_ALIGN_FLOOR(end, msl->hugepage_sz);

aligned_len = RTE_PTR_DIFF(aligned_end, aligned_start);
+ n_pages = aligned_len / msl->hugepage_sz;

/* can't free anything */
- if (aligned_len < msl->hugepage_sz)
+ if (n_pages == 0)
goto free_unlock;

+ rte_rwlock_write_lock(&mcfg->memory_hotplug_lock);
+
+ /*
+ * we allow secondary processes to clear the heap of this allocated
+ * memory because it is safe to do so, as even if notifications about
+ * unmapped pages don't make it to other processes, heap is shared
+ * across all processes, and will become empty of this memory anyway,
+ * and nothing can allocate it back unless primary process will be able
+ * to deliver allocation message to every single running process.
+ */
+
malloc_elem_free_list_remove(elem);

malloc_elem_hide_region(elem, (void *) aligned_start, aligned_len);

- /* we don't really care if we fail to deallocate memory */
- n_pages = aligned_len / msl->hugepage_sz;
- page_idx = RTE_PTR_DIFF(aligned_start, msl->base_va) / msl->hugepage_sz;
- max_page_idx = page_idx + n_pages;
+ heap->total_size -= n_pages * msl->hugepage_sz;

- for (; page_idx < max_page_idx; page_idx++) {
- struct rte_memseg *ms;
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ /* don't care if any of this fails */
+ malloc_heap_free_pages(aligned_start, aligned_len);

- ms = rte_fbarray_get(&msl->memseg_arr, page_idx);
- eal_memalloc_free_page(ms);
- heap->total_size -= msl->hugepage_sz;
+ request_sync();
+ } else {
+ struct malloc_mp_req req;
+
+ req.t = REQ_TYPE_FREE;
+ req.free_req.addr = aligned_start;
+ req.free_req.len = aligned_len;
+
+ /*
+ * we request primary to deallocate pages, but we don't do it
+ * in this thread. instead, we notify primary that we would like
+ * to deallocate pages, and this process will receive another
+ * request (in parallel) that will do it for us on another
+ * thread.
+ *
+ * we also don't really care if this succeeds - the data is
+ * already removed from the heap, so it is, for all intents and
+ * purposes, hidden from the rest of DPDK even if some other
+ * process (including this one) may have these pages mapped.
+ */
+ request_to_primary(&req);
}

RTE_LOG(DEBUG, EAL, "Heap on socket %d was shrunk by %zdMB\n",
msl->socket_id, aligned_len >> 20ULL);
+
+ rte_rwlock_write_unlock(&mcfg->memory_hotplug_lock);
free_unlock:
rte_spinlock_unlock(&(heap->lock));
return ret;
@@ -579,6 +737,11 @@ rte_eal_malloc_heap_init(void)
if (mcfg == NULL)
return -1;

+ if (register_mp_requests()) {
+ RTE_LOG(ERR, EAL, "Couldn't register malloc multiprocess actions\n");
+ return -1;
+ }
+
/* secondary processes don't need to initialize heap */
if (rte_eal_process_type() == RTE_PROC_SECONDARY)
return 0;
@@ -604,6 +767,7 @@ rte_eal_malloc_heap_init(void)
rte_fbarray_get(arr, ms_idx);
malloc_heap_add_memory(heap, msl,
ms->addr, ms->len);
+ heap->total_size += ms->len;
ms_idx++;
RTE_LOG(DEBUG, EAL, "Heap on socket %d was expanded by %zdMB\n",
msl->socket_id, ms->len >> 20ULL);
@@ -630,6 +794,8 @@ rte_eal_malloc_heap_init(void)
*/
malloc_heap_add_memory(heap, msl, start_seg->addr, len);

+ heap->total_size += len;
+
RTE_LOG(DEBUG, EAL, "Heap on socket %d was expanded by %zdMB\n",
msl->socket_id, len >> 20ULL);

diff --git a/lib/librte_eal/common/malloc_mp.c b/lib/librte_eal/common/malloc_mp.c
new file mode 100644
index 0000000..8052680
--- /dev/null
+++ b/lib/librte_eal/common/malloc_mp.c
@@ -0,0 +1,723 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#include <string.h>
+#include <sys/time.h>
+
+#include <rte_alarm.h>
+#include <rte_errno.h>
+
+#include "eal_memalloc.h"
+
+#include "malloc_elem.h"
+#include "malloc_mp.h"
+
+#define MP_ACTION_SYNC "mp_malloc_sync"
+/**< request sent by primary process to notify of changes in memory map */
+#define MP_ACTION_ROLLBACK "mp_malloc_rollback"
+/**< request sent by primary process to notify of changes in memory map. this is
+ * essentially a regular sync request, but we cannot send sync requests while
+ * another one is in progress, and we might have to - therefore, we do this as
+ * a separate callback.
+ */
+#define MP_ACTION_REQUEST "mp_malloc_request"
+/**< request sent by secondary process to ask for allocation/deallocation */
+#define MP_ACTION_RESPONSE "mp_malloc_response"
+/**< response sent to secondary process to indicate result of request */
+
+#define MP_TIMEOUT_S 5 /**< 5 seconds timeouts */
+
+/* when we're allocating, we need to store some state to ensure that we can
+ * roll back later
+ */
+struct primary_alloc_req_state {
+ struct malloc_heap *heap;
+ struct rte_memseg **ms;
+ int ms_len;
+ struct malloc_elem *elem;
+ void *map_addr;
+ size_t map_len;
+};
+
+enum req_state {
+ REQ_STATE_INACTIVE = 0,
+ REQ_STATE_ACTIVE,
+ REQ_STATE_COMPLETE
+};
+
+struct mp_request {
+ TAILQ_ENTRY(mp_request) next;
+ struct malloc_mp_req user_req; /**< contents of request */
+ pthread_cond_t cond; /**< variable we use to time out on this request */
+ enum req_state state; /**< indicate status of this request */
+ struct primary_alloc_req_state alloc_state;
+};
+
+/*
+ * We could've used just a single request, but it may be possible for
+ * secondaries to timeout earlier than the primary, and send a new request while
+ * primary is still expecting replies to the old one. Therefore, each new
+ * request will get assigned a new ID, which is how we will distinguish between
+ * expected and unexpected messages.
+ */
+TAILQ_HEAD(mp_request_list, mp_request);
+static struct {
+ struct mp_request_list list;
+ pthread_mutex_t lock;
+} mp_request_list = {
+ .list = TAILQ_HEAD_INITIALIZER(mp_request_list.list),
+ .lock = PTHREAD_MUTEX_INITIALIZER
+};
+
+/**
+ * General workflow is the following:
+ *
+ * Allocation:
+ * S: send request to primary
+ * P: attempt to allocate memory
+ * if failed, sendmsg failure
+ * if success, send sync request
+ * S: if received msg of failure, quit
+ * if received sync request, synchronize memory map and reply with result
+ * P: if received sync request result
+ * if success, sendmsg success
+ * if failure, roll back allocation and send a rollback request
+ * S: if received msg of success, quit
+ * if received rollback request, synchronize memory map and reply with result
+ * P: if received sync request result
+ * sendmsg sync request result
+ * S: if received msg, quit
+ *
+ * Aside from timeouts, there are three points where we can quit:
+ * - if allocation failed straight away
+ * - if allocation and sync request succeeded
+ * - if allocation succeeded, sync request failed, allocation rolled back and
+ * rollback request received (irrespective of whether it succeeded or failed)
+ *
+ * Deallocation:
+ * S: send request to primary
+ * P: attempt to deallocate memory
+ * if failed, sendmsg failure
+ * if success, send sync request
+ * S: if received msg of failure, quit
+ * if received sync request, synchronize memory map and reply with result
+ * P: if received sync request result
+ * sendmsg sync request result
+ * S: if received msg, quit
+ *
+ * There is no "rollback" from deallocation, as it's safe to have some memory
+ * mapped in some processes - it's absent from the heap, so it won't get used.
+ */
+
+static struct mp_request *
+find_request_by_id(uint64_t id)
+{
+ struct mp_request *req;
+ TAILQ_FOREACH(req, &mp_request_list.list, next) {
+ if (req->user_req.id == id)
+ break;
+ }
+ return req;
+}
+
+/* this ID is, like, totally guaranteed to be absolutely unique. pinky swear. */
+static uint64_t
+get_unique_id(void)
+{
+ uint64_t id;
+ do {
+ id = rte_rand();
+ } while (find_request_by_id(id) != NULL);
+ return id;
+}
+
+/* secondary will respond to sync requests thusly */
+static int
+handle_sync(const struct rte_mp_msg *msg, const void *peer)
+{
+ struct rte_mp_msg reply = {0};
+ const struct malloc_mp_req *req =
+ (const struct malloc_mp_req *)msg->param;
+ struct malloc_mp_req *resp =
+ (struct malloc_mp_req *)reply.param;
+ int ret;
+
+ if (req->t != REQ_TYPE_SYNC) {
+ RTE_LOG(ERR, EAL, "Unexpected request from primary\n");
+ return -1;
+ }
+
+ reply.num_fds = 0;
+ snprintf(reply.name, sizeof(reply.name), "%s", msg->name);
+ reply.len_param = sizeof(*resp);
+
+ ret = eal_memalloc_sync_with_primary();
+
+ resp->t = REQ_TYPE_SYNC;
+ resp->id = req->id;
+ resp->result = ret == 0 ? REQ_RESULT_SUCCESS : REQ_RESULT_FAIL;
+
+ rte_mp_reply(&reply, peer);
+
+ return 0;
+}
+
+static int
+handle_alloc_request(const struct malloc_mp_req *m,
+ struct mp_request *req)
+{
+ const struct malloc_req_alloc *ar = &m->alloc_req;
+ struct malloc_heap *heap;
+ struct malloc_elem *elem;
+ struct rte_memseg **ms;
+ size_t map_len;
+ int n_pages;
+
+ map_len = RTE_ALIGN_CEIL(ar->align + ar->elt_size +
+ MALLOC_ELEM_TRAILER_LEN, ar->page_sz);
+ n_pages = map_len / ar->page_sz;
+
+ heap = ar->heap;
+
+ /* we can't know in advance how many pages we'll need, so we malloc */
+ ms = malloc(sizeof(*ms) * n_pages);
+
+ if (ms == NULL) {
+ RTE_LOG(ERR, EAL, "Couldn't allocate memory for request state\n");
+ goto fail;
+ }
+
+ elem = alloc_pages_on_heap(heap, ar->page_sz, ar->elt_size, ar->socket,
+ ar->flags, ar->align, ar->bound, ar->contig, ms,
+ n_pages);
+
+ if (elem == NULL)
+ goto fail;
+
+ /* we have succeeded in allocating memory, but we still need to sync
+ * with other processes. however, since DPDK IPC is single-threaded, we
+ * send an asynchronous request and exit this callback.
+ */
+
+ req->alloc_state.ms = ms;
+ req->alloc_state.ms_len = n_pages;
+ req->alloc_state.map_addr = ms[0]->addr;
+ req->alloc_state.map_len = map_len;
+ req->alloc_state.elem = elem;
+ req->alloc_state.heap = heap;
+
+ return 0;
+fail:
+ free(ms);
+ return -1;
+}
+
+/* first stage of primary handling requests from secondary */
+static int
+handle_request(const struct rte_mp_msg *msg, const void *peer __rte_unused)
+{
+ const struct malloc_mp_req *m =
+ (const struct malloc_mp_req *)msg->param;
+ struct mp_request *entry;
+ int ret;
+
+ /* lock access to request */
+ pthread_mutex_lock(&mp_request_list.lock);
+
+ /* make sure it's not a dupe */
+ entry = find_request_by_id(m->id);
+ if (entry != NULL) {
+ RTE_LOG(ERR, EAL, "Duplicate request id\n");
+ goto fail;
+ }
+
+ entry = malloc(sizeof(*entry));
+ if (entry == NULL) {
+ RTE_LOG(ERR, EAL, "Unable to allocate memory for request\n");
+ goto fail;
+ }
+
+ /* erase all data */
+ memset(entry, 0, sizeof(*entry));
+
+ if (m->t == REQ_TYPE_ALLOC) {
+ ret = handle_alloc_request(m, entry);
+ } else if (m->t == REQ_TYPE_FREE) {
+ ret = malloc_heap_free_pages(m->free_req.addr,
+ m->free_req.len);
+ } else {
+ RTE_LOG(ERR, EAL, "Unexpected request from secondary\n");
+ goto fail;
+ }
+
+ if (ret != 0) {
+ struct rte_mp_msg resp_msg;
+ struct malloc_mp_req *resp =
+ (struct malloc_mp_req *)resp_msg.param;
+
+ /* send failure message straight away */
+ resp_msg.num_fds = 0;
+ resp_msg.len_param = sizeof(*resp);
+ snprintf(resp_msg.name, sizeof(resp_msg.name), "%s",
+ MP_ACTION_RESPONSE);
+
+ resp->t = m->t;
+ resp->result = REQ_RESULT_FAIL;
+ resp->id = m->id;
+
+ if (rte_mp_sendmsg(&resp_msg)) {
+ RTE_LOG(ERR, EAL, "Couldn't send response\n");
+ goto fail;
+ }
+ /* we did not modify the request */
+ free(entry);
+ } else {
+ struct rte_mp_msg sr_msg = {0};
+ struct malloc_mp_req *sr =
+ (struct malloc_mp_req *)sr_msg.param;
+ struct timespec ts;
+
+ /* we can do something, so send sync request asynchronously */
+ sr_msg.num_fds = 0;
+ sr_msg.len_param = sizeof(*sr);
+ snprintf(sr_msg.name, sizeof(sr_msg.name), "%s",
+ MP_ACTION_SYNC);
+
+ ts.tv_nsec = 0;
+ ts.tv_sec = MP_TIMEOUT_S;
+
+ /* sync requests carry no data */
+ sr->t = REQ_TYPE_SYNC;
+ sr->id = m->id;
+
+ /* there may be stray timeout still waiting */
+ do {
+ ret = rte_mp_request_async(&sr_msg, &ts);
+ } while (ret != 0 && rte_errno == EEXIST);
+ if (ret != 0) {
+ RTE_LOG(ERR, EAL, "Couldn't send sync request\n");
+ if (m->t == REQ_TYPE_ALLOC)
+ free(entry->alloc_state.ms);
+ goto fail;
+ }
+
+ /* mark request as in progress */
+ memcpy(&entry->user_req, m, sizeof(*m));
+ entry->state = REQ_STATE_ACTIVE;
+
+ TAILQ_INSERT_TAIL(&mp_request_list.list, entry, next);
+ }
+ pthread_mutex_unlock(&mp_request_list.lock);
+ return 0;
+fail:
+ pthread_mutex_unlock(&mp_request_list.lock);
+ free(entry);
+ return -1;
+}
+
+/* callback for asynchronous sync requests for primary. this will either do a
+ * sendmsg with results, or trigger rollback request.
+ */
+static int
+handle_sync_response(const struct rte_mp_msg *request,
+ const struct rte_mp_reply *reply)
+{
+ enum malloc_req_result result;
+ struct mp_request *entry;
+ const struct malloc_mp_req *mpreq =
+ (const struct malloc_mp_req *)request->param;
+ int i;
+
+ /* lock the request */
+ pthread_mutex_lock(&mp_request_list.lock);
+
+ entry = find_request_by_id(mpreq->id);
+ if (entry == NULL) {
+ RTE_LOG(ERR, EAL, "Wrong request ID\n");
+ goto fail;
+ }
+
+ result = REQ_RESULT_SUCCESS;
+
+ if (reply->nb_received != reply->nb_sent)
+ result = REQ_RESULT_FAIL;
+
+ for (i = 0; i < reply->nb_received; i++) {
+ struct malloc_mp_req *resp =
+ (struct malloc_mp_req *)reply->msgs[i].param;
+
+ if (resp->t != REQ_TYPE_SYNC) {
+ RTE_LOG(ERR, EAL, "Unexpected response to sync request\n");
+ result = REQ_RESULT_FAIL;
+ break;
+ }
+ if (resp->id != entry->user_req.id) {
+ RTE_LOG(ERR, EAL, "Response to wrong sync request\n");
+ result = REQ_RESULT_FAIL;
+ break;
+ }
+ if (resp->result == REQ_RESULT_FAIL) {
+ result = REQ_RESULT_FAIL;
+ break;
+ }
+ }
+
+ if (entry->user_req.t == REQ_TYPE_FREE) {
+ struct rte_mp_msg msg = {0};
+ struct malloc_mp_req *resp = (struct malloc_mp_req *)msg.param;
+
+ /* this is a free request, just sendmsg result */
+ resp->t = REQ_TYPE_FREE;
+ resp->result = result;
+ resp->id = entry->user_req.id;
+ msg.num_fds = 0;
+ msg.len_param = sizeof(*resp);
+ snprintf(msg.name, sizeof(msg.name), "%s", MP_ACTION_RESPONSE);
+
+ if (rte_mp_sendmsg(&msg))
+ RTE_LOG(ERR, EAL, "Could not send message to secondary process\n");
+
+ TAILQ_REMOVE(&mp_request_list.list, entry, next);
+ free(entry);
+ } else if (entry->user_req.t == REQ_TYPE_ALLOC &&
+ result == REQ_RESULT_SUCCESS) {
+ struct malloc_heap *heap = entry->alloc_state.heap;
+ struct rte_mp_msg msg = {0};
+ struct malloc_mp_req *resp =
+ (struct malloc_mp_req *)msg.param;
+
+ heap->total_size += entry->alloc_state.map_len;
+
+ /* result is success, so just notify secondary about this */
+ resp->t = REQ_TYPE_ALLOC;
+ resp->result = result;
+ resp->id = entry->user_req.id;
+ msg.num_fds = 0;
+ msg.len_param = sizeof(*resp);
+ snprintf(msg.name, sizeof(msg.name), "%s", MP_ACTION_RESPONSE);
+
+ if (rte_mp_sendmsg(&msg))
+ RTE_LOG(ERR, EAL, "Could not send message to secondary process\n");
+
+ TAILQ_REMOVE(&mp_request_list.list, entry, next);
+ free(entry->alloc_state.ms);
+ free(entry);
+ } else if (entry->user_req.t == REQ_TYPE_ALLOC &&
+ result == REQ_RESULT_FAIL) {
+ struct rte_mp_msg rb_msg = {0};
+ struct malloc_mp_req *rb =
+ (struct malloc_mp_req *)rb_msg.param;
+ struct timespec ts;
+ struct primary_alloc_req_state *state =
+ &entry->alloc_state;
+ int ret;
+
+ /* we've failed to sync, so do a rollback */
+ rollback_expand_heap(state->ms, state->ms_len, state->elem,
+ state->map_addr, state->map_len);
+
+ /* send rollback request */
+ rb_msg.num_fds = 0;
+ rb_msg.len_param = sizeof(*rb);
+ snprintf(rb_msg.name, sizeof(rb_msg.name), "%s",
+ MP_ACTION_ROLLBACK);
+
+ ts.tv_nsec = 0;
+ ts.tv_sec = MP_TIMEOUT_S;
+
+ /* sync requests carry no data */
+ rb->t = REQ_TYPE_SYNC;
+ rb->id = entry->user_req.id;
+
+ /* there may be stray timeout still waiting */
+ do {
+ ret = rte_mp_request_async(&rb_msg, &ts);
+ } while (ret != 0 && rte_errno == EEXIST);
+ if (ret != 0) {
+ RTE_LOG(ERR, EAL, "Could not send rollback request to secondary process\n");
+
+ /* we couldn't send rollback request, but that's OK -
+ * secondary will time out, and memory has been removed
+ * from heap anyway.
+ */
+ TAILQ_REMOVE(&mp_request_list.list, entry, next);
+ free(state->ms);
+ free(entry);
+ goto fail;
+ }
+ } else {
+ RTE_LOG(ERR, EAL, " to sync request of unknown type\n");
+ goto fail;
+ }
+
+ pthread_mutex_unlock(&mp_request_list.lock);
+ return 0;
+fail:
+ pthread_mutex_unlock(&mp_request_list.lock);
+ return -1;
+}
+
+static int
+handle_rollback_response(const struct rte_mp_msg *request,
+ const struct rte_mp_reply *reply __rte_unused)
+{
+ struct rte_mp_msg msg = {0};
+ struct malloc_mp_req *resp = (struct malloc_mp_req *)msg.param;
+ const struct malloc_mp_req *mpreq =
+ (const struct malloc_mp_req *)request->param;
+ struct mp_request *entry;
+
+ /* lock the request */
+ pthread_mutex_lock(&mp_request_list.lock);
+
+ entry = find_request_by_id(mpreq->id);
+ if (entry == NULL) {
+ RTE_LOG(ERR, EAL, "Wrong request ID\n");
+ goto fail;
+ }
+
+ if (entry->user_req.t != REQ_TYPE_ALLOC) {
+ RTE_LOG(ERR, EAL, "Unexpected active request\n");
+ goto fail;
+ }
+
+ /* we don't care if rollback succeeded, request still failed */
+ resp->t = REQ_TYPE_ALLOC;
+ resp->result = REQ_RESULT_FAIL;
+ resp->id = mpreq->id;
+ msg.num_fds = 0;
+ msg.len_param = sizeof(*resp);
+ snprintf(msg.name, sizeof(msg.name), "%s", MP_ACTION_RESPONSE);
+
+ if (rte_mp_sendmsg(&msg))
+ RTE_LOG(ERR, EAL, "Could not send message to secondary process\n");
+
+ /* clean up */
+ TAILQ_REMOVE(&mp_request_list.list, entry, next);
+ free(entry->alloc_state.ms);
+ free(entry);
+
+ pthread_mutex_unlock(&mp_request_list.lock);
+ return 0;
+fail:
+ pthread_mutex_unlock(&mp_request_list.lock);
+ return -1;
+}
+
+/* final stage of the request from secondary */
+static int
+handle_response(const struct rte_mp_msg *msg, const void *peer __rte_unused)
+{
+ const struct malloc_mp_req *m =
+ (const struct malloc_mp_req *)msg->param;
+ struct mp_request *entry;
+
+ pthread_mutex_lock(&mp_request_list.lock);
+
+ entry = find_request_by_id(m->id);
+ if (entry != NULL) {
+ /* update request status */
+ entry->user_req.result = m->result;
+
+ entry->state = REQ_STATE_COMPLETE;
+
+ /* trigger thread wakeup */
+ pthread_cond_signal(&entry->cond);
+ }
+
+ pthread_mutex_unlock(&mp_request_list.lock);
+
+ return 0;
+}
+
+/* synchronously request memory map sync, this is only called whenever primary
+ * process initiates the allocation.
+ */
+int
+request_sync(void)
+{
+ struct rte_mp_msg msg = {0};
+ struct rte_mp_reply reply = {0};
+ struct malloc_mp_req *req = (struct malloc_mp_req *)msg.param;
+ struct timespec ts;
+ int i, ret;
+
+ /* no need to create tailq entries as this is entirely synchronous */
+
+ msg.num_fds = 0;
+ msg.len_param = sizeof(*req);
+ snprintf(msg.name, sizeof(msg.name), "%s", MP_ACTION_SYNC);
+
+ /* sync request carries no data */
+ req->t = REQ_TYPE_SYNC;
+ req->id = get_unique_id();
+
+ ts.tv_nsec = 0;
+ ts.tv_sec = MP_TIMEOUT_S;
+
+ /* there may be stray timeout still waiting */
+ do {
+ ret = rte_mp_request(&msg, &reply, &ts);
+ } while (ret != 0 && rte_errno == EEXIST);
+ if (ret != 0) {
+ RTE_LOG(ERR, EAL, "Could not send sync request to secondary process\n");
+ ret = -1;
+ goto out;
+ }
+
+ if (reply.nb_received != reply.nb_sent) {
+ RTE_LOG(ERR, EAL, "Not all secondaries have responded\n");
+ ret = -1;
+ goto out;
+ }
+
+ for (i = 0; i < reply.nb_received; i++) {
+ struct malloc_mp_req *resp =
+ (struct malloc_mp_req *)reply.msgs[i].param;
+ if (resp->t != REQ_TYPE_SYNC) {
+ RTE_LOG(ERR, EAL, "Unexpected response from secondary\n");
+ ret = -1;
+ goto out;
+ }
+ if (resp->id != req->id) {
+ RTE_LOG(ERR, EAL, "Wrong request ID\n");
+ ret = -1;
+ goto out;
+ }
+ if (resp->result != REQ_RESULT_SUCCESS) {
+ RTE_LOG(ERR, EAL, "Secondary process failed to synchronize\n");
+ ret = -1;
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ free(reply.msgs);
+ return ret;
+}
+
+/* this is a synchronous wrapper around a bunch of asynchronous requests to
+ * primary process. this will initiate a request and wait until responses come.
+ */
+int
+request_to_primary(struct malloc_mp_req *user_req)
+{
+ struct rte_mp_msg msg = {0};
+ struct malloc_mp_req *msg_req = (struct malloc_mp_req *)msg.param;
+ struct mp_request *entry;
+ struct timespec ts = {0};
+ struct timeval now;
+ int ret;
+
+ pthread_mutex_lock(&mp_request_list.lock);
+
+ entry = malloc(sizeof(*entry));
+ if (entry == NULL) {
+ RTE_LOG(ERR, EAL, "Cannot allocate memory for request\n");
+ goto fail;
+ }
+
+ memset(entry, 0, sizeof(*entry));
+
+ if (gettimeofday(&now, NULL) < 0) {
+ RTE_LOG(ERR, EAL, "Cannot get current time\n");
+ goto fail;
+ }
+
+ ts.tv_nsec = (now.tv_usec * 1000) % 1000000000;
+ ts.tv_sec = now.tv_sec + MP_TIMEOUT_S +
+ (now.tv_usec * 1000) / 1000000000;
+
+ /* initialize the request */
+ pthread_cond_init(&entry->cond, NULL);
+
+ msg.num_fds = 0;
+ msg.len_param = sizeof(*msg_req);
+ snprintf(msg.name, sizeof(msg.name), "%s", MP_ACTION_REQUEST);
+
+ /* (attempt to) get a unique id */
+ user_req->id = get_unique_id();
+
+ /* copy contents of user request into the message */
+ memcpy(msg_req, user_req, sizeof(*msg_req));
+
+ if (rte_mp_sendmsg(&msg)) {
+ RTE_LOG(ERR, EAL, "Cannot send message to primary\n");
+ goto fail;
+ }
+
+ /* copy contents of user request into active request */
+ memcpy(&entry->user_req, user_req, sizeof(*user_req));
+
+ /* mark request as in progress */
+ entry->state = REQ_STATE_ACTIVE;
+
+ TAILQ_INSERT_TAIL(&mp_request_list.list, entry, next);
+
+ /* finally, wait on timeout */
+ do {
+ ret = pthread_cond_timedwait(&entry->cond,
+ &mp_request_list.lock, &ts);
+ } while (ret != 0 && ret != ETIMEDOUT);
+
+ if (entry->state != REQ_STATE_COMPLETE) {
+ RTE_LOG(ERR, EAL, "Request timed out\n");
+ ret = -1;
+ } else {
+ ret = 0;
+ user_req->result = entry->user_req.result;
+ }
+ TAILQ_REMOVE(&mp_request_list.list, entry, next);
+ free(entry);
+
+ pthread_mutex_unlock(&mp_request_list.lock);
+ return ret;
+fail:
+ pthread_mutex_unlock(&mp_request_list.lock);
+ free(entry);
+ return -1;
+}
+
+int
+register_mp_requests(void)
+{
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ if (rte_mp_action_register(MP_ACTION_REQUEST, handle_request)) {
+ RTE_LOG(ERR, EAL, "Couldn't register '%s' action\n",
+ MP_ACTION_REQUEST);
+ return -1;
+ }
+ if (rte_mp_async_reply_register(MP_ACTION_SYNC,
+ handle_sync_response)) {
+ RTE_LOG(ERR, EAL, "Couldn't register '%s' action\n",
+ MP_ACTION_SYNC);
+ return -1;
+ }
+ if (rte_mp_async_reply_register(MP_ACTION_ROLLBACK,
+ handle_rollback_response)) {
+ RTE_LOG(ERR, EAL, "Couldn't register '%s' action\n",
+ MP_ACTION_ROLLBACK);
+ return -1;
+ }
+ } else {
+ if (rte_mp_action_register(MP_ACTION_SYNC, handle_sync)) {
+ RTE_LOG(ERR, EAL, "Couldn't register '%s' action\n",
+ MP_ACTION_SYNC);
+ return -1;
+ }
+ if (rte_mp_action_register(MP_ACTION_ROLLBACK, handle_sync)) {
+ RTE_LOG(ERR, EAL, "Couldn't register '%s' action\n",
+ MP_ACTION_SYNC);
+ return -1;
+ }
+ if (rte_mp_action_register(MP_ACTION_RESPONSE,
+ handle_response)) {
+ RTE_LOG(ERR, EAL, "Couldn't register '%s' action\n",
+ MP_ACTION_RESPONSE);
+ return -1;
+ }
+ }
+ return 0;
+}
diff --git a/lib/librte_eal/common/malloc_mp.h b/lib/librte_eal/common/malloc_mp.h
new file mode 100644
index 0000000..9c79d31
--- /dev/null
+++ b/lib/librte_eal/common/malloc_mp.h
@@ -0,0 +1,86 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef MALLOC_MP_H
+#define MALLOC_MP_H
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include <rte_common.h>
+#include <rte_random.h>
+#include <rte_spinlock.h>
+#include <rte_tailq.h>
+
+/* forward declarations */
+struct malloc_heap;
+struct rte_memseg;
+
+/* multiprocess synchronization structures for malloc */
+enum malloc_req_type {
+ REQ_TYPE_ALLOC, /**< ask primary to allocate */
+ REQ_TYPE_FREE, /**< ask primary to free */
+ REQ_TYPE_SYNC /**< ask secondary to synchronize its memory map */
+};
+
+enum malloc_req_result {
+ REQ_RESULT_SUCCESS,
+ REQ_RESULT_FAIL
+};
+
+struct malloc_req_alloc {
+ struct malloc_heap *heap;
+ uint64_t page_sz;
+ size_t elt_size;
+ int socket;
+ unsigned int flags;
+ size_t align;
+ size_t bound;
+ bool contig;
+};
+
+struct malloc_req_free {
+ RTE_STD_C11
+ union {
+ void *addr;
+ uint64_t addr_64;
+ };
+ uint64_t len;
+};
+
+struct malloc_mp_req {
+ enum malloc_req_type t;
+ RTE_STD_C11
+ union {
+ struct malloc_req_alloc alloc_req;
+ struct malloc_req_free free_req;
+ };
+ uint64_t id; /**< not to be populated by caller */
+ enum malloc_req_result result;
+};
+
+int
+register_mp_requests(void);
+
+int
+request_to_primary(struct malloc_mp_req *req);
+
+/* synchronous memory map sync request */
+int
+request_sync(void);
+
+/* functions from malloc_heap exposed here */
+int
+malloc_heap_free_pages(void *aligned_start, size_t aligned_len);
+
+struct malloc_elem *
+alloc_pages_on_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size,
+ int socket, unsigned int flags, size_t align, size_t bound,
+ bool contig, struct rte_memseg **ms, int n_pages);
+
+void
+rollback_expand_heap(struct rte_memseg **ms, int n_pages,
+ struct malloc_elem *elem, void *map_addr, size_t map_len);
+
+#endif // MALLOC_MP_H
diff --git a/lib/librte_eal/common/meson.build b/lib/librte_eal/common/meson.build
index a1ada24..8a3dcfe 100644
--- a/lib/librte_eal/common/meson.build
+++ b/lib/librte_eal/common/meson.build
@@ -27,6 +27,7 @@ common_sources = files(
'eal_common_timer.c',
'malloc_elem.c',
'malloc_heap.c',
+ 'malloc_mp.c',
'rte_keepalive.c',
'rte_malloc.c',
'rte_reciprocal.c',
diff --git a/lib/librte_eal/linuxapp/eal/Makefile b/lib/librte_eal/linuxapp/eal/Makefile
index 5380ba8..542bf7e 100644
--- a/lib/librte_eal/linuxapp/eal/Makefile
+++ b/lib/librte_eal/linuxapp/eal/Makefile
@@ -67,6 +67,7 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_fbarray.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_malloc.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_elem.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_heap.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_mp.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_keepalive.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_service.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_reciprocal.c
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 7a0d742..4bf8828 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -314,6 +314,8 @@ rte_config_init(void)
case RTE_PROC_INVALID:
rte_panic("Invalid process type\n");
}
+ /* disallow memory hotplug while init is active */
+ rte_rwlock_read_lock(&rte_config.mem_config->memory_hotplug_lock);
}

/* Unlocks hugepage directories that were locked by eal_hugepage_info_init */
@@ -676,6 +678,7 @@ rte_eal_mcfg_complete(void)
rte_config.mem_config->magic = RTE_MAGIC;

internal_config.init_complete = 1;
+ rte_rwlock_read_unlock(&rte_config.mem_config->memory_hotplug_lock);
}

/*
@@ -842,14 +845,14 @@ rte_eal_init(int argc, char **argv)
rte_eal_init_alert("Cannot init logging.");
rte_errno = ENOMEM;
rte_atomic32_clear(&run_once);
- return -1;
+ goto fail;
}

if (rte_mp_channel_init() < 0) {
rte_eal_init_alert("failed to init mp channel\n");
if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
rte_errno = EFAULT;
- return -1;
+ goto fail;
}
}

@@ -858,7 +861,7 @@ rte_eal_init(int argc, char **argv)
rte_eal_init_alert("Cannot init VFIO\n");
rte_errno = EAGAIN;
rte_atomic32_clear(&run_once);
- return -1;
+ goto fail;
}
#endif
/* memzone_init maps rte_fbarrays, which has to be done before hugepage
@@ -868,13 +871,13 @@ rte_eal_init(int argc, char **argv)
if (rte_eal_memzone_init() < 0) {
rte_eal_init_alert("Cannot init memzone\n");
rte_errno = ENODEV;
- return -1;
+ goto fail;
}

if (rte_eal_memory_init() < 0) {
rte_eal_init_alert("Cannot init memory\n");
rte_errno = ENOMEM;
- return -1;
+ goto fail;
}

/* the directories are locked during eal_hugepage_info_init */
@@ -883,25 +886,25 @@ rte_eal_init(int argc, char **argv)
if (rte_eal_malloc_heap_init() < 0) {
rte_eal_init_alert("Cannot init malloc heap\n");
rte_errno = ENODEV;
- return -1;
+ goto fail;
}

if (rte_eal_tailqs_init() < 0) {
rte_eal_init_alert("Cannot init tail queues for objects\n");
rte_errno = EFAULT;
- return -1;
+ goto fail;
}

if (rte_eal_alarm_init() < 0) {
rte_eal_init_alert("Cannot init interrupt-handling thread\n");
/* rte_eal_alarm_init sets rte_errno on failure. */
- return -1;
+ goto fail;
}

if (rte_eal_timer_init() < 0) {
rte_eal_init_alert("Cannot init HPET or TSC timers\n");
rte_errno = ENOTSUP;
- return -1;
+ goto fail;
}

eal_check_mem_on_local_socket();
@@ -916,7 +919,7 @@ rte_eal_init(int argc, char **argv)

if (rte_eal_intr_init() < 0) {
rte_eal_init_alert("Cannot init interrupt-handling thread\n");
- return -1;
+ goto fail;
}

RTE_LCORE_FOREACH_SLAVE(i) {
@@ -925,18 +928,24 @@ rte_eal_init(int argc, char **argv)
* create communication pipes between master thread
* and children
*/
- if (pipe(lcore_config[i].pipe_master2slave) < 0)
- rte_panic("Cannot create pipe\n");
- if (pipe(lcore_config[i].pipe_slave2master) < 0)
- rte_panic("Cannot create pipe\n");
+ if (pipe(lcore_config[i].pipe_master2slave) < 0) {
+ rte_eal_init_alert("Cannot create pipe\n");
+ goto fail;
+ }
+ if (pipe(lcore_config[i].pipe_slave2master) < 0) {
+ rte_eal_init_alert("Cannot create pipe\n");
+ goto fail;
+ }

lcore_config[i].state = WAIT;

/* create a thread for each lcore */
ret = pthread_create(&lcore_config[i].thread_id, NULL,
eal_thread_loop, NULL);
- if (ret != 0)
- rte_panic("Cannot create thread\n");
+ if (ret != 0) {
+ rte_eal_init_alert("Cannot create thread\n");
+ goto fail;
+ }

/* Set thread_name for aid in debugging. */
snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN,
@@ -960,14 +969,14 @@ rte_eal_init(int argc, char **argv)
if (ret) {
rte_eal_init_alert("rte_service_init() failed\n");
rte_errno = ENOEXEC;
- return -1;
+ goto fail;
}

/* Probe all the buses and devices/drivers on them */
if (rte_bus_probe()) {
rte_eal_init_alert("Cannot probe devices\n");
rte_errno = ENOTSUP;
- return -1;
+ goto fail;
}

/* initialize default service/lcore mappings and start running. Ignore
@@ -976,12 +985,15 @@ rte_eal_init(int argc, char **argv)
ret = rte_service_start_with_defaults();
if (ret < 0 && ret != -ENOTSUP) {
rte_errno = ENOEXEC;
- return -1;
+ goto fail;
}

rte_eal_mcfg_complete();

return fctret;
+fail:
+ rte_rwlock_read_unlock(&rte_config.mem_config->memory_hotplug_lock);
+ return -1;
}

int __rte_experimental
--
2.7.4
Anatoly Burakov
2018-03-03 13:46:17 UTC
Permalink
Each process will have its own callbacks. Callbacks will indicate
whether it's allocation and deallocation that's happened, and will
also provide start VA address and length of allocated block.

Since memory hotplug isn't supported on FreeBSD and in legacy mem
mode, it will not be possible to register them in either.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/common/eal_common_memalloc.c | 132 ++++++++++++++++++++++++++++
lib/librte_eal/common/eal_common_memory.c | 28 ++++++
lib/librte_eal/common/eal_memalloc.h | 10 +++
lib/librte_eal/common/include/rte_memory.h | 48 ++++++++++
lib/librte_eal/rte_eal_version.map | 2 +
5 files changed, 220 insertions(+)

diff --git a/lib/librte_eal/common/eal_common_memalloc.c b/lib/librte_eal/common/eal_common_memalloc.c
index 62e8c16..4fb55f2 100644
--- a/lib/librte_eal/common/eal_common_memalloc.c
+++ b/lib/librte_eal/common/eal_common_memalloc.c
@@ -2,16 +2,46 @@
* Copyright(c) 2017-2018 Intel Corporation
*/

+#include <string.h>
+
+#include <rte_errno.h>
#include <rte_lcore.h>
#include <rte_fbarray.h>
#include <rte_memzone.h>
#include <rte_memory.h>
#include <rte_eal_memconfig.h>
+#include <rte_rwlock.h>

#include "eal_private.h"
#include "eal_internal_cfg.h"
#include "eal_memalloc.h"

+struct mem_event_callback_entry {
+ TAILQ_ENTRY(mem_event_callback_entry) next;
+ char name[RTE_MEM_EVENT_CALLBACK_NAME_LEN];
+ rte_mem_event_callback_t clb;
+};
+
+/** Double linked list of actions. */
+TAILQ_HEAD(mem_event_callback_entry_list, mem_event_callback_entry);
+
+static struct mem_event_callback_entry_list callback_list =
+ TAILQ_HEAD_INITIALIZER(callback_list);
+
+static rte_rwlock_t rwlock = RTE_RWLOCK_INITIALIZER;
+
+static struct mem_event_callback_entry *
+find_callback(const char *name)
+{
+ struct mem_event_callback_entry *r;
+
+ TAILQ_FOREACH(r, &callback_list, next) {
+ if (!strcmp(r->name, name))
+ break;
+ }
+ return r;
+}
+
bool
eal_memalloc_is_contig(struct rte_memseg_list *msl, void *start,
size_t len)
@@ -47,3 +77,105 @@ eal_memalloc_is_contig(struct rte_memseg_list *msl, void *start,

return true;
}
+
+int
+eal_memalloc_callback_register(const char *name,
+ rte_mem_event_callback_t clb)
+{
+ struct mem_event_callback_entry *entry;
+ int ret, len;
+ if (name == NULL || clb == NULL) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+ len = strnlen(name, RTE_MEM_EVENT_CALLBACK_NAME_LEN);
+ if (len == 0) {
+ rte_errno = EINVAL;
+ return -1;
+ } else if (len == RTE_MEM_EVENT_CALLBACK_NAME_LEN) {
+ rte_errno = ENAMETOOLONG;
+ return -1;
+ }
+ rte_rwlock_write_lock(&rwlock);
+
+ entry = find_callback(name);
+ if (entry != NULL) {
+ rte_errno = EEXIST;
+ ret = -1;
+ goto unlock;
+ }
+
+ entry = malloc(sizeof(*entry));
+ if (entry == NULL) {
+ rte_errno = ENOMEM;
+ ret = -1;
+ goto unlock;
+ }
+
+ /* callback successfully created and is valid, add it to the list */
+ entry->clb = clb;
+ snprintf(entry->name, RTE_MEM_EVENT_CALLBACK_NAME_LEN, "%s", name);
+ TAILQ_INSERT_TAIL(&callback_list, entry, next);
+
+ ret = 0;
+
+ RTE_LOG(DEBUG, EAL, "Mem event callback '%s' registered\n", name);
+
+unlock:
+ rte_rwlock_write_unlock(&rwlock);
+ return ret;
+}
+
+int
+eal_memalloc_callback_unregister(const char *name)
+{
+ struct mem_event_callback_entry *entry;
+ int ret, len;
+
+ if (name == NULL) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+ len = strnlen(name, RTE_MEM_EVENT_CALLBACK_NAME_LEN);
+ if (len == 0) {
+ rte_errno = EINVAL;
+ return -1;
+ } else if (len == RTE_MEM_EVENT_CALLBACK_NAME_LEN) {
+ rte_errno = ENAMETOOLONG;
+ return -1;
+ }
+ rte_rwlock_write_lock(&rwlock);
+
+ entry = find_callback(name);
+ if (entry == NULL) {
+ rte_errno = ENOENT;
+ ret = -1;
+ goto unlock;
+ }
+ TAILQ_REMOVE(&callback_list, entry, next);
+ free(entry);
+
+ ret = 0;
+
+ RTE_LOG(DEBUG, EAL, "Mem event callback '%s' unregistered\n", name);
+
+unlock:
+ rte_rwlock_write_unlock(&rwlock);
+ return ret;
+}
+
+void
+eal_memalloc_notify(enum rte_mem_event event, const void *start, size_t len)
+{
+ struct mem_event_callback_entry *entry;
+
+ rte_rwlock_read_lock(&rwlock);
+
+ TAILQ_FOREACH(entry, &callback_list, next) {
+ RTE_LOG(DEBUG, EAL, "Calling mem event callback %s",
+ entry->name);
+ entry->clb(event, start, len);
+ }
+
+ rte_rwlock_read_unlock(&rwlock);
+}
diff --git a/lib/librte_eal/common/eal_common_memory.c b/lib/librte_eal/common/eal_common_memory.c
index a571e24..dcba099 100644
--- a/lib/librte_eal/common/eal_common_memory.c
+++ b/lib/librte_eal/common/eal_common_memory.c
@@ -466,6 +466,34 @@ rte_eal_get_physmem_size(void)
return total_len;
}

+/*
+ * Defining here because declared in rte_memory.h, but the actual implementation
+ * is in eal_common_memalloc.c, like all other memalloc internals.
+ */
+int
+rte_mem_event_register_callback(const char *name, rte_mem_event_callback_t clb)
+{
+ /* FreeBSD boots with legacy mem enabled by default */
+ if (internal_config.legacy_mem) {
+ RTE_LOG(DEBUG, EAL, "Registering mem event callbacks not supported\n");
+ rte_errno = ENOTSUP;
+ return -1;
+ }
+ return eal_memalloc_callback_register(name, clb);
+}
+
+int
+rte_mem_event_unregister_callback(const char *name)
+{
+ /* FreeBSD boots with legacy mem enabled by default */
+ if (internal_config.legacy_mem) {
+ RTE_LOG(DEBUG, EAL, "Registering mem event callbacks not supported\n");
+ rte_errno = ENOTSUP;
+ return -1;
+ }
+ return eal_memalloc_callback_unregister(name);
+}
+
/* Dump the physical memory layout on console */
void
rte_dump_physmem_layout(FILE *f)
diff --git a/lib/librte_eal/common/eal_memalloc.h b/lib/librte_eal/common/eal_memalloc.h
index beac296..499cf58 100644
--- a/lib/librte_eal/common/eal_memalloc.h
+++ b/lib/librte_eal/common/eal_memalloc.h
@@ -28,4 +28,14 @@ eal_memalloc_is_contig(struct rte_memseg_list *msl, void *start,
int
eal_memalloc_sync_with_primary(void);

+int
+eal_memalloc_callback_register(const char *name,
+ rte_mem_event_callback_t clb);
+
+int
+eal_memalloc_callback_unregister(const char *name);
+
+void
+eal_memalloc_notify(enum rte_mem_event event, const void *start, size_t len);
+
#endif // EAL_MEMALLOC_H
diff --git a/lib/librte_eal/common/include/rte_memory.h b/lib/librte_eal/common/include/rte_memory.h
index 674d4cb..1c8ffa6 100644
--- a/lib/librte_eal/common/include/rte_memory.h
+++ b/lib/librte_eal/common/include/rte_memory.h
@@ -200,6 +200,54 @@ unsigned rte_memory_get_nrank(void);
*/
int rte_eal_using_phys_addrs(void);

+
+/**
+ * Enum indicating which kind of memory event has happened. Used by callbacks to
+ * distinguish between memory allocations and deallocations.
+ */
+enum rte_mem_event {
+ RTE_MEM_EVENT_ALLOC = 0, /**< Allocation event. */
+ RTE_MEM_EVENT_FREE, /**< Deallocation event. */
+};
+#define RTE_MEM_EVENT_CALLBACK_NAME_LEN 64
+/**< maximum length of callback name */
+
+/**
+ * Function typedef used to register callbacks for memory events.
+ */
+typedef void (*rte_mem_event_callback_t)(enum rte_mem_event event_type,
+ const void *addr, size_t len);
+
+/**
+ * Function used to register callbacks for memory events.
+ *
+ * @param name
+ * Name associated with specified callback to be added to the list.
+ *
+ * @param clb
+ * Callback function pointer.
+ *
+ * @return
+ * 0 on successful callback register
+ * -1 on unsuccessful callback register, with rte_errno value indicating
+ * reason for failure.
+ */
+int rte_mem_event_register_callback(const char *name,
+ rte_mem_event_callback_t clb);
+
+/**
+ * Function used to unregister callbacks for memory events.
+ *
+ * @param name
+ * Name associated with specified callback to be removed from the list.
+ *
+ * @return
+ * 0 on successful callback unregister
+ * -1 on unsuccessful callback unregister, with rte_errno value indicating
+ * reason for failure.
+ */
+int rte_mem_event_unregister_callback(const char *name);
+
#ifdef __cplusplus
}
#endif
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index 4c2e959..b2a2d37 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -214,6 +214,8 @@ DPDK_18.05 {
global:

rte_num_sockets;
+ rte_mem_event_callback_register;
+ rte_mem_event_callback_unregister;
rte_mem_virt2memseg;
rte_mem_virt2memseg_list;
rte_malloc_dump_heaps;
--
2.7.4
Anatoly Burakov
2018-03-03 13:46:20 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---
drivers/crypto/qat/qat_qp.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/crypto/qat/qat_qp.c b/drivers/crypto/qat/qat_qp.c
index 87b9ce0..3f8ed4d 100644
--- a/drivers/crypto/qat/qat_qp.c
+++ b/drivers/crypto/qat/qat_qp.c
@@ -95,8 +95,8 @@ queue_dma_zone_reserve(const char *queue_name, uint32_t queue_size,
default:
memzone_flags = RTE_MEMZONE_SIZE_HINT_ONLY;
}
- return rte_memzone_reserve_aligned(queue_name, queue_size, socket_id,
- memzone_flags, queue_size);
+ return rte_memzone_reserve_aligned_contig(queue_name, queue_size,
+ socket_id, memzone_flags, queue_size);
}

int qat_crypto_sym_qp_setup(struct rte_cryptodev *dev, uint16_t queue_pair_id,
--
2.7.4
Trahe, Fiona
2018-03-05 11:06:29 UTC
Permalink
-----Original Message-----
From: Burakov, Anatoly
Sent: Saturday, March 3, 2018 1:46 PM
Subject: [PATCH 32/41] crypto/qat: use contiguous allocation for DMA memory
Acked-by: Fiona Trahe <***@intel.com>
Anatoly Burakov
2018-03-03 13:46:22 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---
drivers/net/bnx2x/bnx2x.c | 2 +-
drivers/net/bnx2x/bnx2x_rxtx.c | 3 ++-
2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/bnx2x/bnx2x.c b/drivers/net/bnx2x/bnx2x.c
index fb02d0f..81f5dae 100644
--- a/drivers/net/bnx2x/bnx2x.c
+++ b/drivers/net/bnx2x/bnx2x.c
@@ -177,7 +177,7 @@ bnx2x_dma_alloc(struct bnx2x_softc *sc, size_t size, struct bnx2x_dma *dma,
rte_get_timer_cycles());

/* Caller must take care that strlen(mz_name) < RTE_MEMZONE_NAMESIZE */
- z = rte_memzone_reserve_aligned(mz_name, (uint64_t) (size),
+ z = rte_memzone_reserve_aligned_contig(mz_name, (uint64_t)size,
SOCKET_ID_ANY,
0, align);
if (z == NULL) {
diff --git a/drivers/net/bnx2x/bnx2x_rxtx.c b/drivers/net/bnx2x/bnx2x_rxtx.c
index a0d4ac9..325b94d 100644
--- a/drivers/net/bnx2x/bnx2x_rxtx.c
+++ b/drivers/net/bnx2x/bnx2x_rxtx.c
@@ -26,7 +26,8 @@ ring_dma_zone_reserve(struct rte_eth_dev *dev, const char *ring_name,
if (mz)
return mz;

- return rte_memzone_reserve_aligned(z_name, ring_size, socket_id, 0, BNX2X_PAGE_SIZE);
+ return rte_memzone_reserve_aligned_contig(z_name, ring_size, socket_id,
+ 0, BNX2X_PAGE_SIZE);
}

static void
--
2.7.4
Anatoly Burakov
2018-03-03 13:46:13 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/linuxapp/eal/eal_memalloc.c | 11 +++++++++++
1 file changed, 11 insertions(+)

diff --git a/lib/librte_eal/linuxapp/eal/eal_memalloc.c b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
index bbeeeba..c03e7bc 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memalloc.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
@@ -34,6 +34,7 @@
#include <rte_eal.h>
#include <rte_memory.h>
#include <rte_spinlock.h>
+#include <rte_vfio.h>

#include "eal_filesystem.h"
#include "eal_internal_cfg.h"
@@ -476,6 +477,10 @@ alloc_page(struct rte_memseg *ms, void *addr, uint64_t size, int socket_id,
ms->iova = iova;
ms->socket_id = socket_id;

+ /* map the segment so that VFIO has access to it */
+ if (rte_eal_iova_mode() == RTE_IOVA_VA &&
+ rte_vfio_dma_map(ms->addr_64, iova, size))
+ RTE_LOG(DEBUG, EAL, "Cannot register segment with VFIO\n");
return 0;

mapped:
@@ -507,6 +512,12 @@ free_page(struct rte_memseg *ms, struct hugepage_info *hi,
char path[PATH_MAX];
int fd, ret;

+ /* unmap the segment from VFIO */
+ if (rte_eal_iova_mode() == RTE_IOVA_VA &&
+ rte_vfio_dma_unmap(ms->addr_64, ms->iova, ms->len)) {
+ RTE_LOG(DEBUG, EAL, "Cannot unregister segment with VFIO\n");
+ }
+
if (mmap(ms->addr, ms->hugepage_sz, PROT_READ,
MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0) ==
MAP_FAILED) {
--
2.7.4
Anatoly Burakov
2018-03-03 13:46:19 UTC
Permalink
This fixes the following drivers in one go:

grep -Rl rte_eth_dma_zone_reserve drivers/

drivers/net/avf/avf_rxtx.c
drivers/net/thunderx/nicvf_ethdev.c
drivers/net/e1000/igb_rxtx.c
drivers/net/e1000/em_rxtx.c
drivers/net/fm10k/fm10k_ethdev.c
drivers/net/vmxnet3/vmxnet3_rxtx.c
drivers/net/liquidio/lio_rxtx.c
drivers/net/i40e/i40e_rxtx.c
drivers/net/sfc/sfc.c
drivers/net/ixgbe/ixgbe_rxtx.c
drivers/net/nfp/nfp_net.c

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_ether/rte_ethdev.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lib/librte_ether/rte_ethdev.c b/lib/librte_ether/rte_ethdev.c
index 0590f0c..7935230 100644
--- a/lib/librte_ether/rte_ethdev.c
+++ b/lib/librte_ether/rte_ethdev.c
@@ -3401,7 +3401,8 @@ rte_eth_dma_zone_reserve(const struct rte_eth_dev *dev, const char *ring_name,
if (mz)
return mz;

- return rte_memzone_reserve_aligned(z_name, size, socket_id, 0, align);
+ return rte_memzone_reserve_aligned_contig(z_name, size, socket_id, 0,
+ align);
}

int
--
2.7.4
Andrew Rybchenko
2018-03-03 14:05:14 UTC
Permalink
Does it mean that these drivers are broken in the middle of patch set
and fixed now?
If so, it would be good to avoid it. It breaks bisect.
Post by Anatoly Burakov
grep -Rl rte_eth_dma_zone_reserve drivers/
drivers/net/avf/avf_rxtx.c
drivers/net/thunderx/nicvf_ethdev.c
drivers/net/e1000/igb_rxtx.c
drivers/net/e1000/em_rxtx.c
drivers/net/fm10k/fm10k_ethdev.c
drivers/net/vmxnet3/vmxnet3_rxtx.c
drivers/net/liquidio/lio_rxtx.c
drivers/net/i40e/i40e_rxtx.c
drivers/net/sfc/sfc.c
drivers/net/ixgbe/ixgbe_rxtx.c
drivers/net/nfp/nfp_net.c
---
lib/librte_ether/rte_ethdev.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/lib/librte_ether/rte_ethdev.c b/lib/librte_ether/rte_ethdev.c
index 0590f0c..7935230 100644
--- a/lib/librte_ether/rte_ethdev.c
+++ b/lib/librte_ether/rte_ethdev.c
@@ -3401,7 +3401,8 @@ rte_eth_dma_zone_reserve(const struct rte_eth_dev *dev, const char *ring_name,
if (mz)
return mz;
- return rte_memzone_reserve_aligned(z_name, size, socket_id, 0, align);
+ return rte_memzone_reserve_aligned_contig(z_name, size, socket_id, 0,
+ align);
}
int
Burakov, Anatoly
2018-03-05 09:08:43 UTC
Permalink
Post by Andrew Rybchenko
Does it mean that these drivers are broken in the middle of patch set
and fixed now?
If so, it would be good to avoid it. It breaks bisect.
Depends on the definition of "broken". Legacy memory mode will still
work for all drivers throughout the patchset. As for new memory mode,
yes, it will be "broken in the middle of the patchset", but due to the
fact that there's enormous amount of code to review between fbarray
changes, malloc changes, contiguous allocation changes and adding new
rte_memzone API's, i favored ease of code review over bisect.

I can of course reorder and roll up several different patchset and all
driver updates into one giant patch, but do you really want to be the
one reviewing such a patch?
--
Thanks,
Anatoly
Andrew Rybchenko
2018-03-05 09:15:53 UTC
Permalink
Post by Burakov, Anatoly
Post by Andrew Rybchenko
Does it mean that these drivers are broken in the middle of patch set
and fixed now?
If so, it would be good to avoid it. It breaks bisect.
Depends on the definition of "broken". Legacy memory mode will still
work for all drivers throughout the patchset. As for new memory mode,
yes, it will be "broken in the middle of the patchset", but due to the
fact that there's enormous amount of code to review between fbarray
changes, malloc changes, contiguous allocation changes and adding new
rte_memzone API's, i favored ease of code review over bisect.
I can of course reorder and roll up several different patchset and all
driver updates into one giant patch, but do you really want to be the
one reviewing such a patch?
Is it possible to:
1. Introduce _contig function
2. Switch users of the contiguous allocation to it as you do now
3. Make the old function to allocate possibly non-contiguous memory
Burakov, Anatoly
2018-03-05 10:00:41 UTC
Permalink
Post by Andrew Rybchenko
Post by Burakov, Anatoly
Post by Andrew Rybchenko
Does it mean that these drivers are broken in the middle of patch set
and fixed now?
If so, it would be good to avoid it. It breaks bisect.
Depends on the definition of "broken". Legacy memory mode will still
work for all drivers throughout the patchset. As for new memory mode,
yes, it will be "broken in the middle of the patchset", but due to the
fact that there's enormous amount of code to review between fbarray
changes, malloc changes, contiguous allocation changes and adding new
rte_memzone API's, i favored ease of code review over bisect.
I can of course reorder and roll up several different patchset and all
driver updates into one giant patch, but do you really want to be the
one reviewing such a patch?
1. Introduce _contig function
2. Switch users of the contiguous allocation to it as you do now
3. Make the old function to allocate possibly non-contiguous memory
Good point. I'll see if i can shuffle patches around for v2. Thanks!
--
Thanks,
Anatoly
Anatoly Burakov
2018-03-03 13:46:23 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---

Notes:
It is not 100% clear if this memzone is used for DMA,
corrections welcome.

drivers/net/cxgbe/sge.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/cxgbe/sge.c b/drivers/net/cxgbe/sge.c
index 3d5aa59..e31474c 100644
--- a/drivers/net/cxgbe/sge.c
+++ b/drivers/net/cxgbe/sge.c
@@ -1299,7 +1299,8 @@ static void *alloc_ring(size_t nelem, size_t elem_size,
* handle the maximum ring size is allocated in order to allow for
* resizing in later calls to the queue setup function.
*/
- tz = rte_memzone_reserve_aligned(z_name, len, socket_id, 0, 4096);
+ tz = rte_memzone_reserve_aligned_contig(z_name, len, socket_id, 0,
+ 4096);
if (!tz)
return NULL;
--
2.7.4
Anatoly Burakov
2018-03-03 13:46:26 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---

Notes:
It is not 100% clear that all users of this function
need to allocate DMA memory. Corrections welcome.

drivers/net/i40e/i40e_ethdev.c | 2 +-
drivers/net/i40e/i40e_rxtx.c | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/i40e/i40e_ethdev.c b/drivers/net/i40e/i40e_ethdev.c
index 508b417..0fffe2c 100644
--- a/drivers/net/i40e/i40e_ethdev.c
+++ b/drivers/net/i40e/i40e_ethdev.c
@@ -4010,7 +4010,7 @@ i40e_allocate_dma_mem_d(__attribute__((unused)) struct i40e_hw *hw,
return I40E_ERR_PARAM;

snprintf(z_name, sizeof(z_name), "i40e_dma_%"PRIu64, rte_rand());
- mz = rte_memzone_reserve_bounded(z_name, size, SOCKET_ID_ANY, 0,
+ mz = rte_memzone_reserve_bounded_contig(z_name, size, SOCKET_ID_ANY, 0,
alignment, RTE_PGSIZE_2M);
if (!mz)
return I40E_ERR_NO_MEMORY;
diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c
index 1217e5a..6b2b40e 100644
--- a/drivers/net/i40e/i40e_rxtx.c
+++ b/drivers/net/i40e/i40e_rxtx.c
@@ -2189,7 +2189,7 @@ i40e_memzone_reserve(const char *name, uint32_t len, int socket_id)
if (mz)
return mz;

- mz = rte_memzone_reserve_aligned(name, len,
+ mz = rte_memzone_reserve_aligned_contig(name, len,
socket_id, 0, I40E_RING_BASE_ALIGN);
return mz;
}
--
2.7.4
Anatoly Burakov
2018-03-03 13:45:57 UTC
Permalink
rte_fbarray is a simple indexed array stored in shared memory
via mapping files into memory. Rationale for its existence is the
following: since we are going to map memory page-by-page, there
could be quite a lot of memory segments to keep track of (for
smaller page sizes, page count can easily reach thousands). We
can't really make page lists truly dynamic and infinitely expandable,
because that involves reallocating memory (which is a big no-no in
multiprocess). What we can do instead is have a maximum capacity as
something really, really large, and decide at allocation time how
big the array is going to be. We map the entire file into memory,
which makes it possible to use fbarray as shared memory, provided
the structure itself is allocated in shared memory. Per-fbarray
locking is also used to avoid index data races (but not contents
data races - that is up to user application to synchronize).

In addition, in understanding that we will frequently need to scan
this array for free space and iterating over array linearly can
become slow, rte_fbarray provides facilities to index array's
usage. The following use cases are covered:
- find next free/used slot (useful either for adding new elements
to fbarray, or walking the list)
- find starting index for next N free/used slots (useful for when
we want to allocate chunk of VA-contiguous memory composed of
several pages)
- find how many contiguous free/used slots there are, starting
from specified index (useful for when we want to figure out
how many pages we have until next hole in allocated memory, to
speed up some bulk operations where we would otherwise have to
walk the array and add pages one by one)

This is accomplished by storing a usage mask in-memory, right
after the data section of the array, and using some bit-level
magic to figure out the info we need.

Signed-off-by: Anatoly Burakov <***@intel.com>
---

Notes:
Initial version of this had resizing capability, however it was
removed due to the fact that in multiprocess scenario, each
fbarray would have its own view of mapped memory, which might not
correspond with others due to some other process performing a
resize that current process didn't know about.

It was therefore decided that to avoid cost of synchronization on
each and every operation (to make sure the array wasn't resized),
resizing feature should be dropped.

lib/librte_eal/bsdapp/eal/Makefile | 1 +
lib/librte_eal/common/Makefile | 2 +-
lib/librte_eal/common/eal_common_fbarray.c | 859 ++++++++++++++++++++++++++++
lib/librte_eal/common/eal_filesystem.h | 13 +
lib/librte_eal/common/include/rte_fbarray.h | 352 ++++++++++++
lib/librte_eal/common/meson.build | 2 +
lib/librte_eal/linuxapp/eal/Makefile | 1 +
lib/librte_eal/rte_eal_version.map | 17 +
8 files changed, 1246 insertions(+), 1 deletion(-)
create mode 100644 lib/librte_eal/common/eal_common_fbarray.c
create mode 100644 lib/librte_eal/common/include/rte_fbarray.h

diff --git a/lib/librte_eal/bsdapp/eal/Makefile b/lib/librte_eal/bsdapp/eal/Makefile
index ed1d17b..1b43d77 100644
--- a/lib/librte_eal/bsdapp/eal/Makefile
+++ b/lib/librte_eal/bsdapp/eal/Makefile
@@ -53,6 +53,7 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_dev.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_options.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_thread.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_proc.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_fbarray.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += rte_malloc.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += malloc_elem.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += malloc_heap.c
diff --git a/lib/librte_eal/common/Makefile b/lib/librte_eal/common/Makefile
index ea824a3..48f870f 100644
--- a/lib/librte_eal/common/Makefile
+++ b/lib/librte_eal/common/Makefile
@@ -16,7 +16,7 @@ INC += rte_pci_dev_feature_defs.h rte_pci_dev_features.h
INC += rte_malloc.h rte_keepalive.h rte_time.h
INC += rte_service.h rte_service_component.h
INC += rte_bitmap.h rte_vfio.h rte_hypervisor.h rte_test.h
-INC += rte_reciprocal.h
+INC += rte_reciprocal.h rte_fbarray.h

GENERIC_INC := rte_atomic.h rte_byteorder.h rte_cycles.h rte_prefetch.h
GENERIC_INC += rte_spinlock.h rte_memcpy.h rte_cpuflags.h rte_rwlock.h
diff --git a/lib/librte_eal/common/eal_common_fbarray.c b/lib/librte_eal/common/eal_common_fbarray.c
new file mode 100644
index 0000000..76d86c3
--- /dev/null
+++ b/lib/librte_eal/common/eal_common_fbarray.c
@@ -0,0 +1,859 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2017-2018 Intel Corporation
+ */
+
+#include <inttypes.h>
+#include <sys/mman.h>
+#include <stdint.h>
+#include <errno.h>
+#include <sys/file.h>
+#include <string.h>
+
+#include <rte_common.h>
+#include <rte_log.h>
+#include <rte_errno.h>
+#include <rte_spinlock.h>
+#include <rte_tailq.h>
+
+#include "eal_filesystem.h"
+#include "eal_private.h"
+
+#include "rte_fbarray.h"
+
+#define MASK_SHIFT 6ULL
+#define MASK_ALIGN (1 << MASK_SHIFT)
+#define MASK_LEN_TO_IDX(x) ((x) >> MASK_SHIFT)
+#define MASK_LEN_TO_MOD(x) ((x) - RTE_ALIGN_FLOOR(x, MASK_ALIGN))
+#define MASK_GET_IDX(idx, mod) ((idx << MASK_SHIFT) + mod)
+
+/*
+ * This is a mask that is always stored at the end of array, to provide fast
+ * way of finding free/used spots without looping through each element.
+ */
+
+struct used_mask {
+ int n_masks;
+ uint64_t data[];
+};
+
+static size_t
+calc_mask_size(int len)
+{
+ /* mask must be multiple of MASK_ALIGN, even though length of array
+ * itself may not be aligned on that boundary.
+ */
+ len = RTE_ALIGN_CEIL(len, MASK_ALIGN);
+ return sizeof(struct used_mask) +
+ sizeof(uint64_t) * MASK_LEN_TO_IDX(len);
+}
+
+static size_t
+calc_data_size(size_t page_sz, int elt_sz, int len)
+{
+ size_t data_sz = elt_sz * len;
+ size_t msk_sz = calc_mask_size(len);
+ return RTE_ALIGN_CEIL(data_sz + msk_sz, page_sz);
+}
+
+static struct used_mask *
+get_used_mask(void *data, int elt_sz, int len)
+{
+ return (struct used_mask *) RTE_PTR_ADD(data, elt_sz * len);
+}
+
+static int
+resize_and_map(int fd, void *addr, size_t len)
+{
+ char path[PATH_MAX];
+ void *map_addr;
+
+ if (ftruncate(fd, len)) {
+ RTE_LOG(ERR, EAL, "Cannot truncate %s\n", path);
+ /* pass errno up the chain */
+ rte_errno = errno;
+ return -1;
+ }
+
+ map_addr = mmap(addr, len, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd, 0);
+ if (map_addr != addr) {
+ RTE_LOG(ERR, EAL, "mmap() failed: %s\n", strerror(errno));
+ /* pass errno up the chain */
+ rte_errno = errno;
+ return -1;
+ }
+ return 0;
+}
+
+static int
+find_next_n(const struct rte_fbarray *arr, int start, int n, bool used)
+{
+ const struct used_mask *msk = get_used_mask(arr->data, arr->elt_sz,
+ arr->len);
+ int msk_idx, lookahead_idx, first, first_mod;
+ int last, last_mod, last_msk;
+ uint64_t ignore_msk;
+
+ /*
+ * mask only has granularity of MASK_ALIGN, but start may not be aligned
+ * on that boundary, so construct a special mask to exclude anything we
+ * don't want to see to avoid confusing ctz.
+ */
+ first = MASK_LEN_TO_IDX(start);
+ first_mod = MASK_LEN_TO_MOD(start);
+ ignore_msk = ~((1ULL << first_mod) - 1);
+
+ /* array length may not be aligned, so calculate ignore mask for last
+ * mask index.
+ */
+ last = MASK_LEN_TO_IDX(arr->len);
+ last_mod = MASK_LEN_TO_MOD(arr->len);
+ last_msk = ~(-(1ULL) << last_mod);
+
+ for (msk_idx = first; msk_idx < msk->n_masks; msk_idx++) {
+ uint64_t cur_msk, lookahead_msk;
+ int run_start, clz, left;
+ bool found = false;
+ /*
+ * The process of getting n consecutive bits for arbitrary n is
+ * a bit involved, but here it is in a nutshell:
+ *
+ * 1. let n be the number of consecutive bits we're looking for
+ * 2. check if n can fit in one mask, and if so, do n-1
+ * rshift-ands to see if there is an appropriate run inside
+ * our current mask
+ * 2a. if we found a run, bail out early
+ * 2b. if we didn't find a run, proceed
+ * 3. invert the mask and count leading zeroes (that is, count
+ * how many consecutive set bits we had starting from the
+ * end of current mask) as k
+ * 3a. if k is 0, continue to next mask
+ * 3b. if k is not 0, we have a potential run
+ * 4. to satisfy our requirements, next mask must have n-k
+ * consecutive set bits right at the start, so we will do
+ * (n-k-1) rshift-ands and check if first bit is set.
+ *
+ * Step 4 will need to be repeated if (n-k) > MASK_ALIGN until
+ * we either run out of masks, lose the run, or find what we
+ * were looking for.
+ */
+ cur_msk = msk->data[msk_idx];
+ left = n;
+
+ /* if we're looking for free spaces, invert the mask */
+ if (!used)
+ cur_msk = ~cur_msk;
+
+ /* combine current ignore mask with last index ignore mask */
+ if (msk_idx == last)
+ ignore_msk |= last_msk;
+
+ /* if we have an ignore mask, ignore once */
+ if (ignore_msk) {
+ cur_msk &= ignore_msk;
+ ignore_msk = 0;
+ }
+
+ /* if n can fit in within a single mask, do a search */
+ if (n <= MASK_ALIGN) {
+ uint64_t tmp_msk = cur_msk;
+ int s_idx;
+ for (s_idx = 0; s_idx < n - 1; s_idx++)
+ tmp_msk &= tmp_msk >> 1ULL;
+ /* we found what we were looking for */
+ if (tmp_msk != 0) {
+ run_start = __builtin_ctzll(tmp_msk);
+ return MASK_GET_IDX(msk_idx, run_start);
+ }
+ }
+
+ /*
+ * we didn't find our run within the mask, or n > MASK_ALIGN,
+ * so we're going for plan B.
+ */
+
+ /* count leading zeroes on inverted mask */
+ clz = __builtin_clzll(~cur_msk);
+
+ /* if there aren't any runs at the end either, just continue */
+ if (clz == 0)
+ continue;
+
+ /* we have a partial run at the end, so try looking ahead */
+ run_start = MASK_ALIGN - clz;
+ left -= clz;
+
+ for (lookahead_idx = msk_idx + 1; lookahead_idx < msk->n_masks;
+ lookahead_idx++) {
+ int s_idx, need;
+ lookahead_msk = msk->data[lookahead_idx];
+
+ /* if we're looking for free space, invert the mask */
+ if (!used)
+ lookahead_msk = ~lookahead_msk;
+
+ /* figure out how many consecutive bits we need here */
+ need = RTE_MIN(left, MASK_ALIGN);
+
+ for (s_idx = 0; s_idx < need - 1; s_idx++)
+ lookahead_msk &= lookahead_msk >> 1ULL;
+
+ /* if first bit is not set, we've lost the run */
+ if ((lookahead_msk & 1) == 0) {
+ /*
+ * we've scanned this far, so we know there are
+ * no runs in the space we've lookahead-scanned
+ * as well, so skip that on next iteration.
+ */
+ ignore_msk = ~((1ULL << need) - 1);
+ msk_idx = lookahead_idx;
+ break;
+ }
+
+ left -= need;
+
+ /* check if we've found what we were looking for */
+ if (left == 0) {
+ found = true;
+ break;
+ }
+ }
+
+ /* we didn't find anything, so continue */
+ if (!found)
+ continue;
+
+ return MASK_GET_IDX(msk_idx, run_start);
+ }
+ /* we didn't find anything */
+ rte_errno = used ? -ENOENT : -ENOSPC;
+ return -1;
+}
+
+static int
+find_next(const struct rte_fbarray *arr, int start, bool used)
+{
+ const struct used_mask *msk = get_used_mask(arr->data, arr->elt_sz,
+ arr->len);
+ int idx, first, first_mod;
+ int last, last_mod, last_msk;
+ uint64_t ignore_msk;
+
+ /*
+ * mask only has granularity of MASK_ALIGN, but start may not be aligned
+ * on that boundary, so construct a special mask to exclude anything we
+ * don't want to see to avoid confusing ctz.
+ */
+ first = MASK_LEN_TO_IDX(start);
+ first_mod = MASK_LEN_TO_MOD(start);
+ ignore_msk = ~((1ULL << first_mod) - 1ULL);
+
+ /* array length may not be aligned, so calculate ignore mask for last
+ * mask index.
+ */
+ last = MASK_LEN_TO_IDX(arr->len);
+ last_mod = MASK_LEN_TO_MOD(arr->len);
+ last_msk = ~(-(1ULL) << last_mod);
+
+ for (idx = first; idx < msk->n_masks; idx++) {
+ uint64_t cur = msk->data[idx];
+ int found;
+
+ /* if we're looking for free entries, invert mask */
+ if (!used)
+ cur = ~cur;
+
+ if (idx == last)
+ cur &= last_msk;
+
+ /* ignore everything before start on first iteration */
+ if (idx == first)
+ cur &= ignore_msk;
+
+ /* check if we have any entries */
+ if (cur == 0)
+ continue;
+
+ /*
+ * find first set bit - that will correspond to whatever it is
+ * that we're looking for.
+ */
+ found = __builtin_ctzll(cur);
+ return MASK_GET_IDX(idx, found);
+ }
+ /* we didn't find anything */
+ rte_errno = used ? -ENOENT : -ENOSPC;
+ return -1;
+}
+
+static int
+find_contig(const struct rte_fbarray *arr, int start, bool used)
+{
+ const struct used_mask *msk = get_used_mask(arr->data, arr->elt_sz,
+ arr->len);
+ int idx, first, first_mod;
+ int last, last_mod, last_msk;
+ int need_len, result = 0;
+
+ /* array length may not be aligned, so calculate ignore mask for last
+ * mask index.
+ */
+ last = MASK_LEN_TO_IDX(arr->len);
+ last_mod = MASK_LEN_TO_MOD(arr->len);
+ last_msk = ~(-(1ULL) << last_mod);
+
+ first = MASK_LEN_TO_IDX(start);
+ first_mod = MASK_LEN_TO_MOD(start);
+ for (idx = first; idx < msk->n_masks; idx++, result += need_len) {
+ uint64_t cur = msk->data[idx];
+ int run_len;
+
+ need_len = MASK_ALIGN;
+
+ /* if we're looking for free entries, invert mask */
+ if (!used)
+ cur = ~cur;
+
+ /* if this is last mask, ignore everything after last bit */
+ if (idx == last)
+ cur &= last_msk;
+
+ /* ignore everything before start on first iteration */
+ if (idx == first) {
+ cur >>= first_mod;
+ /* at the start, we don't need the full mask len */
+ need_len -= first_mod;
+ }
+
+ /* we will be looking for zeroes, so invert the mask */
+ cur = ~cur;
+
+ /* if mask is zero, we have a complete run */
+ if (cur == 0)
+ continue;
+
+ /*
+ * see if current run ends before mask end.
+ */
+ run_len = __builtin_ctzll(cur);
+
+ /* add however many zeroes we've had in the last run and quit */
+ if (run_len < need_len) {
+ result += run_len;
+ break;
+ }
+ }
+ return result;
+}
+
+static int
+set_used(struct rte_fbarray *arr, int idx, bool used)
+{
+ struct used_mask *msk = get_used_mask(arr->data, arr->elt_sz, arr->len);
+ uint64_t msk_bit = 1ULL << MASK_LEN_TO_MOD(idx);
+ int msk_idx = MASK_LEN_TO_IDX(idx);
+ bool already_used;
+ int ret = -1;
+
+ if (arr == NULL || idx < 0 || idx >= arr->len) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+ ret = 0;
+
+ /* prevent array from changing under us */
+ rte_rwlock_write_lock(&arr->rwlock);
+
+ already_used = (msk->data[msk_idx] & msk_bit) != 0;
+
+ /* nothing to be done */
+ if (used == already_used)
+ goto out;
+
+ if (used) {
+ msk->data[msk_idx] |= msk_bit;
+ arr->count++;
+ } else {
+ msk->data[msk_idx] &= ~msk_bit;
+ arr->count--;
+ }
+out:
+ rte_rwlock_write_unlock(&arr->rwlock);
+
+ return ret;
+}
+
+static int
+fully_validate(const char *name, unsigned int elt_sz, unsigned int len)
+{
+ if (name == NULL || elt_sz == 0 || len == 0) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ if (strnlen(name, RTE_FBARRAY_NAME_LEN) == RTE_FBARRAY_NAME_LEN) {
+ rte_errno = ENAMETOOLONG;
+ return -1;
+ }
+ return 0;
+}
+
+int
+rte_fbarray_init(struct rte_fbarray *arr, const char *name, int len, int elt_sz)
+{
+ size_t mmap_len, page_sz;
+ char path[PATH_MAX];
+ struct used_mask *msk;
+ void *data = NULL;
+ int fd = -1;
+
+ if (arr == NULL) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ if (fully_validate(name, elt_sz, len))
+ return -1;
+
+ page_sz = sysconf(_SC_PAGESIZE);
+
+ /* calculate our memory limits */
+ mmap_len = calc_data_size(page_sz, elt_sz, len);
+
+ data = eal_get_virtual_area(NULL, &mmap_len, page_sz, 0, 0);
+ if (data == NULL)
+ goto fail;
+
+ eal_get_fbarray_path(path, sizeof(path), name);
+
+ /*
+ * Each fbarray is unique to process namespace, i.e. the filename
+ * depends on process prefix. Try to take out a lock and see if we
+ * succeed. If we don't, someone else is using it already.
+ */
+ fd = open(path, O_CREAT | O_RDWR);
+ if (fd < 0) {
+ RTE_LOG(DEBUG, EAL, "%s(): couldn't open %s: %s\n", __func__,
+ path, strerror(errno));
+ rte_errno = errno;
+ goto fail;
+ } else if (flock(fd, LOCK_EX | LOCK_NB)) {
+ RTE_LOG(DEBUG, EAL, "%s(): couldn't lock %s: %s\n", __func__,
+ path, strerror(errno));
+ rte_errno = EBUSY;
+ goto fail;
+ }
+
+ /* take out a non-exclusive lock, so that other processes could still
+ * attach to it, but no other process could reinitialize it.
+ */
+ if (flock(fd, LOCK_SH | LOCK_NB)) {
+ rte_errno = errno;
+ goto fail;
+ }
+
+ if (resize_and_map(fd, data, mmap_len))
+ goto fail;
+
+ /* we've mmap'ed the file, we can now close the fd */
+ close(fd);
+
+ /* initialize the data */
+ memset(data, 0, mmap_len);
+
+ /* populate data structure */
+ snprintf(arr->name, sizeof(arr->name), "%s", name);
+ arr->data = data;
+ arr->len = len;
+ arr->elt_sz = elt_sz;
+ arr->count = 0;
+
+ msk = get_used_mask(data, elt_sz, len);
+ msk->n_masks = MASK_LEN_TO_IDX(len);
+
+ rte_rwlock_init(&arr->rwlock);
+
+ return 0;
+fail:
+ if (data)
+ munmap(data, mmap_len);
+ if (fd >= 0)
+ close(fd);
+ return -1;
+}
+
+int
+rte_fbarray_attach(struct rte_fbarray *arr)
+{
+ uint64_t mmap_len, page_sz;
+ char path[PATH_MAX];
+ void *data = NULL;
+ int fd = -1;
+
+ if (arr == NULL) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ /*
+ * we don't need to synchronize attach as two values we need (element
+ * size and array length) are constant for the duration of life of
+ * the array, so the parts we care about will not race.
+ */
+
+ if (fully_validate(arr->name, arr->elt_sz, arr->len))
+ return -1;
+
+ page_sz = sysconf(_SC_PAGESIZE);
+
+ mmap_len = calc_data_size(page_sz, arr->elt_sz, arr->len);
+
+ data = eal_get_virtual_area(arr->data, &mmap_len, page_sz, 0, 0);
+ if (data == NULL)
+ goto fail;
+
+ eal_get_fbarray_path(path, sizeof(path), arr->name);
+
+ fd = open(path, O_RDWR);
+ if (fd < 0) {
+ rte_errno = errno;
+ goto fail;
+ }
+
+ /* lock the file, to let others know we're using it */
+ if (flock(fd, LOCK_SH | LOCK_NB)) {
+ rte_errno = errno;
+ goto fail;
+ }
+
+ if (resize_and_map(fd, data, mmap_len))
+ goto fail;
+
+ close(fd);
+
+ /* we're done */
+
+ return 0;
+fail:
+ if (data)
+ munmap(data, mmap_len);
+ if (fd >= 0)
+ close(fd);
+ return -1;
+}
+
+int
+rte_fbarray_detach(struct rte_fbarray *arr)
+{
+ if (arr == NULL) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ /*
+ * we don't need to synchronize detach as two values we need (element
+ * size and total capacity) are constant for the duration of life of
+ * the array, so the parts we care about will not race. if the user is
+ * detaching while doing something else in the same process, we can't
+ * really do anything about it, things will blow up either way.
+ */
+
+ size_t page_sz = sysconf(_SC_PAGESIZE);
+
+ /* this may already be unmapped (e.g. repeated call from previously
+ * failed destroy(), but this is on user, we can't (easily) know if this
+ * is still mapped.
+ */
+ munmap(arr->data, calc_data_size(page_sz, arr->elt_sz, arr->len));
+
+ return 0;
+}
+
+int
+rte_fbarray_destroy(struct rte_fbarray *arr)
+{
+ int fd, ret;
+ char path[PATH_MAX];
+
+ ret = rte_fbarray_detach(arr);
+ if (ret)
+ return ret;
+
+ /* try deleting the file */
+ eal_get_fbarray_path(path, sizeof(path), arr->name);
+
+ fd = open(path, O_RDONLY);
+ if (flock(fd, LOCK_EX | LOCK_NB)) {
+ RTE_LOG(DEBUG, EAL, "Cannot destroy fbarray - another process is using it\n");
+ rte_errno = EBUSY;
+ ret = -1;
+ } else {
+ ret = 0;
+ unlink(path);
+ memset(arr, 0, sizeof(*arr));
+ }
+ close(fd);
+
+ return ret;
+}
+
+void *
+rte_fbarray_get(const struct rte_fbarray *arr, int idx)
+{
+ void *ret = NULL;
+ if (arr == NULL || idx < 0) {
+ rte_errno = EINVAL;
+ return NULL;
+ }
+
+ if (idx >= arr->len) {
+ rte_errno = EINVAL;
+ return NULL;
+ }
+
+ ret = RTE_PTR_ADD(arr->data, idx * arr->elt_sz);
+
+ return ret;
+}
+
+int
+rte_fbarray_set_used(struct rte_fbarray *arr, int idx)
+{
+ return set_used(arr, idx, true);
+}
+
+int
+rte_fbarray_set_free(struct rte_fbarray *arr, int idx)
+{
+ return set_used(arr, idx, false);
+}
+
+int
+rte_fbarray_is_used(struct rte_fbarray *arr, int idx)
+{
+ struct used_mask *msk;
+ int msk_idx;
+ uint64_t msk_bit;
+ int ret = -1;
+
+ if (arr == NULL || idx < 0 || idx >= arr->len) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ /* prevent array from changing under us */
+ rte_rwlock_read_lock(&arr->rwlock);
+
+ msk = get_used_mask(arr->data, arr->elt_sz, arr->len);
+ msk_idx = MASK_LEN_TO_IDX(idx);
+ msk_bit = 1ULL << MASK_LEN_TO_MOD(idx);
+
+ ret = (msk->data[msk_idx] & msk_bit) != 0;
+
+ rte_rwlock_read_unlock(&arr->rwlock);
+
+ return ret;
+}
+
+int
+rte_fbarray_find_next_free(struct rte_fbarray *arr, int start)
+{
+ int ret = -1;
+
+ if (arr == NULL || start < 0 || start >= arr->len) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ /* prevent array from changing under us */
+ rte_rwlock_read_lock(&arr->rwlock);
+
+ if (arr->len == arr->count) {
+ rte_errno = ENOSPC;
+ goto out;
+ }
+
+ ret = find_next(arr, start, false);
+out:
+ rte_rwlock_read_unlock(&arr->rwlock);
+ return ret;
+}
+
+int
+rte_fbarray_find_next_used(struct rte_fbarray *arr, int start)
+{
+ int ret = -1;
+
+ if (arr == NULL || start < 0 || start >= arr->len) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ /* prevent array from changing under us */
+ rte_rwlock_read_lock(&arr->rwlock);
+
+ if (arr->count == 0) {
+ rte_errno = ENOENT;
+ goto out;
+ }
+
+ ret = find_next(arr, start, true);
+out:
+ rte_rwlock_read_unlock(&arr->rwlock);
+ return ret;
+}
+
+int
+rte_fbarray_find_next_n_free(struct rte_fbarray *arr, int start, int n)
+{
+ int ret = -1;
+
+ if (arr == NULL || start < 0 || start >= arr->len ||
+ n < 0 || n > arr->len) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ /* prevent array from changing under us */
+ rte_rwlock_read_lock(&arr->rwlock);
+
+ if (arr->len == arr->count || arr->len - arr->count < n) {
+ rte_errno = ENOSPC;
+ goto out;
+ }
+
+ ret = find_next_n(arr, start, n, false);
+out:
+ rte_rwlock_read_unlock(&arr->rwlock);
+ return ret;
+}
+
+int
+rte_fbarray_find_next_n_used(struct rte_fbarray *arr, int start, int n)
+{
+ int ret = -1;
+
+ if (arr == NULL || start < 0 || start >= arr->len ||
+ n < 0 || n > arr->len) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ /* prevent array from changing under us */
+ rte_rwlock_read_lock(&arr->rwlock);
+
+ if (arr->count < n) {
+ rte_errno = ENOENT;
+ goto out;
+ }
+
+ ret = find_next_n(arr, start, n, true);
+out:
+ rte_rwlock_read_unlock(&arr->rwlock);
+ return ret;
+}
+
+int
+rte_fbarray_find_contig_free(struct rte_fbarray *arr, int start)
+{
+ int ret = -1;
+
+ if (arr == NULL || start < 0 || start >= arr->len) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ /* prevent array from changing under us */
+ rte_rwlock_read_lock(&arr->rwlock);
+
+ if (arr->len == arr->count) {
+ rte_errno = ENOSPC;
+ goto out;
+ }
+
+ if (arr->count == 0) {
+ ret = arr->len - start;
+ goto out;
+ }
+
+ ret = find_contig(arr, start, false);
+out:
+ rte_rwlock_read_unlock(&arr->rwlock);
+ return ret;
+}
+
+int
+rte_fbarray_find_contig_used(struct rte_fbarray *arr, int start)
+{
+ int ret = -1;
+
+ if (arr == NULL || start < 0 || start >= arr->len) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ /* prevent array from changing under us */
+ rte_rwlock_read_lock(&arr->rwlock);
+
+ ret = find_contig(arr, start, true);
+
+ rte_rwlock_read_unlock(&arr->rwlock);
+ return ret;
+}
+
+int
+rte_fbarray_find_idx(const struct rte_fbarray *arr, const void *elt)
+{
+ void *end;
+ int ret = -1;
+
+ /*
+ * no need to synchronize as it doesn't matter if underlying data
+ * changes - we're doing pointer arithmetic here.
+ */
+
+ if (arr == NULL || elt == NULL) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+ end = RTE_PTR_ADD(arr->data, arr->elt_sz * arr->len);
+ if (elt < arr->data || elt >= end) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ ret = RTE_PTR_DIFF(elt, arr->data) / arr->elt_sz;
+
+ return ret;
+}
+
+void
+rte_fbarray_dump_metadata(struct rte_fbarray *arr, FILE *f)
+{
+ struct used_mask *msk;
+ int i;
+
+ if (arr == NULL || f == NULL) {
+ rte_errno = EINVAL;
+ return;
+ }
+
+ if (fully_validate(arr->name, arr->elt_sz, arr->len)) {
+ fprintf(f, "Invalid file-backed array\n");
+ goto out;
+ }
+
+ /* prevent array from changing under us */
+ rte_rwlock_read_lock(&arr->rwlock);
+
+ fprintf(f, "File-backed array: %s\n", arr->name);
+ fprintf(f, "size: %i occupied: %i elt_sz: %i\n",
+ arr->len, arr->count, arr->elt_sz);
+
+ msk = get_used_mask(arr->data, arr->elt_sz, arr->len);
+
+ for (i = 0; i < msk->n_masks; i++)
+ fprintf(f, "msk idx %i: 0x%016" PRIx64 "\n", i, msk->data[i]);
+out:
+ rte_rwlock_read_unlock(&arr->rwlock);
+}
diff --git a/lib/librte_eal/common/eal_filesystem.h b/lib/librte_eal/common/eal_filesystem.h
index 4708dd5..1c6048b 100644
--- a/lib/librte_eal/common/eal_filesystem.h
+++ b/lib/librte_eal/common/eal_filesystem.h
@@ -13,6 +13,7 @@

/** Path of rte config file. */
#define RUNTIME_CONFIG_FMT "%s/.%s_config"
+#define FBARRAY_FMT "%s/%s_%s"

#include <stdint.h>
#include <limits.h>
@@ -55,6 +56,18 @@ eal_mp_socket_path(void)
return buffer;
}

+static inline const char *
+eal_get_fbarray_path(char *buffer, size_t buflen, const char *name) {
+ const char *directory = "/tmp";
+ const char *home_dir = getenv("HOME");
+
+ if (getuid() != 0 && home_dir != NULL)
+ directory = home_dir;
+ snprintf(buffer, buflen - 1, FBARRAY_FMT, directory,
+ internal_config.hugefile_prefix, name);
+ return buffer;
+}
+
/** Path of hugepage info file. */
#define HUGEPAGE_INFO_FMT "%s/.%s_hugepage_info"

diff --git a/lib/librte_eal/common/include/rte_fbarray.h b/lib/librte_eal/common/include/rte_fbarray.h
new file mode 100644
index 0000000..4e1d207
--- /dev/null
+++ b/lib/librte_eal/common/include/rte_fbarray.h
@@ -0,0 +1,352 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2017-2018 Intel Corporation
+ */
+
+#ifndef RTE_FBARRAY_H
+#define RTE_FBARRAY_H
+
+/**
+ * @file
+ *
+ * File-backed shared indexed array for DPDK.
+ *
+ * Basic workflow is expected to be the following:
+ * 1) Allocate array either using ``rte_fbarray_init()`` or
+ * ``rte_fbarray_attach()`` (depending on whether it's shared between
+ * multiple DPDK processes)
+ * 2) find free spots using ``rte_fbarray_find_next_free()``
+ * 3) get pointer to data in the free spot using ``rte_fbarray_get()``, and
+ * copy data into the pointer (element size is fixed)
+ * 4) mark entry as used using ``rte_fbarray_set_used()``
+ *
+ * Calls to ``rte_fbarray_init()`` and ``rte_fbarray_destroy()`` will have
+ * consequences for all processes, while calls to ``rte_fbarray_attach()`` and
+ * ``rte_fbarray_detach()`` will only have consequences within a single process.
+ * Therefore, it is safe to call ``rte_fbarray_attach()`` or
+ * ``rte_fbarray_detach()`` while another process is using ``rte_fbarray``,
+ * provided no other thread within the same process will try to use
+ * ``rte_fbarray`` before attaching or after detaching. It is not safe to call
+ * ``rte_fbarray_init()`` or ``rte_fbarray_destroy()`` while another thread or
+ * another process is using ``rte_fbarray``.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdbool.h>
+#include <stdio.h>
+
+#include <rte_rwlock.h>
+
+#define RTE_FBARRAY_NAME_LEN 64
+
+struct rte_fbarray {
+ char name[RTE_FBARRAY_NAME_LEN]; /**< name associated with an array */
+ int count; /**< number of entries stored */
+ int len; /**< current length of the array */
+ int elt_sz; /**< size of each element */
+ void *data; /**< data pointer */
+ rte_rwlock_t rwlock; /**< multiprocess lock */
+};
+
+/**
+ * Set up ``rte_fbarray`` structure and allocate underlying resources.
+ *
+ * Call this function to correctly set up ``rte_fbarray`` and allocate
+ * underlying files that will be backing the data in the current process. Note
+ * that in order to use and share ``rte_fbarray`` between multiple processes,
+ * data pointed to by ``arr`` pointer must itself be allocated in shared memory.
+ *
+ * @param arr
+ * Valid pointer to allocated ``rte_fbarray`` structure.
+ *
+ * @param name
+ * Unique name to be assigned to this array.
+ *
+ * @param len
+ * Number of elements initially available in the array.
+ *
+ * @param elt_sz
+ * Size of each element.
+ *
+ * @return
+ * - 0 on success.
+ * - -1 on failure, with ``rte_errno`` indicating reason for failure.
+ */
+int
+rte_fbarray_init(struct rte_fbarray *arr, const char *name, int len,
+ int elt_sz);
+
+
+/**
+ * Attach to a file backing an already allocated and correctly set up
+ * ``rte_fbarray`` structure.
+ *
+ * Call this function to attach to file that will be backing the data in the
+ * current process. The structure must have been previously correctly set up
+ * with a call to ``rte_fbarray_init()``. Calls to ``rte_fbarray_attach()`` are
+ * usually meant to be performed in a multiprocessing scenario, with data
+ * pointed to by ``arr`` pointer allocated in shared memory.
+ *
+ * @param arr
+ * Valid pointer to allocated and correctly set up rte_fbarray structure.
+ *
+ * @return
+ * - 0 on success.
+ * - -1 on failure, with ``rte_errno`` indicating reason for failure.
+ */
+int
+rte_fbarray_attach(struct rte_fbarray *arr);
+
+
+/**
+ * Deallocate resources for an already allocated and correctly set up
+ * ``rte_fbarray`` structure, and remove the underlying file.
+ *
+ * Call this function to deallocate all resources associated with an
+ * ``rte_fbarray`` structure within the current process. This will also
+ * zero-fill data pointed to by ``arr`` pointer and remove the underlying file
+ * backing the data, so it is expected that by the time this function is called,
+ * all other processes have detached from this ``rte_fbarray``.
+ *
+ * @param arr
+ * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure.
+ *
+ * @return
+ * - 0 on success.
+ * - -1 on failure, with ``rte_errno`` indicating reason for failure.
+ */
+int
+rte_fbarray_destroy(struct rte_fbarray *arr);
+
+
+/**
+ * Deallocate resources for an already allocated and correctly set up
+ * ``rte_fbarray`` structure.
+ *
+ * Call this function to deallocate all resources associated with an
+ * ``rte_fbarray`` structure within current process.
+ *
+ * @param arr
+ * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure.
+ *
+ * @return
+ * - 0 on success.
+ * - -1 on failure, with ``rte_errno`` indicating reason for failure.
+ */
+int
+rte_fbarray_detach(struct rte_fbarray *arr);
+
+
+/**
+ * Get pointer to element residing at specified index.
+ *
+ * @param arr
+ * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure.
+ *
+ * @param idx
+ * Index of an element to get a pointer to.
+ *
+ * @return
+ * - non-NULL pointer on success.
+ * - NULL on failure, with ``rte_errno`` indicating reason for failure.
+ */
+void *
+rte_fbarray_get(const struct rte_fbarray *arr, int idx);
+
+
+/**
+ * Find index of a specified element within the array.
+ *
+ * @param arr
+ * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure.
+ *
+ * @param elt
+ * Pointer to element to find index to.
+ *
+ * @return
+ * - non-negative integer on success.
+ * - -1 on failure, with ``rte_errno`` indicating reason for failure.
+ */
+int
+rte_fbarray_find_idx(const struct rte_fbarray *arr, const void *elt);
+
+
+/**
+ * Mark specified element as used.
+ *
+ * @param arr
+ * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure.
+ *
+ * @param idx
+ * Element index to mark as used.
+ *
+ * @return
+ * - 0 on success.
+ * - -1 on failure, with ``rte_errno`` indicating reason for failure.
+ */
+int
+rte_fbarray_set_used(struct rte_fbarray *arr, int idx);
+
+
+/**
+ * Mark specified element as free.
+ *
+ * @param arr
+ * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure.
+ *
+ * @param idx
+ * Element index to mark as free.
+ *
+ * @return
+ * - 0 on success.
+ * - -1 on failure, with ``rte_errno`` indicating reason for failure.
+ */
+int
+rte_fbarray_set_free(struct rte_fbarray *arr, int idx);
+
+
+/**
+ * Check whether element at specified index is marked as used.
+ *
+ * @param arr
+ * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure.
+ *
+ * @param idx
+ * Element index to check as used.
+ *
+ * @return
+ * - 1 if element is used.
+ * - 0 if element is unused.
+ * - -1 on failure, with ``rte_errno`` indicating reason for failure.
+ */
+int
+rte_fbarray_is_used(struct rte_fbarray *arr, int idx);
+
+
+/**
+ * Find index of next free element, starting at specified index.
+ *
+ * @param arr
+ * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure.
+ *
+ * @param start
+ * Element index to start search from.
+ *
+ * @return
+ * - non-negative integer on success.
+ * - -1 on failure, with ``rte_errno`` indicating reason for failure.
+ */
+int
+rte_fbarray_find_next_free(struct rte_fbarray *arr, int start);
+
+
+/**
+ * Find index of next used element, starting at specified index.
+ *
+ * @param arr
+ * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure.
+ *
+ * @param start
+ * Element index to start search from.
+ *
+ * @return
+ * - non-negative integer on success.
+ * - -1 on failure, with ``rte_errno`` indicating reason for failure.
+ */
+int
+rte_fbarray_find_next_used(struct rte_fbarray *arr, int start);
+
+
+/**
+ * Find index of next chunk of ``n`` free elements, starting at specified index.
+ *
+ * @param arr
+ * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure.
+ *
+ * @param start
+ * Element index to start search from.
+ *
+ * @param n
+ * Number of free elements to look for.
+ *
+ * @return
+ * - non-negative integer on success.
+ * - -1 on failure, with ``rte_errno`` indicating reason for failure.
+ */
+int
+rte_fbarray_find_next_n_free(struct rte_fbarray *arr, int start, int n);
+
+
+/**
+ * Find index of next chunk of ``n`` used elements, starting at specified index.
+ *
+ * @param arr
+ * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure.
+ *
+ * @param start
+ * Element index to start search from.
+ *
+ * @param n
+ * Number of used elements to look for.
+ *
+ * @return
+ * - non-negative integer on success.
+ * - -1 on failure, with ``rte_errno`` indicating reason for failure.
+ */
+int
+rte_fbarray_find_next_n_used(struct rte_fbarray *arr, int start, int n);
+
+
+/**
+ * Find how many more free entries there are, starting at specified index.
+ *
+ * @param arr
+ * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure.
+ *
+ * @param start
+ * Element index to start search from.
+ *
+ * @return
+ * - non-negative integer on success.
+ * - -1 on failure, with ``rte_errno`` indicating reason for failure.
+ */
+int
+rte_fbarray_find_contig_free(struct rte_fbarray *arr, int start);
+
+
+/**
+ * Find how many more used entries there are, starting at specified index.
+ *
+ * @param arr
+ * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure.
+ *
+ * @param start
+ * Element index to start search from.
+ *
+ * @return
+ * - non-negative integer on success.
+ * - -1 on failure, with ``rte_errno`` indicating reason for failure.
+ */
+int
+rte_fbarray_find_contig_used(struct rte_fbarray *arr, int start);
+
+
+/**
+ * Dump ``rte_fbarray`` metadata.
+ *
+ * @param arr
+ * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure.
+ *
+ * @param f
+ * File object to dump information into.
+ */
+void
+rte_fbarray_dump_metadata(struct rte_fbarray *arr, FILE *f);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // RTE_FBARRAY_H
diff --git a/lib/librte_eal/common/meson.build b/lib/librte_eal/common/meson.build
index 82b8910..7d02191 100644
--- a/lib/librte_eal/common/meson.build
+++ b/lib/librte_eal/common/meson.build
@@ -11,6 +11,7 @@ common_sources = files(
'eal_common_devargs.c',
'eal_common_dev.c',
'eal_common_errno.c',
+ 'eal_common_fbarray.c',
'eal_common_hexdump.c',
'eal_common_launch.c',
'eal_common_lcore.c',
@@ -51,6 +52,7 @@ common_headers = files(
'include/rte_eal_memconfig.h',
'include/rte_eal_interrupts.h',
'include/rte_errno.h',
+ 'include/rte_fbarray.h',
'include/rte_hexdump.h',
'include/rte_interrupts.h',
'include/rte_keepalive.h',
diff --git a/lib/librte_eal/linuxapp/eal/Makefile b/lib/librte_eal/linuxapp/eal/Makefile
index b9c7727..c407a43 100644
--- a/lib/librte_eal/linuxapp/eal/Makefile
+++ b/lib/librte_eal/linuxapp/eal/Makefile
@@ -61,6 +61,7 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_dev.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_options.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_thread.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_proc.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_fbarray.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_malloc.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_elem.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_heap.c
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index 18b8bf5..a938a2f 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -216,6 +216,23 @@ DPDK_18.05 {

rte_num_sockets;
rte_malloc_dump_heaps;
+ rte_fbarray_init;
+ rte_fbarray_destroy;
+ rte_fbarray_attach;
+ rte_fbarray_detach;
+ rte_fbarray_resize;
+ rte_fbarray_get;
+ rte_fbarray_find_idx;
+ rte_fbarray_set_free;
+ rte_fbarray_set_used;
+ rte_fbarray_is_used;
+ rte_fbarray_find_next_free;
+ rte_fbarray_find_next_used;
+ rte_fbarray_find_next_n_free;
+ rte_fbarray_find_next_n_used;
+ rte_fbarray_find_contig_free;
+ rte_fbarray_find_contig_used;
+ rte_fbarray_dump_metadata;

} DPDK_18.02;
--
2.7.4
Anatoly Burakov
2018-03-03 13:46:27 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---

Notes:
Doing "grep -R rte_memzone_reserve drivers/net/qede" returns the following:

drivers/net/qede/qede_fdir.c: mz = rte_memzone_reserve_aligned(mz_name, QEDE_MAX_FDIR_PKT_LEN,
drivers/net/qede/base/bcm_osal.c: mz = rte_memzone_reserve_aligned_contig(mz_name, size,
drivers/net/qede/base/bcm_osal.c: mz = rte_memzone_reserve_aligned_contig(mz_name, size, socket_id, 0,

I took a brief look at memzone in qede_fdir and it didn't look like memzone
was used for DMA, so i left it alone. Corrections welcome.

drivers/net/qede/base/bcm_osal.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/qede/base/bcm_osal.c b/drivers/net/qede/base/bcm_osal.c
index fe42f32..707d553 100644
--- a/drivers/net/qede/base/bcm_osal.c
+++ b/drivers/net/qede/base/bcm_osal.c
@@ -135,7 +135,7 @@ void *osal_dma_alloc_coherent(struct ecore_dev *p_dev,
if (core_id == (unsigned int)LCORE_ID_ANY)
core_id = 0;
socket_id = rte_lcore_to_socket_id(core_id);
- mz = rte_memzone_reserve_aligned(mz_name, size,
+ mz = rte_memzone_reserve_aligned_contig(mz_name, size,
socket_id, 0, RTE_CACHE_LINE_SIZE);
if (!mz) {
DP_ERR(p_dev, "Unable to allocate DMA memory "
@@ -174,7 +174,8 @@ void *osal_dma_alloc_coherent_aligned(struct ecore_dev *p_dev,
if (core_id == (unsigned int)LCORE_ID_ANY)
core_id = 0;
socket_id = rte_lcore_to_socket_id(core_id);
- mz = rte_memzone_reserve_aligned(mz_name, size, socket_id, 0, align);
+ mz = rte_memzone_reserve_aligned_contig(mz_name, size, socket_id, 0,
+ align);
if (!mz) {
DP_ERR(p_dev, "Unable to allocate DMA memory "
"of size %zu bytes - %s\n",
--
2.7.4
Anatoly Burakov
2018-03-03 13:45:58 UTC
Permalink
For now, this option does nothing, but it will be useful in
dynamic memory allocation down the line. Currently, DPDK stores
all pages as separate files in hugetlbfs. This option will allow
storing all pages in one file (one file per socket, per page size).

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/common/eal_common_options.c | 4 ++++
lib/librte_eal/common/eal_internal_cfg.h | 4 ++++
lib/librte_eal/common/eal_options.h | 2 ++
lib/librte_eal/linuxapp/eal/eal.c | 1 +
4 files changed, 11 insertions(+)

diff --git a/lib/librte_eal/common/eal_common_options.c b/lib/librte_eal/common/eal_common_options.c
index 0be80cb..dbc3fb5 100644
--- a/lib/librte_eal/common/eal_common_options.c
+++ b/lib/librte_eal/common/eal_common_options.c
@@ -73,6 +73,7 @@ eal_long_options[] = {
{OPT_VDEV, 1, NULL, OPT_VDEV_NUM },
{OPT_VFIO_INTR, 1, NULL, OPT_VFIO_INTR_NUM },
{OPT_VMWARE_TSC_MAP, 0, NULL, OPT_VMWARE_TSC_MAP_NUM },
+ {OPT_SINGLE_FILE_SEGMENTS, 0, NULL, OPT_SINGLE_FILE_SEGMENTS_NUM},
{0, 0, NULL, 0 }
};

@@ -1161,6 +1162,9 @@ eal_parse_common_option(int opt, const char *optarg,

core_parsed = LCORE_OPT_MAP;
break;
+ case OPT_SINGLE_FILE_SEGMENTS_NUM:
+ conf->single_file_segments = 1;
+ break;

/* don't know what to do, leave this to caller */
default:
diff --git a/lib/librte_eal/common/eal_internal_cfg.h b/lib/librte_eal/common/eal_internal_cfg.h
index 4e2c2e6..3e31ac6 100644
--- a/lib/librte_eal/common/eal_internal_cfg.h
+++ b/lib/librte_eal/common/eal_internal_cfg.h
@@ -47,6 +47,10 @@ struct internal_config {
volatile unsigned force_sockets;
volatile uint64_t socket_mem[RTE_MAX_NUMA_NODES]; /**< amount of memory per socket */
uintptr_t base_virtaddr; /**< base address to try and reserve memory from */
+ volatile unsigned single_file_segments;
+ /**< true if storing all pages within single files (per-page-size,
+ * per-node).
+ */
volatile int syslog_facility; /**< facility passed to openlog() */
/** default interrupt mode for VFIO */
volatile enum rte_intr_mode vfio_intr_mode;
diff --git a/lib/librte_eal/common/eal_options.h b/lib/librte_eal/common/eal_options.h
index e86c711..a4b80d5 100644
--- a/lib/librte_eal/common/eal_options.h
+++ b/lib/librte_eal/common/eal_options.h
@@ -55,6 +55,8 @@ enum {
OPT_VFIO_INTR_NUM,
#define OPT_VMWARE_TSC_MAP "vmware-tsc-map"
OPT_VMWARE_TSC_MAP_NUM,
+#define OPT_SINGLE_FILE_SEGMENTS "single-file-segments"
+ OPT_SINGLE_FILE_SEGMENTS_NUM,
OPT_LONG_MAX_NUM
};

diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 2ecd07b..c84e6bf 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -348,6 +348,7 @@ eal_usage(const char *prgname)
" --"OPT_BASE_VIRTADDR" Base virtual address\n"
" --"OPT_CREATE_UIO_DEV" Create /dev/uioX (usually done by hotplug)\n"
" --"OPT_VFIO_INTR" Interrupt mode for VFIO (legacy|msi|msix)\n"
+ " --"OPT_SINGLE_FILE_SEGMENTS" Put all hugepage memory in single files\n"
"\n");
/* Allow the application to print its usage message too if hook is set */
if ( rte_application_usage_hook ) {
--
2.7.4
Anatoly Burakov
2018-03-03 13:46:02 UTC
Permalink
Nothing uses this code yet. The bulk of it is copied from old
memory allocation code (linuxapp eal_memory.c). We provide an
EAL-internal API to allocate either one page or multiple pages,
guaranteeing that we'll get contiguous VA for all of the pages
that we requested.

For single-file segments, we will use fallocate() to grow and
shrink memory segments, however fallocate() is not supported
on all kernel versions, so we will fall back to using
ftruncate() to grow the file, and disable shrinking as there's
little we can do there. This will enable vhost use cases where
having single file segments is of great value even without
support for hot-unplugging memory.

Not supported on FreeBSD.

Locking is done via fcntl() because that way, when it comes to
taking out write locks or unlocking on deallocation, we don't
have to keep original fd's around. Plus, using fcntl() gives us
ability to lock parts of a file, which is useful for single-file
segments.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/bsdapp/eal/Makefile | 1 +
lib/librte_eal/bsdapp/eal/eal_memalloc.c | 26 ++
lib/librte_eal/bsdapp/eal/meson.build | 1 +
lib/librte_eal/common/eal_memalloc.h | 19 +
lib/librte_eal/linuxapp/eal/Makefile | 2 +
lib/librte_eal/linuxapp/eal/eal_memalloc.c | 609 +++++++++++++++++++++++++++++
lib/librte_eal/linuxapp/eal/meson.build | 1 +
7 files changed, 659 insertions(+)
create mode 100644 lib/librte_eal/bsdapp/eal/eal_memalloc.c
create mode 100644 lib/librte_eal/common/eal_memalloc.h
create mode 100644 lib/librte_eal/linuxapp/eal/eal_memalloc.c

diff --git a/lib/librte_eal/bsdapp/eal/Makefile b/lib/librte_eal/bsdapp/eal/Makefile
index 1b43d77..19f9322 100644
--- a/lib/librte_eal/bsdapp/eal/Makefile
+++ b/lib/librte_eal/bsdapp/eal/Makefile
@@ -29,6 +29,7 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_memory.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_hugepage_info.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_thread.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_debug.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_memalloc.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_lcore.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_timer.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_interrupts.c
diff --git a/lib/librte_eal/bsdapp/eal/eal_memalloc.c b/lib/librte_eal/bsdapp/eal/eal_memalloc.c
new file mode 100644
index 0000000..be8340b
--- /dev/null
+++ b/lib/librte_eal/bsdapp/eal/eal_memalloc.c
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2017-2018 Intel Corporation
+ */
+
+#include <inttypes.h>
+
+#include <rte_log.h>
+#include <rte_memory.h>
+
+#include "eal_memalloc.h"
+
+int
+eal_memalloc_alloc_page_bulk(struct rte_memseg **ms __rte_unused,
+ int __rte_unused n, uint64_t __rte_unused size,
+ int __rte_unused socket, bool __rte_unused exact)
+{
+ RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n");
+ return -1;
+}
+
+struct rte_memseg *
+eal_memalloc_alloc_page(uint64_t __rte_unused size, int __rte_unused socket)
+{
+ RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n");
+ return NULL;
+}
diff --git a/lib/librte_eal/bsdapp/eal/meson.build b/lib/librte_eal/bsdapp/eal/meson.build
index e83fc91..4b40223 100644
--- a/lib/librte_eal/bsdapp/eal/meson.build
+++ b/lib/librte_eal/bsdapp/eal/meson.build
@@ -8,6 +8,7 @@ env_sources = files('eal_alarm.c',
'eal_hugepage_info.c',
'eal_interrupts.c',
'eal_lcore.c',
+ 'eal_memalloc.c',
'eal_thread.c',
'eal_timer.c',
'eal.c',
diff --git a/lib/librte_eal/common/eal_memalloc.h b/lib/librte_eal/common/eal_memalloc.h
new file mode 100644
index 0000000..c1076cf
--- /dev/null
+++ b/lib/librte_eal/common/eal_memalloc.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2017-2018 Intel Corporation
+ */
+
+#ifndef EAL_MEMALLOC_H
+#define EAL_MEMALLOC_H
+
+#include <stdbool.h>
+
+#include <rte_memory.h>
+
+struct rte_memseg *
+eal_memalloc_alloc_page(uint64_t size, int socket);
+
+int
+eal_memalloc_alloc_page_bulk(struct rte_memseg **ms, int n, uint64_t size,
+ int socket, bool exact);
+
+#endif // EAL_MEMALLOC_H
diff --git a/lib/librte_eal/linuxapp/eal/Makefile b/lib/librte_eal/linuxapp/eal/Makefile
index c407a43..af6b9be 100644
--- a/lib/librte_eal/linuxapp/eal/Makefile
+++ b/lib/librte_eal/linuxapp/eal/Makefile
@@ -36,6 +36,7 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_thread.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_log.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_vfio.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_vfio_mp_sync.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_memalloc.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_debug.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_lcore.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_timer.c
@@ -82,6 +83,7 @@ CFLAGS_eal_interrupts.o := -D_GNU_SOURCE
CFLAGS_eal_vfio_mp_sync.o := -D_GNU_SOURCE
CFLAGS_eal_timer.o := -D_GNU_SOURCE
CFLAGS_eal_lcore.o := -D_GNU_SOURCE
+CFLAGS_eal_memalloc.o := -D_GNU_SOURCE
CFLAGS_eal_thread.o := -D_GNU_SOURCE
CFLAGS_eal_log.o := -D_GNU_SOURCE
CFLAGS_eal_common_log.o := -D_GNU_SOURCE
diff --git a/lib/librte_eal/linuxapp/eal/eal_memalloc.c b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
new file mode 100644
index 0000000..1ba1201
--- /dev/null
+++ b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
@@ -0,0 +1,609 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2017-2018 Intel Corporation
+ */
+
+#define _FILE_OFFSET_BITS 64
+#include <errno.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/queue.h>
+#include <sys/file.h>
+#include <unistd.h>
+#include <limits.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/time.h>
+#include <signal.h>
+#include <setjmp.h>
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+#include <numa.h>
+#include <numaif.h>
+#endif
+
+#include <rte_common.h>
+#include <rte_log.h>
+#include <rte_eal_memconfig.h>
+#include <rte_eal.h>
+#include <rte_memory.h>
+#include <rte_spinlock.h>
+
+#include "eal_filesystem.h"
+#include "eal_internal_cfg.h"
+#include "eal_memalloc.h"
+
+/*
+ * not all kernel version support fallocate on hugetlbfs, so fall back to
+ * ftruncate and disallow deallocation if fallocate is not supported.
+ */
+static int fallocate_supported = -1; /* unknown */
+
+/*
+ * If each page is in a separate file, we can close fd's since we need each fd
+ * only once. However, in single file segments mode, we can get away with using
+ * a single fd for entire segments, but we need to store them somewhere. Each
+ * fd is different within each process, so we'll store them in a local tailq.
+ */
+struct msl_entry {
+ TAILQ_ENTRY(msl_entry) next;
+ unsigned int msl_idx;
+ int fd;
+};
+
+/** Double linked list of memseg list fd's. */
+TAILQ_HEAD(msl_entry_list, msl_entry);
+
+static struct msl_entry_list msl_entry_list =
+ TAILQ_HEAD_INITIALIZER(msl_entry_list);
+static rte_spinlock_t tailq_lock = RTE_SPINLOCK_INITIALIZER;
+
+static sigjmp_buf huge_jmpenv;
+
+static void __rte_unused huge_sigbus_handler(int signo __rte_unused)
+{
+ siglongjmp(huge_jmpenv, 1);
+}
+
+/* Put setjmp into a wrap method to avoid compiling error. Any non-volatile,
+ * non-static local variable in the stack frame calling sigsetjmp might be
+ * clobbered by a call to longjmp.
+ */
+static int __rte_unused huge_wrap_sigsetjmp(void)
+{
+ return sigsetjmp(huge_jmpenv, 1);
+}
+
+static struct sigaction huge_action_old;
+static int huge_need_recover;
+
+static void __rte_unused
+huge_register_sigbus(void)
+{
+ sigset_t mask;
+ struct sigaction action;
+
+ sigemptyset(&mask);
+ sigaddset(&mask, SIGBUS);
+ action.sa_flags = 0;
+ action.sa_mask = mask;
+ action.sa_handler = huge_sigbus_handler;
+
+ huge_need_recover = !sigaction(SIGBUS, &action, &huge_action_old);
+}
+
+static void __rte_unused
+huge_recover_sigbus(void)
+{
+ if (huge_need_recover) {
+ sigaction(SIGBUS, &huge_action_old, NULL);
+ huge_need_recover = 0;
+ }
+}
+
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+static bool
+prepare_numa(int *oldpolicy, struct bitmask *oldmask, int socket_id)
+{
+ bool have_numa = true;
+
+ /* Check if kernel supports NUMA. */
+ if (numa_available() != 0) {
+ RTE_LOG(DEBUG, EAL, "NUMA is not supported.\n");
+ have_numa = false;
+ }
+
+ if (have_numa) {
+ RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n");
+ if (get_mempolicy(oldpolicy, oldmask->maskp,
+ oldmask->size + 1, 0, 0) < 0) {
+ RTE_LOG(ERR, EAL,
+ "Failed to get current mempolicy: %s. "
+ "Assuming MPOL_DEFAULT.\n", strerror(errno));
+ oldpolicy = MPOL_DEFAULT;
+ }
+ RTE_LOG(DEBUG, EAL,
+ "Setting policy MPOL_PREFERRED for socket %d\n",
+ socket_id);
+ numa_set_preferred(socket_id);
+ }
+ return have_numa;
+}
+
+static void
+resotre_numa(int *oldpolicy, struct bitmask *oldmask)
+{
+ RTE_LOG(DEBUG, EAL,
+ "Restoring previous memory policy: %d\n", *oldpolicy);
+ if (oldpolicy == MPOL_DEFAULT) {
+ numa_set_localalloc();
+ } else if (set_mempolicy(*oldpolicy, oldmask->maskp,
+ oldmask->size + 1) < 0) {
+ RTE_LOG(ERR, EAL, "Failed to restore mempolicy: %s\n",
+ strerror(errno));
+ numa_set_localalloc();
+ }
+ numa_free_cpumask(oldmask);
+}
+#endif
+
+static struct msl_entry *
+get_msl_entry_by_idx(unsigned int list_idx)
+{
+ struct msl_entry *te;
+
+ rte_spinlock_lock(&tailq_lock);
+
+ TAILQ_FOREACH(te, &msl_entry_list, next) {
+ if (te->msl_idx == list_idx)
+ break;
+ }
+ if (te == NULL) {
+ /* doesn't exist, so create it and set fd to -1 */
+
+ te = malloc(sizeof(*te));
+ if (te == NULL) {
+ RTE_LOG(ERR, EAL, "%s(): cannot allocate tailq entry for memseg list\n",
+ __func__);
+ goto unlock;
+ }
+ te->msl_idx = list_idx;
+ te->fd = -1;
+ TAILQ_INSERT_TAIL(&msl_entry_list, te, next);
+ }
+unlock:
+ rte_spinlock_unlock(&tailq_lock);
+ return te;
+}
+
+/*
+ * uses fstat to report the size of a file on disk
+ */
+static off_t
+getFileSize(int fd)
+{
+ struct stat st;
+ if (fstat(fd, &st) < 0)
+ return 0;
+ return st.st_size;
+}
+
+/*
+ * uses fstat to check if file size on disk is zero (regular fstat won't show
+ * true file size due to how fallocate works)
+ */
+static bool
+is_zero_length(int fd)
+{
+ struct stat st;
+ if (fstat(fd, &st) < 0)
+ return false;
+ return st.st_blocks == 0;
+}
+
+static int
+get_page_fd(char *path, int buflen, struct hugepage_info *hi,
+ unsigned int list_idx, unsigned int seg_idx)
+{
+ int fd;
+
+ if (internal_config.single_file_segments) {
+ /*
+ * try to find a tailq entry, for this memseg list, or create
+ * one if it doesn't exist.
+ */
+ struct msl_entry *te = get_msl_entry_by_idx(list_idx);
+ if (te == NULL) {
+ RTE_LOG(ERR, EAL, "%s(): cannot allocate tailq entry for memseg list\n",
+ __func__);
+ return -1;
+ } else if (te->fd < 0) {
+ /* create a hugepage file */
+ eal_get_hugefile_path(path, buflen, hi->hugedir,
+ list_idx);
+ fd = open(path, O_CREAT | O_RDWR, 0600);
+ if (fd < 0) {
+ RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n",
+ __func__, strerror(errno));
+ return -1;
+ }
+ te->fd = fd;
+ } else {
+ fd = te->fd;
+ }
+ } else {
+ /* one file per page, just create it */
+ eal_get_hugefile_path(path, buflen, hi->hugedir,
+ list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx);
+ fd = open(path, O_CREAT | O_RDWR, 0600);
+ if (fd < 0) {
+ RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__,
+ strerror(errno));
+ return -1;
+ }
+ }
+ return fd;
+}
+
+/* returns 1 on successful lock, 0 on unsuccessful lock, -1 on error */
+static int lock(int fd, uint64_t offset, uint64_t len, int type)
+{
+ struct flock lck = {0};
+ int ret;
+
+ lck.l_type = type;
+ lck.l_whence = SEEK_SET;
+ lck.l_start = offset;
+ lck.l_len = len;
+
+ ret = fcntl(fd, F_SETLK, &lck);
+
+ if (ret && (errno == EAGAIN || errno == EACCES)) {
+ /* locked by another process, not an error */
+ return 0;
+ } else if (ret) {
+ RTE_LOG(ERR, EAL, "%s(): error calling fcntl(): %s\n",
+ __func__, strerror(errno));
+ /* we've encountered an unexpected error */
+ return -1;
+ }
+ return 1;
+}
+
+static int
+resize_hugefile(int fd, uint64_t fa_offset, uint64_t page_sz,
+ bool grow)
+{
+ bool again = false;
+ do {
+ if (fallocate_supported == 0) {
+ /* we cannot deallocate memory if fallocate() is not
+ * supported, but locks are still needed to prevent
+ * primary process' initialization from clearing out
+ * huge pages used by this process.
+ */
+
+ if (!grow) {
+ RTE_LOG(DEBUG, EAL, "%s(): fallocate not supported, not freeing page back to the system\n",
+ __func__);
+ return -1;
+ }
+ uint64_t new_size = fa_offset + page_sz;
+ uint64_t cur_size = getFileSize(fd);
+
+ /* fallocate isn't supported, fall back to ftruncate */
+ if (new_size > cur_size &&
+ ftruncate(fd, new_size) < 0) {
+ RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n",
+ __func__, strerror(errno));
+ return -1;
+ }
+ /* not being able to take out a read lock is an error */
+ if (lock(fd, fa_offset, page_sz, F_RDLCK) != 1)
+ return -1;
+ } else {
+ int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE |
+ FALLOC_FL_KEEP_SIZE;
+ int ret;
+
+ /* if fallocate() is supported, we need to take out a
+ * read lock on allocate (to prevent other processes
+ * from deallocating this page), and take out a write
+ * lock on deallocate (to ensure nobody else is using
+ * this page).
+ *
+ * we can't use flock() for this, as we actually need to
+ * lock part of the file, not the entire file.
+ */
+
+ if (!grow) {
+ ret = lock(fd, fa_offset, page_sz, F_WRLCK);
+
+ if (ret < 0)
+ return -1;
+ else if (ret == 0)
+ /* failed to lock, not an error */
+ return 0;
+ }
+ if (fallocate(fd, flags, fa_offset, page_sz) < 0) {
+ if (fallocate_supported == -1 &&
+ errno == ENOTSUP) {
+ RTE_LOG(ERR, EAL, "%s(): fallocate() not supported, hugepage deallocation will be disabled\n",
+ __func__);
+ again = true;
+ fallocate_supported = 0;
+ } else {
+ RTE_LOG(DEBUG, EAL, "%s(): fallocate() failed: %s\n",
+ __func__,
+ strerror(errno));
+ return -1;
+ }
+ } else {
+ fallocate_supported = 1;
+
+ if (grow) {
+ /* if can't read lock, it's an error */
+ if (lock(fd, fa_offset, page_sz,
+ F_RDLCK) != 1)
+ return -1;
+ } else {
+ /* if can't unlock, it's an error */
+ if (lock(fd, fa_offset, page_sz,
+ F_UNLCK) != 1)
+ return -1;
+ }
+ }
+ }
+ } while (again);
+ return 0;
+}
+
+static int
+alloc_page(struct rte_memseg *ms, void *addr, uint64_t size, int socket_id,
+ struct hugepage_info *hi, unsigned int list_idx,
+ unsigned int seg_idx)
+{
+ int cur_socket_id = 0;
+ uint64_t map_offset;
+ char path[PATH_MAX];
+ int ret = 0;
+ int fd;
+
+ fd = get_page_fd(path, sizeof(path), hi, list_idx, seg_idx);
+ if (fd < 0)
+ return -1;
+
+
+ if (internal_config.single_file_segments) {
+ map_offset = seg_idx * size;
+ ret = resize_hugefile(fd, map_offset, size, true);
+ if (ret < 1)
+ goto resized;
+ } else {
+ map_offset = 0;
+ if (ftruncate(fd, size) < 0) {
+ RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n",
+ __func__, strerror(errno));
+ goto resized;
+ }
+ /* we've allocated a page - take out a read lock. we're using
+ * fcntl() locks rather than flock() here because doing that
+ * gives us one huge advantage - fcntl() locks are per-process,
+ * not per-file descriptor, which means that we don't have to
+ * keep the original fd's around to keep a lock on the file.
+ *
+ * this is useful, because when it comes to unmapping pages, we
+ * will have to take out a write lock (to figure out if another
+ * process still has this page mapped), and to do itwith flock()
+ * we'll have to use original fd, as lock is associated with
+ * that particular fd. with fcntl(), this is not necessary - we
+ * can open a new fd and use fcntl() on that.
+ */
+ ret = lock(fd, map_offset, size, F_RDLCK);
+
+ /* this should not fail */
+ if (ret != 1) {
+ RTE_LOG(ERR, EAL, "%s(): error locking file: %s\n",
+ __func__,
+ strerror(errno));
+ goto resized;
+ }
+ }
+
+ /*
+ * map the segment, and populate page tables, the kernel fills this
+ * segment with zeros if it's a new page.
+ */
+ void *va = mmap(addr, size, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd, map_offset);
+ /* for non-single file segments, we can close fd here */
+ if (!internal_config.single_file_segments)
+ close(fd);
+
+ if (va == MAP_FAILED) {
+ RTE_LOG(DEBUG, EAL, "%s(): mmap() failed: %s\n", __func__,
+ strerror(errno));
+ goto resized;
+ }
+ if (va != addr) {
+ RTE_LOG(DEBUG, EAL, "%s(): wrong mmap() address\n", __func__);
+ goto mapped;
+ }
+
+ rte_iova_t iova = rte_mem_virt2iova(addr);
+ if (iova == RTE_BAD_PHYS_ADDR) {
+ RTE_LOG(DEBUG, EAL, "%s(): can't get IOVA addr\n",
+ __func__);
+ goto mapped;
+ }
+
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+ move_pages(getpid(), 1, &addr, NULL, &cur_socket_id, 0);
+
+ if (cur_socket_id != socket_id) {
+ RTE_LOG(DEBUG, EAL,
+ "%s(): allocation happened on wrong socket (wanted %d, got %d)\n",
+ __func__, socket_id, cur_socket_id);
+ goto mapped;
+ }
+#endif
+
+ /* In linux, hugetlb limitations, like cgroup, are
+ * enforced at fault time instead of mmap(), even
+ * with the option of MAP_POPULATE. Kernel will send
+ * a SIGBUS signal. To avoid to be killed, save stack
+ * environment here, if SIGBUS happens, we can jump
+ * back here.
+ */
+ if (huge_wrap_sigsetjmp()) {
+ RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more hugepages of size %uMB\n",
+ (unsigned int)(size / 0x100000));
+ goto mapped;
+ }
+ *(int *)addr = *(int *) addr;
+
+ ms->addr = addr;
+ ms->hugepage_sz = size;
+ ms->len = size;
+ ms->nchannel = rte_memory_get_nchannel();
+ ms->nrank = rte_memory_get_nrank();
+ ms->iova = iova;
+ ms->socket_id = socket_id;
+
+ return 0;
+
+mapped:
+ munmap(addr, size);
+resized:
+ if (internal_config.single_file_segments) {
+ resize_hugefile(fd, map_offset, size, false);
+ if (is_zero_length(fd)) {
+ struct msl_entry *te = get_msl_entry_by_idx(list_idx);
+ if (te != NULL && te->fd >= 0) {
+ close(te->fd);
+ te->fd = -1;
+ }
+ /* ignore errors, can't make it any worse */
+ unlink(path);
+ }
+ } else {
+ close(fd);
+ unlink(path);
+ }
+ return -1;
+}
+
+int
+eal_memalloc_alloc_page_bulk(struct rte_memseg **ms, int n,
+ uint64_t size, int socket, bool exact)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct rte_memseg_list *msl = NULL;
+ void *addr;
+ unsigned int msl_idx;
+ int cur_idx, end_idx, i, ret = -1;
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+ bool have_numa;
+ int oldpolicy;
+ struct bitmask *oldmask = numa_allocate_nodemask();
+#endif
+ struct hugepage_info *hi = NULL;
+
+ /* dynamic allocation not supported in legacy mode */
+ if (internal_config.legacy_mem)
+ goto restore_numa;
+
+ for (i = 0; i < (int) RTE_DIM(internal_config.hugepage_info); i++) {
+ if (size ==
+ internal_config.hugepage_info[i].hugepage_sz) {
+ hi = &internal_config.hugepage_info[i];
+ break;
+ }
+ }
+ if (!hi) {
+ RTE_LOG(ERR, EAL, "%s(): can't find relevant hugepage_info entry\n",
+ __func__);
+ goto restore_numa;
+ }
+
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+ have_numa = prepare_numa(&oldpolicy, oldmask, socket);
+#endif
+
+ /* there may be several memsegs for this page size and socket id, so try
+ * allocating on all of them.
+ */
+
+ /* find our memseg list */
+ for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
+ struct rte_memseg_list *cur_msl = &mcfg->memsegs[msl_idx];
+
+ if (cur_msl->hugepage_sz != size)
+ continue;
+ if (cur_msl->socket_id != socket)
+ continue;
+ msl = cur_msl;
+
+ /* try finding space in memseg list */
+ cur_idx = rte_fbarray_find_next_n_free(&msl->memseg_arr, 0, n);
+
+ if (cur_idx < 0)
+ continue;
+
+ end_idx = cur_idx + n;
+
+ for (i = 0; cur_idx < end_idx; cur_idx++, i++) {
+ struct rte_memseg *cur;
+
+ cur = rte_fbarray_get(&msl->memseg_arr, cur_idx);
+ addr = RTE_PTR_ADD(msl->base_va,
+ cur_idx * msl->hugepage_sz);
+
+ if (alloc_page(cur, addr, size, socket, hi, msl_idx,
+ cur_idx)) {
+ RTE_LOG(DEBUG, EAL, "attempted to allocate %i pages, but only %i were allocated\n",
+ n, i);
+
+ /* if exact number wasn't requested, stop */
+ if (!exact)
+ ret = i;
+ goto restore_numa;
+ }
+ if (ms)
+ ms[i] = cur;
+
+ rte_fbarray_set_used(&msl->memseg_arr, cur_idx);
+ }
+ ret = n;
+
+ break;
+ }
+ /* we didn't break */
+ if (!msl) {
+ RTE_LOG(ERR, EAL, "%s(): couldn't find suitable memseg_list\n",
+ __func__);
+ }
+
+restore_numa:
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+ if (have_numa)
+ resotre_numa(&oldpolicy, oldmask);
+#endif
+ return ret;
+}
+
+struct rte_memseg *
+eal_memalloc_alloc_page(uint64_t size, int socket)
+{
+ struct rte_memseg *ms;
+ if (eal_memalloc_alloc_page_bulk(&ms, 1, size, socket, true) < 0)
+ return NULL;
+ /* return pointer to newly allocated memseg */
+ return ms;
+}
diff --git a/lib/librte_eal/linuxapp/eal/meson.build b/lib/librte_eal/linuxapp/eal/meson.build
index 03974ff..5254c6c 100644
--- a/lib/librte_eal/linuxapp/eal/meson.build
+++ b/lib/librte_eal/linuxapp/eal/meson.build
@@ -10,6 +10,7 @@ env_sources = files('eal_alarm.c',
'eal_debug.c',
'eal_hugepage_info.c',
'eal_interrupts.c',
+ 'eal_memalloc.c',
'eal_lcore.c',
'eal_log.c',
'eal_thread.c',
--
2.7.4
Olivier Matz
2018-03-19 17:42:02 UTC
Permalink
Post by Anatoly Burakov
Nothing uses this code yet. The bulk of it is copied from old
memory allocation code (linuxapp eal_memory.c). We provide an
EAL-internal API to allocate either one page or multiple pages,
guaranteeing that we'll get contiguous VA for all of the pages
that we requested.
For single-file segments, we will use fallocate() to grow and
shrink memory segments, however fallocate() is not supported
on all kernel versions, so we will fall back to using
ftruncate() to grow the file, and disable shrinking as there's
little we can do there. This will enable vhost use cases where
having single file segments is of great value even without
support for hot-unplugging memory.
Not supported on FreeBSD.
Locking is done via fcntl() because that way, when it comes to
taking out write locks or unlocking on deallocation, we don't
have to keep original fd's around. Plus, using fcntl() gives us
ability to lock parts of a file, which is useful for single-file
segments.
Few minor typos:

[...]
Post by Anatoly Burakov
+static void
+resotre_numa(int *oldpolicy, struct bitmask *oldmask)
restore

[...]
Post by Anatoly Burakov
+static off_t
+getFileSize(int fd)
should it be get_file_size()?

[...]
Post by Anatoly Burakov
+static int
+alloc_page(struct rte_memseg *ms, void *addr, uint64_t size, int socket_id,
+ struct hugepage_info *hi, unsigned int list_idx,
+ unsigned int seg_idx)
+{
+ int cur_socket_id = 0;
+ uint64_t map_offset;
+ char path[PATH_MAX];
+ int ret = 0;
+ int fd;
+
+ fd = get_page_fd(path, sizeof(path), hi, list_idx, seg_idx);
+ if (fd < 0)
+ return -1;
+
+
+ if (internal_config.single_file_segments) {
+ map_offset = seg_idx * size;
+ ret = resize_hugefile(fd, map_offset, size, true);
+ if (ret < 1)
+ goto resized;
+ } else {
+ map_offset = 0;
+ if (ftruncate(fd, size) < 0) {
+ RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n",
+ __func__, strerror(errno));
+ goto resized;
+ }
+ /* we've allocated a page - take out a read lock. we're using
+ * fcntl() locks rather than flock() here because doing that
+ * gives us one huge advantage - fcntl() locks are per-process,
+ * not per-file descriptor, which means that we don't have to
+ * keep the original fd's around to keep a lock on the file.
+ *
+ * this is useful, because when it comes to unmapping pages, we
+ * will have to take out a write lock (to figure out if another
+ * process still has this page mapped), and to do itwith flock()
typo: itwith
Anatoly Burakov
2018-03-03 13:46:24 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---
drivers/net/ena/base/ena_plat_dpdk.h | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ena/base/ena_plat_dpdk.h b/drivers/net/ena/base/ena_plat_dpdk.h
index 8cba319..c1ebf00 100644
--- a/drivers/net/ena/base/ena_plat_dpdk.h
+++ b/drivers/net/ena/base/ena_plat_dpdk.h
@@ -188,7 +188,8 @@ typedef uint64_t dma_addr_t;
ENA_TOUCH(dmadev); ENA_TOUCH(handle); \
snprintf(z_name, sizeof(z_name), \
"ena_alloc_%d", ena_alloc_cnt++); \
- mz = rte_memzone_reserve(z_name, size, SOCKET_ID_ANY, 0); \
+ mz = rte_memzone_reserve_contig(z_name, \
+ size, SOCKET_ID_ANY, 0); \
memset(mz->addr, 0, size); \
virt = mz->addr; \
phys = mz->iova; \
@@ -206,7 +207,7 @@ typedef uint64_t dma_addr_t;
ENA_TOUCH(dmadev); ENA_TOUCH(dev_node); \
snprintf(z_name, sizeof(z_name), \
"ena_alloc_%d", ena_alloc_cnt++); \
- mz = rte_memzone_reserve(z_name, size, node, 0); \
+ mz = rte_memzone_reserve_contig(z_name, size, node, 0); \
memset(mz->addr, 0, size); \
virt = mz->addr; \
phys = mz->iova; \
@@ -219,7 +220,7 @@ typedef uint64_t dma_addr_t;
ENA_TOUCH(dmadev); ENA_TOUCH(dev_node); \
snprintf(z_name, sizeof(z_name), \
"ena_alloc_%d", ena_alloc_cnt++); \
- mz = rte_memzone_reserve(z_name, size, node, 0); \
+ mz = rte_memzone_reserve_contig(z_name, size, node, 0); \
memset(mz->addr, 0, size); \
virt = mz->addr; \
} while (0)
--
2.7.4
Anatoly Burakov
2018-03-03 13:46:06 UTC
Permalink
The test was expecting memory already being allocated on all sockets,
and thus was failing because calling rte_malloc could trigger memory
hotplug event and allocate memory where there was none before.

Fix it to instead report availability of memory on specific sockets
by attempting to allocate a page and see if that succeeds. Technically,
this can still cause failure as memory might not be available at the
time of check, but become available by the time the test is run, but
this is a corner case not worth considering.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
test/test/test_malloc.c | 52 +++++++++++++++++++++++++++++++++++++++++--------
1 file changed, 44 insertions(+), 8 deletions(-)

diff --git a/test/test/test_malloc.c b/test/test/test_malloc.c
index 8484fb6..2aaf1b8 100644
--- a/test/test/test_malloc.c
+++ b/test/test/test_malloc.c
@@ -22,6 +22,8 @@
#include <rte_random.h>
#include <rte_string_fns.h>

+#include "../../lib/librte_eal/common/eal_memalloc.h"
+
#include "test.h"

#define N 10000
@@ -708,22 +710,56 @@ test_malloc_bad_params(void)

/* Check if memory is avilable on a specific socket */
static int
-is_mem_on_socket(int32_t socket)
+is_mem_on_socket(unsigned int socket)
{
+ struct rte_malloc_socket_stats stats;
const struct rte_mem_config *mcfg =
rte_eal_get_configuration()->mem_config;
- unsigned i;
+ uint64_t prev_pgsz;
+ unsigned int i;
+
+ /* we cannot know if there's memory on a specific socket, since it might
+ * be available, but not yet allocated. so, in addition to checking
+ * already mapped memory, we will attempt to allocate a page from that
+ * socket and see if it works.
+ */
+ if (socket >= rte_num_sockets())
+ return 0;

+ rte_malloc_get_socket_stats(socket, &stats);
+
+ /* if heap has memory allocated, stop */
+ if (stats.heap_totalsz_bytes > 0)
+ return 1;
+
+ /* to allocate a page, we will have to know its size, so go through all
+ * supported page sizes and try with each one.
+ */
+ prev_pgsz = 0;
for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
- const struct rte_memseg_list *msl =
- &mcfg->memsegs[i];
- const struct rte_fbarray *arr = &msl->memseg_arr;
+ const struct rte_memseg_list *msl = &mcfg->memsegs[i];
+ uint64_t page_sz;

- if (msl->socket_id != socket)
+ /* skip unused memseg lists */
+ if (msl->memseg_arr.len == 0)
continue;
+ page_sz = msl->hugepage_sz;

- if (arr->count)
- return 1;
+ /* skip page sizes we've tried already */
+ if (prev_pgsz == page_sz)
+ continue;
+
+ prev_pgsz = page_sz;
+
+ struct rte_memseg *ms = eal_memalloc_alloc_page(page_sz,
+ socket);
+
+ if (ms == NULL)
+ continue;
+
+ eal_memalloc_free_page(ms);
+
+ return 1;
}
return 0;
}
--
2.7.4
Olivier Matz
2018-03-19 17:49:27 UTC
Permalink
Post by Anatoly Burakov
The test was expecting memory already being allocated on all sockets,
and thus was failing because calling rte_malloc could trigger memory
hotplug event and allocate memory where there was none before.
Fix it to instead report availability of memory on specific sockets
by attempting to allocate a page and see if that succeeds. Technically,
this can still cause failure as memory might not be available at the
time of check, but become available by the time the test is run, but
this is a corner case not worth considering.
---
test/test/test_malloc.c | 52 +++++++++++++++++++++++++++++++++++++++++--------
1 file changed, 44 insertions(+), 8 deletions(-)
diff --git a/test/test/test_malloc.c b/test/test/test_malloc.c
index 8484fb6..2aaf1b8 100644
--- a/test/test/test_malloc.c
+++ b/test/test/test_malloc.c
@@ -22,6 +22,8 @@
#include <rte_random.h>
#include <rte_string_fns.h>
+#include "../../lib/librte_eal/common/eal_memalloc.h"
+
I guess there is no way to test without importing a private EAL
function, correct? If yes, maybe it deserves a quick explanation.
Anatoly Burakov
2018-03-03 13:46:04 UTC
Permalink
Add a new (non-legacy) memory init path for EAL. It uses the
new memory hotplug facilities, although it's only being run
at startup.

If no -m or --socket-mem switches were specified, the new init
will not allocate anything, whereas if those switches were passed,
appropriate amounts of pages would be requested, just like for
legacy init.

Since rte_malloc support for dynamic allocation comes in later
patches, running DPDK without --socket-mem or -m switches will
fail in this patch.

Also, allocated pages will be physically discontiguous (or rather,
they're not guaranteed to be physically contiguous - they may still
be, by accident) unless IOVA_AS_VA mode is used.

Since memory hotplug subsystem relies on partial file locking,
replace flock() locks with fcntl() locks.

Signed-off-by: Anatoly Burakov <***@intel.com>
---

Notes:
This commit shows "the wolrd as it could have been". All of this other
monstrous amount of code in eal_memory.c is there because of legacy
init option. Do we *really* want to keep it around, and make DPDK
init and memory system suffer from split personality?

lib/librte_eal/linuxapp/eal/eal_hugepage_info.c | 25 ++++++++-
lib/librte_eal/linuxapp/eal/eal_memory.c | 74 +++++++++++++++++++++++--
2 files changed, 92 insertions(+), 7 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
index 706b6d5..7e2475f 100644
--- a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
+++ b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
@@ -15,6 +15,7 @@
#include <unistd.h>
#include <errno.h>
#include <sys/queue.h>
+#include <sys/stat.h>

#include <rte_memory.h>
#include <rte_eal.h>
@@ -200,6 +201,18 @@ get_hugepage_dir(uint64_t hugepage_sz)
}

/*
+ * uses fstat to report the size of a file on disk
+ */
+static off_t
+getFileSize(int fd)
+{
+ struct stat st;
+ if (fstat(fd, &st) < 0)
+ return 0;
+ return st.st_size;
+}
+
+/*
* Clear the hugepage directory of whatever hugepage files
* there are. Checks if the file is locked (i.e.
* if it's in use by another DPDK process).
@@ -229,6 +242,8 @@ clear_hugedir(const char * hugedir)
}

while(dirent != NULL){
+ struct flock lck = {0};
+
/* skip files that don't match the hugepage pattern */
if (fnmatch(filter, dirent->d_name, 0) > 0) {
dirent = readdir(dir);
@@ -245,11 +260,17 @@ clear_hugedir(const char * hugedir)
}

/* non-blocking lock */
- lck_result = flock(fd, LOCK_EX | LOCK_NB);
+ lck.l_type = F_RDLCK;
+ lck.l_whence = SEEK_SET;
+ lck.l_start = 0;
+ lck.l_len = getFileSize(fd);
+
+ lck_result = fcntl(fd, F_SETLK, &lck);

/* if lock succeeds, unlock and remove the file */
if (lck_result != -1) {
- flock(fd, LOCK_UN);
+ lck.l_type = F_UNLCK;
+ fcntl(fd, F_SETLK, &lck);
unlinkat(dir_fd, dirent->d_name, 0);
}
close (fd);
diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
index 9512da9..e0b4988 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -40,6 +40,7 @@
#include <rte_string_fns.h>

#include "eal_private.h"
+#include "eal_memalloc.h"
#include "eal_internal_cfg.h"
#include "eal_filesystem.h"
#include "eal_hugepages.h"
@@ -260,6 +261,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
void *virtaddr;
void *vma_addr = NULL;
size_t vma_len = 0;
+ struct flock lck = {0};
#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
int node_id = -1;
int essential_prev = 0;
@@ -434,8 +436,12 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
}


- /* set shared flock on the file. */
- if (flock(fd, LOCK_SH | LOCK_NB) == -1) {
+ /* set shared lock on the file. */
+ lck.l_type = F_RDLCK;
+ lck.l_whence = SEEK_SET;
+ lck.l_start = 0;
+ lck.l_len = hugepage_sz;
+ if (fcntl(fd, F_SETLK, &lck) == -1) {
RTE_LOG(DEBUG, EAL, "%s(): Locking file failed:%s \n",
__func__, strerror(errno));
close(fd);
@@ -1300,6 +1306,62 @@ eal_legacy_hugepage_init(void)
return -1;
}

+static int
+eal_hugepage_init(void)
+{
+ struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES];
+ uint64_t memory[RTE_MAX_NUMA_NODES];
+ int hp_sz_idx, socket_id;
+
+ test_phys_addrs_available();
+
+ memset(used_hp, 0, sizeof(used_hp));
+
+ for (hp_sz_idx = 0;
+ hp_sz_idx < (int) internal_config.num_hugepage_sizes;
+ hp_sz_idx++) {
+ /* also initialize used_hp hugepage sizes in used_hp */
+ struct hugepage_info *hpi;
+ hpi = &internal_config.hugepage_info[hp_sz_idx];
+ used_hp[hp_sz_idx].hugepage_sz = hpi->hugepage_sz;
+ }
+
+ /* make a copy of socket_mem, needed for balanced allocation. */
+ for (hp_sz_idx = 0; hp_sz_idx < RTE_MAX_NUMA_NODES; hp_sz_idx++)
+ memory[hp_sz_idx] = internal_config.socket_mem[hp_sz_idx];
+
+ /* calculate final number of pages */
+ if (calc_num_pages_per_socket(memory,
+ internal_config.hugepage_info, used_hp,
+ internal_config.num_hugepage_sizes) < 0)
+ return -1;
+
+ for (hp_sz_idx = 0;
+ hp_sz_idx < (int) internal_config.num_hugepage_sizes;
+ hp_sz_idx++) {
+ for (socket_id = 0; socket_id < RTE_MAX_NUMA_NODES;
+ socket_id++) {
+ struct hugepage_info *hpi = &used_hp[hp_sz_idx];
+ unsigned int num_pages = hpi->num_pages[socket_id];
+ int num_pages_alloc;
+
+ if (num_pages == 0)
+ continue;
+
+ RTE_LOG(DEBUG, EAL, "Allocating %u pages of size %luM on socket %i\n",
+ num_pages, hpi->hugepage_sz >> 20, socket_id);
+
+ num_pages_alloc = eal_memalloc_alloc_page_bulk(NULL,
+ num_pages,
+ hpi->hugepage_sz, socket_id,
+ true);
+ if (num_pages_alloc < 0)
+ return -1;
+ }
+ }
+ return 0;
+}
+
/*
* uses fstat to report the size of a file on disk
*/
@@ -1510,9 +1572,9 @@ eal_legacy_hugepage_attach(void)
int
rte_eal_hugepage_init(void)
{
- if (internal_config.legacy_mem)
- return eal_legacy_hugepage_init();
- return -1;
+ return internal_config.legacy_mem ?
+ eal_legacy_hugepage_init() :
+ eal_hugepage_init();
}

int
@@ -1520,6 +1582,8 @@ rte_eal_hugepage_attach(void)
{
if (internal_config.legacy_mem)
return eal_legacy_hugepage_attach();
+ else
+ RTE_LOG(ERR, EAL, "Secondary processes aren't supported yet\n");
return -1;
}
--
2.7.4
Anatoly Burakov
2018-03-03 13:46:11 UTC
Permalink
If a user has specified that the zone should have contiguous memory,
use the new _contig allocation API's instead of normal ones.
Otherwise, account for the fact that unless we're in IOVA_AS_VA
mode, we cannot guarantee that the pages would be physically
contiguous, so we calculate the memzone size and alignments as if
we were getting the smallest page size available.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_mempool/rte_mempool.c | 87 +++++++++++++++++++++++++++++++++++-----
1 file changed, 78 insertions(+), 9 deletions(-)

diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index 54f7f4b..5c4d3fd 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -98,6 +98,27 @@ static unsigned optimize_object_size(unsigned obj_size)
return new_obj_size * RTE_MEMPOOL_ALIGN;
}

+static size_t
+get_min_page_size(void)
+{
+ const struct rte_mem_config *mcfg =
+ rte_eal_get_configuration()->mem_config;
+ int i;
+ size_t min_pagesz = SIZE_MAX;
+
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+ const struct rte_memseg_list *msl = &mcfg->memsegs[i];
+
+ if (msl->base_va == NULL)
+ continue;
+
+ if (msl->hugepage_sz < min_pagesz)
+ min_pagesz = msl->hugepage_sz;
+ }
+
+ return min_pagesz == SIZE_MAX ? (size_t) getpagesize() : min_pagesz;
+}
+
static void
mempool_add_elem(struct rte_mempool *mp, void *obj, rte_iova_t iova)
{
@@ -549,6 +570,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
unsigned mz_id, n;
unsigned int mp_flags;
int ret;
+ bool force_contig, no_contig;

/* mempool must not be populated */
if (mp->nb_mem_chunks != 0)
@@ -563,10 +585,46 @@ rte_mempool_populate_default(struct rte_mempool *mp)
/* update mempool capabilities */
mp->flags |= mp_flags;

- if (rte_eal_has_hugepages()) {
- pg_shift = 0; /* not needed, zone is physically contiguous */
+ no_contig = mp->flags & MEMPOOL_F_NO_PHYS_CONTIG;
+ force_contig = mp->flags & MEMPOOL_F_CAPA_PHYS_CONTIG;
+
+ /*
+ * there are several considerations for page size and page shift here.
+ *
+ * if we don't need our mempools to have physically contiguous objects,
+ * then just set page shift and page size to 0, because the user has
+ * indicated that there's no need to care about anything.
+ *
+ * if we do need contiguous objects, there is also an option to reserve
+ * the entire mempool memory as one contiguous block of memory, in
+ * which case the page shift and alignment wouldn't matter as well.
+ *
+ * if we require contiguous objects, but not necessarily the entire
+ * mempool reserved space to be contiguous, then there are two options.
+ *
+ * if our IO addresses are virtual, not actual physical (IOVA as VA
+ * case), then no page shift needed - our memory allocation will give us
+ * contiguous physical memory as far as the hardware is concerned, so
+ * act as if we're getting contiguous memory.
+ *
+ * if our IO addresses are physical, we may get memory from bigger
+ * pages, or we might get memory from smaller pages, and how much of it
+ * we require depends on whether we want bigger or smaller pages.
+ * However, requesting each and every memory size is too much work, so
+ * what we'll do instead is walk through the page sizes available, pick
+ * the smallest one and set up page shift to match that one. We will be
+ * wasting some space this way, but it's much nicer than looping around
+ * trying to reserve each and every page size.
+ */
+
+ if (no_contig || force_contig || rte_eal_iova_mode() == RTE_IOVA_VA) {
pg_sz = 0;
+ pg_shift = 0;
align = RTE_CACHE_LINE_SIZE;
+ } else if (rte_eal_has_hugepages()) {
+ pg_sz = get_min_page_size();
+ pg_shift = rte_bsf32(pg_sz);
+ align = pg_sz;
} else {
pg_sz = getpagesize();
pg_shift = rte_bsf32(pg_sz);
@@ -585,23 +643,34 @@ rte_mempool_populate_default(struct rte_mempool *mp)
goto fail;
}

- mz = rte_memzone_reserve_aligned(mz_name, size,
- mp->socket_id, mz_flags, align);
- /* not enough memory, retry with the biggest zone we have */
- if (mz == NULL)
- mz = rte_memzone_reserve_aligned(mz_name, 0,
+ if (force_contig) {
+ /*
+ * if contiguous memory for entire mempool memory was
+ * requested, don't try reserving again if we fail.
+ */
+ mz = rte_memzone_reserve_aligned_contig(mz_name, size,
+ mp->socket_id, mz_flags, align);
+ } else {
+ mz = rte_memzone_reserve_aligned(mz_name, size,
mp->socket_id, mz_flags, align);
+ /* not enough memory, retry with the biggest zone we
+ * have
+ */
+ if (mz == NULL)
+ mz = rte_memzone_reserve_aligned(mz_name, 0,
+ mp->socket_id, mz_flags, align);
+ }
if (mz == NULL) {
ret = -rte_errno;
goto fail;
}

- if (mp->flags & MEMPOOL_F_NO_PHYS_CONTIG)
+ if (no_contig)
iova = RTE_BAD_IOVA;
else
iova = mz->iova;

- if (rte_eal_has_hugepages())
+ if (rte_eal_has_hugepages() && force_contig)
ret = rte_mempool_populate_iova(mp, mz->addr,
iova, mz->len,
rte_mempool_memchunk_mz_free,
--
2.7.4
Anatoly Burakov
2018-03-03 13:46:10 UTC
Permalink
It's there, so we might as well use it. Some operations will be
sped up by that.

Since we have to allocate an fbarray for memzones, we have to do
it before we initialize memory subsystem, because that, in
secondary processes, will (later) allocate more fbarrays than the
primary process, which will result in inability to attach to
memzone fbarray if we do it after the fact.

Signed-off-by: Anatoly Burakov <***@intel.com>
---

Notes:
Code for ENA driver makes little sense to me, but i've
attempted to keep the same semantics as the old code.

drivers/net/ena/ena_ethdev.c | 10 +-
lib/librte_eal/bsdapp/eal/eal.c | 6 +
lib/librte_eal/common/eal_common_memzone.c | 180 +++++++++++++++-------
lib/librte_eal/common/include/rte_eal_memconfig.h | 4 +-
lib/librte_eal/common/malloc_heap.c | 4 +
lib/librte_eal/linuxapp/eal/eal.c | 13 +-
test/test/test_memzone.c | 9 +-
7 files changed, 157 insertions(+), 69 deletions(-)

diff --git a/drivers/net/ena/ena_ethdev.c b/drivers/net/ena/ena_ethdev.c
index 34b2a8d..f7bfc7a 100644
--- a/drivers/net/ena/ena_ethdev.c
+++ b/drivers/net/ena/ena_ethdev.c
@@ -264,11 +264,15 @@ static const struct eth_dev_ops ena_dev_ops = {
static inline int ena_cpu_to_node(int cpu)
{
struct rte_config *config = rte_eal_get_configuration();
+ struct rte_fbarray *arr = &config->mem_config->memzones;
+ const struct rte_memzone *mz;

- if (likely(cpu < RTE_MAX_MEMZONE))
- return config->mem_config->memzone[cpu].socket_id;
+ if (unlikely(cpu >= RTE_MAX_MEMZONE))
+ return NUMA_NO_NODE;

- return NUMA_NO_NODE;
+ mz = rte_fbarray_get(arr, cpu);
+
+ return mz->socket_id;
}

static inline void ena_rx_mbuf_prepare(struct rte_mbuf *mbuf,
diff --git a/lib/librte_eal/bsdapp/eal/eal.c b/lib/librte_eal/bsdapp/eal/eal.c
index 45e5670..3b06e21 100644
--- a/lib/librte_eal/bsdapp/eal/eal.c
+++ b/lib/librte_eal/bsdapp/eal/eal.c
@@ -608,6 +608,12 @@ rte_eal_init(int argc, char **argv)
return -1;
}

+ if (rte_eal_malloc_heap_init() < 0) {
+ rte_eal_init_alert("Cannot init malloc heap\n");
+ rte_errno = ENODEV;
+ return -1;
+ }
+
if (rte_eal_tailqs_init() < 0) {
rte_eal_init_alert("Cannot init tail queues for objects\n");
rte_errno = EFAULT;
diff --git a/lib/librte_eal/common/eal_common_memzone.c b/lib/librte_eal/common/eal_common_memzone.c
index 8c9aa28..a7cfdaf 100644
--- a/lib/librte_eal/common/eal_common_memzone.c
+++ b/lib/librte_eal/common/eal_common_memzone.c
@@ -28,42 +28,29 @@
static inline const struct rte_memzone *
memzone_lookup_thread_unsafe(const char *name)
{
- const struct rte_mem_config *mcfg;
+ struct rte_mem_config *mcfg;
+ struct rte_fbarray *arr;
const struct rte_memzone *mz;
- unsigned i = 0;
+ int i = 0;

/* get pointer to global configuration */
mcfg = rte_eal_get_configuration()->mem_config;
+ arr = &mcfg->memzones;

/*
* the algorithm is not optimal (linear), but there are few
* zones and this function should be called at init only
*/
- for (i = 0; i < RTE_MAX_MEMZONE; i++) {
- mz = &mcfg->memzone[i];
- if (mz->addr != NULL && !strncmp(name, mz->name, RTE_MEMZONE_NAMESIZE))
- return &mcfg->memzone[i];
+ while ((i = rte_fbarray_find_next_used(arr, i)) >= 0) {
+ mz = rte_fbarray_get(arr, i++);
+ if (mz->addr != NULL &&
+ !strncmp(name, mz->name, RTE_MEMZONE_NAMESIZE))
+ return mz;
}

return NULL;
}

-static inline struct rte_memzone *
-get_next_free_memzone(void)
-{
- struct rte_mem_config *mcfg;
- unsigned i = 0;
-
- /* get pointer to global configuration */
- mcfg = rte_eal_get_configuration()->mem_config;
-
- for (i = 0; i < RTE_MAX_MEMZONE; i++) {
- if (mcfg->memzone[i].addr == NULL)
- return &mcfg->memzone[i];
- }
-
- return NULL;
-}

/* This function will return the greatest free block if a heap has been
* specified. If no heap has been specified, it will return the heap and
@@ -103,13 +90,16 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
{
struct rte_memzone *mz;
struct rte_mem_config *mcfg;
+ struct rte_fbarray *arr;
size_t requested_len;
+ int idx;

/* get pointer to global configuration */
mcfg = rte_eal_get_configuration()->mem_config;
+ arr = &mcfg->memzones;

/* no more room in config */
- if (mcfg->memzone_cnt >= RTE_MAX_MEMZONE) {
+ if (arr->count >= arr->len) {
RTE_LOG(ERR, EAL, "%s(): No more room in config\n", __func__);
rte_errno = ENOSPC;
return NULL;
@@ -199,7 +189,14 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
struct malloc_elem *elem = malloc_elem_from_data(mz_addr);

/* fill the zone in config */
- mz = get_next_free_memzone();
+ idx = rte_fbarray_find_next_free(arr, 0);
+
+ if (idx < 0) {
+ mz = NULL;
+ } else {
+ rte_fbarray_set_used(arr, idx);
+ mz = rte_fbarray_get(arr, idx);
+ }

if (mz == NULL) {
RTE_LOG(ERR, EAL, "%s(): Cannot find free memzone but there is room "
@@ -209,7 +206,6 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
return NULL;
}

- mcfg->memzone_cnt++;
snprintf(mz->name, sizeof(mz->name), "%s", name);
mz->iova = rte_malloc_virt2iova(mz_addr);
mz->addr = mz_addr;
@@ -322,6 +318,8 @@ int
rte_memzone_free(const struct rte_memzone *mz)
{
struct rte_mem_config *mcfg;
+ struct rte_fbarray *arr;
+ struct rte_memzone *found_mz;
int ret = 0;
void *addr;
unsigned idx;
@@ -330,21 +328,26 @@ rte_memzone_free(const struct rte_memzone *mz)
return -EINVAL;

mcfg = rte_eal_get_configuration()->mem_config;
+ arr = &mcfg->memzones;

rte_rwlock_write_lock(&mcfg->mlock);

- idx = ((uintptr_t)mz - (uintptr_t)mcfg->memzone);
- idx = idx / sizeof(struct rte_memzone);
+ idx = rte_fbarray_find_idx(arr, mz);
+ found_mz = rte_fbarray_get(arr, idx);

- addr = mcfg->memzone[idx].addr;
- if (addr == NULL)
+ if (found_mz == NULL) {
ret = -EINVAL;
- else if (mcfg->memzone_cnt == 0) {
- rte_panic("%s(): memzone address not NULL but memzone_cnt is 0!\n",
- __func__);
} else {
- memset(&mcfg->memzone[idx], 0, sizeof(mcfg->memzone[idx]));
- mcfg->memzone_cnt--;
+ addr = found_mz->addr;
+ if (addr == NULL)
+ ret = -EINVAL;
+ else if (arr->count == 0) {
+ rte_panic("%s(): memzone address not NULL but memzone_cnt is 0!\n",
+ __func__);
+ } else {
+ memset(found_mz, 0, sizeof(*found_mz));
+ rte_fbarray_set_free(arr, idx);
+ }
}

rte_rwlock_write_unlock(&mcfg->mlock);
@@ -378,25 +381,79 @@ rte_memzone_lookup(const char *name)
void
rte_memzone_dump(FILE *f)
{
+ struct rte_fbarray *arr;
struct rte_mem_config *mcfg;
- unsigned i = 0;
+ int i = 0;

/* get pointer to global configuration */
mcfg = rte_eal_get_configuration()->mem_config;
+ arr = &mcfg->memzones;

rte_rwlock_read_lock(&mcfg->mlock);
/* dump all zones */
- for (i=0; i<RTE_MAX_MEMZONE; i++) {
- if (mcfg->memzone[i].addr == NULL)
- break;
- fprintf(f, "Zone %u: name:<%s>, IO:0x%"PRIx64", len:0x%zx"
+ while ((i = rte_fbarray_find_next_used(arr, i)) >= 0) {
+ void *cur_addr, *mz_end;
+ struct rte_memzone *mz;
+ struct rte_memseg_list *msl = NULL;
+ struct rte_memseg *ms;
+ int ms_idx;
+
+ mz = rte_fbarray_get(arr, i);
+
+ /*
+ * memzones can span multiple physical pages, so dump addresses
+ * of all physical pages this memzone spans.
+ */
+
+ fprintf(f, "Zone %u: name:<%s>, len:0x%zx"
", virt:%p, socket_id:%"PRId32", flags:%"PRIx32"\n", i,
- mcfg->memzone[i].name,
- mcfg->memzone[i].iova,
- mcfg->memzone[i].len,
- mcfg->memzone[i].addr,
- mcfg->memzone[i].socket_id,
- mcfg->memzone[i].flags);
+ mz->name,
+ mz->len,
+ mz->addr,
+ mz->socket_id,
+ mz->flags);
+
+ msl = rte_mem_virt2memseg_list(mz->addr);
+ if (!msl) {
+ RTE_LOG(DEBUG, EAL, "Skipping bad memzone\n");
+ continue;
+ }
+
+ cur_addr = RTE_PTR_ALIGN_FLOOR(mz->addr, mz->hugepage_sz);
+ mz_end = RTE_PTR_ADD(cur_addr, mz->len);
+
+ fprintf(f, "physical segments used:\n");
+ if (msl->base_va == NULL) {
+ /* if memseg list base VA, we're in legacy mem mode,
+ * which means we have only one memseg.
+ */
+ ms = rte_mem_virt2memseg(mz->addr, msl);
+
+ fprintf(f, " addr: %p iova: 0x%" PRIx64 " "
+ "len: 0x%" PRIx64 " "
+ "pagesz: 0x%" PRIx64 "\n",
+ cur_addr, ms->iova, ms->len, ms->hugepage_sz);
+ } else {
+ ms_idx = RTE_PTR_DIFF(mz->addr, msl->base_va) /
+ msl->hugepage_sz;
+ ms = rte_fbarray_get(&msl->memseg_arr, ms_idx);
+
+ do {
+ fprintf(f, " addr: %p iova: 0x%" PRIx64 " "
+ "len: 0x%" PRIx64 " "
+ "pagesz: 0x%" PRIx64 "\n",
+ cur_addr, ms->iova, ms->len,
+ ms->hugepage_sz);
+
+ /* advance VA to next page */
+ cur_addr = RTE_PTR_ADD(cur_addr,
+ ms->hugepage_sz);
+
+ /* memzones occupy contiguous segments */
+ ++ms;
+ } while (cur_addr < mz_end);
+ }
+ i++;
}
rte_rwlock_read_unlock(&mcfg->mlock);
}
@@ -412,19 +469,23 @@ rte_eal_memzone_init(void)
/* get pointer to global configuration */
mcfg = rte_eal_get_configuration()->mem_config;

- /* secondary processes don't need to initialise anything */
- if (rte_eal_process_type() == RTE_PROC_SECONDARY)
- return 0;
-
rte_rwlock_write_lock(&mcfg->mlock);

- /* delete all zones */
- mcfg->memzone_cnt = 0;
- memset(mcfg->memzone, 0, sizeof(mcfg->memzone));
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY &&
+ rte_fbarray_init(&mcfg->memzones, "memzone",
+ RTE_MAX_MEMZONE, sizeof(struct rte_memzone))) {
+ RTE_LOG(ERR, EAL, "Cannot allocate memzone list\n");
+ return -1;
+ } else if (rte_eal_process_type() == RTE_PROC_SECONDARY &&
+ rte_fbarray_attach(&mcfg->memzones)) {
+ RTE_LOG(ERR, EAL, "Cannot attach to memzone list\n");
+ rte_rwlock_write_unlock(&mcfg->mlock);
+ return -1;
+ }

rte_rwlock_write_unlock(&mcfg->mlock);

- return rte_eal_malloc_heap_init();
+ return 0;
}

/* Walk all reserved memory zones */
@@ -432,14 +493,19 @@ void rte_memzone_walk(void (*func)(const struct rte_memzone *, void *),
void *arg)
{
struct rte_mem_config *mcfg;
- unsigned i;
+ struct rte_fbarray *arr;
+ int i;

mcfg = rte_eal_get_configuration()->mem_config;
+ arr = &mcfg->memzones;
+
+ i = 0;

rte_rwlock_read_lock(&mcfg->mlock);
- for (i=0; i<RTE_MAX_MEMZONE; i++) {
- if (mcfg->memzone[i].addr != NULL)
- (*func)(&mcfg->memzone[i], arg);
+ while ((i = rte_fbarray_find_next_used(arr, i)) > 0) {
+ struct rte_memzone *mz = rte_fbarray_get(arr, i);
+ (*func)(mz, arg);
+ i++;
}
rte_rwlock_read_unlock(&mcfg->mlock);
}
diff --git a/lib/librte_eal/common/include/rte_eal_memconfig.h b/lib/librte_eal/common/include/rte_eal_memconfig.h
index 31fc8e7..b6bdb21 100644
--- a/lib/librte_eal/common/include/rte_eal_memconfig.h
+++ b/lib/librte_eal/common/include/rte_eal_memconfig.h
@@ -58,10 +58,8 @@ struct rte_mem_config {
rte_rwlock_t qlock; /**< used for tailq operation for thread safe. */
rte_rwlock_t mplock; /**< only used by mempool LIB for thread-safe. */

- uint32_t memzone_cnt; /**< Number of allocated memzones */
-
/* memory segments and zones */
- struct rte_memzone memzone[RTE_MAX_MEMZONE]; /**< Memzone descriptors. */
+ struct rte_fbarray memzones; /**< Memzone descriptors. */

struct rte_memseg_list memsegs[RTE_MAX_MEMSEG_LISTS];
/**< list of dynamic arrays holding memsegs */
diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c
index 984e027..7a3d0f3 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -579,6 +579,10 @@ rte_eal_malloc_heap_init(void)
if (mcfg == NULL)
return -1;

+ /* secondary processes don't need to initialize heap */
+ if (rte_eal_process_type() == RTE_PROC_SECONDARY)
+ return 0;
+
for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
int start;
struct rte_fbarray *arr;
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 7851a7d..d336c96 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -857,6 +857,15 @@ rte_eal_init(int argc, char **argv)
return -1;
}
#endif
+ /* memzone_init maps rte_fbarrays, which has to be done before hugepage
+ * init/attach, because attach creates extra fbarrays in secondary
+ * process, resulting in inability to map memzone fbarray.
+ */
+ if (rte_eal_memzone_init() < 0) {
+ rte_eal_init_alert("Cannot init memzone\n");
+ rte_errno = ENODEV;
+ return -1;
+ }

if (rte_eal_memory_init() < 0) {
rte_eal_init_alert("Cannot init memory\n");
@@ -867,8 +876,8 @@ rte_eal_init(int argc, char **argv)
/* the directories are locked during eal_hugepage_info_init */
eal_hugedirs_unlock();

- if (rte_eal_memzone_init() < 0) {
- rte_eal_init_alert("Cannot init memzone\n");
+ if (rte_eal_malloc_heap_init() < 0) {
+ rte_eal_init_alert("Cannot init malloc heap\n");
rte_errno = ENODEV;
return -1;
}
diff --git a/test/test/test_memzone.c b/test/test/test_memzone.c
index 47f4de8..4b49d61 100644
--- a/test/test/test_memzone.c
+++ b/test/test/test_memzone.c
@@ -893,7 +893,7 @@ test_memzone_basic(void)
const struct rte_memzone *mz;
int memzone_cnt_after, memzone_cnt_expected;
int memzone_cnt_before =
- rte_eal_get_configuration()->mem_config->memzone_cnt;
+ rte_eal_get_configuration()->mem_config->memzones.count;

memzone1 = rte_memzone_reserve(TEST_MEMZONE_NAME("testzone1"), 100,
SOCKET_ID_ANY, 0);
@@ -917,7 +917,7 @@ test_memzone_basic(void)
(memzone3 != NULL) + (memzone4 != NULL);

memzone_cnt_after =
- rte_eal_get_configuration()->mem_config->memzone_cnt;
+ rte_eal_get_configuration()->mem_config->memzones.count;

if (memzone_cnt_after != memzone_cnt_expected)
return -1;
@@ -996,7 +996,7 @@ test_memzone_basic(void)
}

memzone_cnt_after =
- rte_eal_get_configuration()->mem_config->memzone_cnt;
+ rte_eal_get_configuration()->mem_config->memzones.count;
if (memzone_cnt_after != memzone_cnt_before)
return -1;

@@ -1017,7 +1017,8 @@ static int
test_memzone(void)
{
/* take note of how many memzones were allocated before running */
- int memzone_cnt = rte_eal_get_configuration()->mem_config->memzone_cnt;
+ int memzone_cnt =
+ rte_eal_get_configuration()->mem_config->memzones.count;

printf("test basic memzone API\n");
if (test_memzone_basic() < 0)
--
2.7.4
Anatoly Burakov
2018-03-03 13:46:28 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---

Notes:
Not sure if virtio needs to allocate DMA-capable memory,
being a software driver and all. Corrections welcome.

drivers/net/virtio/virtio_ethdev.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/virtio/virtio_ethdev.c b/drivers/net/virtio/virtio_ethdev.c
index 884f74a..35812e4 100644
--- a/drivers/net/virtio/virtio_ethdev.c
+++ b/drivers/net/virtio/virtio_ethdev.c
@@ -391,7 +391,7 @@ virtio_init_queue(struct rte_eth_dev *dev, uint16_t vtpci_queue_idx)
PMD_INIT_LOG(DEBUG, "vring_size: %d, rounded_vring_size: %d",
size, vq->vq_ring_size);

- mz = rte_memzone_reserve_aligned(vq_name, vq->vq_ring_size,
+ mz = rte_memzone_reserve_aligned_contig(vq_name, vq->vq_ring_size,
SOCKET_ID_ANY,
0, VIRTIO_PCI_VRING_ALIGN);
if (mz == NULL) {
@@ -417,9 +417,9 @@ virtio_init_queue(struct rte_eth_dev *dev, uint16_t vtpci_queue_idx)
if (sz_hdr_mz) {
snprintf(vq_hdr_name, sizeof(vq_hdr_name), "port%d_vq%d_hdr",
dev->data->port_id, vtpci_queue_idx);
- hdr_mz = rte_memzone_reserve_aligned(vq_hdr_name, sz_hdr_mz,
- SOCKET_ID_ANY, 0,
- RTE_CACHE_LINE_SIZE);
+ hdr_mz = rte_memzone_reserve_aligned_contig(vq_hdr_name,
+ sz_hdr_mz, SOCKET_ID_ANY, 0,
+ RTE_CACHE_LINE_SIZE);
if (hdr_mz == NULL) {
if (rte_errno == EEXIST)
hdr_mz = rte_memzone_lookup(vq_hdr_name);
--
2.7.4
Venkatesh Srinivas
2018-03-03 16:52:09 UTC
Permalink
On Sat, Mar 3, 2018 at 7:46 AM, Anatoly Burakov
Post by Anatoly Burakov
---
Not sure if virtio needs to allocate DMA-capable memory,
being a software driver and all. Corrections welcome.
Reviewed-by: Venkatesh Srinivas <***@google.com>

1. The first change is correct - virtio rings need to be contiguous in
guest physical address
space.

2. The second change - virtio_tx_region contains both a virtio_net_hdr
and indirect table.
virtio devices require virtio_net_hdr to be contiguous (in pre-1.0
devices w/o F_ANY_LAYOUT)
but do not require the indirect table to be contiguous w/
virtio_net_hdr. You may want this to
avoid splitting up the structure though.

HTH,
-- vs;
Anatoly Burakov
2018-03-03 13:46:05 UTC
Permalink
This set of changes enables rte_malloc to allocate and free memory
as needed. The way it works is, first malloc checks if there is
enough memory already allocated to satisfy user's request. If there
isn't, we try and allocate more memory. The reverse happens with
free - we free an element, check its size (including free element
merging due to adjacency) and see if it's bigger than hugepage
size and that its start and end span a hugepage or more. Then we
remove the area from malloc heap (adjusting element lengths where
appropriate), and deallocate the page.

For legacy mode, runtime alloc/free of pages is disabled.

It is worth noting that memseg lists are being sorted by page size,
and that we try our best to satisfy user's request. That is, if
the user requests an element from a 2MB page memory, we will check
if we can satisfy that request from existing memory, if not we try
and allocate more 2MB pages. If that fails and user also specified
a "size is hint" flag, we then check other page sizes and try to
allocate from there. If that fails too, then, depending on flags,
we may try allocating from other sockets. In other words, we try
our best to give the user what they asked for, but going to other
sockets is last resort - first we try to allocate more memory on
the same socket.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/common/eal_common_memzone.c | 23 +-
lib/librte_eal/common/malloc_elem.c | 85 ++++++++
lib/librte_eal/common/malloc_elem.h | 3 +
lib/librte_eal/common/malloc_heap.c | 332 ++++++++++++++++++++++++++++-
lib/librte_eal/common/malloc_heap.h | 4 +-
lib/librte_eal/common/rte_malloc.c | 31 +--
6 files changed, 416 insertions(+), 62 deletions(-)

diff --git a/lib/librte_eal/common/eal_common_memzone.c b/lib/librte_eal/common/eal_common_memzone.c
index ed36174..718dee8 100644
--- a/lib/librte_eal/common/eal_common_memzone.c
+++ b/lib/librte_eal/common/eal_common_memzone.c
@@ -103,7 +103,6 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
struct rte_memzone *mz;
struct rte_mem_config *mcfg;
size_t requested_len;
- int socket, i;

/* get pointer to global configuration */
mcfg = rte_eal_get_configuration()->mem_config;
@@ -181,27 +180,9 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
}
}

- if (socket_id == SOCKET_ID_ANY)
- socket = malloc_get_numa_socket();
- else
- socket = socket_id;
-
/* allocate memory on heap */
- void *mz_addr = malloc_heap_alloc(&mcfg->malloc_heaps[socket], NULL,
- requested_len, flags, align, bound);
-
- if ((mz_addr == NULL) && (socket_id == SOCKET_ID_ANY)) {
- /* try other heaps */
- for (i = 0; i < RTE_MAX_NUMA_NODES; i++) {
- if (socket == i)
- continue;
-
- mz_addr = malloc_heap_alloc(&mcfg->malloc_heaps[i],
- NULL, requested_len, flags, align, bound);
- if (mz_addr != NULL)
- break;
- }
- }
+ void *mz_addr = malloc_heap_alloc(NULL, requested_len, socket_id, flags,
+ align, bound);

if (mz_addr == NULL) {
rte_errno = ENOMEM;
diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c
index 701bffd..eabad66 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -400,6 +400,91 @@ malloc_elem_free(struct malloc_elem *elem)
return elem;
}

+/* assume all checks were already done */
+void
+malloc_elem_hide_region(struct malloc_elem *elem, void *start, size_t len)
+{
+ size_t len_before, len_after;
+ struct malloc_elem *prev, *next;
+ void *end, *elem_end;
+
+ end = RTE_PTR_ADD(start, len);
+ elem_end = RTE_PTR_ADD(elem, elem->size);
+ len_before = RTE_PTR_DIFF(start, elem);
+ len_after = RTE_PTR_DIFF(elem_end, end);
+
+ prev = elem->prev;
+ next = elem->next;
+
+ if (len_after >= MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) {
+ /* split after */
+ struct malloc_elem *split_after = end;
+
+ split_elem(elem, split_after);
+
+ next = split_after;
+
+ malloc_elem_free_list_insert(split_after);
+ } else if (len_after >= MALLOC_ELEM_HEADER_LEN) {
+ struct malloc_elem *pad_elem = end;
+
+ /* shrink current element */
+ elem->size -= len_after;
+ memset(pad_elem, 0, sizeof(*pad_elem));
+
+ /* copy next element's data to our pad */
+ memcpy(pad_elem, next, sizeof(*pad_elem));
+
+ /* pad next element */
+ next->state = ELEM_PAD;
+ next->pad = len_after;
+
+ /* next element is busy, would've been merged otherwise */
+ pad_elem->pad = len_after;
+ pad_elem->size += len_after;
+
+ /* adjust pointers to point to our new pad */
+ pad_elem->next->prev = pad_elem;
+ elem->next = pad_elem;
+ } else if (len_after > 0) {
+ rte_panic("Unaligned element, heap is probably corrupt\n");
+ }
+
+ if (len_before >= MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) {
+ /* split before */
+ struct malloc_elem *split_before = start;
+
+ split_elem(elem, split_before);
+
+ prev = elem;
+ elem = split_before;
+
+ malloc_elem_free_list_insert(prev);
+ } else if (len_before > 0) {
+ /*
+ * unlike with elements after current, here we don't need to
+ * pad elements, but rather just increase the size of previous
+ * element, copy the old header and and set up trailer.
+ */
+ void *trailer = RTE_PTR_ADD(prev,
+ prev->size - MALLOC_ELEM_TRAILER_LEN);
+ struct malloc_elem *new_elem = start;
+
+ memcpy(new_elem, elem, sizeof(*elem));
+ new_elem->size -= len_before;
+
+ prev->size += len_before;
+ set_trailer(prev);
+
+ elem = new_elem;
+
+ /* erase old trailer */
+ memset(trailer, 0, MALLOC_ELEM_TRAILER_LEN);
+ }
+
+ remove_elem(elem);
+}
+
/*
* attempt to resize a malloc_elem by expanding into any free space
* immediately after it in memory.
diff --git a/lib/librte_eal/common/malloc_elem.h b/lib/librte_eal/common/malloc_elem.h
index 388c16f..6d979d2 100644
--- a/lib/librte_eal/common/malloc_elem.h
+++ b/lib/librte_eal/common/malloc_elem.h
@@ -152,6 +152,9 @@ int
malloc_elem_resize(struct malloc_elem *elem, size_t size);

void
+malloc_elem_hide_region(struct malloc_elem *elem, void *start, size_t len);
+
+void
malloc_elem_free_list_remove(struct malloc_elem *elem);

/*
diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c
index 058ad75..87dc9ad 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -20,8 +20,10 @@
#include <rte_spinlock.h>
#include <rte_memcpy.h>
#include <rte_atomic.h>
+#include <rte_fbarray.h>

#include "eal_internal_cfg.h"
+#include "eal_memalloc.h"
#include "malloc_elem.h"
#include "malloc_heap.h"

@@ -123,48 +125,356 @@ find_suitable_element(struct malloc_heap *heap, size_t size,
* scan fails. Once the new memseg is added, it re-scans and should return
* the new element after releasing the lock.
*/
-void *
-malloc_heap_alloc(struct malloc_heap *heap,
- const char *type __attribute__((unused)), size_t size, unsigned flags,
- size_t align, size_t bound)
+static void *
+heap_alloc(struct malloc_heap *heap, const char *type __rte_unused, size_t size,
+ unsigned int flags, size_t align, size_t bound)
{
struct malloc_elem *elem;

size = RTE_CACHE_LINE_ROUNDUP(size);
align = RTE_CACHE_LINE_ROUNDUP(align);

- rte_spinlock_lock(&heap->lock);
-
elem = find_suitable_element(heap, size, flags, align, bound);
if (elem != NULL) {
elem = malloc_elem_alloc(elem, size, align, bound);
+
/* increase heap's count of allocated elements */
heap->alloc_count++;
}
- rte_spinlock_unlock(&heap->lock);

return elem == NULL ? NULL : (void *)(&elem[1]);
}

+static int
+try_expand_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size,
+ int socket, unsigned int flags, size_t align, size_t bound)
+{
+ struct rte_memseg_list *msl;
+ struct rte_memseg **ms;
+ struct malloc_elem *elem;
+ size_t map_len;
+ int i, n_pages, allocd_pages;
+ void *ret, *map_addr;
+
+ align = RTE_MAX(align, MALLOC_ELEM_HEADER_LEN);
+ map_len = RTE_ALIGN_CEIL(align + elt_size + MALLOC_ELEM_TRAILER_LEN,
+ pg_sz);
+
+ n_pages = map_len / pg_sz;
+
+ /* we can't know in advance how many pages we'll need, so malloc */
+ ms = malloc(sizeof(*ms) * n_pages);
+
+ allocd_pages = eal_memalloc_alloc_page_bulk(ms, n_pages, pg_sz, socket,
+ true);
+
+ /* make sure we've allocated our pages... */
+ if (allocd_pages != n_pages)
+ goto free_ms;
+
+ map_addr = ms[0]->addr;
+ msl = rte_mem_virt2memseg_list(map_addr);
+
+ /* add newly minted memsegs to malloc heap */
+ elem = malloc_heap_add_memory(heap, msl, map_addr, map_len);
+
+ /* try once more, as now we have allocated new memory */
+ ret = find_suitable_element(heap, elt_size, flags, align, bound);
+
+ if (ret == NULL)
+ goto free_elem;
+
+ RTE_LOG(DEBUG, EAL, "Heap on socket %d was expanded by %zdMB\n",
+ socket, map_len >> 20ULL);
+
+ free(ms);
+
+ return 0;
+
+free_elem:
+ malloc_elem_free_list_remove(elem);
+ malloc_elem_hide_region(elem, map_addr, map_len);
+ heap->total_size -= map_len;
+
+ for (i = 0; i < n_pages; i++)
+ eal_memalloc_free_page(ms[i]);
+free_ms:
+ free(ms);
+
+ return -1;
+}
+
+static int
+compare_pagesz(const void *a, const void *b)
+{
+ const struct rte_memseg_list * const*mpa = a;
+ const struct rte_memseg_list * const*mpb = b;
+ const struct rte_memseg_list *msla = *mpa;
+ const struct rte_memseg_list *mslb = *mpb;
+ uint64_t pg_sz_a = msla->hugepage_sz;
+ uint64_t pg_sz_b = mslb->hugepage_sz;
+
+ if (pg_sz_a < pg_sz_b)
+ return -1;
+ if (pg_sz_a > pg_sz_b)
+ return 1;
+ return 0;
+}
+
+static int
+alloc_mem_on_socket(size_t size, int socket, unsigned int flags, size_t align,
+ size_t bound)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct malloc_heap *heap = &mcfg->malloc_heaps[socket];
+ struct rte_memseg_list *requested_msls[RTE_MAX_MEMSEG_LISTS];
+ struct rte_memseg_list *other_msls[RTE_MAX_MEMSEG_LISTS];
+ uint64_t requested_pg_sz[RTE_MAX_MEMSEG_LISTS];
+ uint64_t other_pg_sz[RTE_MAX_MEMSEG_LISTS];
+ uint64_t prev_pg_sz;
+ int i, n_other_msls, n_other_pg_sz, n_requested_msls, n_requested_pg_sz;
+ bool size_hint = (flags & RTE_MEMZONE_SIZE_HINT_ONLY) > 0;
+ unsigned int size_flags = flags & ~RTE_MEMZONE_SIZE_HINT_ONLY;
+ void *ret;
+
+ memset(requested_msls, 0, sizeof(requested_msls));
+ memset(other_msls, 0, sizeof(other_msls));
+ memset(requested_pg_sz, 0, sizeof(requested_pg_sz));
+ memset(other_pg_sz, 0, sizeof(other_pg_sz));
+
+ /*
+ * go through memseg list and take note of all the page sizes available,
+ * and if any of them were specifically requested by the user.
+ */
+ n_requested_msls = 0;
+ n_other_msls = 0;
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+ struct rte_memseg_list *msl = &mcfg->memsegs[i];
+
+ if (msl->socket_id != socket)
+ continue;
+
+ if (msl->base_va == NULL)
+ continue;
+
+ /* if pages of specific size were requested */
+ if (size_flags != 0 && check_hugepage_sz(size_flags,
+ msl->hugepage_sz))
+ requested_msls[n_requested_msls++] = msl;
+ else if (size_flags == 0 || size_hint)
+ other_msls[n_other_msls++] = msl;
+ }
+
+ /* sort the lists, smallest first */
+ qsort(requested_msls, n_requested_msls, sizeof(requested_msls[0]),
+ compare_pagesz);
+ qsort(other_msls, n_other_msls, sizeof(other_msls[0]),
+ compare_pagesz);
+
+ /* now, extract page sizes we are supposed to try */
+ prev_pg_sz = 0;
+ n_requested_pg_sz = 0;
+ for (i = 0; i < n_requested_msls; i++) {
+ uint64_t pg_sz = requested_msls[i]->hugepage_sz;
+
+ if (prev_pg_sz != pg_sz) {
+ requested_pg_sz[n_requested_pg_sz++] = pg_sz;
+ prev_pg_sz = pg_sz;
+ }
+ }
+ prev_pg_sz = 0;
+ n_other_pg_sz = 0;
+ for (i = 0; i < n_other_msls; i++) {
+ uint64_t pg_sz = other_msls[i]->hugepage_sz;
+
+ if (prev_pg_sz != pg_sz) {
+ other_pg_sz[n_other_pg_sz++] = pg_sz;
+ prev_pg_sz = pg_sz;
+ }
+ }
+
+ /* finally, try allocating memory of specified page sizes, starting from
+ * the smallest sizes
+ */
+ for (i = 0; i < n_requested_pg_sz; i++) {
+ uint64_t pg_sz = requested_pg_sz[i];
+
+ /*
+ * do not pass the size hint here, as user expects other page
+ * sizes first, before resorting to best effort allocation.
+ */
+ if (!try_expand_heap(heap, pg_sz, size, socket, size_flags,
+ align, bound))
+ return 0;
+ }
+ if (n_other_pg_sz == 0)
+ return -1;
+
+ /* now, check if we can reserve anything with size hint */
+ ret = find_suitable_element(heap, size, flags, align, bound);
+ if (ret != NULL)
+ return 0;
+
+ /*
+ * we still couldn't reserve memory, so try expanding heap with other
+ * page sizes, if there are any
+ */
+ for (i = 0; i < n_other_pg_sz; i++) {
+ uint64_t pg_sz = other_pg_sz[i];
+
+ if (!try_expand_heap(heap, pg_sz, size, socket, flags,
+ align, bound))
+ return 0;
+ }
+ return -1;
+}
+
+/* this will try lower page sizes first */
+static void *
+heap_alloc_on_socket(const char *type, size_t size, int socket,
+ unsigned int flags, size_t align, size_t bound)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct malloc_heap *heap = &mcfg->malloc_heaps[socket];
+ unsigned int size_flags = flags & ~RTE_MEMZONE_SIZE_HINT_ONLY;
+ void *ret;
+
+ rte_spinlock_lock(&(heap->lock));
+
+ align = align == 0 ? 1 : align;
+
+ /* for legacy mode, try once and with all flags */
+ if (internal_config.legacy_mem) {
+ ret = heap_alloc(heap, type, size, flags, align, bound);
+ goto alloc_unlock;
+ }
+
+ /*
+ * we do not pass the size hint here, because even if allocation fails,
+ * we may still be able to allocate memory from appropriate page sizes,
+ * we just need to request more memory first.
+ */
+ ret = heap_alloc(heap, type, size, size_flags, align, bound);
+ if (ret != NULL)
+ goto alloc_unlock;
+
+ if (!alloc_mem_on_socket(size, socket, flags, align, bound)) {
+ ret = heap_alloc(heap, type, size, flags, align, bound);
+
+ /* this should have succeeded */
+ if (ret == NULL)
+ rte_panic("Error allocating from heap\n");
+ }
+alloc_unlock:
+ rte_spinlock_unlock(&(heap->lock));
+ return ret;
+}
+
+void *
+malloc_heap_alloc(const char *type, size_t size, int socket_arg,
+ unsigned int flags, size_t align, size_t bound)
+{
+ int socket, i;
+ void *ret;
+
+ /* return NULL if size is 0 or alignment is not power-of-2 */
+ if (size == 0 || (align && !rte_is_power_of_2(align)))
+ return NULL;
+
+ if (!rte_eal_has_hugepages())
+ socket_arg = SOCKET_ID_ANY;
+
+ if (socket_arg == SOCKET_ID_ANY)
+ socket = malloc_get_numa_socket();
+ else
+ socket = socket_arg;
+
+ /* Check socket parameter */
+ if (socket >= RTE_MAX_NUMA_NODES)
+ return NULL;
+
+ ret = heap_alloc_on_socket(type, size, socket, flags, align, bound);
+ if (ret != NULL || socket_arg != SOCKET_ID_ANY)
+ return ret;
+
+ /* try other heaps */
+ for (i = 0; i < (int) rte_num_sockets(); i++) {
+ if (i == socket)
+ continue;
+ ret = heap_alloc_on_socket(type, size, i, flags,
+ align, bound);
+ if (ret != NULL)
+ return ret;
+ }
+ return NULL;
+}
+
int
malloc_heap_free(struct malloc_elem *elem)
{
struct malloc_heap *heap;
- struct malloc_elem *ret;
+ void *start, *aligned_start, *end, *aligned_end;
+ size_t len, aligned_len;
+ struct rte_memseg_list *msl;
+ int n_pages, page_idx, max_page_idx, ret;

if (!malloc_elem_cookies_ok(elem) || elem->state != ELEM_BUSY)
return -1;

/* elem may be merged with previous element, so keep heap address */
heap = elem->heap;
+ msl = elem->msl;

rte_spinlock_lock(&(heap->lock));

- ret = malloc_elem_free(elem);
+ elem = malloc_elem_free(elem);

- rte_spinlock_unlock(&(heap->lock));
+ /* anything after this is a bonus */
+ ret = 0;
+
+ /* ...of which we can't avail if we are in legacy mode */
+ if (internal_config.legacy_mem)
+ goto free_unlock;
+
+ /* check if we can free any memory back to the system */
+ if (elem->size < msl->hugepage_sz)
+ goto free_unlock;

- return ret != NULL ? 0 : -1;
+ /* probably, but let's make sure, as we may not be using up full page */
+ start = elem;
+ len = elem->size;
+ aligned_start = RTE_PTR_ALIGN_CEIL(start, msl->hugepage_sz);
+ end = RTE_PTR_ADD(elem, len);
+ aligned_end = RTE_PTR_ALIGN_FLOOR(end, msl->hugepage_sz);
+
+ aligned_len = RTE_PTR_DIFF(aligned_end, aligned_start);
+
+ /* can't free anything */
+ if (aligned_len < msl->hugepage_sz)
+ goto free_unlock;
+
+ malloc_elem_free_list_remove(elem);
+
+ malloc_elem_hide_region(elem, (void *) aligned_start, aligned_len);
+
+ /* we don't really care if we fail to deallocate memory */
+ n_pages = aligned_len / msl->hugepage_sz;
+ page_idx = RTE_PTR_DIFF(aligned_start, msl->base_va) / msl->hugepage_sz;
+ max_page_idx = page_idx + n_pages;
+
+ for (; page_idx < max_page_idx; page_idx++) {
+ struct rte_memseg *ms;
+
+ ms = rte_fbarray_get(&msl->memseg_arr, page_idx);
+ eal_memalloc_free_page(ms);
+ heap->total_size -= msl->hugepage_sz;
+ }
+
+ RTE_LOG(DEBUG, EAL, "Heap on socket %d was shrunk by %zdMB\n",
+ msl->socket_id, aligned_len >> 20ULL);
+free_unlock:
+ rte_spinlock_unlock(&(heap->lock));
+ return ret;
}

int
diff --git a/lib/librte_eal/common/malloc_heap.h b/lib/librte_eal/common/malloc_heap.h
index bb28422..292d578 100644
--- a/lib/librte_eal/common/malloc_heap.h
+++ b/lib/librte_eal/common/malloc_heap.h
@@ -24,8 +24,8 @@ malloc_get_numa_socket(void)
}

void *
-malloc_heap_alloc(struct malloc_heap *heap, const char *type, size_t size,
- unsigned flags, size_t align, size_t bound);
+malloc_heap_alloc(const char *type, size_t size, int socket, unsigned int flags,
+ size_t align, size_t bound);

int
malloc_heap_free(struct malloc_elem *elem);
diff --git a/lib/librte_eal/common/rte_malloc.c b/lib/librte_eal/common/rte_malloc.c
index bd7e757..b0fe11c 100644
--- a/lib/librte_eal/common/rte_malloc.c
+++ b/lib/librte_eal/common/rte_malloc.c
@@ -39,10 +39,6 @@ void rte_free(void *addr)
void *
rte_malloc_socket(const char *type, size_t size, unsigned align, int socket_arg)
{
- struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
- int socket, i;
- void *ret;
-
/* return NULL if size is 0 or alignment is not power-of-2 */
if (size == 0 || (align && !rte_is_power_of_2(align)))
return NULL;
@@ -50,33 +46,12 @@ rte_malloc_socket(const char *type, size_t size, unsigned align, int socket_arg)
if (!rte_eal_has_hugepages())
socket_arg = SOCKET_ID_ANY;

- if (socket_arg == SOCKET_ID_ANY)
- socket = malloc_get_numa_socket();
- else
- socket = socket_arg;
-
/* Check socket parameter */
- if (socket >= RTE_MAX_NUMA_NODES)
+ if (socket_arg >= RTE_MAX_NUMA_NODES)
return NULL;

- ret = malloc_heap_alloc(&mcfg->malloc_heaps[socket], type,
- size, 0, align == 0 ? 1 : align, 0);
- if (ret != NULL || socket_arg != SOCKET_ID_ANY)
- return ret;
-
- /* try other heaps */
- for (i = 0; i < RTE_MAX_NUMA_NODES; i++) {
- /* we already tried this one */
- if (i == socket)
- continue;
-
- ret = malloc_heap_alloc(&mcfg->malloc_heaps[i], type,
- size, 0, align == 0 ? 1 : align, 0);
- if (ret != NULL)
- return ret;
- }
-
- return NULL;
+ return malloc_heap_alloc(type, size, socket_arg, 0,
+ align == 0 ? 1 : align, 0);
}

/*
--
2.7.4
Olivier Matz
2018-03-19 17:46:11 UTC
Permalink
Post by Anatoly Burakov
This set of changes enables rte_malloc to allocate and free memory
as needed. The way it works is, first malloc checks if there is
enough memory already allocated to satisfy user's request. If there
isn't, we try and allocate more memory. The reverse happens with
free - we free an element, check its size (including free element
merging due to adjacency) and see if it's bigger than hugepage
size and that its start and end span a hugepage or more. Then we
remove the area from malloc heap (adjusting element lengths where
appropriate), and deallocate the page.
For legacy mode, runtime alloc/free of pages is disabled.
It is worth noting that memseg lists are being sorted by page size,
and that we try our best to satisfy user's request. That is, if
the user requests an element from a 2MB page memory, we will check
if we can satisfy that request from existing memory, if not we try
and allocate more 2MB pages. If that fails and user also specified
a "size is hint" flag, we then check other page sizes and try to
allocate from there. If that fails too, then, depending on flags,
we may try allocating from other sockets. In other words, we try
our best to give the user what they asked for, but going to other
sockets is last resort - first we try to allocate more memory on
the same socket.
[...]
Post by Anatoly Burakov
@@ -123,48 +125,356 @@ find_suitable_element(struct malloc_heap *heap, size_t size,
* scan fails. Once the new memseg is added, it re-scans and should return
* the new element after releasing the lock.
*/
-void *
-malloc_heap_alloc(struct malloc_heap *heap,
- const char *type __attribute__((unused)), size_t size, unsigned flags,
- size_t align, size_t bound)
+static void *
+heap_alloc(struct malloc_heap *heap, const char *type __rte_unused, size_t size,
+ unsigned int flags, size_t align, size_t bound)
{
struct malloc_elem *elem;
size = RTE_CACHE_LINE_ROUNDUP(size);
align = RTE_CACHE_LINE_ROUNDUP(align);
- rte_spinlock_lock(&heap->lock);
-
elem = find_suitable_element(heap, size, flags, align, bound);
if (elem != NULL) {
elem = malloc_elem_alloc(elem, size, align, bound);
+
/* increase heap's count of allocated elements */
heap->alloc_count++;
}
- rte_spinlock_unlock(&heap->lock);
return elem == NULL ? NULL : (void *)(&elem[1]);
}
The comment on top of the function says "after releasing the lock" but
it seems it's not relevant anymore because the lock is removed.

[...]
Post by Anatoly Burakov
int
malloc_heap_free(struct malloc_elem *elem)
{
struct malloc_heap *heap;
- struct malloc_elem *ret;
+ void *start, *aligned_start, *end, *aligned_end;
+ size_t len, aligned_len;
+ struct rte_memseg_list *msl;
+ int n_pages, page_idx, max_page_idx, ret;
if (!malloc_elem_cookies_ok(elem) || elem->state != ELEM_BUSY)
return -1;
/* elem may be merged with previous element, so keep heap address */
heap = elem->heap;
+ msl = elem->msl;
rte_spinlock_lock(&(heap->lock));
- ret = malloc_elem_free(elem);
+ elem = malloc_elem_free(elem);
- rte_spinlock_unlock(&(heap->lock));
+ /* anything after this is a bonus */
+ ret = 0;
+
The fact that there was previously 2 rte_spinlock_unlock() calls
looks strange to me. Is there something wrong in a previous patch?
Anatoly Burakov
2018-03-03 13:46:21 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---
drivers/net/avf/avf_ethdev.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/avf/avf_ethdev.c b/drivers/net/avf/avf_ethdev.c
index 4df6617..f69d697 100644
--- a/drivers/net/avf/avf_ethdev.c
+++ b/drivers/net/avf/avf_ethdev.c
@@ -1365,7 +1365,7 @@ avf_allocate_dma_mem_d(__rte_unused struct avf_hw *hw,
return AVF_ERR_PARAM;

snprintf(z_name, sizeof(z_name), "avf_dma_%"PRIu64, rte_rand());
- mz = rte_memzone_reserve_bounded(z_name, size, SOCKET_ID_ANY, 0,
+ mz = rte_memzone_reserve_bounded_contig(z_name, size, SOCKET_ID_ANY, 0,
alignment, RTE_PGSIZE_2M);
if (!mz)
return AVF_ERR_NO_MEMORY;
--
2.7.4
Anatoly Burakov
2018-03-03 13:46:12 UTC
Permalink
Currently it is not possible to use memory that is not owned by DPDK to
perform DMA. This scenarion might be used in vhost applications (like
SPDK) where guest send its own memory table. To fill this gap provide
API to allow registering arbitrary address in VFIO container.

Signed-off-by: Pawel Wodkowski <***@intel.com>
Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/bsdapp/eal/eal.c | 16 ++++
lib/librte_eal/common/include/rte_vfio.h | 39 ++++++++
lib/librte_eal/linuxapp/eal/eal_vfio.c | 153 ++++++++++++++++++++++++++-----
lib/librte_eal/linuxapp/eal/eal_vfio.h | 11 +++
4 files changed, 196 insertions(+), 23 deletions(-)

diff --git a/lib/librte_eal/bsdapp/eal/eal.c b/lib/librte_eal/bsdapp/eal/eal.c
index 3b06e21..5a7f436 100644
--- a/lib/librte_eal/bsdapp/eal/eal.c
+++ b/lib/librte_eal/bsdapp/eal/eal.c
@@ -755,6 +755,8 @@ int rte_vfio_enable(const char *modname);
int rte_vfio_is_enabled(const char *modname);
int rte_vfio_noiommu_is_enabled(void);
int rte_vfio_clear_group(int vfio_group_fd);
+int rte_vfio_dma_map(uint64_t vaddr, uint64_t iova, uint64_t len);
+int rte_vfio_dma_unmap(uint64_t vaddr, uint64_t iova, uint64_t len);

int rte_vfio_setup_device(__rte_unused const char *sysfs_base,
__rte_unused const char *dev_addr,
@@ -790,3 +792,17 @@ int rte_vfio_clear_group(__rte_unused int vfio_group_fd)
{
return 0;
}
+
+int
+rte_vfio_dma_map(uint64_t __rte_unused vaddr, __rte_unused uint64_t iova,
+ __rte_unused uint64_t len)
+{
+ return -1;
+}
+
+int
+rte_vfio_dma_unmap(uint64_t __rte_unused vaddr, uint64_t __rte_unused iova,
+ __rte_unused uint64_t len)
+{
+ return -1;
+}
diff --git a/lib/librte_eal/common/include/rte_vfio.h b/lib/librte_eal/common/include/rte_vfio.h
index e981a62..093c309 100644
--- a/lib/librte_eal/common/include/rte_vfio.h
+++ b/lib/librte_eal/common/include/rte_vfio.h
@@ -123,6 +123,45 @@ int rte_vfio_noiommu_is_enabled(void);
int
rte_vfio_clear_group(int vfio_group_fd);

+/**
+ * Map memory region for use with VFIO.
+ *
+ * @param vaddr
+ * Starting virtual address of memory to be mapped.
+ *
+ * @param iova
+ * Starting IOVA address of memory to be mapped.
+ *
+ * @param len
+ * Length of memory segment being mapped.
+ *
+ * @return
+ * 0 if success.
+ * -1 on error.
+ */
+int
+rte_vfio_dma_map(uint64_t vaddr, uint64_t iova, uint64_t len);
+
+
+/**
+ * Unmap memory region from VFIO.
+ *
+ * @param vaddr
+ * Starting virtual address of memory to be unmapped.
+ *
+ * @param iova
+ * Starting IOVA address of memory to be unmapped.
+ *
+ * @param len
+ * Length of memory segment being unmapped.
+ *
+ * @return
+ * 0 if success.
+ * -1 on error.
+ */
+int
+rte_vfio_dma_unmap(uint64_t vaddr, uint64_t iova, uint64_t len);
+
#endif /* VFIO_PRESENT */

#endif /* _RTE_VFIO_H_ */
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index 5192763..8fe8984 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -22,17 +22,35 @@
static struct vfio_config vfio_cfg;

static int vfio_type1_dma_map(int);
+static int vfio_type1_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);
static int vfio_spapr_dma_map(int);
static int vfio_noiommu_dma_map(int);
+static int vfio_noiommu_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);

/* IOMMU types we support */
static const struct vfio_iommu_type iommu_types[] = {
/* x86 IOMMU, otherwise known as type 1 */
- { RTE_VFIO_TYPE1, "Type 1", &vfio_type1_dma_map},
+ {
+ .type_id = RTE_VFIO_TYPE1,
+ .name = "Type 1",
+ .dma_map_func = &vfio_type1_dma_map,
+ .dma_user_map_func = &vfio_type1_dma_mem_map
+ },
/* ppc64 IOMMU, otherwise known as spapr */
- { RTE_VFIO_SPAPR, "sPAPR", &vfio_spapr_dma_map},
+ {
+ .type_id = RTE_VFIO_SPAPR,
+ .name = "sPAPR",
+ .dma_map_func = &vfio_spapr_dma_map,
+ .dma_user_map_func = NULL
+ // TODO: work with PPC64 people on enabling this, window size!
+ },
/* IOMMU-less mode */
- { RTE_VFIO_NOIOMMU, "No-IOMMU", &vfio_noiommu_dma_map},
+ {
+ .type_id = RTE_VFIO_NOIOMMU,
+ .name = "No-IOMMU",
+ .dma_map_func = &vfio_noiommu_dma_map,
+ .dma_user_map_func = &vfio_noiommu_dma_mem_map
+ },
};

int
@@ -333,9 +351,10 @@ rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
*/
if (internal_config.process_type == RTE_PROC_PRIMARY &&
vfio_cfg.vfio_active_groups == 1) {
+ const struct vfio_iommu_type *t;
+
/* select an IOMMU type which we will be using */
- const struct vfio_iommu_type *t =
- vfio_set_iommu_type(vfio_cfg.vfio_container_fd);
+ t = vfio_set_iommu_type(vfio_cfg.vfio_container_fd);
if (!t) {
RTE_LOG(ERR, EAL,
" %s failed to select IOMMU type\n",
@@ -353,6 +372,8 @@ rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
rte_vfio_clear_group(vfio_group_fd);
return -1;
}
+
+ vfio_cfg.vfio_iommu_type = t;
}
}

@@ -665,13 +686,54 @@ vfio_get_group_no(const char *sysfs_base,
}

static int
+vfio_type1_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
+ uint64_t len, int do_map)
+{
+ struct vfio_iommu_type1_dma_map dma_map;
+ struct vfio_iommu_type1_dma_unmap dma_unmap;
+ int ret;
+
+ if (do_map != 0) {
+ memset(&dma_map, 0, sizeof(dma_map));
+ dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
+ dma_map.vaddr = vaddr;
+ dma_map.size = len;
+ dma_map.iova = iova;
+ dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
+ VFIO_DMA_MAP_FLAG_WRITE;
+
+ ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot set up DMA remapping, error %i (%s)\n",
+ errno, strerror(errno));
+ return -1;
+ }
+
+ } else {
+ memset(&dma_unmap, 0, sizeof(dma_unmap));
+ dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap);
+ dma_unmap.size = len;
+ dma_unmap.iova = iova;
+
+ ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA,
+ &dma_unmap);
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot clear DMA remapping, error %i (%s)\n",
+ errno, strerror(errno));
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+static int
vfio_type1_dma_map(int vfio_container_fd)
{
- int i, ret;
+ int i;

/* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
- struct vfio_iommu_type1_dma_map dma_map;
struct rte_memseg_list *msl;
struct rte_fbarray *arr;
int ms_idx, next_idx;
@@ -697,23 +759,9 @@ vfio_type1_dma_map(int vfio_container_fd)
len = ms->hugepage_sz;
hw_addr = ms->iova;

- memset(&dma_map, 0, sizeof(dma_map));
- dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
- dma_map.vaddr = addr;
- dma_map.size = len;
- dma_map.iova = hw_addr;
- dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
- VFIO_DMA_MAP_FLAG_WRITE;
-
- ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA,
- &dma_map);
-
- if (ret) {
- RTE_LOG(ERR, EAL, " cannot set up DMA remapping, "
- "error %i (%s)\n", errno,
- strerror(errno));
+ if (vfio_type1_dma_mem_map(vfio_container_fd, addr,
+ hw_addr, len, 1))
return -1;
- }
}
}

@@ -865,6 +913,49 @@ vfio_noiommu_dma_map(int __rte_unused vfio_container_fd)
return 0;
}

+static int
+vfio_noiommu_dma_mem_map(int __rte_unused vfio_container_fd,
+ uint64_t __rte_unused vaddr,
+ uint64_t __rte_unused iova, uint64_t __rte_unused len,
+ int __rte_unused do_map)
+{
+ /* No-IOMMU mode does not need DMA mapping */
+ return 0;
+}
+
+static int
+vfio_dma_mem_map(uint64_t vaddr, uint64_t iova, uint64_t len, int do_map)
+{
+ const struct vfio_iommu_type *t = vfio_cfg.vfio_iommu_type;
+
+ if (!t) {
+ RTE_LOG(ERR, EAL, " VFIO support not initialized\n");
+ return -1;
+ }
+
+ if (!t->dma_user_map_func) {
+ RTE_LOG(ERR, EAL,
+ " VFIO custom DMA region maping not supported by IOMMU %s\n",
+ t->name);
+ return -1;
+ }
+
+ return t->dma_user_map_func(vfio_cfg.vfio_container_fd, vaddr, iova,
+ len, do_map);
+}
+
+int
+rte_vfio_dma_map(uint64_t vaddr, uint64_t iova, uint64_t len)
+{
+ return vfio_dma_mem_map(vaddr, iova, len, 1);
+}
+
+int
+rte_vfio_dma_unmap(uint64_t vaddr, uint64_t iova, uint64_t len)
+{
+ return vfio_dma_mem_map(vaddr, iova, len, 0);
+}
+
int
rte_vfio_noiommu_is_enabled(void)
{
@@ -897,4 +988,20 @@ rte_vfio_noiommu_is_enabled(void)
return c == 'Y';
}

+#else
+
+int
+rte_vfio_dma_map(uint64_t __rte_unused vaddr, __rte_unused uint64_t iova,
+ __rte_unused uint64_t len)
+{
+ return -1;
+}
+
+int
+rte_vfio_dma_unmap(uint64_t __rte_unused vaddr, uint64_t __rte_unused iova,
+ __rte_unused uint64_t len)
+{
+ return -1;
+}
+
#endif
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h
index 8059577..b68703e 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.h
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
@@ -19,6 +19,7 @@

#ifdef VFIO_PRESENT

+#include <stdint.h>
#include <linux/vfio.h>

#define RTE_VFIO_TYPE1 VFIO_TYPE1_IOMMU
@@ -110,6 +111,7 @@ struct vfio_config {
int vfio_enabled;
int vfio_container_fd;
int vfio_active_groups;
+ const struct vfio_iommu_type *vfio_iommu_type;
struct vfio_group vfio_groups[VFIO_MAX_GROUPS];
};

@@ -119,9 +121,18 @@ struct vfio_config {
* */
typedef int (*vfio_dma_func_t)(int);

+/* Custom memory region DMA mapping function prototype.
+ * Takes VFIO container fd, virtual address, phisical address, length and
+ * operation type (0 to unmap 1 for map) as a parameters.
+ * Returns 0 on success, -1 on error.
+ **/
+typedef int (*vfio_dma_user_func_t)(int fd, uint64_t vaddr, uint64_t iova,
+ uint64_t len, int do_map);
+
struct vfio_iommu_type {
int type_id;
const char *name;
+ vfio_dma_user_func_t dma_user_map_func;
vfio_dma_func_t dma_map_func;
};
--
2.7.4
Anatoly Burakov
2018-03-03 13:46:29 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---

Notes:
Not sure if DMA-capable memzones are needed for vmxnet3.
Corrections welcome.

drivers/net/vmxnet3/vmxnet3_ethdev.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/net/vmxnet3/vmxnet3_ethdev.c b/drivers/net/vmxnet3/vmxnet3_ethdev.c
index 4e68aae..c787379 100644
--- a/drivers/net/vmxnet3/vmxnet3_ethdev.c
+++ b/drivers/net/vmxnet3/vmxnet3_ethdev.c
@@ -150,14 +150,15 @@ gpa_zone_reserve(struct rte_eth_dev *dev, uint32_t size,
if (!reuse) {
if (mz)
rte_memzone_free(mz);
- return rte_memzone_reserve_aligned(z_name, size, socket_id,
- 0, align);
+ return rte_memzone_reserve_aligned_contig(z_name, size,
+ socket_id, 0, align);
}

if (mz)
return mz;

- return rte_memzone_reserve_aligned(z_name, size, socket_id, 0, align);
+ return rte_memzone_reserve_aligned_contig(z_name, size, socket_id, 0,
+ align);
}

/**
--
2.7.4
Anatoly Burakov
2018-03-03 13:46:25 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---

Notes:
It is not 100% clear that second call to memzone_reserve
is allocating DMA memory. Corrections welcome.

drivers/net/enic/enic_main.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/enic/enic_main.c b/drivers/net/enic/enic_main.c
index ec9d343..cb2a7ba 100644
--- a/drivers/net/enic/enic_main.c
+++ b/drivers/net/enic/enic_main.c
@@ -319,7 +319,7 @@ enic_alloc_consistent(void *priv, size_t size,
struct enic *enic = (struct enic *)priv;
struct enic_memzone_entry *mze;

- rz = rte_memzone_reserve_aligned((const char *)name,
+ rz = rte_memzone_reserve_aligned_contig((const char *)name,
size, SOCKET_ID_ANY, 0, ENIC_ALIGN);
if (!rz) {
pr_err("%s : Failed to allocate memory requested for %s\n",
@@ -787,7 +787,7 @@ int enic_alloc_wq(struct enic *enic, uint16_t queue_idx,
"vnic_cqmsg-%s-%d-%d", enic->bdf_name, queue_idx,
instance++);

- wq->cqmsg_rz = rte_memzone_reserve_aligned((const char *)name,
+ wq->cqmsg_rz = rte_memzone_reserve_aligned_contig((const char *)name,
sizeof(uint32_t),
SOCKET_ID_ANY, 0,
ENIC_ALIGN);
--
2.7.4
John Daley (johndale)
2018-03-05 19:45:07 UTC
Permalink
Hi Anatoly,
Looks good, see inline for details.
Acked-by: John Daley <***@cisco.com>

Thanks,
John
-----Original Message-----
Sent: Saturday, March 03, 2018 5:46 AM
Subject: [PATCH 37/41] net/enic: use contiguous allocation for DMA memory
---
It is not 100% clear that second call to memzone_reserve
is allocating DMA memory. Corrections welcome.
The 2nd call is allocating DMA memory so I believe your patch is correct.
drivers/net/enic/enic_main.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/net/enic/enic_main.c b/drivers/net/enic/enic_main.c index
ec9d343..cb2a7ba 100644
--- a/drivers/net/enic/enic_main.c
+++ b/drivers/net/enic/enic_main.c
@@ -319,7 +319,7 @@ enic_alloc_consistent(void *priv, size_t size,
struct enic *enic = (struct enic *)priv;
struct enic_memzone_entry *mze;
- rz = rte_memzone_reserve_aligned((const char *)name,
+ rz = rte_memzone_reserve_aligned_contig((const char *)name,
size, SOCKET_ID_ANY, 0,
ENIC_ALIGN);
if (!rz) {
pr_err("%s : Failed to allocate memory requested for %s\n",
@@ -787,7 +787,7 @@ int enic_alloc_wq(struct enic *enic, uint16_t queue_idx,
"vnic_cqmsg-%s-%d-%d", enic->bdf_name, queue_idx,
instance++);
- wq->cqmsg_rz = rte_memzone_reserve_aligned((const char *)name,
+ wq->cqmsg_rz = rte_memzone_reserve_aligned_contig((const char *)name,
sizeof(uint32_t),
SOCKET_ID_ANY, 0,
ENIC_ALIGN);
This is a send completion landing spot which is DMA'd to by the NIC so it does have to be contiguous. However the size is only 4 bytes so it might not matter.
--
2.7.4
Burakov, Anatoly
2018-03-06 11:04:54 UTC
Permalink
This patchset introduces dynamic memory allocation for DPDK (aka memory
hotplug). Based upon RFC submitted in December [1].
For those testing this patch, there's a deadlock-at-startup issue when
DPDK is started with no memory. This will be fixed in v2 (as well as
dependent IPC patches), but for now the workaround is to start DPDK with
-m/--socket-mem switches.
--
Thanks,
Anatoly
Nélio Laranjeiro
2018-03-07 15:27:01 UTC
Permalink
Hi Anatoly,

I am trying to run some test with this series, but it seems to be based
on some other commits of yours. I have already identified the following
one [1] it seems I am missing some others.

It is possible to have a list of commits to apply on the current master
branch [2] before this series?

Thanks,

[1] https://dpdk.org/patch/35043
[2] https://dpdk.org/browse/dpdk/commit/?id=c06ddf9698e0c2a9653cfa971f9ddc205065662c
--
Nélio Laranjeiro
6WIND
Burakov, Anatoly
2018-03-07 16:05:26 UTC
Permalink
Post by John Daley (johndale)
Hi Anatoly,
I am trying to run some test with this series, but it seems to be based
on some other commits of yours. I have already identified the following
one [1] it seems I am missing some others.
It is possible to have a list of commits to apply on the current master
branch [2] before this series?
Thanks,
[1] https://dpdk.org/patch/35043
[2] https://dpdk.org/browse/dpdk/commit/?id=c06ddf9698e0c2a9653cfa971f9ddc205065662c
Hi Nelio,

Yes, my apologies. I'm aware of the apply issues. The issue is due to me
missing a rebase on one of the dependent patchsets. I'm preparing a v2
that will fix the issue (pending some internal processes).
--
Thanks,
Anatoly
Burakov, Anatoly
2018-03-08 09:37:27 UTC
Permalink
Post by Burakov, Anatoly
Post by John Daley (johndale)
Hi Anatoly,
I am trying to run some test with this series, but it seems to be based
on some other commits of yours. I have already identified the following
one [1] it seems I am missing some others.
It is possible to have a list of commits to apply on the current master
branch [2] before this series?
Thanks,
[1] https://dpdk.org/patch/35043
[2]
https://dpdk.org/browse/dpdk/commit/?id=c06ddf9698e0c2a9653cfa971f9ddc205065662c
Hi Nelio,
Yes, my apologies. I'm aware of the apply issues. The issue is due to me
missing a rebase on one of the dependent patchsets. I'm preparing a v2
that will fix the issue (pending some internal processes).
Hi Nelio,

The v2 is now up, with corrected rebase. You can see the list of
dependent patches in the cover letter [1]. Once again apologies for
incorrect rebase in v1. Looking forward to your feedback!

[1] http://dpdk.org/ml/archives/dev/2018-March/092070.html
--
Thanks,
Anatoly
Nélio Laranjeiro
2018-03-08 10:53:26 UTC
Permalink
Post by Burakov, Anatoly
Post by Burakov, Anatoly
Post by John Daley (johndale)
Hi Anatoly,
I am trying to run some test with this series, but it seems to be based
on some other commits of yours. I have already identified the following
one [1] it seems I am missing some others.
It is possible to have a list of commits to apply on the current master
branch [2] before this series?
Thanks,
[1] https://dpdk.org/patch/35043
[2] https://dpdk.org/browse/dpdk/commit/?id=c06ddf9698e0c2a9653cfa971f9ddc205065662c
Hi Nelio,
Yes, my apologies. I'm aware of the apply issues. The issue is due to me
missing a rebase on one of the dependent patchsets. I'm preparing a v2
that will fix the issue (pending some internal processes).
Hi Nelio,
The v2 is now up, with corrected rebase. You can see the list of dependent
patches in the cover letter [1]. Once again apologies for incorrect rebase
in v1. Looking forward to your feedback!
[1] http://dpdk.org/ml/archives/dev/2018-March/092070.html
Hi Anatoly,

First feedbacks, I have some issue when compiling it on desktop/server
machine with clang and GCC, maybe due some different configuration items
depending on the machine compile it.

Clang error
-----------

dpdk.org/lib/librte_eal/linuxapp/eal/eal_memalloc.c:590:6: error: variable 'have_numa' is used uninitialized whenever 'if' condition is true [-Werror,-Wsometimes-uninitialized]
if (!hi) {
^~~
CC eal_lcore.o
dpdk.org/lib/librte_eal/linuxapp/eal/eal_memalloc.c:693:6: note: uninitialized use occurs here
if (have_numa)
^~~~~~~~~
dpdk.org/lib/librte_eal/linuxapp/eal/eal_memalloc.c:590:2: note: remove the 'if' if its condition is always false
if (!hi) {
^~~~~~~~~~
dpdk.org/lib/librte_eal/linuxapp/eal/eal_memalloc.c:580:6: error: variable 'have_numa' is used uninitialized whenever 'if' condition is true [-Werror,-Wsometimes-uninitialized]
if (internal_config.legacy_mem)
^~~~~~~~~~~~~~~~~~~~~~~~~~
dpdk.org/lib/librte_eal/linuxapp/eal/eal_memalloc.c:693:6: note: uninitialized use occurs here
if (have_numa)
^~~~~~~~~
dpdk.org/lib/librte_eal/linuxapp/eal/eal_memalloc.c:580:2: note: remove the 'if' if its condition is always false
if (internal_config.legacy_mem)
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
dpdk.org/lib/librte_eal/linuxapp/eal/eal_memalloc.c:573:16: note: initialize the variable 'have_numa' to silence this warning
bool have_numa;
^
= false

GCC errors
----------

/root/dpdk/lib/librte_eal/common/eal_common_memzone.c: In function ‘rte_memzone_free’:
/root/dpdk/lib/librte_eal/common/eal_common_memzone.c:355:2: error: ‘addr’ may be used uninitialized in this function [-Werror=maybe-uninitialized]
rte_free(addr);
^~~~~~~~~~~~~~
/root/dpdk/lib/librte_eal/linuxapp/eal/eal_memalloc.c: In function ‘eal_memalloc_alloc_page_bulk’:
/root/dpdk/lib/librte_eal/linuxapp/eal/eal_memalloc.c:693:5: error: ‘have_numa’ may be used uninitialized in this function [-Werror=maybe-uninitialized]
if (have_numa)
^
cc1: all warnings being treated as errors
/root/dpdk/mk/internal/rte.compile-pre.mk:114: recipe for target 'eal_common_memzone.o' failed
make[5]: *** [eal_common_memzone.o] Error 1
make[5]: *** Waiting for unfinished jobs....
cc1: all warnings being treated as errors
/root/dpdk/mk/internal/rte.compile-pre.mk:114: recipe for target 'eal_memalloc.o' failed
make[5]: *** [eal_memalloc.o] Error 1
/root/dpdk/lib/librte_eal/linuxapp/eal/eal_memory.c: In function ‘rte_eal_hugepage_attach’:
/root/dpdk/lib/librte_eal/linuxapp/eal/eal_memory.c:1556:7: error: ‘max_seg’ may be used uninitialized in this function [-Werror=maybe-uninitialized]
if (cur_seg >= max_seg)
^
/root/dpdk/lib/librte_eal/linuxapp/eal/eal_memory.c:1391:24: note: ‘max_seg’ was declared here
unsigned int cur_seg, max_seg;
^~~~~~~
/root/dpdk/lib/librte_eal/linuxapp/eal/eal_memory.c:1389:15: error: ‘i’ may be used uninitialized in this function [-Werror=maybe-uninitialized]
unsigned int i;
^

It worse to ask Thomas to have a dedicated repository/branch on DPDK,
otherwise it will be a nightmare for anyone who want to test if we need
each time to apply 54 patches.

Can you see it with him?

Thanks,
--
Nélio Laranjeiro
6WIND
Burakov, Anatoly
2018-03-08 12:12:15 UTC
Permalink
Post by John Daley (johndale)
Post by Burakov, Anatoly
Post by Burakov, Anatoly
Post by John Daley (johndale)
Hi Anatoly,
I am trying to run some test with this series, but it seems to be based
on some other commits of yours. I have already identified the following
one [1] it seems I am missing some others.
It is possible to have a list of commits to apply on the current master
branch [2] before this series?
Thanks,
[1] https://dpdk.org/patch/35043
[2] https://dpdk.org/browse/dpdk/commit/?id=c06ddf9698e0c2a9653cfa971f9ddc205065662c
Hi Nelio,
Yes, my apologies. I'm aware of the apply issues. The issue is due to me
missing a rebase on one of the dependent patchsets. I'm preparing a v2
that will fix the issue (pending some internal processes).
Hi Nelio,
The v2 is now up, with corrected rebase. You can see the list of dependent
patches in the cover letter [1]. Once again apologies for incorrect rebase
in v1. Looking forward to your feedback!
[1] http://dpdk.org/ml/archives/dev/2018-March/092070.html
Hi Anatoly,
First feedbacks, I have some issue when compiling it on desktop/server
machine with clang and GCC, maybe due some different configuration items
depending on the machine compile it.
Clang error
-----------
dpdk.org/lib/librte_eal/linuxapp/eal/eal_memalloc.c:590:6: error: variable 'have_numa' is used uninitialized whenever 'if' condition is true [-Werror,-Wsometimes-uninitialized]
if (!hi) {
^~~
CC eal_lcore.o
dpdk.org/lib/librte_eal/linuxapp/eal/eal_memalloc.c:693:6: note: uninitialized use occurs here
if (have_numa)
^~~~~~~~~
dpdk.org/lib/librte_eal/linuxapp/eal/eal_memalloc.c:590:2: note: remove the 'if' if its condition is always false
if (!hi) {
^~~~~~~~~~
dpdk.org/lib/librte_eal/linuxapp/eal/eal_memalloc.c:580:6: error: variable 'have_numa' is used uninitialized whenever 'if' condition is true [-Werror,-Wsometimes-uninitialized]
if (internal_config.legacy_mem)
^~~~~~~~~~~~~~~~~~~~~~~~~~
dpdk.org/lib/librte_eal/linuxapp/eal/eal_memalloc.c:693:6: note: uninitialized use occurs here
if (have_numa)
^~~~~~~~~
dpdk.org/lib/librte_eal/linuxapp/eal/eal_memalloc.c:580:2: note: remove the 'if' if its condition is always false
if (internal_config.legacy_mem)
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
dpdk.org/lib/librte_eal/linuxapp/eal/eal_memalloc.c:573:16: note: initialize the variable 'have_numa' to silence this warning
bool have_numa;
^
= false
GCC errors
----------
/root/dpdk/lib/librte_eal/common/eal_common_memzone.c:355:2: error: ‘addr’ may be used uninitialized in this function [-Werror=maybe-uninitialized]
rte_free(addr);
^~~~~~~~~~~~~~
/root/dpdk/lib/librte_eal/linuxapp/eal/eal_memalloc.c:693:5: error: ‘have_numa’ may be used uninitialized in this function [-Werror=maybe-uninitialized]
if (have_numa)
^
cc1: all warnings being treated as errors
/root/dpdk/mk/internal/rte.compile-pre.mk:114: recipe for target 'eal_common_memzone.o' failed
make[5]: *** [eal_common_memzone.o] Error 1
make[5]: *** Waiting for unfinished jobs....
cc1: all warnings being treated as errors
/root/dpdk/mk/internal/rte.compile-pre.mk:114: recipe for target 'eal_memalloc.o' failed
make[5]: *** [eal_memalloc.o] Error 1
/root/dpdk/lib/librte_eal/linuxapp/eal/eal_memory.c:1556:7: error: ‘max_seg’ may be used uninitialized in this function [-Werror=maybe-uninitialized]
if (cur_seg >= max_seg)
^
/root/dpdk/lib/librte_eal/linuxapp/eal/eal_memory.c:1391:24: note: ‘max_seg’ was declared here
unsigned int cur_seg, max_seg;
^~~~~~~
/root/dpdk/lib/librte_eal/linuxapp/eal/eal_memory.c:1389:15: error: ‘i’ may be used uninitialized in this function [-Werror=maybe-uninitialized]
unsigned int i;
^
It worse to ask Thomas to have a dedicated repository/branch on DPDK,
otherwise it will be a nightmare for anyone who want to test if we need
each time to apply 54 patches.
Can you see it with him?
Thanks,
Hi Nelio,

Thanks for your feedback.

We're working on merging dependencies into the main tree. I've spoken
with Thomas about this, and he suggested to create a GitHub repo for
this patchset, so i'll be looking into this as well.
--
Thanks,
Anatoly
Bruce Richardson
2018-03-08 12:14:54 UTC
Permalink
Post by Burakov, Anatoly
Post by John Daley (johndale)
Post by Burakov, Anatoly
Post by Burakov, Anatoly
Post by John Daley (johndale)
Hi Anatoly,
I am trying to run some test with this series, but it seems to be based
on some other commits of yours. I have already identified the following
one [1] it seems I am missing some others.
It is possible to have a list of commits to apply on the current master
branch [2] before this series?
Thanks,
[1] https://dpdk.org/patch/35043
[2] https://dpdk.org/browse/dpdk/commit/?id=c06ddf9698e0c2a9653cfa971f9ddc205065662c
Hi Nelio,
Yes, my apologies. I'm aware of the apply issues. The issue is due to me
missing a rebase on one of the dependent patchsets. I'm preparing a v2
that will fix the issue (pending some internal processes).
Hi Nelio,
The v2 is now up, with corrected rebase. You can see the list of dependent
patches in the cover letter [1]. Once again apologies for incorrect rebase
in v1. Looking forward to your feedback!
[1] http://dpdk.org/ml/archives/dev/2018-March/092070.html
Hi Anatoly,
First feedbacks, I have some issue when compiling it on desktop/server
machine with clang and GCC, maybe due some different configuration items
depending on the machine compile it.
Clang error
-----------
dpdk.org/lib/librte_eal/linuxapp/eal/eal_memalloc.c:590:6: error: variable 'have_numa' is used uninitialized whenever 'if' condition is true [-Werror,-Wsometimes-uninitialized]
if (!hi) {
^~~
CC eal_lcore.o
dpdk.org/lib/librte_eal/linuxapp/eal/eal_memalloc.c:693:6: note: uninitialized use occurs here
if (have_numa)
^~~~~~~~~
dpdk.org/lib/librte_eal/linuxapp/eal/eal_memalloc.c:590:2: note: remove the 'if' if its condition is always false
if (!hi) {
^~~~~~~~~~
dpdk.org/lib/librte_eal/linuxapp/eal/eal_memalloc.c:580:6: error: variable 'have_numa' is used uninitialized whenever 'if' condition is true [-Werror,-Wsometimes-uninitialized]
if (internal_config.legacy_mem)
^~~~~~~~~~~~~~~~~~~~~~~~~~
dpdk.org/lib/librte_eal/linuxapp/eal/eal_memalloc.c:693:6: note: uninitialized use occurs here
if (have_numa)
^~~~~~~~~
dpdk.org/lib/librte_eal/linuxapp/eal/eal_memalloc.c:580:2: note: remove the 'if' if its condition is always false
if (internal_config.legacy_mem)
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
dpdk.org/lib/librte_eal/linuxapp/eal/eal_memalloc.c:573:16: note: initialize the variable 'have_numa' to silence this warning
bool have_numa;
^
= false
GCC errors
----------
/root/dpdk/lib/librte_eal/common/eal_common_memzone.c:355:2: error: ‘addr’ may be used uninitialized in this function [-Werror=maybe-uninitialized]
rte_free(addr);
^~~~~~~~~~~~~~
/root/dpdk/lib/librte_eal/linuxapp/eal/eal_memalloc.c:693:5: error: ‘have_numa’ may be used uninitialized in this function [-Werror=maybe-uninitialized]
if (have_numa)
^
cc1: all warnings being treated as errors
/root/dpdk/mk/internal/rte.compile-pre.mk:114: recipe for target 'eal_common_memzone.o' failed
make[5]: *** [eal_common_memzone.o] Error 1
make[5]: *** Waiting for unfinished jobs....
cc1: all warnings being treated as errors
/root/dpdk/mk/internal/rte.compile-pre.mk:114: recipe for target 'eal_memalloc.o' failed
make[5]: *** [eal_memalloc.o] Error 1
/root/dpdk/lib/librte_eal/linuxapp/eal/eal_memory.c:1556:7: error: ‘max_seg’ may be used uninitialized in this function [-Werror=maybe-uninitialized]
if (cur_seg >= max_seg)
^
/root/dpdk/lib/librte_eal/linuxapp/eal/eal_memory.c:1391:24: note: ‘max_seg’ was declared here
unsigned int cur_seg, max_seg;
^~~~~~~
/root/dpdk/lib/librte_eal/linuxapp/eal/eal_memory.c:1389:15: error: ‘i’ may be used uninitialized in this function [-Werror=maybe-uninitialized]
unsigned int i;
^
It worse to ask Thomas to have a dedicated repository/branch on DPDK,
otherwise it will be a nightmare for anyone who want to test if we need
each time to apply 54 patches.
Can you see it with him?
Thanks,
Hi Nelio,
Thanks for your feedback.
We're working on merging dependencies into the main tree. I've spoken with
Thomas about this, and he suggested to create a GitHub repo for this
patchset, so i'll be looking into this as well.
I think some of the dependent patches are already acked, so perhaps they
could be applied to the main tree soon? That would a) help test them and
b) make the life easier of everyone testing this big memory rework set.
I still think we have a big issue with all patches being applied in a
"big bang" near the end of the release cycle.

/Bruce
Burakov, Anatoly
2018-03-07 16:11:26 UTC
Permalink
Post by John Daley (johndale)
Hi Anatoly,
I am trying to run some test with this series, but it seems to be based
on some other commits of yours. I have already identified the following
one [1] it seems I am missing some others.
It is possible to have a list of commits to apply on the current master
branch [2] before this series?
Thanks,
[1] https://dpdk.org/patch/35043
[2] https://dpdk.org/browse/dpdk/commit/?id=c06ddf9698e0c2a9653cfa971f9ddc205065662c
Also, the cover letter you're responding to, lists dependent patches as
well :) it's just that current patchset does not apply cleanly atop of
them due to rebase errors from my side.
--
Thanks,
Anatoly
Patil, Harish
2018-03-07 22:55:08 UTC
Permalink
-----Original Message-----
From: Anatoly Burakov <***@intel.com>
Date: Wednesday, March 7, 2018 at 8:57 AM
To: "***@dpdk.org" <***@dpdk.org>
Cc: "Mody, Rasesh" <***@cavium.com>, Harish Patil
<***@cavium.com>, "Shaikh, Shahed" <***@cavium.com>,
"***@intel.com" <***@intel.com>, "***@intel.com"
<***@intel.com>, "***@ericsson.com"
<***@ericsson.com>, "***@ericsson.com"
<***@ericsson.com>, "***@intel.com"
<***@intel.com>, "***@intel.com"
<***@intel.com>, "***@monjalon.net" <***@monjalon.net>,
"***@intel.com" <***@intel.com>,
"***@intel.com"
<***@intel.com>, "***@intel.com"
<***@intel.com>, "***@6wind.com"
<***@6wind.com>, "***@mellanox.com" <***@mellanox.com>,
"***@japf.ch" <***@japf.ch>, "Jacob, Jerin"
<***@cavium.com>, "***@nxp.com"
<***@nxp.com>, "***@6wind.com" <***@6wind.com>
Subject: [PATCH v2 39/41] net/qede: use contiguous allocation for DMA
memory
Post by Anatoly Burakov
---
Doing "grep -R rte_memzone_reserve drivers/net/qede" returns the
drivers/net/qede/qede_fdir.c: mz =
rte_memzone_reserve_aligned(mz_name, QEDE_MAX_FDIR_PKT_LEN,
drivers/net/qede/base/bcm_osal.c: mz =
rte_memzone_reserve_aligned_contig(mz_name, size,
drivers/net/qede/base/bcm_osal.c: mz =
rte_memzone_reserve_aligned_contig(mz_name, size, socket_id, 0,
I took a brief look at memzone in qede_fdir and it didn't look like
memzone
was used for DMA, so i left it alone. Corrections welcome.
That’s right.
Post by Anatoly Burakov
drivers/net/qede/base/bcm_osal.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/drivers/net/qede/base/bcm_osal.c
b/drivers/net/qede/base/bcm_osal.c
index fe42f32..707d553 100644
--- a/drivers/net/qede/base/bcm_osal.c
+++ b/drivers/net/qede/base/bcm_osal.c
@@ -135,7 +135,7 @@ void *osal_dma_alloc_coherent(struct ecore_dev *p_dev,
if (core_id == (unsigned int)LCORE_ID_ANY)
core_id = 0;
socket_id = rte_lcore_to_socket_id(core_id);
- mz = rte_memzone_reserve_aligned(mz_name, size,
+ mz = rte_memzone_reserve_aligned_contig(mz_name, size,
socket_id, 0, RTE_CACHE_LINE_SIZE);
if (!mz) {
DP_ERR(p_dev, "Unable to allocate DMA memory "
@@ -174,7 +174,8 @@ void *osal_dma_alloc_coherent_aligned(struct
ecore_dev *p_dev,
if (core_id == (unsigned int)LCORE_ID_ANY)
core_id = 0;
socket_id = rte_lcore_to_socket_id(core_id);
- mz = rte_memzone_reserve_aligned(mz_name, size, socket_id, 0, align);
+ mz = rte_memzone_reserve_aligned_contig(mz_name, size, socket_id, 0,
+ align);
if (!mz) {
DP_ERR(p_dev, "Unable to allocate DMA memory "
"of size %zu bytes - %s\n",
--
2.7.4
Acked-by: Harish Pa
Michał Krawczyk
2018-03-08 09:40:07 UTC
Permalink
Post by Anatoly Burakov
---
drivers/net/ena/base/ena_plat_dpdk.h | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/drivers/net/ena/base/ena_plat_dpdk.h b/drivers/net/ena/base/ena_plat_dpdk.h
index 8cba319..c1ebf00 100644
--- a/drivers/net/ena/base/ena_plat_dpdk.h
+++ b/drivers/net/ena/base/ena_plat_dpdk.h
@@ -188,7 +188,8 @@ typedef uint64_t dma_addr_t;
ENA_TOUCH(dmadev); ENA_TOUCH(handle); \
snprintf(z_name, sizeof(z_name), \
"ena_alloc_%d", ena_alloc_cnt++); \
- mz = rte_memzone_reserve(z_name, size, SOCKET_ID_ANY, 0); \
+ mz = rte_memzone_reserve_contig(z_name, \
+ size, SOCKET_ID_ANY, 0); \
memset(mz->addr, 0, size); \
virt = mz->addr; \
phys = mz->iova; \
@@ -206,7 +207,7 @@ typedef uint64_t dma_addr_t;
ENA_TOUCH(dmadev); ENA_TOUCH(dev_node); \
snprintf(z_name, sizeof(z_name), \
"ena_alloc_%d", ena_alloc_cnt++); \
- mz = rte_memzone_reserve(z_name, size, node, 0); \
+ mz = rte_memzone_reserve_contig(z_name, size, node, 0); \
memset(mz->addr, 0, size); \
virt = mz->addr; \
phys = mz->iova; \
@@ -219,7 +220,7 @@ typedef uint64_t dma_addr_t;
ENA_TOUCH(dmadev); ENA_TOUCH(dev_node); \
snprintf(z_name, sizeof(z_name), \
"ena_alloc_%d", ena_alloc_cnt++); \
- mz = rte_memzone_reserve(z_name, size, node, 0); \
+ mz = rte_memzone_reserve_contig(z_name, size, node, 0); \
memset(mz->addr, 0, size); \
virt = mz->addr; \
} while (0)
--
2.7.4
Pavan Nikhilesh
2018-03-08 10:18:06 UTC
Permalink
Hi Anatoly,

I am trying to verify this patchset and have encountered few issues.

Few -Werror=maybe-uninitialized errors in eal_memalloc.c/eal_memory.c/
eal_common_memzone.c files.

diff --git a/lib/librte_eal/common/eal_common_memzone.c b/lib/librte_eal/common/eal_common_memzone.c
index a7cfdaf03..ad4413507 100644
--- a/lib/librte_eal/common/eal_common_memzone.c
+++ b/lib/librte_eal/common/eal_common_memzone.c
@@ -321,7 +321,7 @@ rte_memzone_free(const struct rte_memzone *mz)
struct rte_fbarray *arr;
struct rte_memzone *found_mz;
int ret = 0;
- void *addr;
+ void *addr = NULL;
unsigned idx;

if (mz == NULL)
diff --git a/lib/librte_eal/linuxapp/eal/eal_memalloc.c b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
index 1008faed6..32b0d5133 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memalloc.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
@@ -570,7 +570,7 @@ eal_memalloc_alloc_page_bulk(struct rte_memseg **ms, int n,
unsigned int msl_idx;
int cur_idx, start_idx, end_idx, i, j, ret = -1;
#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
- bool have_numa;
+ bool have_numa = false;
int oldpolicy;
struct bitmask *oldmask = numa_allocate_nodemask();
#endif
diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
index f74291fb6..d37b4a59b 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -1386,9 +1386,9 @@ eal_legacy_hugepage_attach(void)
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
struct hugepage_file *hp = NULL;
unsigned int num_hp = 0;
- unsigned int i;
+ unsigned int i = 0;
int ms_idx, msl_idx;
- unsigned int cur_seg, max_seg;
+ unsigned int cur_seg, max_seg = 0;
off_t size = 0;
int fd, fd_hugepage = -1;



@Hemanth
Also, this patchset breaks dpaa/dpaa2 bus drivers (they rely on
`rte_eal_get_physmem_layout` that is depricated
http://dpdk.org/dev/patchwork/patch/34002/)
So, generic arm64 linuxapp build is broken.

Regards,
Pavan.
This patchset introduces dynamic memory allocation for DPDK (aka memory
hotplug). Based upon RFC submitted in December [1].
- IPC bugfixes patchset [2]
- IPC improvements patchset [3]
- IPC asynchronous request API patch [4]
- Function to return number of sockets [5]
<snip>
--
2.7.4
Burakov, Anatoly
2018-03-08 10:46:46 UTC
Permalink
Post by John Daley (johndale)
Hi Anatoly,
I am trying to verify this patchset and have encountered few issues.
Few -Werror=maybe-uninitialized errors in eal_memalloc.c/eal_memory.c/
eal_common_memzone.c files.
Thanks for the heads up, i'll fix those in the next revision. Out of
curiousity, which compiler version are you using?
Post by John Daley (johndale)
diff --git a/lib/librte_eal/common/eal_common_memzone.c b/lib/librte_eal/common/eal_common_memzone.c
index a7cfdaf03..ad4413507 100644
--- a/lib/librte_eal/common/eal_common_memzone.c
+++ b/lib/librte_eal/common/eal_common_memzone.c
@@ -321,7 +321,7 @@ rte_memzone_free(const struct rte_memzone *mz)
struct rte_fbarray *arr;
struct rte_memzone *found_mz;
int ret = 0;
- void *addr;
+ void *addr = NULL;
unsigned idx;
if (mz == NULL)
diff --git a/lib/librte_eal/linuxapp/eal/eal_memalloc.c b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
index 1008faed6..32b0d5133 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memalloc.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
@@ -570,7 +570,7 @@ eal_memalloc_alloc_page_bulk(struct rte_memseg **ms, int n,
unsigned int msl_idx;
int cur_idx, start_idx, end_idx, i, j, ret = -1;
#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
- bool have_numa;
+ bool have_numa = false;
int oldpolicy;
struct bitmask *oldmask = numa_allocate_nodemask();
#endif
diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
index f74291fb6..d37b4a59b 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -1386,9 +1386,9 @@ eal_legacy_hugepage_attach(void)
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
struct hugepage_file *hp = NULL;
unsigned int num_hp = 0;
- unsigned int i;
+ unsigned int i = 0;
int ms_idx, msl_idx;
- unsigned int cur_seg, max_seg;
+ unsigned int cur_seg, max_seg = 0;
off_t size = 0;
int fd, fd_hugepage = -1;
@Hemanth
Also, this patchset breaks dpaa/dpaa2 bus drivers (they rely on
`rte_eal_get_physmem_layout` that is depricated
http://dpdk.org/dev/patchwork/patch/34002/)
So, generic arm64 linuxapp build is broken.
Should the deprecation notice have been accompanied with marking that
function as __rte_deprecated?
Post by John Daley (johndale)
Regards,
Pavan.
This patchset introduces dynamic memory allocation for DPDK (aka memory
hotplug). Based upon RFC submitted in December [1].
- IPC bugfixes patchset [2]
- IPC improvements patchset [3]
- IPC asynchronous request API patch [4]
- Function to return number of sockets [5]
<snip>
--
2.7.4
--
Thanks,
Anatoly
Pavan Nikhilesh
2018-03-08 11:13:38 UTC
Permalink
Post by Burakov, Anatoly
Post by John Daley (johndale)
Hi Anatoly,
I am trying to verify this patchset and have encountered few issues.
Few -Werror=maybe-uninitialized errors in eal_memalloc.c/eal_memory.c/
eal_common_memzone.c files.
Thanks for the heads up, i'll fix those in the next revision. Out of
curiousity, which compiler version are you using?
I'm using gcc 5.3.0.
Post by Burakov, Anatoly
Post by John Daley (johndale)
diff --git a/lib/librte_eal/common/eal_common_memzone.c b/lib/librte_eal/common/eal_common_memzone.c
index a7cfdaf03..ad4413507 100644
--- a/lib/librte_eal/common/eal_common_memzone.c
+++ b/lib/librte_eal/common/eal_common_memzone.c
@@ -321,7 +321,7 @@ rte_memzone_free(const struct rte_memzone *mz)
struct rte_fbarray *arr;
struct rte_memzone *found_mz;
int ret = 0;
- void *addr;
+ void *addr = NULL;
unsigned idx;
if (mz == NULL)
diff --git a/lib/librte_eal/linuxapp/eal/eal_memalloc.c b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
index 1008faed6..32b0d5133 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memalloc.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
@@ -570,7 +570,7 @@ eal_memalloc_alloc_page_bulk(struct rte_memseg **ms, int n,
unsigned int msl_idx;
int cur_idx, start_idx, end_idx, i, j, ret = -1;
#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
- bool have_numa;
+ bool have_numa = false;
int oldpolicy;
struct bitmask *oldmask = numa_allocate_nodemask();
#endif
diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
index f74291fb6..d37b4a59b 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -1386,9 +1386,9 @@ eal_legacy_hugepage_attach(void)
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
struct hugepage_file *hp = NULL;
unsigned int num_hp = 0;
- unsigned int i;
+ unsigned int i = 0;
int ms_idx, msl_idx;
- unsigned int cur_seg, max_seg;
+ unsigned int cur_seg, max_seg = 0;
off_t size = 0;
int fd, fd_hugepage = -1;
@Hemanth
Also, this patchset breaks dpaa/dpaa2 bus drivers (they rely on
`rte_eal_get_physmem_layout` that is depricated
http://dpdk.org/dev/patchwork/patch/34002/)
So, generic arm64 linuxapp build is broken.
Should the deprecation notice have been accompanied with marking that
function as __rte_deprecated?
Yup that's the general sequence.
Post by Burakov, Anatoly
Post by John Daley (johndale)
Regards,
Pavan.
This patchset introduces dynamic memory allocation for DPDK (aka memory
hotplug). Based upon RFC submitted in December [1].
- IPC bugfixes patchset [2]
- IPC improvements patchset [3]
- IPC asynchronous request API patch [4]
- Function to return number of sockets [5]
<snip>
--
2.7.4
--
Thanks,
Anatoly
Pavan Nikhilesh
2018-03-08 13:36:13 UTC
Permalink
Hi Anatoly,

We are currently facing issues with running testpmd on thunderx platform.
The issue seems to be with vfio

EAL: Detected 24 lcore(s)
EAL: Detected 1 NUMA nodes
EAL: No free hugepages reported in hugepages-2048kB
EAL: Multi-process socket /var/run/.rte_unix
EAL: Probing VFIO support...
EAL: VFIO support initialized
EAL: VFIO support not initialized

<snip>

EAL: probe driver: 177d:a053 octeontx_fpavf
EAL: PCI device 0001:01:00.1 on NUMA socket 0
EAL: probe driver: 177d:a034 net_thunderx
EAL: using IOMMU type 1 (Type 1)
EAL: cannot set up DMA remapping, error 22 (Invalid argument)
EAL: 0001:01:00.1 DMA remapping failed, error 22 (Invalid argument)
EAL: Requested device 0001:01:00.1 cannot be used
EAL: PCI device 0001:01:00.2 on NUMA socket 0
<snip>
testpmd: No probed ethernet devices
testpmd: create a new mbuf pool <mbuf_pool_socket_0>: n=251456, size=2176, socket=0
testpmd: preferred mempool ops selected: ring_mp_mc
EAL: VFIO support not initialized
EAL: VFIO support not initialized
EAL: VFIO support not initialized
Done


This is because rte_service_init() calls rte_calloc() before
rte_bus_probe() and vfio_dma_mem_map fails because iommu type is not set.

Call stack:
gdb) bt
#0 vfio_dma_mem_map (vaddr=281439006359552, iova=11274289152, len=536870912, do_map=1) at /root/clean/dpdk/lib/librte_eal/linuxapp/eal/eal_vfio.c:967
#1 0x00000000004fd974 in rte_vfio_dma_map (vaddr=281439006359552, iova=11274289152, len=536870912) at /root/clean/dpdk/lib/librte_eal/linuxapp/eal/eal_vfio.c:988
#2 0x00000000004fbe78 in vfio_mem_event_callback (type=RTE_MEM_EVENT_ALLOC, addr=0xfff7a0000000, len=536870912) at /root/clean/dpdk/lib/librte_eal/linuxapp/eal/eal_vfio.c:240
#3 0x00000000005070ac in eal_memalloc_notify (event=RTE_MEM_EVENT_ALLOC, start=0xfff7a0000000, len=536870912) at /root/clean/dpdk/lib/librte_eal/common/eal_common_memalloc.c:177
#4 0x0000000000515c98 in try_expand_heap_primary (heap=0xffffb7fb167c, pg_sz=536870912, elt_size=8192, socket=0, flags=0, align=128, bound=0, contig=false) at /root/clean/dpdk/lib/librte_eal/common/malloc_heap.c:247
#5 0x0000000000515e94 in try_expand_heap (heap=0xffffb7fb167c, pg_sz=536870912, elt_size=8192, socket=0, flags=0, align=128, bound=0, contig=false) at /root/clean/dpdk/lib/librte_eal/common/malloc_heap.c:327
#6 0x00000000005163a0 in alloc_more_mem_on_socket (heap=0xffffb7fb167c, size=8192, socket=0, flags=0, align=128, bound=0, contig=false) at /root/clean/dpdk/lib/librte_eal/common/malloc_heap.c:455
#7 0x0000000000516514 in heap_alloc_on_socket (type=0x85bf90 "rte_services", size=8192, socket=0, flags=0, align=128, bound=0, contig=false) at /root/clean/dpdk/lib/librte_eal/common/malloc_heap.c:491
#8 0x0000000000516664 in malloc_heap_alloc (type=0x85bf90 "rte_services", size=8192, socket_arg=-1, flags=0, align=128, bound=0, contig=false) at /root/clean/dpdk/lib/librte_eal/common/malloc_heap.c:527
#9 0x0000000000513b54 in rte_malloc_socket (type=0x85bf90 "rte_services", size=8192, align=128, socket_arg=-1) at /root/clean/dpdk/lib/librte_eal/common/rte_malloc.c:54
#10 0x0000000000513bc8 in rte_zmalloc_socket (type=0x85bf90 "rte_services", size=8192, align=128, socket=-1) at /root/clean/dpdk/lib/librte_eal/common/rte_malloc.c:72
#11 0x0000000000513c00 in rte_zmalloc (type=0x85bf90 "rte_services", size=8192, align=128) at /root/clean/dpdk/lib/librte_eal/common/rte_malloc.c:81
#12 0x0000000000513c90 in rte_calloc (type=0x85bf90 "rte_services", num=64, size=128, align=128) at /root/clean/dpdk/lib/librte_eal/common/rte_malloc.c:99
#13 0x0000000000518cec in rte_service_init () at /root/clean/dpdk/lib/librte_eal/common/rte_service.c:81
#14 0x00000000004f55f4 in rte_eal_init (argc=3, argv=0xfffffffff488) at /root/clean/dpdk/lib/librte_eal/linuxapp/eal/eal.c:959
#15 0x000000000045af5c in main (argc=3, argv=0xfffffffff488) at /root/clean/dpdk/app/test-pmd/testpmd.c:2483


Also, I have tried running with --legacy-mem but I'm stuck in
`pci_find_max_end_va` loop because `rte_fbarray_find_next_used` always return
0.

HugePages_Total: 15
HugePages_Free: 11
HugePages_Rsvd: 0
HugePages_Surp: 0
Hugepagesize: 524288 kB

Call Stack:
(gdb) bt
#0 find_next (arr=0xffffb7fb009c, start=0, used=true) at /root/clean/dpdk/lib/librte_eal/common/eal_common_fbarray.c:248
#1 0x00000000005132a8 in rte_fbarray_find_next_used (arr=0xffffb7fb009c, start=0) at /root/clean/dpdk/lib/librte_eal/common/eal_common_fbarray.c:700
#2 0x000000000052d030 in pci_find_max_end_va () at /root/clean/dpdk/drivers/bus/pci/linux/pci.c:138
#3 0x0000000000530ab8 in pci_vfio_map_resource_primary (dev=0xeae700) at /root/clean/dpdk/drivers/bus/pci/linux/pci_vfio.c:499
#4 0x0000000000530ffc in pci_vfio_map_resource (dev=0xeae700) at /root/clean/dpdk/drivers/bus/pci/linux/pci_vfio.c:601
#5 0x000000000052ce90 in rte_pci_map_device (dev=0xeae700) at /root/clean/dpdk/drivers/bus/pci/linux/pci.c:75
#6 0x0000000000531a20 in rte_pci_probe_one_driver (dr=0x997e20 <rte_nicvf_pmd>, dev=0xeae700) at /root/clean/dpdk/drivers/bus/pci/pci_common.c:164
#7 0x0000000000531c68 in pci_probe_all_drivers (dev=0xeae700) at /root/clean/dpdk/drivers/bus/pci/pci_common.c:249
#8 0x0000000000531f68 in rte_pci_probe () at /root/clean/dpdk/drivers/bus/pci/pci_common.c:359
#9 0x000000000050a140 in rte_bus_probe () at /root/clean/dpdk/lib/librte_eal/common/eal_common_bus.c:98
#10 0x00000000004f55f4 in rte_eal_init (argc=1, argv=0xfffffffff498) at /root/clean/dpdk/lib/librte_eal/linuxapp/eal/eal.c:967
#11 0x000000000045af5c in main (argc=1, argv=0xfffffffff498) at /root/clean/dpdk/app/test-pmd/testpmd.c:2483

Am I missing something here?


Thanks,
Pavan.
Post by Pavan Nikhilesh
Post by Burakov, Anatoly
Post by John Daley (johndale)
Hi Anatoly,
I am trying to verify this patchset and have encountered few issues.
Few -Werror=maybe-uninitialized errors in eal_memalloc.c/eal_memory.c/
eal_common_memzone.c files.
Thanks for the heads up, i'll fix those in the next revision. Out of
curiousity, which compiler version are you using?
I'm using gcc 5.3.0.
Post by Burakov, Anatoly
Post by John Daley (johndale)
diff --git a/lib/librte_eal/common/eal_common_memzone.c b/lib/librte_eal/common/eal_common_memzone.c
index a7cfdaf03..ad4413507 100644
--- a/lib/librte_eal/common/eal_common_memzone.c
+++ b/lib/librte_eal/common/eal_common_memzone.c
@@ -321,7 +321,7 @@ rte_memzone_free(const struct rte_memzone *mz)
struct rte_fbarray *arr;
struct rte_memzone *found_mz;
int ret = 0;
- void *addr;
+ void *addr = NULL;
unsigned idx;
if (mz == NULL)
diff --git a/lib/librte_eal/linuxapp/eal/eal_memalloc.c b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
index 1008faed6..32b0d5133 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memalloc.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
@@ -570,7 +570,7 @@ eal_memalloc_alloc_page_bulk(struct rte_memseg **ms, int n,
unsigned int msl_idx;
int cur_idx, start_idx, end_idx, i, j, ret = -1;
#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
- bool have_numa;
+ bool have_numa = false;
int oldpolicy;
struct bitmask *oldmask = numa_allocate_nodemask();
#endif
diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
index f74291fb6..d37b4a59b 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -1386,9 +1386,9 @@ eal_legacy_hugepage_attach(void)
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
struct hugepage_file *hp = NULL;
unsigned int num_hp = 0;
- unsigned int i;
+ unsigned int i = 0;
int ms_idx, msl_idx;
- unsigned int cur_seg, max_seg;
+ unsigned int cur_seg, max_seg = 0;
off_t size = 0;
int fd, fd_hugepage = -1;
@Hemanth
Also, this patchset breaks dpaa/dpaa2 bus drivers (they rely on
`rte_eal_get_physmem_layout` that is depricated
http://dpdk.org/dev/patchwork/patch/34002/)
So, generic arm64 linuxapp build is broken.
Should the deprecation notice have been accompanied with marking that
function as __rte_deprecated?
Yup that's the general sequence.
Post by Burakov, Anatoly
Post by John Daley (johndale)
Regards,
Pavan.
This patchset introduces dynamic memory allocation for DPDK (aka memory
hotplug). Based upon RFC submitted in December [1].
- IPC bugfixes patchset [2]
- IPC improvements patchset [3]
- IPC asynchronous request API patch [4]
- Function to return number of sockets [5]
<snip>
--
2.7.4
--
Thanks,
Anatoly
Burakov, Anatoly
2018-03-08 14:36:42 UTC
Permalink
Post by John Daley (johndale)
Hi Anatoly,
We are currently facing issues with running testpmd on thunderx platform.
The issue seems to be with vfio
EAL: Detected 24 lcore(s)
EAL: Detected 1 NUMA nodes
EAL: No free hugepages reported in hugepages-2048kB
EAL: Multi-process socket /var/run/.rte_unix
EAL: Probing VFIO support...
EAL: VFIO support initialized
EAL: VFIO support not initialized
<snip>
EAL: probe driver: 177d:a053 octeontx_fpavf
EAL: PCI device 0001:01:00.1 on NUMA socket 0
EAL: probe driver: 177d:a034 net_thunderx
EAL: using IOMMU type 1 (Type 1)
EAL: cannot set up DMA remapping, error 22 (Invalid argument)
EAL: 0001:01:00.1 DMA remapping failed, error 22 (Invalid argument)
EAL: Requested device 0001:01:00.1 cannot be used
EAL: PCI device 0001:01:00.2 on NUMA socket 0
<snip>
testpmd: No probed ethernet devices
testpmd: create a new mbuf pool <mbuf_pool_socket_0>: n=251456, size=2176, socket=0
testpmd: preferred mempool ops selected: ring_mp_mc
EAL: VFIO support not initialized
EAL: VFIO support not initialized
EAL: VFIO support not initialized
Done
This is because rte_service_init() calls rte_calloc() before
rte_bus_probe() and vfio_dma_mem_map fails because iommu type is not set.
gdb) bt
#0 vfio_dma_mem_map (vaddr=281439006359552, iova=11274289152, len=536870912, do_map=1) at /root/clean/dpdk/lib/librte_eal/linuxapp/eal/eal_vfio.c:967
#1 0x00000000004fd974 in rte_vfio_dma_map (vaddr=281439006359552, iova=11274289152, len=536870912) at /root/clean/dpdk/lib/librte_eal/linuxapp/eal/eal_vfio.c:988
#2 0x00000000004fbe78 in vfio_mem_event_callback (type=RTE_MEM_EVENT_ALLOC, addr=0xfff7a0000000, len=536870912) at /root/clean/dpdk/lib/librte_eal/linuxapp/eal/eal_vfio.c:240
#3 0x00000000005070ac in eal_memalloc_notify (event=RTE_MEM_EVENT_ALLOC, start=0xfff7a0000000, len=536870912) at /root/clean/dpdk/lib/librte_eal/common/eal_common_memalloc.c:177
#4 0x0000000000515c98 in try_expand_heap_primary (heap=0xffffb7fb167c, pg_sz=536870912, elt_size=8192, socket=0, flags=0, align=128, bound=0, contig=false) at /root/clean/dpdk/lib/librte_eal/common/malloc_heap.c:247
#5 0x0000000000515e94 in try_expand_heap (heap=0xffffb7fb167c, pg_sz=536870912, elt_size=8192, socket=0, flags=0, align=128, bound=0, contig=false) at /root/clean/dpdk/lib/librte_eal/common/malloc_heap.c:327
#6 0x00000000005163a0 in alloc_more_mem_on_socket (heap=0xffffb7fb167c, size=8192, socket=0, flags=0, align=128, bound=0, contig=false) at /root/clean/dpdk/lib/librte_eal/common/malloc_heap.c:455
#7 0x0000000000516514 in heap_alloc_on_socket (type=0x85bf90 "rte_services", size=8192, socket=0, flags=0, align=128, bound=0, contig=false) at /root/clean/dpdk/lib/librte_eal/common/malloc_heap.c:491
#8 0x0000000000516664 in malloc_heap_alloc (type=0x85bf90 "rte_services", size=8192, socket_arg=-1, flags=0, align=128, bound=0, contig=false) at /root/clean/dpdk/lib/librte_eal/common/malloc_heap.c:527
#9 0x0000000000513b54 in rte_malloc_socket (type=0x85bf90 "rte_services", size=8192, align=128, socket_arg=-1) at /root/clean/dpdk/lib/librte_eal/common/rte_malloc.c:54
#10 0x0000000000513bc8 in rte_zmalloc_socket (type=0x85bf90 "rte_services", size=8192, align=128, socket=-1) at /root/clean/dpdk/lib/librte_eal/common/rte_malloc.c:72
#11 0x0000000000513c00 in rte_zmalloc (type=0x85bf90 "rte_services", size=8192, align=128) at /root/clean/dpdk/lib/librte_eal/common/rte_malloc.c:81
#12 0x0000000000513c90 in rte_calloc (type=0x85bf90 "rte_services", num=64, size=128, align=128) at /root/clean/dpdk/lib/librte_eal/common/rte_malloc.c:99
#13 0x0000000000518cec in rte_service_init () at /root/clean/dpdk/lib/librte_eal/common/rte_service.c:81
#14 0x00000000004f55f4 in rte_eal_init (argc=3, argv=0xfffffffff488) at /root/clean/dpdk/lib/librte_eal/linuxapp/eal/eal.c:959
#15 0x000000000045af5c in main (argc=3, argv=0xfffffffff488) at /root/clean/dpdk/app/test-pmd/testpmd.c:2483
Also, I have tried running with --legacy-mem but I'm stuck in
`pci_find_max_end_va` loop because `rte_fbarray_find_next_used` always return
0. >
HugePages_Total: 15
HugePages_Free: 11
HugePages_Rsvd: 0
HugePages_Surp: 0
Hugepagesize: 524288 kB
(gdb) bt
#0 find_next (arr=0xffffb7fb009c, start=0, used=true) at /root/clean/dpdk/lib/librte_eal/common/eal_common_fbarray.c:248
#1 0x00000000005132a8 in rte_fbarray_find_next_used (arr=0xffffb7fb009c, start=0) at /root/clean/dpdk/lib/librte_eal/common/eal_common_fbarray.c:700
#2 0x000000000052d030 in pci_find_max_end_va () at /root/clean/dpdk/drivers/bus/pci/linux/pci.c:138
#3 0x0000000000530ab8 in pci_vfio_map_resource_primary (dev=0xeae700) at /root/clean/dpdk/drivers/bus/pci/linux/pci_vfio.c:499
#4 0x0000000000530ffc in pci_vfio_map_resource (dev=0xeae700) at /root/clean/dpdk/drivers/bus/pci/linux/pci_vfio.c:601
#5 0x000000000052ce90 in rte_pci_map_device (dev=0xeae700) at /root/clean/dpdk/drivers/bus/pci/linux/pci.c:75
#6 0x0000000000531a20 in rte_pci_probe_one_driver (dr=0x997e20 <rte_nicvf_pmd>, dev=0xeae700) at /root/clean/dpdk/drivers/bus/pci/pci_common.c:164
#7 0x0000000000531c68 in pci_probe_all_drivers (dev=0xeae700) at /root/clean/dpdk/drivers/bus/pci/pci_common.c:249
#8 0x0000000000531f68 in rte_pci_probe () at /root/clean/dpdk/drivers/bus/pci/pci_common.c:359
#9 0x000000000050a140 in rte_bus_probe () at /root/clean/dpdk/lib/librte_eal/common/eal_common_bus.c:98
#10 0x00000000004f55f4 in rte_eal_init (argc=1, argv=0xfffffffff498) at /root/clean/dpdk/lib/librte_eal/linuxapp/eal/eal.c:967
#11 0x000000000045af5c in main (argc=1, argv=0xfffffffff498) at /root/clean/dpdk/app/test-pmd/testpmd.c:2483
Am I missing something here?
I'll look into those, thanks!

Btw, i've now set up a github repo with the patchset applied:

https://github.com/anatolyburakov/dpdk

I will be pushing quick fixes there before spinning new revisions, so we
can discover and fix bugs more rapidly. I'll fix compile issues reported
earlier, then i'll take a look at your issues. The latter one seems like
a typo, the former is probably a matter of moving things around a bit.

(also, pull requests welcome if you find it easier to fix things
yourself and submit patches against my tree!)

Thanks for testing.
--
Thanks,
Anatoly
Burakov, Anatoly
2018-03-08 20:11:58 UTC
Permalink
Post by Burakov, Anatoly
Post by John Daley (johndale)
Hi Anatoly,
We are currently facing issues with running testpmd on thunderx platform.
The issue seems to be with vfio
EAL: Detected 24 lcore(s)
EAL: Detected 1 NUMA nodes
EAL: No free hugepages reported in hugepages-2048kB
EAL: Multi-process socket /var/run/.rte_unix
EAL: Probing VFIO support...
EAL: VFIO support initialized
EAL:   VFIO support not initialized
<snip>
EAL:   probe driver: 177d:a053 octeontx_fpavf
EAL: PCI device 0001:01:00.1 on NUMA socket 0
EAL:   probe driver: 177d:a034 net_thunderx
EAL:   using IOMMU type 1 (Type 1)
EAL:   cannot set up DMA remapping, error 22 (Invalid argument)
EAL:   0001:01:00.1 DMA remapping failed, error 22 (Invalid argument)
EAL: Requested device 0001:01:00.1 cannot be used
EAL: PCI device 0001:01:00.2 on NUMA socket 0
<snip>
testpmd: No probed ethernet devices
testpmd: create a new mbuf pool <mbuf_pool_socket_0>: n=251456, size=2176, socket=0
testpmd: preferred mempool ops selected: ring_mp_mc
EAL:   VFIO support not initialized
EAL:   VFIO support not initialized
EAL:   VFIO support not initialized
Done
This is because rte_service_init() calls rte_calloc() before
rte_bus_probe() and vfio_dma_mem_map fails because iommu type is not set.
gdb) bt
#0  vfio_dma_mem_map (vaddr=281439006359552, iova=11274289152,
len=536870912, do_map=1) at
/root/clean/dpdk/lib/librte_eal/linuxapp/eal/eal_vfio.c:967
#1  0x00000000004fd974 in rte_vfio_dma_map (vaddr=281439006359552,
iova=11274289152, len=536870912) at
/root/clean/dpdk/lib/librte_eal/linuxapp/eal/eal_vfio.c:988
#2  0x00000000004fbe78 in vfio_mem_event_callback
(type=RTE_MEM_EVENT_ALLOC, addr=0xfff7a0000000, len=536870912) at
/root/clean/dpdk/lib/librte_eal/linuxapp/eal/eal_vfio.c:240
#3  0x00000000005070ac in eal_memalloc_notify
(event=RTE_MEM_EVENT_ALLOC, start=0xfff7a0000000, len=536870912) at
/root/clean/dpdk/lib/librte_eal/common/eal_common_memalloc.c:177
#4  0x0000000000515c98 in try_expand_heap_primary
(heap=0xffffb7fb167c, pg_sz=536870912, elt_size=8192, socket=0,
flags=0, align=128, bound=0, contig=false) at
/root/clean/dpdk/lib/librte_eal/common/malloc_heap.c:247
#5  0x0000000000515e94 in try_expand_heap (heap=0xffffb7fb167c,
pg_sz=536870912, elt_size=8192, socket=0, flags=0, align=128, bound=0,
contig=false) at /root/clean/dpdk/lib/librte_eal/common/malloc_heap.c:327
#6  0x00000000005163a0 in alloc_more_mem_on_socket
(heap=0xffffb7fb167c, size=8192, socket=0, flags=0, align=128,
bound=0, contig=false) at
/root/clean/dpdk/lib/librte_eal/common/malloc_heap.c:455
#7  0x0000000000516514 in heap_alloc_on_socket (type=0x85bf90
"rte_services", size=8192, socket=0, flags=0, align=128, bound=0,
contig=false) at /root/clean/dpdk/lib/librte_eal/common/malloc_heap.c:491
#8  0x0000000000516664 in malloc_heap_alloc (type=0x85bf90
"rte_services", size=8192, socket_arg=-1, flags=0, align=128, bound=0,
contig=false) at /root/clean/dpdk/lib/librte_eal/common/malloc_heap.c:527
#9  0x0000000000513b54 in rte_malloc_socket (type=0x85bf90
"rte_services", size=8192, align=128, socket_arg=-1) at
/root/clean/dpdk/lib/librte_eal/common/rte_malloc.c:54
#10 0x0000000000513bc8 in rte_zmalloc_socket (type=0x85bf90
"rte_services", size=8192, align=128, socket=-1) at
/root/clean/dpdk/lib/librte_eal/common/rte_malloc.c:72
#11 0x0000000000513c00 in rte_zmalloc (type=0x85bf90 "rte_services",
size=8192, align=128) at
/root/clean/dpdk/lib/librte_eal/common/rte_malloc.c:81
#12 0x0000000000513c90 in rte_calloc (type=0x85bf90 "rte_services",
num=64, size=128, align=128) at
/root/clean/dpdk/lib/librte_eal/common/rte_malloc.c:99
#13 0x0000000000518cec in rte_service_init () at
/root/clean/dpdk/lib/librte_eal/common/rte_service.c:81
#14 0x00000000004f55f4 in rte_eal_init (argc=3, argv=0xfffffffff488)
at /root/clean/dpdk/lib/librte_eal/linuxapp/eal/eal.c:959
#15 0x000000000045af5c in main (argc=3, argv=0xfffffffff488) at
/root/clean/dpdk/app/test-pmd/testpmd.c:2483
Also, I have tried running with --legacy-mem but I'm stuck in
`pci_find_max_end_va` loop  because `rte_fbarray_find_next_used`
always return
0. >
HugePages_Total:      15
HugePages_Free:       11
HugePages_Rsvd:        0
HugePages_Surp:        0
Hugepagesize:     524288 kB
(gdb) bt
#0  find_next (arr=0xffffb7fb009c, start=0, used=true) at
/root/clean/dpdk/lib/librte_eal/common/eal_common_fbarray.c:248
#1  0x00000000005132a8 in rte_fbarray_find_next_used
(arr=0xffffb7fb009c, start=0) at
/root/clean/dpdk/lib/librte_eal/common/eal_common_fbarray.c:700
#2  0x000000000052d030 in pci_find_max_end_va () at
/root/clean/dpdk/drivers/bus/pci/linux/pci.c:138
#3  0x0000000000530ab8 in pci_vfio_map_resource_primary (dev=0xeae700)
at /root/clean/dpdk/drivers/bus/pci/linux/pci_vfio.c:499
#4  0x0000000000530ffc in pci_vfio_map_resource (dev=0xeae700) at
/root/clean/dpdk/drivers/bus/pci/linux/pci_vfio.c:601
#5  0x000000000052ce90 in rte_pci_map_device (dev=0xeae700) at
/root/clean/dpdk/drivers/bus/pci/linux/pci.c:75
#6  0x0000000000531a20 in rte_pci_probe_one_driver (dr=0x997e20
<rte_nicvf_pmd>, dev=0xeae700) at
/root/clean/dpdk/drivers/bus/pci/pci_common.c:164
#7  0x0000000000531c68 in pci_probe_all_drivers (dev=0xeae700) at
/root/clean/dpdk/drivers/bus/pci/pci_common.c:249
#8  0x0000000000531f68 in rte_pci_probe () at
/root/clean/dpdk/drivers/bus/pci/pci_common.c:359
#9  0x000000000050a140 in rte_bus_probe () at
/root/clean/dpdk/lib/librte_eal/common/eal_common_bus.c:98
#10 0x00000000004f55f4 in rte_eal_init (argc=1, argv=0xfffffffff498)
at /root/clean/dpdk/lib/librte_eal/linuxapp/eal/eal.c:967
#11 0x000000000045af5c in main (argc=1, argv=0xfffffffff498) at
/root/clean/dpdk/app/test-pmd/testpmd.c:2483
Am I missing something here?
I'll look into those, thanks!
https://github.com/anatolyburakov/dpdk
I will be pushing quick fixes there before spinning new revisions, so we
can discover and fix bugs more rapidly. I'll fix compile issues reported
earlier, then i'll take a look at your issues. The latter one seems like
a typo, the former is probably a matter of moving things around a bit.
(also, pull requests welcome if you find it easier to fix things
yourself and submit patches against my tree!)
Thanks for testing.
I've looked into the failures.

The VFIO one is not actually a failure. It only prints out errors
because rte_malloc is called before VFIO is initialized. However, once
VFIO *is* initialized, all of that memory would be added to VFIO, so
these error messages are harmless. Regardless, i've added a check to see
if init is finished before printing out those errors, so they won't be
printed out any more.

Second one is a typo on my part that got lost in one of the rebases.

I've pushed fixes for both into the github repo.
--
Thanks,
Anatoly
Burakov, Anatoly
2018-03-08 20:33:21 UTC
Permalink
Post by Burakov, Anatoly
Post by Burakov, Anatoly
Post by John Daley (johndale)
Hi Anatoly,
We are currently facing issues with running testpmd on thunderx platform.
The issue seems to be with vfio
EAL: Detected 24 lcore(s)
EAL: Detected 1 NUMA nodes
EAL: No free hugepages reported in hugepages-2048kB
EAL: Multi-process socket /var/run/.rte_unix
EAL: Probing VFIO support...
EAL: VFIO support initialized
EAL:   VFIO support not initialized
<snip>
EAL:   probe driver: 177d:a053 octeontx_fpavf
EAL: PCI device 0001:01:00.1 on NUMA socket 0
EAL:   probe driver: 177d:a034 net_thunderx
EAL:   using IOMMU type 1 (Type 1)
EAL:   cannot set up DMA remapping, error 22 (Invalid argument)
EAL:   0001:01:00.1 DMA remapping failed, error 22 (Invalid argument)
EAL: Requested device 0001:01:00.1 cannot be used
EAL: PCI device 0001:01:00.2 on NUMA socket 0
<snip>
testpmd: No probed ethernet devices
testpmd: create a new mbuf pool <mbuf_pool_socket_0>: n=251456, size=2176, socket=0
testpmd: preferred mempool ops selected: ring_mp_mc
EAL:   VFIO support not initialized
EAL:   VFIO support not initialized
EAL:   VFIO support not initialized
Done
This is because rte_service_init() calls rte_calloc() before
rte_bus_probe() and vfio_dma_mem_map fails because iommu type is not set.
gdb) bt
#0  vfio_dma_mem_map (vaddr=281439006359552, iova=11274289152,
len=536870912, do_map=1) at
/root/clean/dpdk/lib/librte_eal/linuxapp/eal/eal_vfio.c:967
#1  0x00000000004fd974 in rte_vfio_dma_map (vaddr=281439006359552,
iova=11274289152, len=536870912) at
/root/clean/dpdk/lib/librte_eal/linuxapp/eal/eal_vfio.c:988
#2  0x00000000004fbe78 in vfio_mem_event_callback
(type=RTE_MEM_EVENT_ALLOC, addr=0xfff7a0000000, len=536870912) at
/root/clean/dpdk/lib/librte_eal/linuxapp/eal/eal_vfio.c:240
#3  0x00000000005070ac in eal_memalloc_notify
(event=RTE_MEM_EVENT_ALLOC, start=0xfff7a0000000, len=536870912) at
/root/clean/dpdk/lib/librte_eal/common/eal_common_memalloc.c:177
#4  0x0000000000515c98 in try_expand_heap_primary
(heap=0xffffb7fb167c, pg_sz=536870912, elt_size=8192, socket=0,
flags=0, align=128, bound=0, contig=false) at
/root/clean/dpdk/lib/librte_eal/common/malloc_heap.c:247
#5  0x0000000000515e94 in try_expand_heap (heap=0xffffb7fb167c,
pg_sz=536870912, elt_size=8192, socket=0, flags=0, align=128,
bound=0, contig=false) at
/root/clean/dpdk/lib/librte_eal/common/malloc_heap.c:327
#6  0x00000000005163a0 in alloc_more_mem_on_socket
(heap=0xffffb7fb167c, size=8192, socket=0, flags=0, align=128,
bound=0, contig=false) at
/root/clean/dpdk/lib/librte_eal/common/malloc_heap.c:455
#7  0x0000000000516514 in heap_alloc_on_socket (type=0x85bf90
"rte_services", size=8192, socket=0, flags=0, align=128, bound=0,
contig=false) at
/root/clean/dpdk/lib/librte_eal/common/malloc_heap.c:491
#8  0x0000000000516664 in malloc_heap_alloc (type=0x85bf90
"rte_services", size=8192, socket_arg=-1, flags=0, align=128,
bound=0, contig=false) at
/root/clean/dpdk/lib/librte_eal/common/malloc_heap.c:527
#9  0x0000000000513b54 in rte_malloc_socket (type=0x85bf90
"rte_services", size=8192, align=128, socket_arg=-1) at
/root/clean/dpdk/lib/librte_eal/common/rte_malloc.c:54
#10 0x0000000000513bc8 in rte_zmalloc_socket (type=0x85bf90
"rte_services", size=8192, align=128, socket=-1) at
/root/clean/dpdk/lib/librte_eal/common/rte_malloc.c:72
#11 0x0000000000513c00 in rte_zmalloc (type=0x85bf90 "rte_services",
size=8192, align=128) at
/root/clean/dpdk/lib/librte_eal/common/rte_malloc.c:81
#12 0x0000000000513c90 in rte_calloc (type=0x85bf90 "rte_services",
num=64, size=128, align=128) at
/root/clean/dpdk/lib/librte_eal/common/rte_malloc.c:99
#13 0x0000000000518cec in rte_service_init () at
/root/clean/dpdk/lib/librte_eal/common/rte_service.c:81
#14 0x00000000004f55f4 in rte_eal_init (argc=3, argv=0xfffffffff488)
at /root/clean/dpdk/lib/librte_eal/linuxapp/eal/eal.c:959
#15 0x000000000045af5c in main (argc=3, argv=0xfffffffff488) at
/root/clean/dpdk/app/test-pmd/testpmd.c:2483
Also, I have tried running with --legacy-mem but I'm stuck in
`pci_find_max_end_va` loop  because `rte_fbarray_find_next_used`
always return
0. >
HugePages_Total:      15
HugePages_Free:       11
HugePages_Rsvd:        0
HugePages_Surp:        0
Hugepagesize:     524288 kB
(gdb) bt
#0  find_next (arr=0xffffb7fb009c, start=0, used=true) at
/root/clean/dpdk/lib/librte_eal/common/eal_common_fbarray.c:248
#1  0x00000000005132a8 in rte_fbarray_find_next_used
(arr=0xffffb7fb009c, start=0) at
/root/clean/dpdk/lib/librte_eal/common/eal_common_fbarray.c:700
#2  0x000000000052d030 in pci_find_max_end_va () at
/root/clean/dpdk/drivers/bus/pci/linux/pci.c:138
#3  0x0000000000530ab8 in pci_vfio_map_resource_primary
(dev=0xeae700) at /root/clean/dpdk/drivers/bus/pci/linux/pci_vfio.c:499
#4  0x0000000000530ffc in pci_vfio_map_resource (dev=0xeae700) at
/root/clean/dpdk/drivers/bus/pci/linux/pci_vfio.c:601
#5  0x000000000052ce90 in rte_pci_map_device (dev=0xeae700) at
/root/clean/dpdk/drivers/bus/pci/linux/pci.c:75
#6  0x0000000000531a20 in rte_pci_probe_one_driver (dr=0x997e20
<rte_nicvf_pmd>, dev=0xeae700) at
/root/clean/dpdk/drivers/bus/pci/pci_common.c:164
#7  0x0000000000531c68 in pci_probe_all_drivers (dev=0xeae700) at
/root/clean/dpdk/drivers/bus/pci/pci_common.c:249
#8  0x0000000000531f68 in rte_pci_probe () at
/root/clean/dpdk/drivers/bus/pci/pci_common.c:359
#9  0x000000000050a140 in rte_bus_probe () at
/root/clean/dpdk/lib/librte_eal/common/eal_common_bus.c:98
#10 0x00000000004f55f4 in rte_eal_init (argc=1, argv=0xfffffffff498)
at /root/clean/dpdk/lib/librte_eal/linuxapp/eal/eal.c:967
#11 0x000000000045af5c in main (argc=1, argv=0xfffffffff498) at
/root/clean/dpdk/app/test-pmd/testpmd.c:2483
Am I missing something here?
I'll look into those, thanks!
https://github.com/anatolyburakov/dpdk
I will be pushing quick fixes there before spinning new revisions, so
we can discover and fix bugs more rapidly. I'll fix compile issues
reported earlier, then i'll take a look at your issues. The latter one
seems like a typo, the former is probably a matter of moving things
around a bit.
(also, pull requests welcome if you find it easier to fix things
yourself and submit patches against my tree!)
Thanks for testing.
I've looked into the failures.
The VFIO one is not actually a failure. It only prints out errors
because rte_malloc is called before VFIO is initialized. However, once
VFIO *is* initialized, all of that memory would be added to VFIO, so
these error messages are harmless. Regardless, i've added a check to see
if init is finished before printing out those errors, so they won't be
printed out any more.
Second one is a typo on my part that got lost in one of the rebases.
I've pushed fixes for both into the github repo.
Although i do wonder where do the DMA remapping errors come from. The
error message says "invalid argument", so that doesn't come from
rte_service or anything to do with rte_malloc - this is us not providing
valid arguments to VFIO. I'm not seeing these errors on my system. I'll
check on others to be sure.
--
Thanks,
Anatoly
Pavan Nikhilesh
2018-03-09 09:15:15 UTC
Permalink
Post by Burakov, Anatoly
Post by Burakov, Anatoly
Post by John Daley (johndale)
Hi Anatoly,
We are currently facing issues with running testpmd on thunderx platform.
The issue seems to be with vfio
EAL: Detected 24 lcore(s)
EAL: Detected 1 NUMA nodes
EAL: No free hugepages reported in hugepages-2048kB
EAL: Multi-process socket /var/run/.rte_unix
EAL: Probing VFIO support...
EAL: VFIO support initialized
EAL:   VFIO support not initialized
<snip>
EAL:   probe driver: 177d:a053 octeontx_fpavf
EAL: PCI device 0001:01:00.1 on NUMA socket 0
EAL:   probe driver: 177d:a034 net_thunderx
EAL:   using IOMMU type 1 (Type 1)
EAL:   cannot set up DMA remapping, error 22 (Invalid argument)
EAL:   0001:01:00.1 DMA remapping failed, error 22 (Invalid argument)
EAL: Requested device 0001:01:00.1 cannot be used
EAL: PCI device 0001:01:00.2 on NUMA socket 0
<snip>
testpmd: No probed ethernet devices
testpmd: create a new mbuf pool <mbuf_pool_socket_0>: n=251456,
size=2176, socket=0
testpmd: preferred mempool ops selected: ring_mp_mc
EAL:   VFIO support not initialized
EAL:   VFIO support not initialized
EAL:   VFIO support not initialized
Done
This is because rte_service_init() calls rte_calloc() before
rte_bus_probe() and vfio_dma_mem_map fails because iommu type is not set.
gdb) bt
#0  vfio_dma_mem_map (vaddr=281439006359552, iova=11274289152,
len=536870912, do_map=1) at
/root/clean/dpdk/lib/librte_eal/linuxapp/eal/eal_vfio.c:967
#1  0x00000000004fd974 in rte_vfio_dma_map
(vaddr=281439006359552, iova=11274289152, len=536870912) at
/root/clean/dpdk/lib/librte_eal/linuxapp/eal/eal_vfio.c:988
#2  0x00000000004fbe78 in vfio_mem_event_callback
(type=RTE_MEM_EVENT_ALLOC, addr=0xfff7a0000000, len=536870912)
at /root/clean/dpdk/lib/librte_eal/linuxapp/eal/eal_vfio.c:240
#3  0x00000000005070ac in eal_memalloc_notify
(event=RTE_MEM_EVENT_ALLOC, start=0xfff7a0000000, len=536870912) at
/root/clean/dpdk/lib/librte_eal/common/eal_common_memalloc.c:177
#4  0x0000000000515c98 in try_expand_heap_primary
(heap=0xffffb7fb167c, pg_sz=536870912, elt_size=8192, socket=0,
flags=0, align=128, bound=0, contig=false) at
/root/clean/dpdk/lib/librte_eal/common/malloc_heap.c:247
#5  0x0000000000515e94 in try_expand_heap (heap=0xffffb7fb167c,
pg_sz=536870912, elt_size=8192, socket=0, flags=0, align=128,
bound=0, contig=false) at
/root/clean/dpdk/lib/librte_eal/common/malloc_heap.c:327
#6  0x00000000005163a0 in alloc_more_mem_on_socket
(heap=0xffffb7fb167c, size=8192, socket=0, flags=0, align=128,
bound=0, contig=false) at
/root/clean/dpdk/lib/librte_eal/common/malloc_heap.c:455
#7  0x0000000000516514 in heap_alloc_on_socket (type=0x85bf90
"rte_services", size=8192, socket=0, flags=0, align=128,
bound=0, contig=false) at
/root/clean/dpdk/lib/librte_eal/common/malloc_heap.c:491
#8  0x0000000000516664 in malloc_heap_alloc (type=0x85bf90
"rte_services", size=8192, socket_arg=-1, flags=0, align=128,
bound=0, contig=false) at
/root/clean/dpdk/lib/librte_eal/common/malloc_heap.c:527
#9  0x0000000000513b54 in rte_malloc_socket (type=0x85bf90
"rte_services", size=8192, align=128, socket_arg=-1) at
/root/clean/dpdk/lib/librte_eal/common/rte_malloc.c:54
#10 0x0000000000513bc8 in rte_zmalloc_socket (type=0x85bf90
"rte_services", size=8192, align=128, socket=-1) at
/root/clean/dpdk/lib/librte_eal/common/rte_malloc.c:72
#11 0x0000000000513c00 in rte_zmalloc (type=0x85bf90
"rte_services", size=8192, align=128) at
/root/clean/dpdk/lib/librte_eal/common/rte_malloc.c:81
#12 0x0000000000513c90 in rte_calloc (type=0x85bf90
"rte_services", num=64, size=128, align=128) at
/root/clean/dpdk/lib/librte_eal/common/rte_malloc.c:99
#13 0x0000000000518cec in rte_service_init () at
/root/clean/dpdk/lib/librte_eal/common/rte_service.c:81
#14 0x00000000004f55f4 in rte_eal_init (argc=3,
argv=0xfffffffff488) at
/root/clean/dpdk/lib/librte_eal/linuxapp/eal/eal.c:959
#15 0x000000000045af5c in main (argc=3, argv=0xfffffffff488) at
/root/clean/dpdk/app/test-pmd/testpmd.c:2483
Also, I have tried running with --legacy-mem but I'm stuck in
`pci_find_max_end_va` loop  because `rte_fbarray_find_next_used`
always return
0. >
HugePages_Total:      15
HugePages_Free:       11
HugePages_Rsvd:        0
HugePages_Surp:        0
Hugepagesize:     524288 kB
(gdb) bt
#0  find_next (arr=0xffffb7fb009c, start=0, used=true) at
/root/clean/dpdk/lib/librte_eal/common/eal_common_fbarray.c:248
#1  0x00000000005132a8 in rte_fbarray_find_next_used
(arr=0xffffb7fb009c, start=0) at
/root/clean/dpdk/lib/librte_eal/common/eal_common_fbarray.c:700
#2  0x000000000052d030 in pci_find_max_end_va () at
/root/clean/dpdk/drivers/bus/pci/linux/pci.c:138
#3  0x0000000000530ab8 in pci_vfio_map_resource_primary
(dev=0xeae700) at
/root/clean/dpdk/drivers/bus/pci/linux/pci_vfio.c:499
#4  0x0000000000530ffc in pci_vfio_map_resource (dev=0xeae700)
at /root/clean/dpdk/drivers/bus/pci/linux/pci_vfio.c:601
#5  0x000000000052ce90 in rte_pci_map_device (dev=0xeae700) at
/root/clean/dpdk/drivers/bus/pci/linux/pci.c:75
#6  0x0000000000531a20 in rte_pci_probe_one_driver (dr=0x997e20
<rte_nicvf_pmd>, dev=0xeae700) at
/root/clean/dpdk/drivers/bus/pci/pci_common.c:164
#7  0x0000000000531c68 in pci_probe_all_drivers (dev=0xeae700)
at /root/clean/dpdk/drivers/bus/pci/pci_common.c:249
#8  0x0000000000531f68 in rte_pci_probe () at
/root/clean/dpdk/drivers/bus/pci/pci_common.c:359
#9  0x000000000050a140 in rte_bus_probe () at
/root/clean/dpdk/lib/librte_eal/common/eal_common_bus.c:98
#10 0x00000000004f55f4 in rte_eal_init (argc=1,
argv=0xfffffffff498) at
/root/clean/dpdk/lib/librte_eal/linuxapp/eal/eal.c:967
#11 0x000000000045af5c in main (argc=1, argv=0xfffffffff498) at
/root/clean/dpdk/app/test-pmd/testpmd.c:2483
Am I missing something here?
I'll look into those, thanks!
https://github.com/anatolyburakov/dpdk
I will be pushing quick fixes there before spinning new revisions,
so we can discover and fix bugs more rapidly. I'll fix compile
issues reported earlier, then i'll take a look at your issues. The
latter one seems like a typo, the former is probably a matter of
moving things around a bit.
(also, pull requests welcome if you find it easier to fix things
yourself and submit patches against my tree!)
Thanks for testing.
I've looked into the failures.
The VFIO one is not actually a failure. It only prints out errors
because rte_malloc is called before VFIO is initialized. However, once
VFIO *is* initialized, all of that memory would be added to VFIO, so
these error messages are harmless. Regardless, i've added a check to see
if init is finished before printing out those errors, so they won't be
printed out any more.
Second one is a typo on my part that got lost in one of the rebases.
I've pushed fixes for both into the github repo.
Although i do wonder where do the DMA remapping errors come from. The error
message says "invalid argument", so that doesn't come from rte_service or
anything to do with rte_malloc - this is us not providing valid arguments to
VFIO. I'm not seeing these errors on my system. I'll check on others to be
sure.
I have taken a look at the github tree the issues with VFIO are gone, Although
compilation issues with dpaa/dpaa2 are still present due to their dependency on
`rte_eal_get_physmem_layout`.
--
Thanks,
Anatoly
Thanks,
Pavan
Burakov, Anatoly
2018-03-09 10:42:03 UTC
Permalink
Post by Pavan Nikhilesh
Post by Burakov, Anatoly
Post by Burakov, Anatoly
Post by John Daley (johndale)
Hi Anatoly,
We are currently facing issues with running testpmd on thunderx platform.
The issue seems to be with vfio
EAL: Detected 24 lcore(s)
EAL: Detected 1 NUMA nodes
EAL: No free hugepages reported in hugepages-2048kB
EAL: Multi-process socket /var/run/.rte_unix
EAL: Probing VFIO support...
EAL: VFIO support initialized
EAL:   VFIO support not initialized
<snip>
EAL:   probe driver: 177d:a053 octeontx_fpavf
EAL: PCI device 0001:01:00.1 on NUMA socket 0
EAL:   probe driver: 177d:a034 net_thunderx
EAL:   using IOMMU type 1 (Type 1)
EAL:   cannot set up DMA remapping, error 22 (Invalid argument)
EAL:   0001:01:00.1 DMA remapping failed, error 22 (Invalid argument)
EAL: Requested device 0001:01:00.1 cannot be used
EAL: PCI device 0001:01:00.2 on NUMA socket 0
<snip>
testpmd: No probed ethernet devices
testpmd: create a new mbuf pool <mbuf_pool_socket_0>: n=251456, size=2176, socket=0
testpmd: preferred mempool ops selected: ring_mp_mc
EAL:   VFIO support not initialized
EAL:   VFIO support not initialized
EAL:   VFIO support not initialized
Done
This is because rte_service_init() calls rte_calloc() before
rte_bus_probe() and vfio_dma_mem_map fails because iommu type is not set.
gdb) bt
#0  vfio_dma_mem_map (vaddr=281439006359552, iova=11274289152,
len=536870912, do_map=1) at
/root/clean/dpdk/lib/librte_eal/linuxapp/eal/eal_vfio.c:967
#1  0x00000000004fd974 in rte_vfio_dma_map
(vaddr=281439006359552, iova=11274289152, len=536870912) at
/root/clean/dpdk/lib/librte_eal/linuxapp/eal/eal_vfio.c:988
#2  0x00000000004fbe78 in vfio_mem_event_callback
(type=RTE_MEM_EVENT_ALLOC, addr=0xfff7a0000000, len=536870912)
at /root/clean/dpdk/lib/librte_eal/linuxapp/eal/eal_vfio.c:240
#3  0x00000000005070ac in eal_memalloc_notify
(event=RTE_MEM_EVENT_ALLOC, start=0xfff7a0000000, len=536870912) at
/root/clean/dpdk/lib/librte_eal/common/eal_common_memalloc.c:177
#4  0x0000000000515c98 in try_expand_heap_primary
(heap=0xffffb7fb167c, pg_sz=536870912, elt_size=8192, socket=0,
flags=0, align=128, bound=0, contig=false) at
/root/clean/dpdk/lib/librte_eal/common/malloc_heap.c:247
#5  0x0000000000515e94 in try_expand_heap (heap=0xffffb7fb167c,
pg_sz=536870912, elt_size=8192, socket=0, flags=0, align=128,
bound=0, contig=false) at
/root/clean/dpdk/lib/librte_eal/common/malloc_heap.c:327
#6  0x00000000005163a0 in alloc_more_mem_on_socket
(heap=0xffffb7fb167c, size=8192, socket=0, flags=0, align=128,
bound=0, contig=false) at
/root/clean/dpdk/lib/librte_eal/common/malloc_heap.c:455
#7  0x0000000000516514 in heap_alloc_on_socket (type=0x85bf90
"rte_services", size=8192, socket=0, flags=0, align=128,
bound=0, contig=false) at
/root/clean/dpdk/lib/librte_eal/common/malloc_heap.c:491
#8  0x0000000000516664 in malloc_heap_alloc (type=0x85bf90
"rte_services", size=8192, socket_arg=-1, flags=0, align=128,
bound=0, contig=false) at
/root/clean/dpdk/lib/librte_eal/common/malloc_heap.c:527
#9  0x0000000000513b54 in rte_malloc_socket (type=0x85bf90
"rte_services", size=8192, align=128, socket_arg=-1) at
/root/clean/dpdk/lib/librte_eal/common/rte_malloc.c:54
#10 0x0000000000513bc8 in rte_zmalloc_socket (type=0x85bf90
"rte_services", size=8192, align=128, socket=-1) at
/root/clean/dpdk/lib/librte_eal/common/rte_malloc.c:72
#11 0x0000000000513c00 in rte_zmalloc (type=0x85bf90
"rte_services", size=8192, align=128) at
/root/clean/dpdk/lib/librte_eal/common/rte_malloc.c:81
#12 0x0000000000513c90 in rte_calloc (type=0x85bf90
"rte_services", num=64, size=128, align=128) at
/root/clean/dpdk/lib/librte_eal/common/rte_malloc.c:99
#13 0x0000000000518cec in rte_service_init () at
/root/clean/dpdk/lib/librte_eal/common/rte_service.c:81
#14 0x00000000004f55f4 in rte_eal_init (argc=3,
argv=0xfffffffff488) at
/root/clean/dpdk/lib/librte_eal/linuxapp/eal/eal.c:959
#15 0x000000000045af5c in main (argc=3, argv=0xfffffffff488) at
/root/clean/dpdk/app/test-pmd/testpmd.c:2483
Also, I have tried running with --legacy-mem but I'm stuck in
`pci_find_max_end_va` loop  because `rte_fbarray_find_next_used`
always return
0. >
HugePages_Total:      15
HugePages_Free:       11
HugePages_Rsvd:        0
HugePages_Surp:        0
Hugepagesize:     524288 kB
(gdb) bt
#0  find_next (arr=0xffffb7fb009c, start=0, used=true) at
/root/clean/dpdk/lib/librte_eal/common/eal_common_fbarray.c:248
#1  0x00000000005132a8 in rte_fbarray_find_next_used
(arr=0xffffb7fb009c, start=0) at
/root/clean/dpdk/lib/librte_eal/common/eal_common_fbarray.c:700
#2  0x000000000052d030 in pci_find_max_end_va () at
/root/clean/dpdk/drivers/bus/pci/linux/pci.c:138
#3  0x0000000000530ab8 in pci_vfio_map_resource_primary
(dev=0xeae700) at
/root/clean/dpdk/drivers/bus/pci/linux/pci_vfio.c:499
#4  0x0000000000530ffc in pci_vfio_map_resource (dev=0xeae700)
at /root/clean/dpdk/drivers/bus/pci/linux/pci_vfio.c:601
#5  0x000000000052ce90 in rte_pci_map_device (dev=0xeae700) at
/root/clean/dpdk/drivers/bus/pci/linux/pci.c:75
#6  0x0000000000531a20 in rte_pci_probe_one_driver (dr=0x997e20
<rte_nicvf_pmd>, dev=0xeae700) at
/root/clean/dpdk/drivers/bus/pci/pci_common.c:164
#7  0x0000000000531c68 in pci_probe_all_drivers (dev=0xeae700)
at /root/clean/dpdk/drivers/bus/pci/pci_common.c:249
#8  0x0000000000531f68 in rte_pci_probe () at
/root/clean/dpdk/drivers/bus/pci/pci_common.c:359
#9  0x000000000050a140 in rte_bus_probe () at
/root/clean/dpdk/lib/librte_eal/common/eal_common_bus.c:98
#10 0x00000000004f55f4 in rte_eal_init (argc=1,
argv=0xfffffffff498) at
/root/clean/dpdk/lib/librte_eal/linuxapp/eal/eal.c:967
#11 0x000000000045af5c in main (argc=1, argv=0xfffffffff498) at
/root/clean/dpdk/app/test-pmd/testpmd.c:2483
Am I missing something here?
I'll look into those, thanks!
https://github.com/anatolyburakov/dpdk
I will be pushing quick fixes there before spinning new revisions,
so we can discover and fix bugs more rapidly. I'll fix compile
issues reported earlier, then i'll take a look at your issues. The
latter one seems like a typo, the former is probably a matter of
moving things around a bit.
(also, pull requests welcome if you find it easier to fix things
yourself and submit patches against my tree!)
Thanks for testing.
I've looked into the failures.
The VFIO one is not actually a failure. It only prints out errors
because rte_malloc is called before VFIO is initialized. However, once
VFIO *is* initialized, all of that memory would be added to VFIO, so
these error messages are harmless. Regardless, i've added a check to see
if init is finished before printing out those errors, so they won't be
printed out any more.
Second one is a typo on my part that got lost in one of the rebases.
I've pushed fixes for both into the github repo.
Although i do wonder where do the DMA remapping errors come from. The error
message says "invalid argument", so that doesn't come from rte_service or
anything to do with rte_malloc - this is us not providing valid arguments to
VFIO. I'm not seeing these errors on my system. I'll check on others to be
sure.
I have taken a look at the github tree the issues with VFIO are gone, Although
compilation issues with dpaa/dpaa2 are still present due to their dependency on
`rte_eal_get_physmem_layout`.
I've fixed the dpaa compile issue and pushed it to github. I've tried to
keep the semantics the same as before, but i can't compile-test (let
alone test-test) them as i don't have access to a system with dpaa bus.

Also, you might want to know that dpaa bus driver references
RTE_LIBRTE_DPAA_MAX_CRYPTODEV which is only found in
config/common_armv8a_linuxapp but is not present in base config. Not
sure if that's an issue.
Post by Pavan Nikhilesh
--
Thanks,
Anatoly
Thanks,
Pavan
--
Thanks,
Anatoly
Shreyansh Jain
2018-03-13 05:17:16 UTC
Permalink
Hello Anatoly,

On Fri, Mar 9, 2018 at 4:12 PM, Burakov, Anatoly
[...]
Post by Burakov, Anatoly
Post by Pavan Nikhilesh
I have taken a look at the github tree the issues with VFIO are gone, Although
compilation issues with dpaa/dpaa2 are still present due to their dependency on
`rte_eal_get_physmem_layout`.
I've fixed the dpaa compile issue and pushed it to github. I've tried to
keep the semantics the same as before, but i can't compile-test (let alone
test-test) them as i don't have access to a system with dpaa bus.
Thanks. I will have a look at this.
Post by Burakov, Anatoly
Also, you might want to know that dpaa bus driver references
RTE_LIBRTE_DPAA_MAX_CRYPTODEV which is only found in
config/common_armv8a_linuxapp but is not present in base config. Not sure if
that's an issue.
This might be an issue as very recently some patches updated the base
config. I will cross check this as well.

-
Shreyansh
Shreyansh Jain
2018-03-15 14:01:15 UTC
Permalink
Hello Anatoly,
Post by Shreyansh Jain
Hello Anatoly,
On Fri, Mar 9, 2018 at 4:12 PM, Burakov, Anatoly
[...]
Post by Burakov, Anatoly
Post by Pavan Nikhilesh
I have taken a look at the github tree the issues with VFIO are gone, Although
compilation issues with dpaa/dpaa2 are still present due to their dependency on
`rte_eal_get_physmem_layout`.
I've fixed the dpaa compile issue and pushed it to github. I've tried to
keep the semantics the same as before, but i can't compile-test (let alone
test-test) them as i don't have access to a system with dpaa bus.
Thanks. I will have a look at this.
Just a heads-up, DPAA2 is broken on top-of-tree (github:
784e041f6b520) as of now:

--->8---
***@ls2088ardb:~/shreyansh/07_dpdk_memory#
./arm64-dpaa2-linuxapp-gcc/app/testpmd -c 0xE -n 1 --log-level=eal,8
--log-level=mem,8 -- -i --portmask=0x3
EAL: Detected lcore 0 as core 0 on socket 0
EAL: Detected lcore 1 as core 1 on socket 0
EAL: Detected lcore 2 as core 0 on socket 0
EAL: Detected lcore 3 as core 1 on socket 0
EAL: Detected lcore 4 as core 0 on socket 0
EAL: Detected lcore 5 as core 1 on socket 0
EAL: Detected lcore 6 as core 0 on socket 0
EAL: Detected lcore 7 as core 1 on socket 0
EAL: Support maximum 16 logical core(s) by configuration.
EAL: Detected 8 lcore(s)
EAL: Detected 1 NUMA nodes
EAL: VFIO PCI modules not loaded
EAL: DPAA Bus not present. Skipping.
EAL: Container: dprc.2 has VFIO iommu group id = 4
EAL: fslmc: Bus scan completed
EAL: Module /sys/module/rte_kni not found! error 2 (No such file or directory)
EAL: Multi-process socket /var/run/.rte_unix
EAL: Probing VFIO support...
EAL: IOMMU type 1 (Type 1) is supported
EAL: IOMMU type 7 (sPAPR) is not supported
EAL: IOMMU type 8 (No-IOMMU) is not supported
EAL: VFIO support initialized
EAL: Mem event callback 'vfio_mem_event_clb' registered
EAL: Ask a virtual area of 0x2e000 bytes
EAL: Virtual area found at 0xffff86cae000 (size = 0x2e000)
EAL: Setting up physically contiguous memory...
EAL: Ask a virtual area of 0x1000 bytes
EAL: Virtual area found at 0xffff8873f000 (size = 0x1000)
EAL: Memseg list allocated: 0x100000kB at socket 0
EAL: Ask a virtual area of 0x800000000 bytes
EAL: Virtual area found at 0xfff780000000 (size = 0x800000000)
EAL: Ask a virtual area of 0x1000 bytes
EAL: Virtual area found at 0xffff8873e000 (size = 0x1000)
EAL: Memseg list allocated: 0x100000kB at socket 0
EAL: Ask a virtual area of 0x800000000 bytes
EAL: Virtual area found at 0xffef40000000 (size = 0x800000000)
EAL: Ask a virtual area of 0x1000 bytes
EAL: Virtual area found at 0xffff8873d000 (size = 0x1000)
EAL: Memseg list allocated: 0x100000kB at socket 0
EAL: Ask a virtual area of 0x800000000 bytes
EAL: Virtual area found at 0xffe700000000 (size = 0x800000000)
EAL: Ask a virtual area of 0x1000 bytes
EAL: Virtual area found at 0xffff8873c000 (size = 0x1000)
EAL: Memseg list allocated: 0x100000kB at socket 0
EAL: Ask a virtual area of 0x800000000 bytes
EAL: Virtual area found at 0xffdec0000000 (size = 0x800000000)
EAL: TSC frequency is ~25000 KHz
EAL: Master lcore 1 is ready (tid=88742110;cpuset=[1])
EAL: lcore 3 is ready (tid=85cab910;cpuset=[3])
EAL: lcore 2 is ready (tid=864ab910;cpuset=[2])
EAL: eal_memalloc_alloc_page_bulk(): couldn't find suitable memseg_list
error allocating rte services array
EAL: FATAL: rte_service_init() failed

EAL: rte_service_init() failed

PANIC in main():
Cannot init EAL
1: [./arm64-dpaa2-linuxapp-gcc/app/testpmd(rte_dump_stack+0x38) [0x4f37a8]]
Aborted
--->8--

Above is an initial output - still investigating. I will keep you posted.
Post by Shreyansh Jain
Post by Burakov, Anatoly
Also, you might want to know that dpaa bus driver references
RTE_LIBRTE_DPAA_MAX_CRYPTODEV which is only found in
config/common_armv8a_linuxapp but is not present in base config. Not sure if
that's an issue.
A recent patch from Hemant has fixed this (yet to be merged in master).

[...]

-
Shreyansh
Shreyansh Jain
2018-03-21 13:45:57 UTC
Permalink
Hello Anatoly,

This is not necessarily right chain to reply to, but reusing this
email for another issue in DPAA2 so that all issues can be at a single
place.
Post by Shreyansh Jain
Hello Anatoly,
Post by Shreyansh Jain
Hello Anatoly,
On Fri, Mar 9, 2018 at 4:12 PM, Burakov, Anatoly
[...]
Post by Burakov, Anatoly
Post by Pavan Nikhilesh
I have taken a look at the github tree the issues with VFIO are gone, Although
compilation issues with dpaa/dpaa2 are still present due to their dependency on
`rte_eal_get_physmem_layout`.
I've fixed the dpaa compile issue and pushed it to github. I've tried to
keep the semantics the same as before, but i can't compile-test (let alone
test-test) them as i don't have access to a system with dpaa bus.
Thanks. I will have a look at this.
--->8---
./arm64-dpaa2-linuxapp-gcc/app/testpmd -c 0xE -n 1 --log-level=eal,8
--log-level=mem,8 -- -i --portmask=0x3
EAL: Detected lcore 0 as core 0 on socket 0
EAL: Detected lcore 1 as core 1 on socket 0
EAL: Detected lcore 2 as core 0 on socket 0
EAL: Detected lcore 3 as core 1 on socket 0
EAL: Detected lcore 4 as core 0 on socket 0
EAL: Detected lcore 5 as core 1 on socket 0
EAL: Detected lcore 6 as core 0 on socket 0
EAL: Detected lcore 7 as core 1 on socket 0
EAL: Support maximum 16 logical core(s) by configuration.
EAL: Detected 8 lcore(s)
EAL: Detected 1 NUMA nodes
EAL: VFIO PCI modules not loaded
EAL: DPAA Bus not present. Skipping.
EAL: Container: dprc.2 has VFIO iommu group id = 4
EAL: fslmc: Bus scan completed
EAL: Module /sys/module/rte_kni not found! error 2 (No such file or directory)
EAL: Multi-process socket /var/run/.rte_unix
EAL: Probing VFIO support...
EAL: IOMMU type 1 (Type 1) is supported
EAL: IOMMU type 7 (sPAPR) is not supported
EAL: IOMMU type 8 (No-IOMMU) is not supported
EAL: VFIO support initialized
EAL: Mem event callback 'vfio_mem_event_clb' registered
EAL: Ask a virtual area of 0x2e000 bytes
EAL: Virtual area found at 0xffff86cae000 (size = 0x2e000)
EAL: Setting up physically contiguous memory...
EAL: Ask a virtual area of 0x1000 bytes
EAL: Virtual area found at 0xffff8873f000 (size = 0x1000)
EAL: Memseg list allocated: 0x100000kB at socket 0
EAL: Ask a virtual area of 0x800000000 bytes
EAL: Virtual area found at 0xfff780000000 (size = 0x800000000)
EAL: Ask a virtual area of 0x1000 bytes
EAL: Virtual area found at 0xffff8873e000 (size = 0x1000)
EAL: Memseg list allocated: 0x100000kB at socket 0
EAL: Ask a virtual area of 0x800000000 bytes
EAL: Virtual area found at 0xffef40000000 (size = 0x800000000)
EAL: Ask a virtual area of 0x1000 bytes
EAL: Virtual area found at 0xffff8873d000 (size = 0x1000)
EAL: Memseg list allocated: 0x100000kB at socket 0
EAL: Ask a virtual area of 0x800000000 bytes
EAL: Virtual area found at 0xffe700000000 (size = 0x800000000)
EAL: Ask a virtual area of 0x1000 bytes
EAL: Virtual area found at 0xffff8873c000 (size = 0x1000)
EAL: Memseg list allocated: 0x100000kB at socket 0
EAL: Ask a virtual area of 0x800000000 bytes
EAL: Virtual area found at 0xffdec0000000 (size = 0x800000000)
EAL: TSC frequency is ~25000 KHz
EAL: Master lcore 1 is ready (tid=88742110;cpuset=[1])
EAL: lcore 3 is ready (tid=85cab910;cpuset=[3])
EAL: lcore 2 is ready (tid=864ab910;cpuset=[2])
EAL: eal_memalloc_alloc_page_bulk(): couldn't find suitable memseg_list
error allocating rte services array
EAL: FATAL: rte_service_init() failed
EAL: rte_service_init() failed
Cannot init EAL
1: [./arm64-dpaa2-linuxapp-gcc/app/testpmd(rte_dump_stack+0x38) [0x4f37a8]]
Aborted
--->8--
Above is an initial output - still investigating. I will keep you posted.
While working on issue reported in [1], I have found another issue
which I might need you help.

[1] http://dpdk.org/ml/archives/dev/2018-March/093202.html

For [1], I bypassed by changing the mempool_add_elem code for time
being - it now allows non-contiguous (not explicitly demanded
contiguous) allocations to go through rte_mempool_populate_iova. With
that, I was able to get DPAA2 working.

Problem is:
1. When I am working with 1GB pages, I/O is working fine.
2. When using 2MB pages (1024 num), the initialization somewhere after
VFIO layer fails.

All with IOVA=VA mode.

Some logs:

This is the output of the virtual memory layout demanded by DPDK:

--->8---
EAL: Ask a virtual area of 0x2e000 bytes
EAL: Virtual area found at 0xffffb6561000 (size = 0x2e000)
EAL: Setting up physically contiguous memory...
EAL: Ask a virtual area of 0x59000 bytes
EAL: Virtual area found at 0xffffb6508000 (size = 0x59000)
EAL: Memseg list allocated: 0x800kB at socket 0
EAL: Ask a virtual area of 0x400000000 bytes
EAL: Virtual area found at 0xfffbb6400000 (size = 0x400000000)
EAL: Ask a virtual area of 0x59000 bytes
EAL: Virtual area found at 0xfffbb62af000 (size = 0x59000)
EAL: Memseg list allocated: 0x800kB at socket 0
EAL: Ask a virtual area of 0x400000000 bytes
EAL: Virtual area found at 0xfff7b6200000 (size = 0x400000000)
EAL: Ask a virtual area of 0x59000 bytes
EAL: Virtual area found at 0xfff7b6056000 (size = 0x59000)
EAL: Memseg list allocated: 0x800kB at socket 0
EAL: Ask a virtual area of 0x400000000 bytes
EAL: Virtual area found at 0xfff3b6000000 (size = 0x400000000)
EAL: Ask a virtual area of 0x59000 bytes
EAL: Virtual area found at 0xfff3b5dfd000 (size = 0x59000)
EAL: Memseg list allocated: 0x800kB at socket 0
EAL: Ask a virtual area of 0x400000000 bytes
EAL: Virtual area found at 0xffefb5c00000 (size = 0x400000000)
--->8---

Then, somehow VFIO mapping is able to find only a single page to map

--->8---
EAL: Device (dpci.1) abstracted from VFIO
EAL: -->Initial SHM Virtual ADDR FFFBB6400000
EAL: -----> DMA size 0x200000
EAL: Total 1 segments found.
--->8---

Then, these logs appear probably when DPAA2 code requests for memory.
I am not sure why it repeats the same '...expanded by 10MB'.

--->8---
EAL: Calling mem event callback vfio_mem_event_clbEAL: request: mp_malloc_sync
EAL: Heap on socket 0 was expanded by 10MB
EAL: Calling mem event callback vfio_mem_event_clbEAL: request: mp_malloc_sync
EAL: Heap on socket 0 was expanded by 10MB
EAL: Calling mem event callback vfio_mem_event_clbEAL: request: mp_malloc_sync
EAL: Heap on socket 0 was expanded by 10MB
EAL: Calling mem event callback vfio_mem_event_clbEAL: request: mp_malloc_sync
EAL: Heap on socket 0 was expanded by 10MB
EAL: Calling mem event callback vfio_mem_event_clbEAL: request: mp_malloc_sync
EAL: Heap on socket 0 was expanded by 10MB
EAL: Calling mem event callback vfio_mem_event_clbEAL: request: mp_malloc_sync
EAL: Heap on socket 0 was expanded by 10MB
EAL: Calling mem event callback vfio_mem_event_clbEAL: request: mp_malloc_sync
EAL: Heap on socket 0 was expanded by 2MB
EAL: Calling mem event callback vfio_mem_event_clbEAL: request: mp_malloc_sync
EAL: Heap on socket 0 was expanded by 10MB
EAL: Calling mem event callback vfio_mem_event_clbEAL: request: mp_malloc_sync
EAL: Heap on socket 0 was expanded by 10MB
LPM or EM none selected, default LPM on
Initializing port 0 ...
--->8---

l3fwd is stuck at this point. What I observe is that DPAA2 driver has
gone ahead to register the queues (queue_setup) with hardware and the
memory has either overrun (smaller than requested size mapped) or the
addresses are corrupt (that is, not dma-able). (I get SMMU faults,
indicating one of these cases)

There is some change from you in the fslmc/fslmc_vfio.c file
(rte_fslmc_vfio_dmamap()). Ideally, that code should have walked over
all the available pages for mapping but that didn't happen and only a
single virtual area got dma-mapped.

--->8---
EAL: Device (dpci.1) abstracted from VFIO
EAL: -->Initial SHM Virtual ADDR FFFBB6400000
EAL: -----> DMA size 0x200000
EAL: Total 1 segments found.
--->8---

I am looking into this but if there is some hint which come to your
mind, it might help.

Regards,
Shreyansh
Burakov, Anatoly
2018-03-21 14:48:26 UTC
Permalink
Post by Shreyansh Jain
Hello Anatoly,
This is not necessarily right chain to reply to, but reusing this
email for another issue in DPAA2 so that all issues can be at a single
place.
Post by Shreyansh Jain
Hello Anatoly,
Post by Shreyansh Jain
Hello Anatoly,
On Fri, Mar 9, 2018 at 4:12 PM, Burakov, Anatoly
[...]
Post by Burakov, Anatoly
Post by Pavan Nikhilesh
I have taken a look at the github tree the issues with VFIO are gone, Although
compilation issues with dpaa/dpaa2 are still present due to their dependency on
`rte_eal_get_physmem_layout`.
I've fixed the dpaa compile issue and pushed it to github. I've tried to
keep the semantics the same as before, but i can't compile-test (let alone
test-test) them as i don't have access to a system with dpaa bus.
Thanks. I will have a look at this.
--->8---
./arm64-dpaa2-linuxapp-gcc/app/testpmd -c 0xE -n 1 --log-level=eal,8
--log-level=mem,8 -- -i --portmask=0x3
EAL: Detected lcore 0 as core 0 on socket 0
EAL: Detected lcore 1 as core 1 on socket 0
EAL: Detected lcore 2 as core 0 on socket 0
EAL: Detected lcore 3 as core 1 on socket 0
EAL: Detected lcore 4 as core 0 on socket 0
EAL: Detected lcore 5 as core 1 on socket 0
EAL: Detected lcore 6 as core 0 on socket 0
EAL: Detected lcore 7 as core 1 on socket 0
EAL: Support maximum 16 logical core(s) by configuration.
EAL: Detected 8 lcore(s)
EAL: Detected 1 NUMA nodes
EAL: VFIO PCI modules not loaded
EAL: DPAA Bus not present. Skipping.
EAL: Container: dprc.2 has VFIO iommu group id = 4
EAL: fslmc: Bus scan completed
EAL: Module /sys/module/rte_kni not found! error 2 (No such file or directory)
EAL: Multi-process socket /var/run/.rte_unix
EAL: Probing VFIO support...
EAL: IOMMU type 1 (Type 1) is supported
EAL: IOMMU type 7 (sPAPR) is not supported
EAL: IOMMU type 8 (No-IOMMU) is not supported
EAL: VFIO support initialized
EAL: Mem event callback 'vfio_mem_event_clb' registered
EAL: Ask a virtual area of 0x2e000 bytes
EAL: Virtual area found at 0xffff86cae000 (size = 0x2e000)
EAL: Setting up physically contiguous memory...
EAL: Ask a virtual area of 0x1000 bytes
EAL: Virtual area found at 0xffff8873f000 (size = 0x1000)
EAL: Memseg list allocated: 0x100000kB at socket 0
EAL: Ask a virtual area of 0x800000000 bytes
EAL: Virtual area found at 0xfff780000000 (size = 0x800000000)
EAL: Ask a virtual area of 0x1000 bytes
EAL: Virtual area found at 0xffff8873e000 (size = 0x1000)
EAL: Memseg list allocated: 0x100000kB at socket 0
EAL: Ask a virtual area of 0x800000000 bytes
EAL: Virtual area found at 0xffef40000000 (size = 0x800000000)
EAL: Ask a virtual area of 0x1000 bytes
EAL: Virtual area found at 0xffff8873d000 (size = 0x1000)
EAL: Memseg list allocated: 0x100000kB at socket 0
EAL: Ask a virtual area of 0x800000000 bytes
EAL: Virtual area found at 0xffe700000000 (size = 0x800000000)
EAL: Ask a virtual area of 0x1000 bytes
EAL: Virtual area found at 0xffff8873c000 (size = 0x1000)
EAL: Memseg list allocated: 0x100000kB at socket 0
EAL: Ask a virtual area of 0x800000000 bytes
EAL: Virtual area found at 0xffdec0000000 (size = 0x800000000)
EAL: TSC frequency is ~25000 KHz
EAL: Master lcore 1 is ready (tid=88742110;cpuset=[1])
EAL: lcore 3 is ready (tid=85cab910;cpuset=[3])
EAL: lcore 2 is ready (tid=864ab910;cpuset=[2])
EAL: eal_memalloc_alloc_page_bulk(): couldn't find suitable memseg_list
error allocating rte services array
EAL: FATAL: rte_service_init() failed
EAL: rte_service_init() failed
Cannot init EAL
1: [./arm64-dpaa2-linuxapp-gcc/app/testpmd(rte_dump_stack+0x38) [0x4f37a8]]
Aborted
--->8--
Above is an initial output - still investigating. I will keep you posted.
While working on issue reported in [1], I have found another issue
which I might need you help.
[1] http://dpdk.org/ml/archives/dev/2018-March/093202.html
For [1], I bypassed by changing the mempool_add_elem code for time
being - it now allows non-contiguous (not explicitly demanded
contiguous) allocations to go through rte_mempool_populate_iova. With
that, I was able to get DPAA2 working.
1. When I am working with 1GB pages, I/O is working fine.
2. When using 2MB pages (1024 num), the initialization somewhere after
VFIO layer fails.
All with IOVA=VA mode.
--->8---
EAL: Ask a virtual area of 0x2e000 bytes
EAL: Virtual area found at 0xffffb6561000 (size = 0x2e000)
EAL: Setting up physically contiguous memory...
EAL: Ask a virtual area of 0x59000 bytes
EAL: Virtual area found at 0xffffb6508000 (size = 0x59000)
EAL: Memseg list allocated: 0x800kB at socket 0
EAL: Ask a virtual area of 0x400000000 bytes
EAL: Virtual area found at 0xfffbb6400000 (size = 0x400000000)
EAL: Ask a virtual area of 0x59000 bytes
EAL: Virtual area found at 0xfffbb62af000 (size = 0x59000)
EAL: Memseg list allocated: 0x800kB at socket 0
EAL: Ask a virtual area of 0x400000000 bytes
EAL: Virtual area found at 0xfff7b6200000 (size = 0x400000000)
EAL: Ask a virtual area of 0x59000 bytes
EAL: Virtual area found at 0xfff7b6056000 (size = 0x59000)
EAL: Memseg list allocated: 0x800kB at socket 0
EAL: Ask a virtual area of 0x400000000 bytes
EAL: Virtual area found at 0xfff3b6000000 (size = 0x400000000)
EAL: Ask a virtual area of 0x59000 bytes
EAL: Virtual area found at 0xfff3b5dfd000 (size = 0x59000)
EAL: Memseg list allocated: 0x800kB at socket 0
EAL: Ask a virtual area of 0x400000000 bytes
EAL: Virtual area found at 0xffefb5c00000 (size = 0x400000000)
--->8---
Then, somehow VFIO mapping is able to find only a single page to map
--->8---
EAL: Device (dpci.1) abstracted from VFIO
EAL: -->Initial SHM Virtual ADDR FFFBB6400000
EAL: -----> DMA size 0x200000
EAL: Total 1 segments found.
--->8---
Then, these logs appear probably when DPAA2 code requests for memory.
I am not sure why it repeats the same '...expanded by 10MB'.
--->8---
EAL: Calling mem event callback vfio_mem_event_clbEAL: request: mp_malloc_sync
EAL: Heap on socket 0 was expanded by 10MB
EAL: Calling mem event callback vfio_mem_event_clbEAL: request: mp_malloc_sync
EAL: Heap on socket 0 was expanded by 10MB
EAL: Calling mem event callback vfio_mem_event_clbEAL: request: mp_malloc_sync
EAL: Heap on socket 0 was expanded by 10MB
EAL: Calling mem event callback vfio_mem_event_clbEAL: request: mp_malloc_sync
EAL: Heap on socket 0 was expanded by 10MB
EAL: Calling mem event callback vfio_mem_event_clbEAL: request: mp_malloc_sync
EAL: Heap on socket 0 was expanded by 10MB
EAL: Calling mem event callback vfio_mem_event_clbEAL: request: mp_malloc_sync
EAL: Heap on socket 0 was expanded by 10MB
EAL: Calling mem event callback vfio_mem_event_clbEAL: request: mp_malloc_sync
EAL: Heap on socket 0 was expanded by 2MB
EAL: Calling mem event callback vfio_mem_event_clbEAL: request: mp_malloc_sync
EAL: Heap on socket 0 was expanded by 10MB
EAL: Calling mem event callback vfio_mem_event_clbEAL: request: mp_malloc_sync
EAL: Heap on socket 0 was expanded by 10MB
LPM or EM none selected, default LPM on
Initializing port 0 ...
--->8---
l3fwd is stuck at this point. What I observe is that DPAA2 driver has
gone ahead to register the queues (queue_setup) with hardware and the
memory has either overrun (smaller than requested size mapped) or the
addresses are corrupt (that is, not dma-able). (I get SMMU faults,
indicating one of these cases)
There is some change from you in the fslmc/fslmc_vfio.c file
(rte_fslmc_vfio_dmamap()). Ideally, that code should have walked over
all the available pages for mapping but that didn't happen and only a
single virtual area got dma-mapped.
--->8---
EAL: Device (dpci.1) abstracted from VFIO
EAL: -->Initial SHM Virtual ADDR FFFBB6400000
EAL: -----> DMA size 0x200000
EAL: Total 1 segments found.
--->8---
I am looking into this but if there is some hint which come to your
mind, it might help.
Regards,
Shreyansh
Hi Shreyansh,

Thanks for the feedback.

The "heap on socket 0 was expanded by 10MB" has to do with
synchronization requests in primary/secondary processes. I can see
you're allocating LPM tables - that's most likely what these allocations
are about (it's hotplugging memory).

I think i might have an idea what is going on. I am assuming that you
are starting up your DPDK application without any -m or --socket-mem
flags, which means you are starting with empty heap.

During initialization, certain DPDK features (such as service cores,
PMD's) allocate memory. Most likely you have essentially started up with
1 2M page, which is what you see in fslmc logs: this page gets mapped
for VFIO.

Then, you allocate a bunch of LPM tables, which trigger more memory
allocation, and trigger memory allocation callbacks registered through
rte_mem_event_register_callback(). One of these callbacks is a VFIO
callback, which is registered in eal_vfio.c:rte_vfio_enable(). However,
since fslmc bus has its own VFIO implementation that is independent of
what happens in EAL VFIO code, what probably happens is that the fslmc
bus misses the necessary messages from the memory hotplug to map
additional resources for DMA.

Try adding a rte_mem_event_register_callback() somewhere in fslmc init
so that it calls necessary map function.
eal_vfio.c:vfio_mem_event_callback() should provide a good template on
how to approach creating such a callback. Let me know if this works!

(as a side note, how can we extend VFIO to move this stuff back into EAL
and expose it as an API?)
--
Thanks,
Anatoly
Shreyansh Jain
2018-03-22 05:09:51 UTC
Permalink
Hello Anatoly,
-----Original Message-----
Sent: Wednesday, March 21, 2018 8:18 PM
Subject: Re: [dpdk-dev] [PATCH v2 00/41] Memory Hotplug for DPDK
[...]
Post by Shreyansh Jain
While working on issue reported in [1], I have found another issue
which I might need you help.
[1]
https://emea01.safelinks.protection.outlook.com/?url=http%3A%2F%2Fdpdk.o
rg%2Fml%2Farchives%2Fdev%2F2018-
March%2F093202.html&data=02%7C01%7Cshreyansh.jain%40nxp.com%7C5faee716e6
fc4908bdb608d58f3ad1e5%7C686ea1d3bc2b4c6fa92cd99c5c301635%7C0%7C0%7C6365
72405182868376&sdata=WohDdktHHAuNDnss1atuixSa%2FqC7HRMSDVCtFC9Vnto%3D&re
served=0
Post by Shreyansh Jain
For [1], I bypassed by changing the mempool_add_elem code for time
being - it now allows non-contiguous (not explicitly demanded
contiguous) allocations to go through rte_mempool_populate_iova. With
that, I was able to get DPAA2 working.
1. When I am working with 1GB pages, I/O is working fine.
2. When using 2MB pages (1024 num), the initialization somewhere after
VFIO layer fails.
All with IOVA=VA mode.
--->8---
EAL: Ask a virtual area of 0x2e000 bytes
EAL: Virtual area found at 0xffffb6561000 (size = 0x2e000)
EAL: Setting up physically contiguous memory...
EAL: Ask a virtual area of 0x59000 bytes
EAL: Virtual area found at 0xffffb6508000 (size = 0x59000)
EAL: Memseg list allocated: 0x800kB at socket 0
EAL: Ask a virtual area of 0x400000000 bytes
EAL: Virtual area found at 0xfffbb6400000 (size = 0x400000000)
EAL: Ask a virtual area of 0x59000 bytes
EAL: Virtual area found at 0xfffbb62af000 (size = 0x59000)
EAL: Memseg list allocated: 0x800kB at socket 0
EAL: Ask a virtual area of 0x400000000 bytes
EAL: Virtual area found at 0xfff7b6200000 (size = 0x400000000)
EAL: Ask a virtual area of 0x59000 bytes
EAL: Virtual area found at 0xfff7b6056000 (size = 0x59000)
EAL: Memseg list allocated: 0x800kB at socket 0
EAL: Ask a virtual area of 0x400000000 bytes
EAL: Virtual area found at 0xfff3b6000000 (size = 0x400000000)
EAL: Ask a virtual area of 0x59000 bytes
EAL: Virtual area found at 0xfff3b5dfd000 (size = 0x59000)
EAL: Memseg list allocated: 0x800kB at socket 0
EAL: Ask a virtual area of 0x400000000 bytes
EAL: Virtual area found at 0xffefb5c00000 (size = 0x400000000)
--->8---
Then, somehow VFIO mapping is able to find only a single page to map
--->8---
EAL: Device (dpci.1) abstracted from VFIO
EAL: -->Initial SHM Virtual ADDR FFFBB6400000
EAL: -----> DMA size 0x200000
EAL: Total 1 segments found.
--->8---
Then, these logs appear probably when DPAA2 code requests for memory.
I am not sure why it repeats the same '...expanded by 10MB'.
--->8---
mp_malloc_sync
Post by Shreyansh Jain
EAL: Heap on socket 0 was expanded by 10MB
mp_malloc_sync
Post by Shreyansh Jain
EAL: Heap on socket 0 was expanded by 10MB
mp_malloc_sync
Post by Shreyansh Jain
EAL: Heap on socket 0 was expanded by 10MB
mp_malloc_sync
Post by Shreyansh Jain
EAL: Heap on socket 0 was expanded by 10MB
mp_malloc_sync
Post by Shreyansh Jain
EAL: Heap on socket 0 was expanded by 10MB
mp_malloc_sync
Post by Shreyansh Jain
EAL: Heap on socket 0 was expanded by 10MB
mp_malloc_sync
Post by Shreyansh Jain
EAL: Heap on socket 0 was expanded by 2MB
mp_malloc_sync
Post by Shreyansh Jain
EAL: Heap on socket 0 was expanded by 10MB
mp_malloc_sync
Post by Shreyansh Jain
EAL: Heap on socket 0 was expanded by 10MB
LPM or EM none selected, default LPM on
Initializing port 0 ...
--->8---
l3fwd is stuck at this point. What I observe is that DPAA2 driver has
gone ahead to register the queues (queue_setup) with hardware and the
memory has either overrun (smaller than requested size mapped) or the
addresses are corrupt (that is, not dma-able). (I get SMMU faults,
indicating one of these cases)
There is some change from you in the fslmc/fslmc_vfio.c file
(rte_fslmc_vfio_dmamap()). Ideally, that code should have walked over
all the available pages for mapping but that didn't happen and only a
single virtual area got dma-mapped.
--->8---
EAL: Device (dpci.1) abstracted from VFIO
EAL: -->Initial SHM Virtual ADDR FFFBB6400000
EAL: -----> DMA size 0x200000
EAL: Total 1 segments found.
--->8---
I am looking into this but if there is some hint which come to your
mind, it might help.
Regards,
Shreyansh
Hi Shreyansh,
Thanks for the feedback.
The "heap on socket 0 was expanded by 10MB" has to do with
synchronization requests in primary/secondary processes. I can see
you're allocating LPM tables - that's most likely what these allocations
are about (it's hotplugging memory).
I get that but why same message multiple times without any change in the expansion. Further, I don't have multiple process - in fact, I'm working with a single datapath thread.
Anyways, I will look through the code for this.
I think i might have an idea what is going on. I am assuming that you
are starting up your DPDK application without any -m or --socket-mem
flags, which means you are starting with empty heap.
Yes, no specific --socket-mem passed as argument.
During initialization, certain DPDK features (such as service cores,
PMD's) allocate memory. Most likely you have essentially started up with
1 2M page, which is what you see in fslmc logs: this page gets mapped
for VFIO.
Agree.
Then, you allocate a bunch of LPM tables, which trigger more memory
allocation, and trigger memory allocation callbacks registered through
rte_mem_event_register_callback(). One of these callbacks is a VFIO
callback, which is registered in eal_vfio.c:rte_vfio_enable(). However,
since fslmc bus has its own VFIO implementation that is independent of
what happens in EAL VFIO code, what probably happens is that the fslmc
bus misses the necessary messages from the memory hotplug to map
additional resources for DMA.
Makes sense
Try adding a rte_mem_event_register_callback() somewhere in fslmc init
so that it calls necessary map function.
eal_vfio.c:vfio_mem_event_callback() should provide a good template on
how to approach creating such a callback. Let me know if this works!
OK. I will give this a try and update you.
(as a side note, how can we extend VFIO to move this stuff back into EAL
and expose it as an API?)
The problem is that FSLMC VFIO driver is slightly different from generic VFIO layer in the sense that device in a VFIO container is actually another level of container. Anyways, I will have a look how much generalization is possible. Or else, I will work with the vfio_mem_event_callback() as suggested above.

Thanks for suggestions.
--
Thanks,
Burakov, Anatoly
2018-03-22 09:24:34 UTC
Permalink
Post by Shreyansh Jain
Hello Anatoly,
-----Original Message-----
Sent: Wednesday, March 21, 2018 8:18 PM
Subject: Re: [dpdk-dev] [PATCH v2 00/41] Memory Hotplug for DPDK
[...]
Post by Shreyansh Jain
While working on issue reported in [1], I have found another issue
which I might need you help.
[1]
https://emea01.safelinks.protection.outlook.com/?url=http%3A%2F%2Fdpdk.o
rg%2Fml%2Farchives%2Fdev%2F2018-
March%2F093202.html&data=02%7C01%7Cshreyansh.jain%40nxp.com%7C5faee716e6
fc4908bdb608d58f3ad1e5%7C686ea1d3bc2b4c6fa92cd99c5c301635%7C0%7C0%7C6365
72405182868376&sdata=WohDdktHHAuNDnss1atuixSa%2FqC7HRMSDVCtFC9Vnto%3D&re
served=0
Post by Shreyansh Jain
For [1], I bypassed by changing the mempool_add_elem code for time
being - it now allows non-contiguous (not explicitly demanded
contiguous) allocations to go through rte_mempool_populate_iova. With
that, I was able to get DPAA2 working.
1. When I am working with 1GB pages, I/O is working fine.
2. When using 2MB pages (1024 num), the initialization somewhere after
VFIO layer fails.
All with IOVA=VA mode.
--->8---
EAL: Ask a virtual area of 0x2e000 bytes
EAL: Virtual area found at 0xffffb6561000 (size = 0x2e000)
EAL: Setting up physically contiguous memory...
EAL: Ask a virtual area of 0x59000 bytes
EAL: Virtual area found at 0xffffb6508000 (size = 0x59000)
EAL: Memseg list allocated: 0x800kB at socket 0
EAL: Ask a virtual area of 0x400000000 bytes
EAL: Virtual area found at 0xfffbb6400000 (size = 0x400000000)
EAL: Ask a virtual area of 0x59000 bytes
EAL: Virtual area found at 0xfffbb62af000 (size = 0x59000)
EAL: Memseg list allocated: 0x800kB at socket 0
EAL: Ask a virtual area of 0x400000000 bytes
EAL: Virtual area found at 0xfff7b6200000 (size = 0x400000000)
EAL: Ask a virtual area of 0x59000 bytes
EAL: Virtual area found at 0xfff7b6056000 (size = 0x59000)
EAL: Memseg list allocated: 0x800kB at socket 0
EAL: Ask a virtual area of 0x400000000 bytes
EAL: Virtual area found at 0xfff3b6000000 (size = 0x400000000)
EAL: Ask a virtual area of 0x59000 bytes
EAL: Virtual area found at 0xfff3b5dfd000 (size = 0x59000)
EAL: Memseg list allocated: 0x800kB at socket 0
EAL: Ask a virtual area of 0x400000000 bytes
EAL: Virtual area found at 0xffefb5c00000 (size = 0x400000000)
--->8---
Then, somehow VFIO mapping is able to find only a single page to map
--->8---
EAL: Device (dpci.1) abstracted from VFIO
EAL: -->Initial SHM Virtual ADDR FFFBB6400000
EAL: -----> DMA size 0x200000
EAL: Total 1 segments found.
--->8---
Then, these logs appear probably when DPAA2 code requests for memory.
I am not sure why it repeats the same '...expanded by 10MB'.
--->8---
mp_malloc_sync
Post by Shreyansh Jain
EAL: Heap on socket 0 was expanded by 10MB
mp_malloc_sync
Post by Shreyansh Jain
EAL: Heap on socket 0 was expanded by 10MB
mp_malloc_sync
Post by Shreyansh Jain
EAL: Heap on socket 0 was expanded by 10MB
mp_malloc_sync
Post by Shreyansh Jain
EAL: Heap on socket 0 was expanded by 10MB
mp_malloc_sync
Post by Shreyansh Jain
EAL: Heap on socket 0 was expanded by 10MB
mp_malloc_sync
Post by Shreyansh Jain
EAL: Heap on socket 0 was expanded by 10MB
mp_malloc_sync
Post by Shreyansh Jain
EAL: Heap on socket 0 was expanded by 2MB
mp_malloc_sync
Post by Shreyansh Jain
EAL: Heap on socket 0 was expanded by 10MB
mp_malloc_sync
Post by Shreyansh Jain
EAL: Heap on socket 0 was expanded by 10MB
LPM or EM none selected, default LPM on
Initializing port 0 ...
--->8---
l3fwd is stuck at this point. What I observe is that DPAA2 driver has
gone ahead to register the queues (queue_setup) with hardware and the
memory has either overrun (smaller than requested size mapped) or the
addresses are corrupt (that is, not dma-able). (I get SMMU faults,
indicating one of these cases)
There is some change from you in the fslmc/fslmc_vfio.c file
(rte_fslmc_vfio_dmamap()). Ideally, that code should have walked over
all the available pages for mapping but that didn't happen and only a
single virtual area got dma-mapped.
--->8---
EAL: Device (dpci.1) abstracted from VFIO
EAL: -->Initial SHM Virtual ADDR FFFBB6400000
EAL: -----> DMA size 0x200000
EAL: Total 1 segments found.
--->8---
I am looking into this but if there is some hint which come to your
mind, it might help.
Regards,
Shreyansh
Hi Shreyansh,
Thanks for the feedback.
The "heap on socket 0 was expanded by 10MB" has to do with
synchronization requests in primary/secondary processes. I can see
you're allocating LPM tables - that's most likely what these allocations
are about (it's hotplugging memory).
I get that but why same message multiple times without any change in the expansion. Further, I don't have multiple process - in fact, I'm working with a single datapath thread.
Anyways, I will look through the code for this.
Hi Shreyansh,

I've misspoke - this has nothing to do with multiprocess. The "request:
mp_malloc_sync" does, but it's an attempt to notify other processes of
the allocation - if there are no processes, nothing happens.

However, multiple heap expansions do correspond to multiple allocations.
If you allocate an LPM table that takes up 10M of hugepage memory - you
expand heap by 10M. If you do it multiple times (e.g. per-NIC?), you do
multiple heap expansions. This message will be triggered on every heap
expansion.
Post by Shreyansh Jain
I think i might have an idea what is going on. I am assuming that you
are starting up your DPDK application without any -m or --socket-mem
flags, which means you are starting with empty heap.
Yes, no specific --socket-mem passed as argument.
During initialization, certain DPDK features (such as service cores,
PMD's) allocate memory. Most likely you have essentially started up with
1 2M page, which is what you see in fslmc logs: this page gets mapped
for VFIO.
Agree.
Then, you allocate a bunch of LPM tables, which trigger more memory
allocation, and trigger memory allocation callbacks registered through
rte_mem_event_register_callback(). One of these callbacks is a VFIO
callback, which is registered in eal_vfio.c:rte_vfio_enable(). However,
since fslmc bus has its own VFIO implementation that is independent of
what happens in EAL VFIO code, what probably happens is that the fslmc
bus misses the necessary messages from the memory hotplug to map
additional resources for DMA.
Makes sense
Try adding a rte_mem_event_register_callback() somewhere in fslmc init
so that it calls necessary map function.
eal_vfio.c:vfio_mem_event_callback() should provide a good template on
how to approach creating such a callback. Let me know if this works!
OK. I will give this a try and update you.
(as a side note, how can we extend VFIO to move this stuff back into EAL
and expose it as an API?)
The problem is that FSLMC VFIO driver is slightly different from generic VFIO layer in the sense that device in a VFIO container is actually another level of container. Anyways, I will have a look how much generalization is possible. Or else, I will work with the vfio_mem_event_callback() as suggested above.
This can wait :) The callback is probably the proper way to do it right now.
Post by Shreyansh Jain
Thanks for suggestions.
--
Thanks,
Anatoly
--
Thanks,
Anatoly
Burakov, Anatoly
2018-03-08 14:40:44 UTC
Permalink
This patchset introduces dynamic memory allocation for DPDK (aka memory
hotplug). Based upon RFC submitted in December [1].
- IPC bugfixes patchset [2]
- IPC improvements patchset [3]
- IPC asynchronous request API patch [4]
- Function to return number of sockets [5]
- General outline of memory hotplug changes [6]
- EAL NUMA node count changes [7]
The vast majority of changes are in the EAL and malloc, the external API
disruption is minimal: a new set of API's are added for contiguous memory
allocation for rte_memzone, and a few API additions in rte_memory due to
switch to memseg_lists as opposed to memsegs. Every other API change is
internal to EAL, and all of the memory allocation/freeing is handled
through rte_malloc, with no externally visible API changes.
* Malloc heap adjusted to handle holes in address space
* Single memseg list replaced by multiple memseg lists
* VA space for hugepages is preallocated in advance
* Added alloc/free for pages happening as needed on rte_malloc/rte_free
* Added contiguous memory allocation API's for rte_memzone
* Integrated Pawel Wodkowski's patch for registering/unregistering memory
with VFIO [8]
* Callbacks for registering memory allocations
* Multiprocess support done via DPDK IPC introduced in 18.02
The biggest difference is a "memseg" now represents a single page (as opposed to
being a big contiguous block of pages). As a consequence, both memzones and
malloc elements are no longer guaranteed to be physically contiguous, unless
the user asks for it at reserve time. To preserve whatever functionality that
was dependent on previous behavior, a legacy memory option is also provided,
however it is expected (or perhaps vainly hoped) to be temporary solution.
Why multiple memseg lists instead of one? Since memseg is a single page now,
the list of memsegs will get quite big, and we need to locate pages somehow
when we allocate and free them. We could of course just walk the list and
allocate one contiguous chunk of VA space for memsegs, but this
implementation uses separate lists instead in order to speed up many
operations with memseg lists.
- FreeBSD does not even compile, let alone run
- No 32-bit support
- There are some minor quality-of-life improvements planned that aren't
ready yet and will be part of v2
- VFIO support is only smoke-tested (but is expected to work), VFIO support
with secondary processes is not tested; work is ongoing to validate VFIO
for all use cases
- Dynamic mapping/unmapping memory with VFIO is not supported in sPAPR
IOMMU mode - help from sPAPR maintainers requested
Nevertheless, this patchset should be testable under 64-bit Linux, and
should work for all use cases bar those mentioned above.
[1] http://dpdk.org/dev/patchwork/bundle/aburakov/Memory_RFC/
[2] http://dpdk.org/dev/patchwork/bundle/aburakov/IPC_Fixes/
[3] http://dpdk.org/dev/patchwork/bundle/aburakov/IPC_Improvements/
[4] http://dpdk.org/dev/patchwork/bundle/aburakov/IPC_Async_Request/
[5] http://dpdk.org/dev/patchwork/bundle/aburakov/Num_Sockets/
[6] http://dpdk.org/dev/patchwork/patch/34002/
[7] http://dpdk.org/dev/patchwork/patch/33853/
[8] http://dpdk.org/dev/patchwork/patch/24484/
For those wanting to test this patchset, there is now a github tree with
all of the dependent patches applied:

https://github.com/anatolyburakov/dpdk
--
Thanks,
Anatoly
Shreyansh Jain
2018-03-19 08:58:30 UTC
Permalink
Hi Anatoly,

On Wed, Mar 7, 2018 at 10:26 PM, Anatoly Burakov
This patchset introduces dynamic memory allocation for DPDK (aka memory
hotplug). Based upon RFC submitted in December [1].
- IPC bugfixes patchset [2]
- IPC improvements patchset [3]
- IPC asynchronous request API patch [4]
- Function to return number of sockets [5]
- General outline of memory hotplug changes [6]
- EAL NUMA node count changes [7]
The vast majority of changes are in the EAL and malloc, the external API
disruption is minimal: a new set of API's are added for contiguous memory
allocation for rte_memzone, and a few API additions in rte_memory due to
switch to memseg_lists as opposed to memsegs. Every other API change is
internal to EAL, and all of the memory allocation/freeing is handled
through rte_malloc, with no externally visible API changes.
* Malloc heap adjusted to handle holes in address space
* Single memseg list replaced by multiple memseg lists
* VA space for hugepages is preallocated in advance
* Added alloc/free for pages happening as needed on rte_malloc/rte_free
* Added contiguous memory allocation API's for rte_memzone
* Integrated Pawel Wodkowski's patch for registering/unregistering memory
with VFIO [8]
* Callbacks for registering memory allocations
* Multiprocess support done via DPDK IPC introduced in 18.02
The biggest difference is a "memseg" now represents a single page (as opposed to
being a big contiguous block of pages). As a consequence, both memzones and
malloc elements are no longer guaranteed to be physically contiguous, unless
the user asks for it at reserve time. To preserve whatever functionality that
was dependent on previous behavior, a legacy memory option is also provided,
however it is expected (or perhaps vainly hoped) to be temporary solution.
Why multiple memseg lists instead of one? Since memseg is a single page now,
the list of memsegs will get quite big, and we need to locate pages somehow
when we allocate and free them. We could of course just walk the list and
allocate one contiguous chunk of VA space for memsegs, but this
implementation uses separate lists instead in order to speed up many
operations with memseg lists.
- FreeBSD does not even compile, let alone run
- No 32-bit support
I just read on ***@dpdk.org [1] that an early merge to this
series is expected. So, would this limitation be fixed before merge?
Or, has it already been fixed in github repo?

[1] http://dpdk.org/ml/archives/announce/2018-March/000182.html

[...]

-
Shreyansh
Burakov, Anatoly
2018-03-20 10:07:53 UTC
Permalink
Post by John Daley (johndale)
Hi Anatoly,
On Wed, Mar 7, 2018 at 10:26 PM, Anatoly Burakov
This patchset introduces dynamic memory allocation for DPDK (aka memory
hotplug). Based upon RFC submitted in December [1].
- IPC bugfixes patchset [2]
- IPC improvements patchset [3]
- IPC asynchronous request API patch [4]
- Function to return number of sockets [5]
- General outline of memory hotplug changes [6]
- EAL NUMA node count changes [7]
The vast majority of changes are in the EAL and malloc, the external API
disruption is minimal: a new set of API's are added for contiguous memory
allocation for rte_memzone, and a few API additions in rte_memory due to
switch to memseg_lists as opposed to memsegs. Every other API change is
internal to EAL, and all of the memory allocation/freeing is handled
through rte_malloc, with no externally visible API changes.
* Malloc heap adjusted to handle holes in address space
* Single memseg list replaced by multiple memseg lists
* VA space for hugepages is preallocated in advance
* Added alloc/free for pages happening as needed on rte_malloc/rte_free
* Added contiguous memory allocation API's for rte_memzone
* Integrated Pawel Wodkowski's patch for registering/unregistering memory
with VFIO [8]
* Callbacks for registering memory allocations
* Multiprocess support done via DPDK IPC introduced in 18.02
The biggest difference is a "memseg" now represents a single page (as opposed to
being a big contiguous block of pages). As a consequence, both memzones and
malloc elements are no longer guaranteed to be physically contiguous, unless
the user asks for it at reserve time. To preserve whatever functionality that
was dependent on previous behavior, a legacy memory option is also provided,
however it is expected (or perhaps vainly hoped) to be temporary solution.
Why multiple memseg lists instead of one? Since memseg is a single page now,
the list of memsegs will get quite big, and we need to locate pages somehow
when we allocate and free them. We could of course just walk the list and
allocate one contiguous chunk of VA space for memsegs, but this
implementation uses separate lists instead in order to speed up many
operations with memseg lists.
- FreeBSD does not even compile, let alone run
- No 32-bit support
series is expected. So, would this limitation be fixed before merge?
Or, has it already been fixed in github repo?
[1] http://dpdk.org/ml/archives/announce/2018-March/000182.html
[...]
-
Shreyansh
Hi Shreyansh,

It will be fixed before merge, yes. I would expect this code to arrive
in Github in the next few days.
--
Thanks,
Anatoly
Olivier Matz
2018-03-19 17:11:31 UTC
Permalink
Hi Anatoly,

Please find some comments below.
Post by Anatoly Burakov
If a user has specified that the zone should have contiguous memory,
use the new _contig allocation API's instead of normal ones.
Otherwise, account for the fact that unless we're in IOVA_AS_VA
mode, we cannot guarantee that the pages would be physically
contiguous, so we calculate the memzone size and alignments as if
we were getting the smallest page size available.
[...]
Post by Anatoly Burakov
@@ -563,10 +585,46 @@ rte_mempool_populate_default(struct rte_mempool *mp)
/* update mempool capabilities */
mp->flags |= mp_flags;
- if (rte_eal_has_hugepages()) {
- pg_shift = 0; /* not needed, zone is physically contiguous */
+ no_contig = mp->flags & MEMPOOL_F_NO_PHYS_CONTIG;
+ force_contig = mp->flags & MEMPOOL_F_CAPA_PHYS_CONTIG;
+
+ /*
+ * there are several considerations for page size and page shift here.
I would add a little word here to describe what page size and page shift
are used for:

These values impact the result of rte_mempool_xmem_size() (*), which
returns the amount of memory that should be allocated to store the
desired number of objects. When not zero, it allocates more memory
for the padding between objects, to ensure that an object does not
cross a page boundary.

(*) it is renamed in Andrew's patchset about mempool_ops API, but it
seems the memory rework may be pushed before.
Post by Anatoly Burakov
+ *
+ * if we don't need our mempools to have physically contiguous objects,
+ * then just set page shift and page size to 0, because the user has
+ * indicated that there's no need to care about anything.
+ *
+ * if we do need contiguous objects, there is also an option to reserve
+ * the entire mempool memory as one contiguous block of memory, in
+ * which case the page shift and alignment wouldn't matter as well.
+ *
+ * if we require contiguous objects, but not necessarily the entire
+ * mempool reserved space to be contiguous, then there are two options.
+ *
+ * if our IO addresses are virtual, not actual physical (IOVA as VA
+ * case), then no page shift needed - our memory allocation will give us
+ * contiguous physical memory as far as the hardware is concerned, so
+ * act as if we're getting contiguous memory.
+ *
+ * if our IO addresses are physical, we may get memory from bigger
+ * pages, or we might get memory from smaller pages, and how much of it
+ * we require depends on whether we want bigger or smaller pages.
+ * However, requesting each and every memory size is too much work, so
+ * what we'll do instead is walk through the page sizes available, pick
+ * the smallest one and set up page shift to match that one. We will be
+ * wasting some space this way, but it's much nicer than looping around
+ * trying to reserve each and every page size.
+ */
This comment is helpful to understand, thanks.

(by the way, reading it makes me think we should rename
MEMPOOL_F_*_PHYS_CONTIG as MEMPOOL_F_*_IOVA_CONTIG)
Post by Anatoly Burakov
+
+ if (no_contig || force_contig || rte_eal_iova_mode() == RTE_IOVA_VA) {
pg_sz = 0;
+ pg_shift = 0;
align = RTE_CACHE_LINE_SIZE;
+ } else if (rte_eal_has_hugepages()) {
+ pg_sz = get_min_page_size();
+ pg_shift = rte_bsf32(pg_sz);
+ align = pg_sz;
} else {
pg_sz = getpagesize();
pg_shift = rte_bsf32(pg_sz);
@@ -585,23 +643,34 @@ rte_mempool_populate_default(struct rte_mempool *mp)
goto fail;
}
- mz = rte_memzone_reserve_aligned(mz_name, size,
- mp->socket_id, mz_flags, align);
- /* not enough memory, retry with the biggest zone we have */
- if (mz == NULL)
- mz = rte_memzone_reserve_aligned(mz_name, 0,
+ if (force_contig) {
+ /*
+ * if contiguous memory for entire mempool memory was
+ * requested, don't try reserving again if we fail.
+ */
+ mz = rte_memzone_reserve_aligned_contig(mz_name, size,
+ mp->socket_id, mz_flags, align);
+ } else {
+ mz = rte_memzone_reserve_aligned(mz_name, size,
mp->socket_id, mz_flags, align);
+ /* not enough memory, retry with the biggest zone we
+ * have
+ */
+ if (mz == NULL)
+ mz = rte_memzone_reserve_aligned(mz_name, 0,
+ mp->socket_id, mz_flags, align);
+ }
This is not wrong, but at first glance I think it is not required,
because we have this in populate_iova():

/* Detect pool area has sufficient space for elements */
if (mp_capa_flags & MEMPOOL_F_CAPA_PHYS_CONTIG) {
if (len < total_elt_sz * mp->size) {
RTE_LOG(ERR, MEMPOOL,
"pool area %" PRIx64 " not enough\n",
(uint64_t)len);
return -ENOSPC;
}
}



Thanks,
Olivier
Andrew Rybchenko
2018-03-21 07:49:55 UTC
Permalink
Post by Olivier Matz
Post by Anatoly Burakov
+ *
+ * if we don't need our mempools to have physically contiguous objects,
+ * then just set page shift and page size to 0, because the user has
+ * indicated that there's no need to care about anything.
+ *
+ * if we do need contiguous objects, there is also an option to reserve
+ * the entire mempool memory as one contiguous block of memory, in
+ * which case the page shift and alignment wouldn't matter as well.
+ *
+ * if we require contiguous objects, but not necessarily the entire
+ * mempool reserved space to be contiguous, then there are two options.
+ *
+ * if our IO addresses are virtual, not actual physical (IOVA as VA
+ * case), then no page shift needed - our memory allocation will give us
+ * contiguous physical memory as far as the hardware is concerned, so
+ * act as if we're getting contiguous memory.
+ *
+ * if our IO addresses are physical, we may get memory from bigger
+ * pages, or we might get memory from smaller pages, and how much of it
+ * we require depends on whether we want bigger or smaller pages.
+ * However, requesting each and every memory size is too much work, so
+ * what we'll do instead is walk through the page sizes available, pick
+ * the smallest one and set up page shift to match that one. We will be
+ * wasting some space this way, but it's much nicer than looping around
+ * trying to reserve each and every page size.
+ */
This comment is helpful to understand, thanks.
(by the way, reading it makes me think we should rename
MEMPOOL_F_*_PHYS_CONTIG as MEMPOOL_F_*_IOVA_CONTIG)
I'll care about renaming in my patchset about mempool_ops API.
Olivier Matz
2018-03-21 08:32:46 UTC
Permalink
Post by Andrew Rybchenko
Post by Olivier Matz
Post by Anatoly Burakov
+ *
+ * if we don't need our mempools to have physically contiguous objects,
+ * then just set page shift and page size to 0, because the user has
+ * indicated that there's no need to care about anything.
+ *
+ * if we do need contiguous objects, there is also an option to reserve
+ * the entire mempool memory as one contiguous block of memory, in
+ * which case the page shift and alignment wouldn't matter as well.
+ *
+ * if we require contiguous objects, but not necessarily the entire
+ * mempool reserved space to be contiguous, then there are two options.
+ *
+ * if our IO addresses are virtual, not actual physical (IOVA as VA
+ * case), then no page shift needed - our memory allocation will give us
+ * contiguous physical memory as far as the hardware is concerned, so
+ * act as if we're getting contiguous memory.
+ *
+ * if our IO addresses are physical, we may get memory from bigger
+ * pages, or we might get memory from smaller pages, and how much of it
+ * we require depends on whether we want bigger or smaller pages.
+ * However, requesting each and every memory size is too much work, so
+ * what we'll do instead is walk through the page sizes available, pick
+ * the smallest one and set up page shift to match that one. We will be
+ * wasting some space this way, but it's much nicer than looping around
+ * trying to reserve each and every page size.
+ */
This comment is helpful to understand, thanks.
(by the way, reading it makes me think we should rename
MEMPOOL_F_*_PHYS_CONTIG as MEMPOOL_F_*_IOVA_CONTIG)
I'll care about renaming in my patchset about mempool_ops API.
Great, thanks!
Please also keep the old ones for now, we will remove them later.
Olivier Matz
2018-03-19 17:30:53 UTC
Permalink
Hi Anatoly,
This patchset introduces dynamic memory allocation for DPDK (aka memory
hotplug). Based upon RFC submitted in December [1].
- IPC bugfixes patchset [2]
- IPC improvements patchset [3]
- IPC asynchronous request API patch [4]
- Function to return number of sockets [5]
- General outline of memory hotplug changes [6]
- EAL NUMA node count changes [7]
The vast majority of changes are in the EAL and malloc, the external API
disruption is minimal: a new set of API's are added for contiguous memory
allocation for rte_memzone, and a few API additions in rte_memory due to
switch to memseg_lists as opposed to memsegs. Every other API change is
internal to EAL, and all of the memory allocation/freeing is handled
through rte_malloc, with no externally visible API changes.
* Malloc heap adjusted to handle holes in address space
* Single memseg list replaced by multiple memseg lists
* VA space for hugepages is preallocated in advance
* Added alloc/free for pages happening as needed on rte_malloc/rte_free
* Added contiguous memory allocation API's for rte_memzone
* Integrated Pawel Wodkowski's patch for registering/unregistering memory
with VFIO [8]
* Callbacks for registering memory allocations
* Multiprocess support done via DPDK IPC introduced in 18.02
The biggest difference is a "memseg" now represents a single page (as opposed to
being a big contiguous block of pages). As a consequence, both memzones and
malloc elements are no longer guaranteed to be physically contiguous, unless
the user asks for it at reserve time. To preserve whatever functionality that
was dependent on previous behavior, a legacy memory option is also provided,
however it is expected (or perhaps vainly hoped) to be temporary solution.
Why multiple memseg lists instead of one? Since memseg is a single page now,
the list of memsegs will get quite big, and we need to locate pages somehow
when we allocate and free them. We could of course just walk the list and
allocate one contiguous chunk of VA space for memsegs, but this
implementation uses separate lists instead in order to speed up many
operations with memseg lists.
- FreeBSD does not even compile, let alone run
- No 32-bit support
- There are some minor quality-of-life improvements planned that aren't
ready yet and will be part of v2
- VFIO support is only smoke-tested (but is expected to work), VFIO support
with secondary processes is not tested; work is ongoing to validate VFIO
for all use cases
- Dynamic mapping/unmapping memory with VFIO is not supported in sPAPR
IOMMU mode - help from sPAPR maintainers requested
Nevertheless, this patchset should be testable under 64-bit Linux, and
should work for all use cases bar those mentioned above.
[1] http://dpdk.org/dev/patchwork/bundle/aburakov/Memory_RFC/
[2] http://dpdk.org/dev/patchwork/bundle/aburakov/IPC_Fixes/
[3] http://dpdk.org/dev/patchwork/bundle/aburakov/IPC_Improvements/
[4] http://dpdk.org/dev/patchwork/bundle/aburakov/IPC_Async_Request/
[5] http://dpdk.org/dev/patchwork/bundle/aburakov/Num_Sockets/
[6] http://dpdk.org/dev/patchwork/patch/34002/
[7] http://dpdk.org/dev/patchwork/patch/33853/
[8] http://dpdk.org/dev/patchwork/patch/24484/
I did a quick pass on your patches (unfortunately, I don't have
the time to really dive in it).

I have few questions/comments:

- This is really a big patchset. Thank you for working on this topic.
I'll try to test our application with it as soon as possible.

- I see from patch 17 that it is possible that rte_malloc() expands
the heap by requesting more memory to the OS? Did I understand well?
Today, a good property of rte_malloc() compared to malloc() is that
it won't interrupt the process (the worst case is a spinlock). This
is appreciable on a dataplane core. Will it change?

- It's not a big issue, but I have the feeling that the "const" statement
is often forgotten in the patchset. I think it is helpful for both
optimization, documentation and to detect bugs that modifies/free
something that should not.

I'm sending some other dummy comments as replies to patches.

Thanks,
Olivier
Burakov, Anatoly
2018-03-20 10:27:55 UTC
Permalink
Post by John Daley (johndale)
Hi Anatoly,
This patchset introduces dynamic memory allocation for DPDK (aka memory
hotplug). Based upon RFC submitted in December [1].
- IPC bugfixes patchset [2]
- IPC improvements patchset [3]
- IPC asynchronous request API patch [4]
- Function to return number of sockets [5]
- General outline of memory hotplug changes [6]
- EAL NUMA node count changes [7]
The vast majority of changes are in the EAL and malloc, the external API
disruption is minimal: a new set of API's are added for contiguous memory
allocation for rte_memzone, and a few API additions in rte_memory due to
switch to memseg_lists as opposed to memsegs. Every other API change is
internal to EAL, and all of the memory allocation/freeing is handled
through rte_malloc, with no externally visible API changes.
* Malloc heap adjusted to handle holes in address space
* Single memseg list replaced by multiple memseg lists
* VA space for hugepages is preallocated in advance
* Added alloc/free for pages happening as needed on rte_malloc/rte_free
* Added contiguous memory allocation API's for rte_memzone
* Integrated Pawel Wodkowski's patch for registering/unregistering memory
with VFIO [8]
* Callbacks for registering memory allocations
* Multiprocess support done via DPDK IPC introduced in 18.02
The biggest difference is a "memseg" now represents a single page (as opposed to
being a big contiguous block of pages). As a consequence, both memzones and
malloc elements are no longer guaranteed to be physically contiguous, unless
the user asks for it at reserve time. To preserve whatever functionality that
was dependent on previous behavior, a legacy memory option is also provided,
however it is expected (or perhaps vainly hoped) to be temporary solution.
Why multiple memseg lists instead of one? Since memseg is a single page now,
the list of memsegs will get quite big, and we need to locate pages somehow
when we allocate and free them. We could of course just walk the list and
allocate one contiguous chunk of VA space for memsegs, but this
implementation uses separate lists instead in order to speed up many
operations with memseg lists.
- FreeBSD does not even compile, let alone run
- No 32-bit support
- There are some minor quality-of-life improvements planned that aren't
ready yet and will be part of v2
- VFIO support is only smoke-tested (but is expected to work), VFIO support
with secondary processes is not tested; work is ongoing to validate VFIO
for all use cases
- Dynamic mapping/unmapping memory with VFIO is not supported in sPAPR
IOMMU mode - help from sPAPR maintainers requested
Nevertheless, this patchset should be testable under 64-bit Linux, and
should work for all use cases bar those mentioned above.
[1] http://dpdk.org/dev/patchwork/bundle/aburakov/Memory_RFC/
[2] http://dpdk.org/dev/patchwork/bundle/aburakov/IPC_Fixes/
[3] http://dpdk.org/dev/patchwork/bundle/aburakov/IPC_Improvements/
[4] http://dpdk.org/dev/patchwork/bundle/aburakov/IPC_Async_Request/
[5] http://dpdk.org/dev/patchwork/bundle/aburakov/Num_Sockets/
[6] http://dpdk.org/dev/patchwork/patch/34002/
[7] http://dpdk.org/dev/patchwork/patch/33853/
[8] http://dpdk.org/dev/patchwork/patch/24484/
I did a quick pass on your patches (unfortunately, I don't have
the time to really dive in it).
- This is really a big patchset. Thank you for working on this topic.
I'll try to test our application with it as soon as possible.
- I see from patch 17 that it is possible that rte_malloc() expands
the heap by requesting more memory to the OS? Did I understand well?
Today, a good property of rte_malloc() compared to malloc() is that
it won't interrupt the process (the worst case is a spinlock). This
is appreciable on a dataplane core. Will it change?
Hi Olivier,

Not sure what you mean by "interrupt the process". The new rte_malloc
will _mostly_ work just like the old one. There are now two levels of
locks: the heap lock, and the system allocation lock. If your rte_malloc
call requests amount of memory that can be satisfied by already
allocated memory, then only the heap lock is engaged - or, to put it in
other words, things work as before.

When you *don't* have enough memory allocated, previously rte_malloc
would just fail. Now, it instead will lock the second lock and try to
allocate more memory from the system. This requires IPC (to ensure all
processes have allocated/freed the same memory), so this will take way
longer (timeout is set to wait up to 5 seconds, although under normal
circumstances it's taking a lot less - depending on how many processes
you have running, but generally under 100ms), and will block other
system allocations (i.e. if another rte_malloc call on another heap is
trying to request more memory from the system).

So, in short - you can't allocate from the same heap in parallel (same
as before), and you can't have parallel system memory allocation
requests (regardless of from which heap it comes from). The latter
*only* applies to system memory allocations - that is, if one heap is
allocating system memory while another heap receives allocation request
*and is able to satisfy it from already allocated memory*, it will not
block, because the second lock is never engaged.
Post by John Daley (johndale)
- It's not a big issue, but I have the feeling that the "const" statement
is often forgotten in the patchset. I think it is helpful for both
optimization, documentation and to detect bugs that modifies/free
something that should not.
Generally, if things aren't const, they aren't for a reason :) I made
things const by default and removed constness once i needed to. However,
there may have been a few places where i changed the code around but
forgot to put constness back. I'll look into it.

Thanks for your reviews!
Post by John Daley (johndale)
I'm sending some other dummy comments as replies to patches.
Thanks,
Olivier
--
Thanks,
Anatoly
Olivier Matz
2018-03-20 12:42:55 UTC
Permalink
Post by Burakov, Anatoly
Post by John Daley (johndale)
Hi Anatoly,
This patchset introduces dynamic memory allocation for DPDK (aka memory
hotplug). Based upon RFC submitted in December [1].
- IPC bugfixes patchset [2]
- IPC improvements patchset [3]
- IPC asynchronous request API patch [4]
- Function to return number of sockets [5]
- General outline of memory hotplug changes [6]
- EAL NUMA node count changes [7]
The vast majority of changes are in the EAL and malloc, the external API
disruption is minimal: a new set of API's are added for contiguous memory
allocation for rte_memzone, and a few API additions in rte_memory due to
switch to memseg_lists as opposed to memsegs. Every other API change is
internal to EAL, and all of the memory allocation/freeing is handled
through rte_malloc, with no externally visible API changes.
* Malloc heap adjusted to handle holes in address space
* Single memseg list replaced by multiple memseg lists
* VA space for hugepages is preallocated in advance
* Added alloc/free for pages happening as needed on rte_malloc/rte_free
* Added contiguous memory allocation API's for rte_memzone
* Integrated Pawel Wodkowski's patch for registering/unregistering memory
with VFIO [8]
* Callbacks for registering memory allocations
* Multiprocess support done via DPDK IPC introduced in 18.02
The biggest difference is a "memseg" now represents a single page (as opposed to
being a big contiguous block of pages). As a consequence, both memzones and
malloc elements are no longer guaranteed to be physically contiguous, unless
the user asks for it at reserve time. To preserve whatever functionality that
was dependent on previous behavior, a legacy memory option is also provided,
however it is expected (or perhaps vainly hoped) to be temporary solution.
Why multiple memseg lists instead of one? Since memseg is a single page now,
the list of memsegs will get quite big, and we need to locate pages somehow
when we allocate and free them. We could of course just walk the list and
allocate one contiguous chunk of VA space for memsegs, but this
implementation uses separate lists instead in order to speed up many
operations with memseg lists.
- FreeBSD does not even compile, let alone run
- No 32-bit support
- There are some minor quality-of-life improvements planned that aren't
ready yet and will be part of v2
- VFIO support is only smoke-tested (but is expected to work), VFIO support
with secondary processes is not tested; work is ongoing to validate VFIO
for all use cases
- Dynamic mapping/unmapping memory with VFIO is not supported in sPAPR
IOMMU mode - help from sPAPR maintainers requested
Nevertheless, this patchset should be testable under 64-bit Linux, and
should work for all use cases bar those mentioned above.
[1] http://dpdk.org/dev/patchwork/bundle/aburakov/Memory_RFC/
[2] http://dpdk.org/dev/patchwork/bundle/aburakov/IPC_Fixes/
[3] http://dpdk.org/dev/patchwork/bundle/aburakov/IPC_Improvements/
[4] http://dpdk.org/dev/patchwork/bundle/aburakov/IPC_Async_Request/
[5] http://dpdk.org/dev/patchwork/bundle/aburakov/Num_Sockets/
[6] http://dpdk.org/dev/patchwork/patch/34002/
[7] http://dpdk.org/dev/patchwork/patch/33853/
[8] http://dpdk.org/dev/patchwork/patch/24484/
I did a quick pass on your patches (unfortunately, I don't have
the time to really dive in it).
- This is really a big patchset. Thank you for working on this topic.
I'll try to test our application with it as soon as possible.
- I see from patch 17 that it is possible that rte_malloc() expands
the heap by requesting more memory to the OS? Did I understand well?
Today, a good property of rte_malloc() compared to malloc() is that
it won't interrupt the process (the worst case is a spinlock). This
is appreciable on a dataplane core. Will it change?
Hi Olivier,
Not sure what you mean by "interrupt the process". The new rte_malloc will
_mostly_ work just like the old one. There are now two levels of locks: the
heap lock, and the system allocation lock. If your rte_malloc call requests
amount of memory that can be satisfied by already allocated memory, then
only the heap lock is engaged - or, to put it in other words, things work as
before.
When you *don't* have enough memory allocated, previously rte_malloc would
just fail. Now, it instead will lock the second lock and try to allocate
more memory from the system. This requires IPC (to ensure all processes have
allocated/freed the same memory), so this will take way longer (timeout is
set to wait up to 5 seconds, although under normal circumstances it's taking
a lot less - depending on how many processes you have running, but generally
under 100ms), and will block other system allocations (i.e. if another
rte_malloc call on another heap is trying to request more memory from the
system).
So, in short - you can't allocate from the same heap in parallel (same as
before), and you can't have parallel system memory allocation requests
(regardless of from which heap it comes from). The latter *only* applies to
system memory allocations - that is, if one heap is allocating system memory
while another heap receives allocation request *and is able to satisfy it
from already allocated memory*, it will not block, because the second lock
is never engaged.
OK. Let's imagine you are using rte_malloc() on a dataplane core, and
you run out of memory. Previously, the allocation would just fail. Now,
if my understanding is correct, it can block for a long time, which can
be a problem on a dataplane core, because it will cause packet losses,
especially if it also blocks allocations on other cores during that
time. In this case, it could be useful to make the dynamic heap resizing
feature optional.

I have another question about the patchset. Today, it is not really
possible for an application to allocate a page. If you want a full page
(ex: 2M), you need to allocate 4M because the rte_malloc layer adds a
header before the allocated memory. Therefore, if the memory is
fragmented a lot with only 2M pages, you cannot allocate them as pages.

It is possible, with your patchset or in the future, to have an access
to a page-based allocator? The use-case is to be able for an application
to ask for pages in dpdk memory and remap them in a virtually contiguous
memory.

Thanks
Olivier
Burakov, Anatoly
2018-03-20 13:51:31 UTC
Permalink
Post by Olivier Matz
Post by Burakov, Anatoly
Post by John Daley (johndale)
Hi Anatoly,
This patchset introduces dynamic memory allocation for DPDK (aka memory
hotplug). Based upon RFC submitted in December [1].
- IPC bugfixes patchset [2]
- IPC improvements patchset [3]
- IPC asynchronous request API patch [4]
- Function to return number of sockets [5]
- General outline of memory hotplug changes [6]
- EAL NUMA node count changes [7]
The vast majority of changes are in the EAL and malloc, the external API
disruption is minimal: a new set of API's are added for contiguous memory
allocation for rte_memzone, and a few API additions in rte_memory due to
switch to memseg_lists as opposed to memsegs. Every other API change is
internal to EAL, and all of the memory allocation/freeing is handled
through rte_malloc, with no externally visible API changes.
* Malloc heap adjusted to handle holes in address space
* Single memseg list replaced by multiple memseg lists
* VA space for hugepages is preallocated in advance
* Added alloc/free for pages happening as needed on rte_malloc/rte_free
* Added contiguous memory allocation API's for rte_memzone
* Integrated Pawel Wodkowski's patch for registering/unregistering memory
with VFIO [8]
* Callbacks for registering memory allocations
* Multiprocess support done via DPDK IPC introduced in 18.02
The biggest difference is a "memseg" now represents a single page (as opposed to
being a big contiguous block of pages). As a consequence, both memzones and
malloc elements are no longer guaranteed to be physically contiguous, unless
the user asks for it at reserve time. To preserve whatever functionality that
was dependent on previous behavior, a legacy memory option is also provided,
however it is expected (or perhaps vainly hoped) to be temporary solution.
Why multiple memseg lists instead of one? Since memseg is a single page now,
the list of memsegs will get quite big, and we need to locate pages somehow
when we allocate and free them. We could of course just walk the list and
allocate one contiguous chunk of VA space for memsegs, but this
implementation uses separate lists instead in order to speed up many
operations with memseg lists.
- FreeBSD does not even compile, let alone run
- No 32-bit support
- There are some minor quality-of-life improvements planned that aren't
ready yet and will be part of v2
- VFIO support is only smoke-tested (but is expected to work), VFIO support
with secondary processes is not tested; work is ongoing to validate VFIO
for all use cases
- Dynamic mapping/unmapping memory with VFIO is not supported in sPAPR
IOMMU mode - help from sPAPR maintainers requested
Nevertheless, this patchset should be testable under 64-bit Linux, and
should work for all use cases bar those mentioned above.
[1] http://dpdk.org/dev/patchwork/bundle/aburakov/Memory_RFC/
[2] http://dpdk.org/dev/patchwork/bundle/aburakov/IPC_Fixes/
[3] http://dpdk.org/dev/patchwork/bundle/aburakov/IPC_Improvements/
[4] http://dpdk.org/dev/patchwork/bundle/aburakov/IPC_Async_Request/
[5] http://dpdk.org/dev/patchwork/bundle/aburakov/Num_Sockets/
[6] http://dpdk.org/dev/patchwork/patch/34002/
[7] http://dpdk.org/dev/patchwork/patch/33853/
[8] http://dpdk.org/dev/patchwork/patch/24484/
I did a quick pass on your patches (unfortunately, I don't have
the time to really dive in it).
- This is really a big patchset. Thank you for working on this topic.
I'll try to test our application with it as soon as possible.
- I see from patch 17 that it is possible that rte_malloc() expands
the heap by requesting more memory to the OS? Did I understand well?
Today, a good property of rte_malloc() compared to malloc() is that
it won't interrupt the process (the worst case is a spinlock). This
is appreciable on a dataplane core. Will it change?
Hi Olivier,
Not sure what you mean by "interrupt the process". The new rte_malloc will
_mostly_ work just like the old one. There are now two levels of locks: the
heap lock, and the system allocation lock. If your rte_malloc call requests
amount of memory that can be satisfied by already allocated memory, then
only the heap lock is engaged - or, to put it in other words, things work as
before.
When you *don't* have enough memory allocated, previously rte_malloc would
just fail. Now, it instead will lock the second lock and try to allocate
more memory from the system. This requires IPC (to ensure all processes have
allocated/freed the same memory), so this will take way longer (timeout is
set to wait up to 5 seconds, although under normal circumstances it's taking
a lot less - depending on how many processes you have running, but generally
under 100ms), and will block other system allocations (i.e. if another
rte_malloc call on another heap is trying to request more memory from the
system).
So, in short - you can't allocate from the same heap in parallel (same as
before), and you can't have parallel system memory allocation requests
(regardless of from which heap it comes from). The latter *only* applies to
system memory allocations - that is, if one heap is allocating system memory
while another heap receives allocation request *and is able to satisfy it
from already allocated memory*, it will not block, because the second lock
is never engaged.
OK. Let's imagine you are using rte_malloc() on a dataplane core, and
you run out of memory. Previously, the allocation would just fail. Now,
if my understanding is correct, it can block for a long time, which can
be a problem on a dataplane core, because it will cause packet losses,
especially if it also blocks allocations on other cores during that
time. In this case, it could be useful to make the dynamic heap resizing
feature optional.
Why would anyone in their right mind call rte_malloc on fast path? If
you're referring to mempool allocations/deallocations, then this is a
completely separate subject, as mempool alloc/free is not handled by
rte_malloc but is handled by rte_mempool itself - as far as rte_malloc
is concerned, that memory is already allocated and it will not touch it.

As for "making heap resizing feature optional", i'm working on
functionality that would essentially enable that. Specifically, i'm
adding API's to set allocation limits and a callback which will get
triggered once allocator tries to allocate beyond said limits, with an
option of returning -1 and thus preventing this allocation from
completing. While this is kind of a round-about way of doing it, it
would have similar effect.
Post by Olivier Matz
I have another question about the patchset. Today, it is not really
possible for an application to allocate a page. If you want a full page
(ex: 2M), you need to allocate 4M because the rte_malloc layer adds a
header before the allocated memory. Therefore, if the memory is
fragmented a lot with only 2M pages, you cannot allocate them as pages.
It is possible, with your patchset or in the future, to have an access
to a page-based allocator? The use-case is to be able for an application
to ask for pages in dpdk memory and remap them in a virtually contiguous
memory.
Pages returned from our allocator are already virtually contiguous,
there is no need to do any remapping. If user specifies proper size and
alignment (i.e. reserve a memzone with RTE_MEMZONE_2MB and with 2M size
and alignment), it will essentially cause the allocator to return a
memzone that's exactly page-size long. Yes, in the background, it will
allocate another page to store malloc metadata, and yes, memory will
become fragmented if multiple such allocations will occur. It is not
possible (neither now nor in the future planned work) to do what you
describe unless we store malloc data separately from allocated memory
(which can be done, but is a non-trivial amount of work).

Malloc stores its metadata right in the hugepage mostly for multiprocess
purposes - so that the entire heap is always shared between all
processes. If we want to store malloc metadata separately from allocated
memory, a replacement mechanism to shared heap metadata will need to be
put in place (which, again, can be done, but is a non-trivial amount of
work - arguably for questionable gain).

That said, use case you have described is already possible - just
allocate multiple pages from DPDK as a memzone, and overlay your own
memory allocator over that memory. This will have the same effect.
Post by Olivier Matz
Thanks
Olivier
--
Thanks,
Anatoly
Olivier Matz
2018-03-20 14:18:29 UTC
Permalink
Hi,
Post by Olivier Matz
Post by Burakov, Anatoly
Post by John Daley (johndale)
Hi Anatoly,
This patchset introduces dynamic memory allocation for DPDK (aka memory
hotplug). Based upon RFC submitted in December [1].
- IPC bugfixes patchset [2]
- IPC improvements patchset [3]
- IPC asynchronous request API patch [4]
- Function to return number of sockets [5]
- General outline of memory hotplug changes [6]
- EAL NUMA node count changes [7]
The vast majority of changes are in the EAL and malloc, the external API
disruption is minimal: a new set of API's are added for contiguous memory
allocation for rte_memzone, and a few API additions in rte_memory due to
switch to memseg_lists as opposed to memsegs. Every other API change is
internal to EAL, and all of the memory allocation/freeing is handled
through rte_malloc, with no externally visible API changes.
* Malloc heap adjusted to handle holes in address space
* Single memseg list replaced by multiple memseg lists
* VA space for hugepages is preallocated in advance
* Added alloc/free for pages happening as needed on rte_malloc/rte_free
* Added contiguous memory allocation API's for rte_memzone
* Integrated Pawel Wodkowski's patch for registering/unregistering memory
with VFIO [8]
* Callbacks for registering memory allocations
* Multiprocess support done via DPDK IPC introduced in 18.02
The biggest difference is a "memseg" now represents a single page (as opposed to
being a big contiguous block of pages). As a consequence, both memzones and
malloc elements are no longer guaranteed to be physically contiguous, unless
the user asks for it at reserve time. To preserve whatever functionality that
was dependent on previous behavior, a legacy memory option is also provided,
however it is expected (or perhaps vainly hoped) to be temporary solution.
Why multiple memseg lists instead of one? Since memseg is a single page now,
the list of memsegs will get quite big, and we need to locate pages somehow
when we allocate and free them. We could of course just walk the list and
allocate one contiguous chunk of VA space for memsegs, but this
implementation uses separate lists instead in order to speed up many
operations with memseg lists.
- FreeBSD does not even compile, let alone run
- No 32-bit support
- There are some minor quality-of-life improvements planned that aren't
ready yet and will be part of v2
- VFIO support is only smoke-tested (but is expected to work), VFIO support
with secondary processes is not tested; work is ongoing to validate VFIO
for all use cases
- Dynamic mapping/unmapping memory with VFIO is not supported in sPAPR
IOMMU mode - help from sPAPR maintainers requested
Nevertheless, this patchset should be testable under 64-bit Linux, and
should work for all use cases bar those mentioned above.
[1] http://dpdk.org/dev/patchwork/bundle/aburakov/Memory_RFC/
[2] http://dpdk.org/dev/patchwork/bundle/aburakov/IPC_Fixes/
[3] http://dpdk.org/dev/patchwork/bundle/aburakov/IPC_Improvements/
[4] http://dpdk.org/dev/patchwork/bundle/aburakov/IPC_Async_Request/
[5] http://dpdk.org/dev/patchwork/bundle/aburakov/Num_Sockets/
[6] http://dpdk.org/dev/patchwork/patch/34002/
[7] http://dpdk.org/dev/patchwork/patch/33853/
[8] http://dpdk.org/dev/patchwork/patch/24484/
I did a quick pass on your patches (unfortunately, I don't have
the time to really dive in it).
- This is really a big patchset. Thank you for working on this topic.
I'll try to test our application with it as soon as possible.
- I see from patch 17 that it is possible that rte_malloc() expands
the heap by requesting more memory to the OS? Did I understand well?
Today, a good property of rte_malloc() compared to malloc() is that
it won't interrupt the process (the worst case is a spinlock). This
is appreciable on a dataplane core. Will it change?
Hi Olivier,
Not sure what you mean by "interrupt the process". The new rte_malloc will
_mostly_ work just like the old one. There are now two levels of locks: the
heap lock, and the system allocation lock. If your rte_malloc call requests
amount of memory that can be satisfied by already allocated memory, then
only the heap lock is engaged - or, to put it in other words, things work as
before.
When you *don't* have enough memory allocated, previously rte_malloc would
just fail. Now, it instead will lock the second lock and try to allocate
more memory from the system. This requires IPC (to ensure all processes have
allocated/freed the same memory), so this will take way longer (timeout is
set to wait up to 5 seconds, although under normal circumstances it's taking
a lot less - depending on how many processes you have running, but generally
under 100ms), and will block other system allocations (i.e. if another
rte_malloc call on another heap is trying to request more memory from the
system).
So, in short - you can't allocate from the same heap in parallel (same as
before), and you can't have parallel system memory allocation requests
(regardless of from which heap it comes from). The latter *only* applies to
system memory allocations - that is, if one heap is allocating system memory
while another heap receives allocation request *and is able to satisfy it
from already allocated memory*, it will not block, because the second lock
is never engaged.
OK. Let's imagine you are using rte_malloc() on a dataplane core, and
you run out of memory. Previously, the allocation would just fail. Now,
if my understanding is correct, it can block for a long time, which can
be a problem on a dataplane core, because it will cause packet losses,
especially if it also blocks allocations on other cores during that
time. In this case, it could be useful to make the dynamic heap resizing
feature optional.
Why would anyone in their right mind call rte_malloc on fast path? If you're
referring to mempool allocations/deallocations, then this is a completely
separate subject, as mempool alloc/free is not handled by rte_malloc but is
handled by rte_mempool itself - as far as rte_malloc is concerned, that
memory is already allocated and it will not touch it.
As for "making heap resizing feature optional", i'm working on functionality
that would essentially enable that. Specifically, i'm adding API's to set
allocation limits and a callback which will get triggered once allocator
tries to allocate beyond said limits, with an option of returning -1 and
thus preventing this allocation from completing. While this is kind of a
round-about way of doing it, it would have similar effect.
Calling rte_malloc() in the data path may be required in case the
application needs to allocate an unknown-sized object. I'm not saying
it's a usual or an optimal use case, I just say that it happens.

Waiting for a spinlock is acceptable in datapath, if it is held by
another dataplane core.
Waiting for several hundreds of ms is not an option in that case.

If the feature is going to be optional, it's perfectly fine for me.
Post by Olivier Matz
I have another question about the patchset. Today, it is not really
possible for an application to allocate a page. If you want a full page
(ex: 2M), you need to allocate 4M because the rte_malloc layer adds a
header before the allocated memory. Therefore, if the memory is
fragmented a lot with only 2M pages, you cannot allocate them as pages.
It is possible, with your patchset or in the future, to have an access
to a page-based allocator? The use-case is to be able for an application
to ask for pages in dpdk memory and remap them in a virtually contiguous
memory.
Pages returned from our allocator are already virtually contiguous, there is
no need to do any remapping. If user specifies proper size and alignment
(i.e. reserve a memzone with RTE_MEMZONE_2MB and with 2M size and
alignment), it will essentially cause the allocator to return a memzone
that's exactly page-size long. Yes, in the background, it will allocate
another page to store malloc metadata, and yes, memory will become
fragmented if multiple such allocations will occur. It is not possible
(neither now nor in the future planned work) to do what you describe unless
we store malloc data separately from allocated memory (which can be done,
but is a non-trivial amount of work).
Malloc stores its metadata right in the hugepage mostly for multiprocess
purposes - so that the entire heap is always shared between all processes.
If we want to store malloc metadata separately from allocated memory, a
replacement mechanism to shared heap metadata will need to be put in place
(which, again, can be done, but is a non-trivial amount of work - arguably
for questionable gain).
That said, use case you have described is already possible - just allocate
multiple pages from DPDK as a memzone, and overlay your own memory allocator
over that memory. This will have the same effect.
Yes, that's currently what I'm doing: to get one 2M page, I'm allocating
more 2M with 2M alignement, which actually results in 4M allocation. My
problem today is when the huge pages are already fragmented at dpdk
start (i.e. only isolated pages). So an allocation of > 2M would fail.

So your patchset mostly solves that issue, because rte_malloc() does not
request physically contiguous memory anymore, which means that
physically isolated hugepages are now virtually contiguous, right? So
rte_malloc(4M) will always be succesful until the memory is virtually
fragmented (i.e. after several malloc/free).

Thank you for the clarification.
Burakov, Anatoly
2018-03-20 14:46:33 UTC
Permalink
Post by Olivier Matz
Hi,
Post by Olivier Matz
Post by Burakov, Anatoly
Post by John Daley (johndale)
Hi Anatoly,
This patchset introduces dynamic memory allocation for DPDK (aka memory
hotplug). Based upon RFC submitted in December [1].
- IPC bugfixes patchset [2]
- IPC improvements patchset [3]
- IPC asynchronous request API patch [4]
- Function to return number of sockets [5]
- General outline of memory hotplug changes [6]
- EAL NUMA node count changes [7]
The vast majority of changes are in the EAL and malloc, the external API
disruption is minimal: a new set of API's are added for contiguous memory
allocation for rte_memzone, and a few API additions in rte_memory due to
switch to memseg_lists as opposed to memsegs. Every other API change is
internal to EAL, and all of the memory allocation/freeing is handled
through rte_malloc, with no externally visible API changes.
* Malloc heap adjusted to handle holes in address space
* Single memseg list replaced by multiple memseg lists
* VA space for hugepages is preallocated in advance
* Added alloc/free for pages happening as needed on rte_malloc/rte_free
* Added contiguous memory allocation API's for rte_memzone
* Integrated Pawel Wodkowski's patch for registering/unregistering memory
with VFIO [8]
* Callbacks for registering memory allocations
* Multiprocess support done via DPDK IPC introduced in 18.02
The biggest difference is a "memseg" now represents a single page (as opposed to
being a big contiguous block of pages). As a consequence, both memzones and
malloc elements are no longer guaranteed to be physically contiguous, unless
the user asks for it at reserve time. To preserve whatever functionality that
was dependent on previous behavior, a legacy memory option is also provided,
however it is expected (or perhaps vainly hoped) to be temporary solution.
Why multiple memseg lists instead of one? Since memseg is a single page now,
the list of memsegs will get quite big, and we need to locate pages somehow
when we allocate and free them. We could of course just walk the list and
allocate one contiguous chunk of VA space for memsegs, but this
implementation uses separate lists instead in order to speed up many
operations with memseg lists.
- FreeBSD does not even compile, let alone run
- No 32-bit support
- There are some minor quality-of-life improvements planned that aren't
ready yet and will be part of v2
- VFIO support is only smoke-tested (but is expected to work), VFIO support
with secondary processes is not tested; work is ongoing to validate VFIO
for all use cases
- Dynamic mapping/unmapping memory with VFIO is not supported in sPAPR
IOMMU mode - help from sPAPR maintainers requested
Nevertheless, this patchset should be testable under 64-bit Linux, and
should work for all use cases bar those mentioned above.
[1] http://dpdk.org/dev/patchwork/bundle/aburakov/Memory_RFC/
[2] http://dpdk.org/dev/patchwork/bundle/aburakov/IPC_Fixes/
[3] http://dpdk.org/dev/patchwork/bundle/aburakov/IPC_Improvements/
[4] http://dpdk.org/dev/patchwork/bundle/aburakov/IPC_Async_Request/
[5] http://dpdk.org/dev/patchwork/bundle/aburakov/Num_Sockets/
[6] http://dpdk.org/dev/patchwork/patch/34002/
[7] http://dpdk.org/dev/patchwork/patch/33853/
[8] http://dpdk.org/dev/patchwork/patch/24484/
I did a quick pass on your patches (unfortunately, I don't have
the time to really dive in it).
- This is really a big patchset. Thank you for working on this topic.
I'll try to test our application with it as soon as possible.
- I see from patch 17 that it is possible that rte_malloc() expands
the heap by requesting more memory to the OS? Did I understand well?
Today, a good property of rte_malloc() compared to malloc() is that
it won't interrupt the process (the worst case is a spinlock). This
is appreciable on a dataplane core. Will it change?
Hi Olivier,
Not sure what you mean by "interrupt the process". The new rte_malloc will
_mostly_ work just like the old one. There are now two levels of locks: the
heap lock, and the system allocation lock. If your rte_malloc call requests
amount of memory that can be satisfied by already allocated memory, then
only the heap lock is engaged - or, to put it in other words, things work as
before.
When you *don't* have enough memory allocated, previously rte_malloc would
just fail. Now, it instead will lock the second lock and try to allocate
more memory from the system. This requires IPC (to ensure all processes have
allocated/freed the same memory), so this will take way longer (timeout is
set to wait up to 5 seconds, although under normal circumstances it's taking
a lot less - depending on how many processes you have running, but generally
under 100ms), and will block other system allocations (i.e. if another
rte_malloc call on another heap is trying to request more memory from the
system).
So, in short - you can't allocate from the same heap in parallel (same as
before), and you can't have parallel system memory allocation requests
(regardless of from which heap it comes from). The latter *only* applies to
system memory allocations - that is, if one heap is allocating system memory
while another heap receives allocation request *and is able to satisfy it
from already allocated memory*, it will not block, because the second lock
is never engaged.
OK. Let's imagine you are using rte_malloc() on a dataplane core, and
you run out of memory. Previously, the allocation would just fail. Now,
if my understanding is correct, it can block for a long time, which can
be a problem on a dataplane core, because it will cause packet losses,
especially if it also blocks allocations on other cores during that
time. In this case, it could be useful to make the dynamic heap resizing
feature optional.
Why would anyone in their right mind call rte_malloc on fast path? If you're
referring to mempool allocations/deallocations, then this is a completely
separate subject, as mempool alloc/free is not handled by rte_malloc but is
handled by rte_mempool itself - as far as rte_malloc is concerned, that
memory is already allocated and it will not touch it.
As for "making heap resizing feature optional", i'm working on functionality
that would essentially enable that. Specifically, i'm adding API's to set
allocation limits and a callback which will get triggered once allocator
tries to allocate beyond said limits, with an option of returning -1 and
thus preventing this allocation from completing. While this is kind of a
round-about way of doing it, it would have similar effect.
Calling rte_malloc() in the data path may be required in case the
application needs to allocate an unknown-sized object. I'm not saying
it's a usual or an optimal use case, I just say that it happens.
Waiting for a spinlock is acceptable in datapath, if it is held by
another dataplane core.
Waiting for several hundreds of ms is not an option in that case.
If the feature is going to be optional, it's perfectly fine for me.
Well, there's always an option of running in "legacy mem" mode, which
disables memory hotplug completely and will essentially behave like it
does right now (allocate VA and IOVA-contiguous segments).

But yes, with said allocation limits API you will essentially be able to
control which allocations succeed and which don't. It's not exactly
"making it optional", but you can have control over system memory
allocations that would enable that. For example, at init you allocate
all your necessary data structures, and then you set the memory
allocation limits in such a way that you can neither allocate nor
deallocate any pages whatsoever once you start up your fast-path. This
way, regular malloc will still work, but any page
allocation/deallocation request will not go through.
Post by Olivier Matz
Post by Olivier Matz
I have another question about the patchset. Today, it is not really
possible for an application to allocate a page. If you want a full page
(ex: 2M), you need to allocate 4M because the rte_malloc layer adds a
header before the allocated memory. Therefore, if the memory is
fragmented a lot with only 2M pages, you cannot allocate them as pages.
It is possible, with your patchset or in the future, to have an access
to a page-based allocator? The use-case is to be able for an application
to ask for pages in dpdk memory and remap them in a virtually contiguous
memory.
Pages returned from our allocator are already virtually contiguous, there is
no need to do any remapping. If user specifies proper size and alignment
(i.e. reserve a memzone with RTE_MEMZONE_2MB and with 2M size and
alignment), it will essentially cause the allocator to return a memzone
that's exactly page-size long. Yes, in the background, it will allocate
another page to store malloc metadata, and yes, memory will become
fragmented if multiple such allocations will occur. It is not possible
(neither now nor in the future planned work) to do what you describe unless
we store malloc data separately from allocated memory (which can be done,
but is a non-trivial amount of work).
Malloc stores its metadata right in the hugepage mostly for multiprocess
purposes - so that the entire heap is always shared between all processes.
If we want to store malloc metadata separately from allocated memory, a
replacement mechanism to shared heap metadata will need to be put in place
(which, again, can be done, but is a non-trivial amount of work - arguably
for questionable gain).
That said, use case you have described is already possible - just allocate
multiple pages from DPDK as a memzone, and overlay your own memory allocator
over that memory. This will have the same effect.
Yes, that's currently what I'm doing: to get one 2M page, I'm allocating
more 2M with 2M alignement, which actually results in 4M allocation. My
problem today is when the huge pages are already fragmented at dpdk
start (i.e. only isolated pages). So an allocation of > 2M would fail.
So your patchset mostly solves that issue, because rte_malloc() does not
request physically contiguous memory anymore, which means that
physically isolated hugepages are now virtually contiguous, right? So
rte_malloc(4M) will always be succesful until the memory is virtually
fragmented (i.e. after several malloc/free).
Yes, that is correct. We preallocate all VA space in advance, so unless
you fragment your VA space by making multiple allocations in this way up
to a point where you run out of pages, you should be OK.

As i said, it is possible to rewrite the heap in a way that will do away
with storing metadata in-place, and that will solve some of the tricky
issues with memory allocator (such as pad elements, which require
special handling everywhere), however this metadata still has to be
stored somewhere in shared memory in order to be shared across
processes, and that poses a problem because at some point we may hit a
condition where we have plenty of free space but have exhausted our
malloc element list and cannot allocate more (and we can't realloc
because, well, multiprocess). So, such a scenario will come with its own
set of challenges. Sadly, there's no free lunch :(
Post by Olivier Matz
Thank you for the clarification.
--
Thanks,
Anatoly
Shreyansh Jain
2018-03-20 11:35:17 UTC
Permalink
Hello Anatoly,

On Wed, Mar 7, 2018 at 10:26 PM, Anatoly Burakov
Post by Anatoly Burakov
If a user has specified that the zone should have contiguous memory,
use the new _contig allocation API's instead of normal ones.
Otherwise, account for the fact that unless we're in IOVA_AS_VA
mode, we cannot guarantee that the pages would be physically
contiguous, so we calculate the memzone size and alignments as if
we were getting the smallest page size available.
---
[...]
Post by Anatoly Burakov
static void
mempool_add_elem(struct rte_mempool *mp, void *obj, rte_iova_t iova)
{
@@ -549,6 +570,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
unsigned mz_id, n;
unsigned int mp_flags;
int ret;
+ bool force_contig, no_contig;
/* mempool must not be populated */
if (mp->nb_mem_chunks != 0)
@@ -563,10 +585,46 @@ rte_mempool_populate_default(struct rte_mempool *mp)
/* update mempool capabilities */
mp->flags |= mp_flags;
- if (rte_eal_has_hugepages()) {
- pg_shift = 0; /* not needed, zone is physically contiguous */
+ no_contig = mp->flags & MEMPOOL_F_NO_PHYS_CONTIG;
+ force_contig = mp->flags & MEMPOOL_F_CAPA_PHYS_CONTIG;
+
+ /*
+ * there are several considerations for page size and page shift here.
+ *
+ * if we don't need our mempools to have physically contiguous objects,
+ * then just set page shift and page size to 0, because the user has
+ * indicated that there's no need to care about anything.
I think the above case is not handled properly here.
reason below...
Post by Anatoly Burakov
+ *
+ * if we do need contiguous objects, there is also an option to reserve
+ * the entire mempool memory as one contiguous block of memory, in
+ * which case the page shift and alignment wouldn't matter as well.
+ *
+ * if we require contiguous objects, but not necessarily the entire
+ * mempool reserved space to be contiguous, then there are two options.
+ *
+ * if our IO addresses are virtual, not actual physical (IOVA as VA
+ * case), then no page shift needed - our memory allocation will give us
+ * contiguous physical memory as far as the hardware is concerned, so
+ * act as if we're getting contiguous memory.
+ *
+ * if our IO addresses are physical, we may get memory from bigger
+ * pages, or we might get memory from smaller pages, and how much of it
+ * we require depends on whether we want bigger or smaller pages.
+ * However, requesting each and every memory size is too much work, so
+ * what we'll do instead is walk through the page sizes available, pick
+ * the smallest one and set up page shift to match that one. We will be
+ * wasting some space this way, but it's much nicer than looping around
+ * trying to reserve each and every page size.
+ */
+
+ if (no_contig || force_contig || rte_eal_iova_mode() == RTE_IOVA_VA) {
pg_sz = 0;
+ pg_shift = 0;
align = RTE_CACHE_LINE_SIZE;
So, assuming dpaa2 as example, I ran testpmd. IOVA=VA is the mode.
pg_sz = 0 is set.
same as before applying the hotplug patchset except that earlier this
decision was purely based on availability of hugepages
(rte_eal_has_hugepages()).
Moving on...
Post by Anatoly Burakov
+ } else if (rte_eal_has_hugepages()) {
+ pg_sz = get_min_page_size();
+ pg_shift = rte_bsf32(pg_sz);
+ align = pg_sz;
} else {
pg_sz = getpagesize();
pg_shift = rte_bsf32(pg_sz);
@@ -585,23 +643,34 @@ rte_mempool_populate_default(struct rte_mempool *mp)
goto fail;
}
- mz = rte_memzone_reserve_aligned(mz_name, size,
- mp->socket_id, mz_flags, align);
- /* not enough memory, retry with the biggest zone we have */
- if (mz == NULL)
- mz = rte_memzone_reserve_aligned(mz_name, 0,
+ if (force_contig) {
+ /*
+ * if contiguous memory for entire mempool memory was
+ * requested, don't try reserving again if we fail.
+ */
+ mz = rte_memzone_reserve_aligned_contig(mz_name, size,
+ mp->socket_id, mz_flags, align);
+ } else {
+ mz = rte_memzone_reserve_aligned(mz_name, size,
mp->socket_id, mz_flags, align);
+ /* not enough memory, retry with the biggest zone we
+ * have
+ */
+ if (mz == NULL)
+ mz = rte_memzone_reserve_aligned(mz_name, 0,
+ mp->socket_id, mz_flags, align);
+ }
if (mz == NULL) {
ret = -rte_errno;
goto fail;
}
- if (mp->flags & MEMPOOL_F_NO_PHYS_CONTIG)
+ if (no_contig)
iova = RTE_BAD_IOVA;
else
iova = mz->iova;
- if (rte_eal_has_hugepages())
+ if (rte_eal_has_hugepages() && force_contig)
So, pre-hotplugging patch, call used to enter mempool_populate_iova.
But, with the 'force_contig' not set (in app/test-pmd/testpmd.c:521)
while calling rte_pktmbuf_pool_create, rte_mempool_populate_va is
called instead.
Post by Anatoly Burakov
ret = rte_mempool_populate_iova(mp, mz->addr,
iova, mz->len,
rte_mempool_memchunk_mz_free,
--
2.7.4
This is called with pg_sz = 0:
678 else
Post by Anatoly Burakov
# 679 ret = rte_mempool_populate_virt(mp, mz->addr,
680 mz->len, pg_sz,
681 rte_mempool_memchunk_mz_free,
682 (void *)(uintptr_t)mz);

In this function,

512 /* address and len must be page-aligned */
513 if (RTE_PTR_ALIGN_CEIL(addr, pg_sz) != addr)
514 return -EINVAL;

This is where error is returned.

I don't think RTE_PTR_ALIGN_CEIL is designed to handle pg_sz = 0.

It is roughly equivalent to:
RTE_PTR_ALIGN_FLOOR(((uintptr_t)addr - 1), pg_sz) which returns NULL
(0 ~ pg_sz).

Basically, this ends up failing rte_mempool_populate_default.

I think the reason is the assumption that when
rte_mempool_populate_virt is called, it can handle 0 page sizes (there
would issues besides the above RTE_PTR_ALIGN_CEIL as well, like a
for-loop looping over off+pg_sz), is wrong. It needs a valid page-size
value to work with (!0).

So, basically, DPAA2 is stuck with this patch because of above issue,
if I am correctly comprehending it as above.

Regards,
Shreyansh
Burakov, Anatoly
2018-03-20 12:17:44 UTC
Permalink
Post by Shreyansh Jain
Hello Anatoly,
On Wed, Mar 7, 2018 at 10:26 PM, Anatoly Burakov
Post by Anatoly Burakov
If a user has specified that the zone should have contiguous memory,
use the new _contig allocation API's instead of normal ones.
Otherwise, account for the fact that unless we're in IOVA_AS_VA
mode, we cannot guarantee that the pages would be physically
contiguous, so we calculate the memzone size and alignments as if
we were getting the smallest page size available.
---
[...]
Post by Anatoly Burakov
static void
mempool_add_elem(struct rte_mempool *mp, void *obj, rte_iova_t iova)
{
@@ -549,6 +570,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
unsigned mz_id, n;
unsigned int mp_flags;
int ret;
+ bool force_contig, no_contig;
/* mempool must not be populated */
if (mp->nb_mem_chunks != 0)
@@ -563,10 +585,46 @@ rte_mempool_populate_default(struct rte_mempool *mp)
/* update mempool capabilities */
mp->flags |= mp_flags;
- if (rte_eal_has_hugepages()) {
- pg_shift = 0; /* not needed, zone is physically contiguous */
+ no_contig = mp->flags & MEMPOOL_F_NO_PHYS_CONTIG;
+ force_contig = mp->flags & MEMPOOL_F_CAPA_PHYS_CONTIG;
+
+ /*
+ * there are several considerations for page size and page shift here.
+ *
+ * if we don't need our mempools to have physically contiguous objects,
+ * then just set page shift and page size to 0, because the user has
+ * indicated that there's no need to care about anything.
I think the above case is not handled properly here.
reason below...
Post by Anatoly Burakov
+ *
+ * if we do need contiguous objects, there is also an option to reserve
+ * the entire mempool memory as one contiguous block of memory, in
+ * which case the page shift and alignment wouldn't matter as well.
+ *
+ * if we require contiguous objects, but not necessarily the entire
+ * mempool reserved space to be contiguous, then there are two options.
+ *
+ * if our IO addresses are virtual, not actual physical (IOVA as VA
+ * case), then no page shift needed - our memory allocation will give us
+ * contiguous physical memory as far as the hardware is concerned, so
+ * act as if we're getting contiguous memory.
+ *
+ * if our IO addresses are physical, we may get memory from bigger
+ * pages, or we might get memory from smaller pages, and how much of it
+ * we require depends on whether we want bigger or smaller pages.
+ * However, requesting each and every memory size is too much work, so
+ * what we'll do instead is walk through the page sizes available, pick
+ * the smallest one and set up page shift to match that one. We will be
+ * wasting some space this way, but it's much nicer than looping around
+ * trying to reserve each and every page size.
+ */
+
+ if (no_contig || force_contig || rte_eal_iova_mode() == RTE_IOVA_VA) {
pg_sz = 0;
+ pg_shift = 0;
align = RTE_CACHE_LINE_SIZE;
So, assuming dpaa2 as example, I ran testpmd. IOVA=VA is the mode.
pg_sz = 0 is set.
same as before applying the hotplug patchset except that earlier this
decision was purely based on availability of hugepages
(rte_eal_has_hugepages()).
Moving on...
Post by Anatoly Burakov
+ } else if (rte_eal_has_hugepages()) {
+ pg_sz = get_min_page_size();
+ pg_shift = rte_bsf32(pg_sz);
+ align = pg_sz;
} else {
pg_sz = getpagesize();
pg_shift = rte_bsf32(pg_sz);
@@ -585,23 +643,34 @@ rte_mempool_populate_default(struct rte_mempool *mp)
goto fail;
}
- mz = rte_memzone_reserve_aligned(mz_name, size,
- mp->socket_id, mz_flags, align);
- /* not enough memory, retry with the biggest zone we have */
- if (mz == NULL)
- mz = rte_memzone_reserve_aligned(mz_name, 0,
+ if (force_contig) {
+ /*
+ * if contiguous memory for entire mempool memory was
+ * requested, don't try reserving again if we fail.
+ */
+ mz = rte_memzone_reserve_aligned_contig(mz_name, size,
+ mp->socket_id, mz_flags, align);
+ } else {
+ mz = rte_memzone_reserve_aligned(mz_name, size,
mp->socket_id, mz_flags, align);
+ /* not enough memory, retry with the biggest zone we
+ * have
+ */
+ if (mz == NULL)
+ mz = rte_memzone_reserve_aligned(mz_name, 0,
+ mp->socket_id, mz_flags, align);
+ }
if (mz == NULL) {
ret = -rte_errno;
goto fail;
}
- if (mp->flags & MEMPOOL_F_NO_PHYS_CONTIG)
+ if (no_contig)
iova = RTE_BAD_IOVA;
else
iova = mz->iova;
- if (rte_eal_has_hugepages())
+ if (rte_eal_has_hugepages() && force_contig)
So, pre-hotplugging patch, call used to enter mempool_populate_iova.
But, with the 'force_contig' not set (in app/test-pmd/testpmd.c:521)
while calling rte_pktmbuf_pool_create, rte_mempool_populate_va is
called instead.
Post by Anatoly Burakov
ret = rte_mempool_populate_iova(mp, mz->addr,
iova, mz->len,
rte_mempool_memchunk_mz_free,
--
2.7.4
678 else
Post by Anatoly Burakov
# 679 ret = rte_mempool_populate_virt(mp, mz->addr,
680 mz->len, pg_sz,
681 rte_mempool_memchunk_mz_free,
682 (void *)(uintptr_t)mz);
In this function,
512 /* address and len must be page-aligned */
513 if (RTE_PTR_ALIGN_CEIL(addr, pg_sz) != addr)
514 return -EINVAL;
This is where error is returned.
I don't think RTE_PTR_ALIGN_CEIL is designed to handle pg_sz = 0.
RTE_PTR_ALIGN_FLOOR(((uintptr_t)addr - 1), pg_sz) which returns NULL
(0 ~ pg_sz).
Basically, this ends up failing rte_mempool_populate_default.
I think the reason is the assumption that when
rte_mempool_populate_virt is called, it can handle 0 page sizes (there
would issues besides the above RTE_PTR_ALIGN_CEIL as well, like a
for-loop looping over off+pg_sz), is wrong. It needs a valid page-size
value to work with (!0).
So, basically, DPAA2 is stuck with this patch because of above issue,
if I am correctly comprehending it as above.
Regards,
Shreyansh
Thanks for testing this. I'll look into fixing it.
--
Thanks,
Anatoly
Burakov, Anatoly
2018-03-23 11:25:20 UTC
Permalink
Post by Shreyansh Jain
Hello Anatoly,
On Wed, Mar 7, 2018 at 10:26 PM, Anatoly Burakov
Post by Anatoly Burakov
If a user has specified that the zone should have contiguous memory,
use the new _contig allocation API's instead of normal ones.
Otherwise, account for the fact that unless we're in IOVA_AS_VA
mode, we cannot guarantee that the pages would be physically
contiguous, so we calculate the memzone size and alignments as if
we were getting the smallest page size available.
---
[...]
Post by Anatoly Burakov
static void
mempool_add_elem(struct rte_mempool *mp, void *obj, rte_iova_t iova)
{
@@ -549,6 +570,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
unsigned mz_id, n;
unsigned int mp_flags;
int ret;
+ bool force_contig, no_contig;
/* mempool must not be populated */
if (mp->nb_mem_chunks != 0)
@@ -563,10 +585,46 @@ rte_mempool_populate_default(struct rte_mempool *mp)
/* update mempool capabilities */
mp->flags |= mp_flags;
- if (rte_eal_has_hugepages()) {
- pg_shift = 0; /* not needed, zone is physically contiguous */
+ no_contig = mp->flags & MEMPOOL_F_NO_PHYS_CONTIG;
+ force_contig = mp->flags & MEMPOOL_F_CAPA_PHYS_CONTIG;
+
+ /*
+ * there are several considerations for page size and page shift here.
+ *
+ * if we don't need our mempools to have physically contiguous objects,
+ * then just set page shift and page size to 0, because the user has
+ * indicated that there's no need to care about anything.
I think the above case is not handled properly here.
reason below...
Post by Anatoly Burakov
+ *
+ * if we do need contiguous objects, there is also an option to reserve
+ * the entire mempool memory as one contiguous block of memory, in
+ * which case the page shift and alignment wouldn't matter as well.
+ *
+ * if we require contiguous objects, but not necessarily the entire
+ * mempool reserved space to be contiguous, then there are two options.
+ *
+ * if our IO addresses are virtual, not actual physical (IOVA as VA
+ * case), then no page shift needed - our memory allocation will give us
+ * contiguous physical memory as far as the hardware is concerned, so
+ * act as if we're getting contiguous memory.
+ *
+ * if our IO addresses are physical, we may get memory from bigger
+ * pages, or we might get memory from smaller pages, and how much of it
+ * we require depends on whether we want bigger or smaller pages.
+ * However, requesting each and every memory size is too much work, so
+ * what we'll do instead is walk through the page sizes available, pick
+ * the smallest one and set up page shift to match that one. We will be
+ * wasting some space this way, but it's much nicer than looping around
+ * trying to reserve each and every page size.
+ */
+
+ if (no_contig || force_contig || rte_eal_iova_mode() == RTE_IOVA_VA) {
pg_sz = 0;
+ pg_shift = 0;
align = RTE_CACHE_LINE_SIZE;
So, assuming dpaa2 as example, I ran testpmd. IOVA=VA is the mode.
pg_sz = 0 is set.
same as before applying the hotplug patchset except that earlier this
decision was purely based on availability of hugepages
(rte_eal_has_hugepages()).
Moving on...
Post by Anatoly Burakov
+ } else if (rte_eal_has_hugepages()) {
+ pg_sz = get_min_page_size();
+ pg_shift = rte_bsf32(pg_sz);
+ align = pg_sz;
} else {
pg_sz = getpagesize();
pg_shift = rte_bsf32(pg_sz);
@@ -585,23 +643,34 @@ rte_mempool_populate_default(struct rte_mempool *mp)
goto fail;
}
- mz = rte_memzone_reserve_aligned(mz_name, size,
- mp->socket_id, mz_flags, align);
- /* not enough memory, retry with the biggest zone we have */
- if (mz == NULL)
- mz = rte_memzone_reserve_aligned(mz_name, 0,
+ if (force_contig) {
+ /*
+ * if contiguous memory for entire mempool memory was
+ * requested, don't try reserving again if we fail.
+ */
+ mz = rte_memzone_reserve_aligned_contig(mz_name, size,
+ mp->socket_id, mz_flags, align);
+ } else {
+ mz = rte_memzone_reserve_aligned(mz_name, size,
mp->socket_id, mz_flags, align);
+ /* not enough memory, retry with the biggest zone we
+ * have
+ */
+ if (mz == NULL)
+ mz = rte_memzone_reserve_aligned(mz_name, 0,
+ mp->socket_id, mz_flags, align);
+ }
if (mz == NULL) {
ret = -rte_errno;
goto fail;
}
- if (mp->flags & MEMPOOL_F_NO_PHYS_CONTIG)
+ if (no_contig)
iova = RTE_BAD_IOVA;
else
iova = mz->iova;
- if (rte_eal_has_hugepages())
+ if (rte_eal_has_hugepages() && force_contig)
So, pre-hotplugging patch, call used to enter mempool_populate_iova.
But, with the 'force_contig' not set (in app/test-pmd/testpmd.c:521)
while calling rte_pktmbuf_pool_create, rte_mempool_populate_va is
called instead.
Post by Anatoly Burakov
ret = rte_mempool_populate_iova(mp, mz->addr,
iova, mz->len,
rte_mempool_memchunk_mz_free,
--
2.7.4
678 else
Post by Anatoly Burakov
# 679 ret = rte_mempool_populate_virt(mp, mz->addr,
680 mz->len, pg_sz,
681 rte_mempool_memchunk_mz_free,
682 (void *)(uintptr_t)mz);
In this function,
512 /* address and len must be page-aligned */
513 if (RTE_PTR_ALIGN_CEIL(addr, pg_sz) != addr)
514 return -EINVAL;
This is where error is returned.
I don't think RTE_PTR_ALIGN_CEIL is designed to handle pg_sz = 0.
RTE_PTR_ALIGN_FLOOR(((uintptr_t)addr - 1), pg_sz) which returns NULL
(0 ~ pg_sz).
Basically, this ends up failing rte_mempool_populate_default.
I think the reason is the assumption that when
rte_mempool_populate_virt is called, it can handle 0 page sizes (there
would issues besides the above RTE_PTR_ALIGN_CEIL as well, like a
for-loop looping over off+pg_sz), is wrong. It needs a valid page-size
value to work with (!0).
So, basically, DPAA2 is stuck with this patch because of above issue,
if I am correctly comprehending it as above.
Regards,
Shreyansh
Thanks for finding this issue. Fix is now pushed to github for testing.
--
Thanks,
Anatoly
gowrishankar muthukrishnan
2018-03-21 09:09:35 UTC
Permalink
This patchset introduces dynamic memory allocation for DPDK (aka memory
hotplug). Based upon RFC submitted in December [1].
Hi Anatoly,

I am able to bring up pmd with these patches in powerpc. I am continuing
to validate more
on memory limits that this patch set has (eg pre-allocating anonymous
mapping for largest
mem possible as per default mem and memseg values). I'll keep posting my
observations further.

Thanks for the patches,
Gowrishankar
- IPC bugfixes patchset [2]
- IPC improvements patchset [3]
- IPC asynchronous request API patch [4]
- Function to return number of sockets [5]
- General outline of memory hotplug changes [6]
- EAL NUMA node count changes [7]
The vast majority of changes are in the EAL and malloc, the external API
disruption is minimal: a new set of API's are added for contiguous memory
allocation for rte_memzone, and a few API additions in rte_memory due to
switch to memseg_lists as opposed to memsegs. Every other API change is
internal to EAL, and all of the memory allocation/freeing is handled
through rte_malloc, with no externally visible API changes.
* Malloc heap adjusted to handle holes in address space
* Single memseg list replaced by multiple memseg lists
* VA space for hugepages is preallocated in advance
* Added alloc/free for pages happening as needed on rte_malloc/rte_free
* Added contiguous memory allocation API's for rte_memzone
* Integrated Pawel Wodkowski's patch for registering/unregistering memory
with VFIO [8]
* Callbacks for registering memory allocations
* Multiprocess support done via DPDK IPC introduced in 18.02
The biggest difference is a "memseg" now represents a single page (as opposed to
being a big contiguous block of pages). As a consequence, both memzones and
malloc elements are no longer guaranteed to be physically contiguous, unless
the user asks for it at reserve time. To preserve whatever functionality that
was dependent on previous behavior, a legacy memory option is also provided,
however it is expected (or perhaps vainly hoped) to be temporary solution.
Why multiple memseg lists instead of one? Since memseg is a single page now,
the list of memsegs will get quite big, and we need to locate pages somehow
when we allocate and free them. We could of course just walk the list and
allocate one contiguous chunk of VA space for memsegs, but this
implementation uses separate lists instead in order to speed up many
operations with memseg lists.
- FreeBSD does not even compile, let alone run
- No 32-bit support
- There are some minor quality-of-life improvements planned that aren't
ready yet and will be part of v2
- VFIO support is only smoke-tested (but is expected to work), VFIO support
with secondary processes is not tested; work is ongoing to validate VFIO
for all use cases
- Dynamic mapping/unmapping memory with VFIO is not supported in sPAPR
IOMMU mode - help from sPAPR maintainers requested
Nevertheless, this patchset should be testable under 64-bit Linux, and
should work for all use cases bar those mentioned above.
[1] http://dpdk.org/dev/patchwork/bundle/aburakov/Memory_RFC/
[2] http://dpdk.org/dev/patchwork/bundle/aburakov/IPC_Fixes/
[3] http://dpdk.org/dev/patchwork/bundle/aburakov/IPC_Improvements/
[4] http://dpdk.org/dev/patchwork/bundle/aburakov/IPC_Async_Request/
[5] http://dpdk.org/dev/patchwork/bundle/aburakov/Num_Sockets/
[6] http://dpdk.org/dev/patchwork/patch/34002/
[7] http://dpdk.org/dev/patchwork/patch/33853/
[8] http://dpdk.org/dev/patchwork/patch/24484/
eal: move get_virtual_area out of linuxapp eal_memory.c
eal: move all locking to heap
eal: make malloc heap a doubly-linked list
eal: add function to dump malloc heap contents
test: add command to dump malloc heap contents
eal: make malloc_elem_join_adjacent_free public
eal: make malloc free list remove public
eal: make malloc free return resulting malloc element
eal: add rte_fbarray
eal: add "single file segments" command-line option
eal: add "legacy memory" option
eal: read hugepage counts from node-specific sysfs path
eal: replace memseg with memseg lists
eal: add support for mapping hugepages at runtime
eal: add support for unmapping pages at runtime
eal: make use of memory hotplug for init
eal: enable memory hotplug support in rte_malloc
test: fix malloc autotest to support memory hotplug
eal: add API to check if memory is contiguous
eal: add backend support for contiguous allocation
eal: enable reserving physically contiguous memzones
eal: replace memzone array with fbarray
mempool: add support for the new allocation methods
vfio: allow to map other memory regions
eal: map/unmap memory with VFIO when alloc/free pages
eal: prepare memseg lists for multiprocess sync
eal: add multiprocess init with memory hotplug
eal: add support for multiprocess memory hotplug
eal: add support for callbacks on memory hotplug
eal: enable callbacks on malloc/free and mp sync
ethdev: use contiguous allocation for DMA memory
crypto/qat: use contiguous allocation for DMA memory
net/avf: use contiguous allocation for DMA memory
net/bnx2x: use contiguous allocation for DMA memory
net/cxgbe: use contiguous allocation for DMA memory
net/ena: use contiguous allocation for DMA memory
net/enic: use contiguous allocation for DMA memory
net/i40e: use contiguous allocation for DMA memory
net/qede: use contiguous allocation for DMA memory
net/virtio: use contiguous allocation for DMA memory
net/vmxnet3: use contiguous allocation for DMA memory
config/common_base | 15 +-
drivers/bus/pci/linux/pci.c | 29 +-
drivers/crypto/qat/qat_qp.c | 4 +-
drivers/net/avf/avf_ethdev.c | 2 +-
drivers/net/bnx2x/bnx2x.c | 2 +-
drivers/net/bnx2x/bnx2x_rxtx.c | 3 +-
drivers/net/cxgbe/sge.c | 3 +-
drivers/net/ena/base/ena_plat_dpdk.h | 7 +-
drivers/net/ena/ena_ethdev.c | 10 +-
drivers/net/enic/enic_main.c | 4 +-
drivers/net/i40e/i40e_ethdev.c | 2 +-
drivers/net/i40e/i40e_rxtx.c | 2 +-
drivers/net/qede/base/bcm_osal.c | 5 +-
drivers/net/virtio/virtio_ethdev.c | 8 +-
drivers/net/virtio/virtio_user/vhost_kernel.c | 108 ++-
drivers/net/vmxnet3/vmxnet3_ethdev.c | 7 +-
lib/librte_eal/bsdapp/eal/Makefile | 4 +
lib/librte_eal/bsdapp/eal/eal.c | 25 +
lib/librte_eal/bsdapp/eal/eal_hugepage_info.c | 7 +
lib/librte_eal/bsdapp/eal/eal_memalloc.c | 33 +
lib/librte_eal/bsdapp/eal/meson.build | 1 +
lib/librte_eal/common/Makefile | 2 +-
lib/librte_eal/common/eal_common_fbarray.c | 859 +++++++++++++++++
lib/librte_eal/common/eal_common_memalloc.c | 181 ++++
lib/librte_eal/common/eal_common_memory.c | 512 +++++++++-
lib/librte_eal/common/eal_common_memzone.c | 275 ++++--
lib/librte_eal/common/eal_common_options.c | 8 +
lib/librte_eal/common/eal_filesystem.h | 13 +
lib/librte_eal/common/eal_hugepages.h | 7 +
lib/librte_eal/common/eal_internal_cfg.h | 10 +-
lib/librte_eal/common/eal_memalloc.h | 41 +
lib/librte_eal/common/eal_options.h | 4 +
lib/librte_eal/common/eal_private.h | 33 +
lib/librte_eal/common/include/rte_eal_memconfig.h | 29 +-
lib/librte_eal/common/include/rte_fbarray.h | 352 +++++++
lib/librte_eal/common/include/rte_malloc.h | 9 +
lib/librte_eal/common/include/rte_malloc_heap.h | 6 +
lib/librte_eal/common/include/rte_memory.h | 79 +-
lib/librte_eal/common/include/rte_memzone.h | 155 ++-
lib/librte_eal/common/include/rte_vfio.h | 39 +
lib/librte_eal/common/malloc_elem.c | 436 +++++++--
lib/librte_eal/common/malloc_elem.h | 41 +-
lib/librte_eal/common/malloc_heap.c | 694 +++++++++++++-
lib/librte_eal/common/malloc_heap.h | 15 +-
lib/librte_eal/common/malloc_mp.c | 723 ++++++++++++++
lib/librte_eal/common/malloc_mp.h | 86 ++
lib/librte_eal/common/meson.build | 4 +
lib/librte_eal/common/rte_malloc.c | 75 +-
lib/librte_eal/linuxapp/eal/Makefile | 5 +
lib/librte_eal/linuxapp/eal/eal.c | 102 +-
lib/librte_eal/linuxapp/eal/eal_hugepage_info.c | 155 ++-
lib/librte_eal/linuxapp/eal/eal_memalloc.c | 1049 +++++++++++++++++++++
lib/librte_eal/linuxapp/eal/eal_memory.c | 516 ++++++----
lib/librte_eal/linuxapp/eal/eal_vfio.c | 318 +++++--
lib/librte_eal/linuxapp/eal/eal_vfio.h | 11 +
lib/librte_eal/linuxapp/eal/meson.build | 1 +
lib/librte_eal/rte_eal_version.map | 23 +-
lib/librte_ether/rte_ethdev.c | 3 +-
lib/librte_mempool/rte_mempool.c | 87 +-
test/test/commands.c | 3 +
test/test/test_malloc.c | 71 +-
test/test/test_memory.c | 43 +-
test/test/test_memzone.c | 26 +-
63 files changed, 6631 insertions(+), 751 deletions(-)
create mode 100644 lib/librte_eal/bsdapp/eal/eal_memalloc.c
create mode 100644 lib/librte_eal/common/eal_common_fbarray.c
create mode 100644 lib/librte_eal/common/eal_common_memalloc.c
create mode 100644 lib/librte_eal/common/eal_memalloc.h
create mode 100644 lib/librte_eal/common/include/rte_fbarray.h
create mode 100644 lib/librte_eal/common/malloc_mp.c
create mode 100644 lib/librte_eal/common/malloc_mp.h
create mode 100644 lib/librte_eal/linuxapp/eal/eal_memalloc.c
Tan, Jianfeng
2018-03-23 15:44:43 UTC
Permalink
Post by Anatoly Burakov
This enables multiprocess synchronization for memory hotplug
requests at runtime (as opposed to initialization).
Basic workflow is the following. Primary process always does initial
mapping and unmapping, and secondary processes always follow primary
page map. Only one allocation request can be active at any one time.
When primary allocates memory, it ensures that all other processes
have allocated the same set of hugepages successfully, otherwise
any allocations made are being rolled back, and heap is freed back.
Heap is locked throughout the process, so no race conditions can
happen.
When primary frees memory, it frees the heap, deallocates affected
pages, and notifies other processes of deallocations. Since heap is
freed from that memory chunk, the area basically becomes invisible
to other processes even if they happen to fail to unmap that
specific set of pages, so it's completely safe to ignore results of
sync requests.
When secondary allocates memory, it does not do so by itself.
Instead, it sends a request to primary process to try and allocate
pages of specified size and on specified socket, such that a
specified heap allocation request could complete. Primary process
then sends all secondaries (including the requestor) a separate
notification of allocated pages, and expects all secondary
processes to report success before considering pages as "allocated".
Only after primary process ensures that all memory has been
successfully allocated in all secondary process, it will respond
positively to the initial request, and let secondary proceed with
the allocation. Since the heap now has memory that can satisfy
allocation request, and it was locked all this time (so no other
allocations could take place), secondary process will be able to
allocate memory from the heap.
When secondary frees memory, it hides pages to be deallocated from
the heap. Then, it sends a deallocation request to primary process,
so that it deallocates pages itself, and then sends a separate sync
request to all other processes (including the requestor) to unmap
the same pages. This way, even if secondary fails to notify other
processes of this deallocation, that memory will become invisible
to other processes, and will not be allocated from again.
So, to summarize: address space will only become part of the heap
if primary process can ensure that all other processes have
allocated this memory successfully. If anything goes wrong, the
worst thing that could happen is that a page will "leak" and will
not be available to neither DPDK nor the system, as some process
will still hold onto it. It's not an actual leak, as we can account
for the page - it's just that none of the processes will be able
to use this page for anything useful, until it gets allocated from
by the primary.
Due to underlying DPDK IPC implementation being single-threaded,
some asynchronous magic had to be done, as we need to complete
several requests before we can definitively allow secondary process
to use allocated memory (namely, it has to be present in all other
secondary processes before it can be used). Additionally, only
one allocation request is allowed to be submitted at once.
Memory allocation requests are only allowed when there are no
secondary processes currently initializing. To enforce that,
a shared rwlock is used, that is set to read lock on init (so that
several secondaries could initialize concurrently), and write lock
on making allocation requests (so that either secondary init will
have to wait, or allocation request will have to wait until all
processes have initialized).
---
v2: - fixed deadlocking on init problem
- reverted rte_panic changes (fixed by changes in IPC instead)
This problem is evidently complex to solve without multithreaded
IPC implementation. An alternative approach would be to process
each individual message in its own thread (or at least spawn a
thread per incoming request) - that way, we can send requests
while responding to another request, and this problem becomes
trivial to solve (and in fact it was solved that way initially,
before my aversion to certain other programming languages kicked
in).
Is the added complexity worth saving a couple of thread spin-ups
here and there?
lib/librte_eal/bsdapp/eal/Makefile | 1 +
lib/librte_eal/common/eal_common_memory.c | 16 +-
lib/librte_eal/common/include/rte_eal_memconfig.h | 3 +
lib/librte_eal/common/malloc_heap.c | 255 ++++++--
lib/librte_eal/common/malloc_mp.c | 723 ++++++++++++++++++++++
lib/librte_eal/common/malloc_mp.h | 86 +++
lib/librte_eal/common/meson.build | 1 +
lib/librte_eal/linuxapp/eal/Makefile | 1 +
8 files changed, 1040 insertions(+), 46 deletions(-)
create mode 100644 lib/librte_eal/common/malloc_mp.c
create mode 100644 lib/librte_eal/common/malloc_mp.h
...
Post by Anatoly Burakov
+/* callback for asynchronous sync requests for primary. this will either do a
+ * sendmsg with results, or trigger rollback request.
+ */
+static int
+handle_sync_response(const struct rte_mp_msg *request,
Rename to handle_async_response()?
santosh
2018-03-24 06:01:21 UTC
Permalink
Hi Anatoly,
Post by Anatoly Burakov
Before, we were aggregating multiple pages into one memseg, so the
number of memsegs was small. Now, each page gets its own memseg,
so the list of memsegs is huge. To accommodate the new memseg list
size and to keep the under-the-hood workings sane, the memseg list
is now not just a single list, but multiple lists. To be precise,
each hugepage size available on the system gets one or more memseg
lists, per socket.
In order to support dynamic memory allocation, we reserve all
memory in advance. As in, we do an anonymous mmap() of the entire
maximum size of memory per hugepage size, per socket (which is
limited to either RTE_MAX_MEMSEG_PER_TYPE pages or
RTE_MAX_MEM_PER_TYPE gigabytes worth of memory, whichever is the
smaller one), split over multiple lists (which are limited to
either RTE_MAX_MEMSEG_PER_LIST memsegs or RTE_MAX_MEM_PER_LIST
gigabytes per list, whichever is the smaller one).
So, for each hugepage size, we get (by default) up to 128G worth
of memory, per socket, split into chunks of up to 32G in size.
The address space is claimed at the start, in eal_common_memory.c.
The actual page allocation code is in eal_memalloc.c (Linux-only
for now), and largely consists of copied EAL memory init code.
Pages in the list are also indexed by address. That is, for
non-legacy mode, in order to figure out where the page belongs,
one can simply look at base address for a memseg list. Similarly,
figuring out IOVA address of a memzone is a matter of finding the
right memseg list, getting offset and dividing by page size to get
the appropriate memseg. For legacy mode, old behavior of walking
the memseg list remains.
Due to switch to fbarray and to avoid any intrusive changes,
secondary processes are not supported in this commit. Also, one
particular API call (dump physmem layout) no longer makes sense
and was removed, according to deprecation notice [1].
In legacy mode, nothing is preallocated, and all memsegs are in
a list like before, but each segment still resides in an appropriate
memseg list.
The rest of the changes are really ripple effects from the memseg
change - heap changes, compile fixes, and rewrites to support
fbarray-backed memseg lists.
[1] http://dpdk.org/dev/patchwork/patch/34002/
---
Thanks for good work!.
Few observations:
# Noticed performance regression for thunderx platform for l3fwd application,
drops by 3%. git bisect shows this changeset is offending commit.
I'm still investigating reason for perf-dip..
Would like to know - have you noticed any regression on x86 platform?

# In next version, pl. make sure that individual patch builds successfully.
Right now, Some patches are dependent on other, leads to build break, observed while
git-bisecting.
Post by Anatoly Burakov
fa71cdef6963ed795fdd7e7f35085170bb300e39
1037fcd989176c5cc83db6223534205cac469765
befdec10759d30275a17a829919ee45228d91d3c
495e60f4e02af8a344c0f817a60d1ee9b9322df4
[above commits are from your github repo..]

# Nits:
Perhaps you may club all below comits into one single patch,
as changes are identical... that way you'd reduce patch count by few less.
9a1e2a7bd9f6248c680ad3e444b6f173eb92d457 net/vmxnet3: use contiguous allocation for DMA memory
46388b194cd559b5cf7079e01b04bf67a99b64d7 net/virtio: use contiguous allocation for DMA memory
a3d2eb10bd998ba3ae3a3d39adeaff38d2e53a9d net/qede: use contiguous allocation for DMA memory
6f16b23ef1f472db475edf05159dea5ae741dbf8 net/i40e: use contiguous allocation for DMA memory
f9f7576eed35cb6aa50793810cdda43bcc0f4642 net/enic: use contiguous allocation for DMA memory
2af6c33009b8008da7028a351efed2932b1a13d0 net/ena: use contiguous allocation for DMA memory
18003e22bd7087e5e2e03543cb662d554f7bec52 net/cxgbe: use contiguous allocation for DMA memory
59f79182502dcb3634dfa3e7b918195829777460 net/bnx2x: use contiguous allocation for DMA memory
f481a321e41da82ddfa00f5ddbcb42fc29e6ae76 net/avf: use contiguous allocation for DMA memory
5253e9b757c1855a296656d939f5c28e651fea69 crypto/qat: use contiguous allocation for DMA memory
297ab037b4c0d9d725aa6cfdd2c33f7cd9396899 ethdev: use contiguous allocation for DMA memory

Thanks.

Loading...