Discussion:
[dpdk-dev] [PATCH v2 01/41] eal: move get_virtual_area out of linuxapp eal_memory.c
(too old to reply)
Anatoly Burakov
2018-03-07 16:56:29 UTC
Permalink
Move get_virtual_area out of linuxapp EAL memory and make it
common to EAL, so that other code could reserve virtual areas
as well.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/common/eal_common_memory.c | 101 ++++++++++++++++++++++
lib/librte_eal/common/eal_private.h | 33 +++++++
lib/librte_eal/linuxapp/eal/eal_memory.c | 137 ++++++------------------------
3 files changed, 161 insertions(+), 110 deletions(-)

diff --git a/lib/librte_eal/common/eal_common_memory.c b/lib/librte_eal/common/eal_common_memory.c
index 852f3bb..042881b 100644
--- a/lib/librte_eal/common/eal_common_memory.c
+++ b/lib/librte_eal/common/eal_common_memory.c
@@ -2,10 +2,12 @@
* Copyright(c) 2010-2014 Intel Corporation
*/

+#include <errno.h>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <stdarg.h>
+#include <string.h>
#include <unistd.h>
#include <inttypes.h>
#include <sys/mman.h>
@@ -14,12 +16,111 @@
#include <rte_memory.h>
#include <rte_eal.h>
#include <rte_eal_memconfig.h>
+#include <rte_errno.h>
#include <rte_log.h>

#include "eal_private.h"
#include "eal_internal_cfg.h"

/*
+ * Try to mmap *size bytes in /dev/zero. If it is successful, return the
+ * pointer to the mmap'd area and keep *size unmodified. Else, retry
+ * with a smaller zone: decrease *size by hugepage_sz until it reaches
+ * 0. In this case, return NULL. Note: this function returns an address
+ * which is a multiple of hugepage size.
+ */
+
+static uint64_t baseaddr_offset;
+static uint64_t system_page_sz;
+
+void *
+eal_get_virtual_area(void *requested_addr, uint64_t *size,
+ uint64_t page_sz, int flags, int mmap_flags)
+{
+ bool addr_is_hint, allow_shrink, unmap, no_align;
+ uint64_t map_sz;
+ void *mapped_addr, *aligned_addr;
+
+ if (system_page_sz == 0)
+ system_page_sz = sysconf(_SC_PAGESIZE);
+
+ mmap_flags |= MAP_PRIVATE | MAP_ANONYMOUS;
+
+ RTE_LOG(DEBUG, EAL, "Ask a virtual area of 0x%zx bytes\n", *size);
+
+ addr_is_hint = (flags & EAL_VIRTUAL_AREA_ADDR_IS_HINT) > 0;
+ allow_shrink = (flags & EAL_VIRTUAL_AREA_ALLOW_SHRINK) > 0;
+ unmap = (flags & EAL_VIRTUAL_AREA_UNMAP) > 0;
+
+ if (requested_addr == NULL && internal_config.base_virtaddr != 0) {
+ requested_addr = (void *) (internal_config.base_virtaddr +
+ baseaddr_offset);
+ requested_addr = RTE_PTR_ALIGN(requested_addr, page_sz);
+ addr_is_hint = true;
+ }
+
+ /* if requested address is not aligned by page size, or if requested
+ * address is NULL, add page size to requested length as we may get an
+ * address that's aligned by system page size, which can be smaller than
+ * our requested page size. additionally, we shouldn't try to align if
+ * system page size is the same as requested page size.
+ */
+ no_align = (requested_addr != NULL &&
+ ((uintptr_t)requested_addr & (page_sz - 1)) == 0) ||
+ page_sz == system_page_sz;
+
+ do {
+ map_sz = no_align ? *size : *size + page_sz;
+
+ mapped_addr = mmap(requested_addr, map_sz, PROT_READ,
+ mmap_flags, -1, 0);
+ if (mapped_addr == MAP_FAILED && allow_shrink)
+ *size -= page_sz;
+ } while (allow_shrink && mapped_addr == MAP_FAILED && *size > 0);
+
+ /* align resulting address - if map failed, we will ignore the value
+ * anyway, so no need to add additional checks.
+ */
+ aligned_addr = no_align ? mapped_addr :
+ RTE_PTR_ALIGN(mapped_addr, page_sz);
+
+ if (*size == 0) {
+ RTE_LOG(ERR, EAL, "Cannot get a virtual area of any size: %s\n",
+ strerror(errno));
+ rte_errno = errno;
+ return NULL;
+ } else if (mapped_addr == MAP_FAILED) {
+ RTE_LOG(ERR, EAL, "Cannot get a virtual area: %s\n",
+ strerror(errno));
+ /* pass errno up the call chain */
+ rte_errno = errno;
+ return NULL;
+ } else if (requested_addr != NULL && !addr_is_hint &&
+ aligned_addr != requested_addr) {
+ RTE_LOG(ERR, EAL, "Cannot get a virtual area at requested address: %p (got %p)\n",
+ requested_addr, aligned_addr);
+ munmap(mapped_addr, map_sz);
+ rte_errno = EADDRNOTAVAIL;
+ return NULL;
+ } else if (requested_addr != NULL && addr_is_hint &&
+ aligned_addr != requested_addr) {
+ RTE_LOG(WARNING, EAL, "WARNING! Base virtual address hint (%p != %p) not respected!\n",
+ requested_addr, aligned_addr);
+ RTE_LOG(WARNING, EAL, " This may cause issues with mapping memory into secondary processes\n");
+ }
+
+ if (unmap)
+ munmap(mapped_addr, map_sz);
+
+ RTE_LOG(DEBUG, EAL, "Virtual area found at %p (size = 0x%zx)\n",
+ aligned_addr, *size);
+
+ baseaddr_offset += *size;
+
+ return aligned_addr;
+}
+
+/*
* Return a pointer to a read-only table of struct rte_physmem_desc
* elements, containing the layout of all addressable physical
* memory. The last element of the table contains a NULL address.
diff --git a/lib/librte_eal/common/eal_private.h b/lib/librte_eal/common/eal_private.h
index 0b28770..96cebb7 100644
--- a/lib/librte_eal/common/eal_private.h
+++ b/lib/librte_eal/common/eal_private.h
@@ -127,6 +127,39 @@ int rte_eal_alarm_init(void);
int rte_eal_check_module(const char *module_name);

/**
+ * Get virtual area of specified size from the OS.
+ *
+ * This function is private to the EAL.
+ *
+ * @param requested_addr
+ * Address where to request address space.
+ * @param size
+ * Size of requested area.
+ * @param page_sz
+ * Page size on which to align requested virtual area.
+ * @param flags
+ * EAL_VIRTUAL_AREA_* flags.
+ * @param mmap_flags
+ * Extra flags passed directly to mmap().
+ *
+ * @return
+ * Virtual area address if successful.
+ * NULL if unsuccessful.
+ */
+
+#define EAL_VIRTUAL_AREA_ADDR_IS_HINT (1 << 0)
+/**< don't fail if cannot get exact requested address. */
+#define EAL_VIRTUAL_AREA_ALLOW_SHRINK (1 << 1)
+/**< try getting smaller sized (decrement by page size) virtual areas if cannot
+ * get area of requested size.
+ */
+#define EAL_VIRTUAL_AREA_UNMAP (1 << 2)
+/**< immediately unmap reserved virtual area. */
+void *
+eal_get_virtual_area(void *requested_addr, uint64_t *size,
+ uint64_t page_sz, int flags, int mmap_flags);
+
+/**
* Get cpu core_id.
*
* This function is private to the EAL.
diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
index 38853b7..5c11d77 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -28,6 +28,7 @@
#include <numaif.h>
#endif

+#include <rte_errno.h>
#include <rte_log.h>
#include <rte_memory.h>
#include <rte_launch.h>
@@ -57,8 +58,6 @@
* zone as well as a physical contiguous zone.
*/

-static uint64_t baseaddr_offset;
-
static bool phys_addrs_available = true;

#define RANDOMIZE_VA_SPACE_FILE "/proc/sys/kernel/randomize_va_space"
@@ -221,82 +220,6 @@ aslr_enabled(void)
}
}

-/*
- * Try to mmap *size bytes in /dev/zero. If it is successful, return the
- * pointer to the mmap'd area and keep *size unmodified. Else, retry
- * with a smaller zone: decrease *size by hugepage_sz until it reaches
- * 0. In this case, return NULL. Note: this function returns an address
- * which is a multiple of hugepage size.
- */
-static void *
-get_virtual_area(size_t *size, size_t hugepage_sz)
-{
- void *addr;
- void *addr_hint;
- int fd;
- long aligned_addr;
-
- if (internal_config.base_virtaddr != 0) {
- int page_size = sysconf(_SC_PAGE_SIZE);
- addr_hint = (void *) (uintptr_t)
- (internal_config.base_virtaddr + baseaddr_offset);
- addr_hint = RTE_PTR_ALIGN_FLOOR(addr_hint, page_size);
- } else {
- addr_hint = NULL;
- }
-
- RTE_LOG(DEBUG, EAL, "Ask a virtual area of 0x%zx bytes\n", *size);
-
-
- fd = open("/dev/zero", O_RDONLY);
- if (fd < 0){
- RTE_LOG(ERR, EAL, "Cannot open /dev/zero\n");
- return NULL;
- }
- do {
- addr = mmap(addr_hint, (*size) + hugepage_sz, PROT_READ,
-#ifdef RTE_ARCH_PPC_64
- MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
-#else
- MAP_PRIVATE,
-#endif
- fd, 0);
- if (addr == MAP_FAILED) {
- *size -= hugepage_sz;
- } else if (addr_hint != NULL && addr != addr_hint) {
- RTE_LOG(WARNING, EAL, "WARNING! Base virtual address "
- "hint (%p != %p) not respected!\n",
- addr_hint, addr);
- RTE_LOG(WARNING, EAL, " This may cause issues with "
- "mapping memory into secondary processes\n");
- }
- } while (addr == MAP_FAILED && *size > 0);
-
- if (addr == MAP_FAILED) {
- close(fd);
- RTE_LOG(ERR, EAL, "Cannot get a virtual area: %s\n",
- strerror(errno));
- return NULL;
- }
-
- munmap(addr, (*size) + hugepage_sz);
- close(fd);
-
- /* align addr to a huge page size boundary */
- aligned_addr = (long)addr;
- aligned_addr += (hugepage_sz - 1);
- aligned_addr &= (~(hugepage_sz - 1));
- addr = (void *)(aligned_addr);
-
- RTE_LOG(DEBUG, EAL, "Virtual area found at %p (size = 0x%zx)\n",
- addr, *size);
-
- /* increment offset */
- baseaddr_offset += *size;
-
- return addr;
-}
-
static sigjmp_buf huge_jmpenv;

static void huge_sigbus_handler(int signo __rte_unused)
@@ -445,7 +368,16 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
/* get the biggest virtual memory area up to
* vma_len. If it fails, vma_addr is NULL, so
* let the kernel provide the address. */
- vma_addr = get_virtual_area(&vma_len, hpi->hugepage_sz);
+ vma_addr = eal_get_virtual_area(NULL, &vma_len,
+ hpi->hugepage_sz,
+ EAL_VIRTUAL_AREA_ALLOW_SHRINK |
+ EAL_VIRTUAL_AREA_UNMAP,
+#ifdef RTE_ARCH_PPC_64
+ MAP_HUGETLB
+#else
+ 0
+#endif
+ );
if (vma_addr == NULL)
vma_len = hugepage_sz;
}
@@ -1339,7 +1271,7 @@ rte_eal_hugepage_attach(void)
unsigned i, s = 0; /* s used to track the segment number */
unsigned max_seg = RTE_MAX_MEMSEG;
off_t size = 0;
- int fd, fd_zero = -1, fd_hugepage = -1;
+ int fd, fd_hugepage = -1;

if (aslr_enabled() > 0) {
RTE_LOG(WARNING, EAL, "WARNING: Address Space Layout Randomization "
@@ -1350,11 +1282,6 @@ rte_eal_hugepage_attach(void)

test_phys_addrs_available();

- fd_zero = open("/dev/zero", O_RDONLY);
- if (fd_zero < 0) {
- RTE_LOG(ERR, EAL, "Could not open /dev/zero\n");
- goto error;
- }
fd_hugepage = open(eal_hugepage_info_path(), O_RDONLY);
if (fd_hugepage < 0) {
RTE_LOG(ERR, EAL, "Could not open %s\n", eal_hugepage_info_path());
@@ -1364,6 +1291,8 @@ rte_eal_hugepage_attach(void)
/* map all segments into memory to make sure we get the addrs */
for (s = 0; s < RTE_MAX_MEMSEG; ++s) {
void *base_addr;
+ uint64_t mmap_sz;
+ int mmap_flags = 0;

/*
* the first memory segment with len==0 is the one that
@@ -1372,35 +1301,26 @@ rte_eal_hugepage_attach(void)
if (mcfg->memseg[s].len == 0)
break;

- /*
- * fdzero is mmapped to get a contiguous block of virtual
- * addresses of the appropriate memseg size.
- * use mmap to get identical addresses as the primary process.
+ /* get identical addresses as the primary process.
*/
- base_addr = mmap(mcfg->memseg[s].addr, mcfg->memseg[s].len,
- PROT_READ,
#ifdef RTE_ARCH_PPC_64
- MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
-#else
- MAP_PRIVATE,
+ mmap_flags |= MAP_HUGETLB;
#endif
- fd_zero, 0);
- if (base_addr == MAP_FAILED ||
- base_addr != mcfg->memseg[s].addr) {
+ mmap_sz = mcfg->memseg[s].len;
+ base_addr = eal_get_virtual_area(mcfg->memseg[s].addr,
+ &mmap_sz, mcfg->memseg[s].hugepage_sz, 0,
+ mmap_flags);
+ if (base_addr == NULL) {
max_seg = s;
- if (base_addr != MAP_FAILED) {
- /* errno is stale, don't use */
- RTE_LOG(ERR, EAL, "Could not mmap %llu bytes "
- "in /dev/zero at [%p], got [%p] - "
- "please use '--base-virtaddr' option\n",
+ if (rte_errno == EADDRNOTAVAIL) {
+ RTE_LOG(ERR, EAL, "Could not mmap %llu bytes at [%p] - please use '--base-virtaddr' option\n",
(unsigned long long)mcfg->memseg[s].len,
- mcfg->memseg[s].addr, base_addr);
- munmap(base_addr, mcfg->memseg[s].len);
+ mcfg->memseg[s].addr);
} else {
- RTE_LOG(ERR, EAL, "Could not mmap %llu bytes "
- "in /dev/zero at [%p]: '%s'\n",
+ RTE_LOG(ERR, EAL, "Could not mmap %llu bytes at [%p]: '%s'\n",
(unsigned long long)mcfg->memseg[s].len,
- mcfg->memseg[s].addr, strerror(errno));
+ mcfg->memseg[s].addr,
+ rte_strerror(rte_errno));
}
if (aslr_enabled() > 0) {
RTE_LOG(ERR, EAL, "It is recommended to "
@@ -1465,7 +1385,6 @@ rte_eal_hugepage_attach(void)
}
/* unmap the hugepage config file, since we are done using it */
munmap(hp, size);
- close(fd_zero);
close(fd_hugepage);
return 0;

@@ -1474,8 +1393,6 @@ rte_eal_hugepage_attach(void)
munmap(mcfg->memseg[i].addr, mcfg->memseg[i].len);
if (hp != NULL && hp != MAP_FAILED)
munmap(hp, size);
- if (fd_zero >= 0)
- close(fd_zero);
if (fd_hugepage >= 0)
close(fd_hugepage);
return -1;
--
2.7.4
Anatoly Burakov
2018-03-07 16:56:31 UTC
Permalink
As we are preparing for dynamic memory allocation, we need to be
able to handle holes in our malloc heap, hence we're switching to
doubly linked list, and prepare infrastructure to support it.

Since our heap is now aware where are our first and last elements,
there is no longer any need to have a dummy element at the end of
each heap, so get rid of that as well. Instead, let insert/remove/
join/split operations handle end-of-list conditions automatically.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/common/include/rte_malloc_heap.h | 6 +
lib/librte_eal/common/malloc_elem.c | 200 +++++++++++++++++++-----
lib/librte_eal/common/malloc_elem.h | 14 +-
lib/librte_eal/common/malloc_heap.c | 8 +-
4 files changed, 179 insertions(+), 49 deletions(-)

diff --git a/lib/librte_eal/common/include/rte_malloc_heap.h b/lib/librte_eal/common/include/rte_malloc_heap.h
index ba99ed9..9ec4b62 100644
--- a/lib/librte_eal/common/include/rte_malloc_heap.h
+++ b/lib/librte_eal/common/include/rte_malloc_heap.h
@@ -13,12 +13,18 @@
/* Number of free lists per heap, grouped by size. */
#define RTE_HEAP_NUM_FREELISTS 13

+/* dummy definition, for pointers */
+struct malloc_elem;
+
/**
* Structure to hold malloc heap
*/
struct malloc_heap {
rte_spinlock_t lock;
LIST_HEAD(, malloc_elem) free_head[RTE_HEAP_NUM_FREELISTS];
+ struct malloc_elem *first;
+ struct malloc_elem *last;
+
unsigned alloc_count;
size_t total_size;
} __rte_cache_aligned;
diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c
index ea041e2..eb41200 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -31,6 +31,7 @@ malloc_elem_init(struct malloc_elem *elem,
elem->heap = heap;
elem->ms = ms;
elem->prev = NULL;
+ elem->next = NULL;
memset(&elem->free_list, 0, sizeof(elem->free_list));
elem->state = ELEM_FREE;
elem->size = size;
@@ -39,15 +40,56 @@ malloc_elem_init(struct malloc_elem *elem,
set_trailer(elem);
}

-/*
- * Initialize a dummy malloc_elem header for the end-of-memseg marker
- */
void
-malloc_elem_mkend(struct malloc_elem *elem, struct malloc_elem *prev)
+malloc_elem_insert(struct malloc_elem *elem)
{
- malloc_elem_init(elem, prev->heap, prev->ms, 0);
- elem->prev = prev;
- elem->state = ELEM_BUSY; /* mark busy so its never merged */
+ struct malloc_elem *prev_elem, *next_elem;
+ struct malloc_heap *heap = elem->heap;
+
+ if (heap->first == NULL && heap->last == NULL) {
+ /* if empty heap */
+ heap->first = elem;
+ heap->last = elem;
+ prev_elem = NULL;
+ next_elem = NULL;
+ } else if (elem < heap->first) {
+ /* if lower than start */
+ prev_elem = NULL;
+ next_elem = heap->first;
+ heap->first = elem;
+ } else if (elem > heap->last) {
+ /* if higher than end */
+ prev_elem = heap->last;
+ next_elem = NULL;
+ heap->last = elem;
+ } else {
+ /* the new memory is somewhere inbetween start and end */
+ uint64_t dist_from_start, dist_from_end;
+
+ dist_from_end = RTE_PTR_DIFF(heap->last, elem);
+ dist_from_start = RTE_PTR_DIFF(elem, heap->first);
+
+ /* check which is closer, and find closest list entries */
+ if (dist_from_start < dist_from_end) {
+ prev_elem = heap->first;
+ while (prev_elem->next < elem)
+ prev_elem = prev_elem->next;
+ next_elem = prev_elem->next;
+ } else {
+ next_elem = heap->last;
+ while (next_elem->prev > elem)
+ next_elem = next_elem->prev;
+ prev_elem = next_elem->prev;
+ }
+ }
+
+ /* insert new element */
+ elem->prev = prev_elem;
+ elem->next = next_elem;
+ if (prev_elem)
+ prev_elem->next = elem;
+ if (next_elem)
+ next_elem->prev = elem;
}

/*
@@ -98,18 +140,58 @@ malloc_elem_can_hold(struct malloc_elem *elem, size_t size, unsigned align,
static void
split_elem(struct malloc_elem *elem, struct malloc_elem *split_pt)
{
- struct malloc_elem *next_elem = RTE_PTR_ADD(elem, elem->size);
+ struct malloc_elem *next_elem = elem->next;
const size_t old_elem_size = (uintptr_t)split_pt - (uintptr_t)elem;
const size_t new_elem_size = elem->size - old_elem_size;

malloc_elem_init(split_pt, elem->heap, elem->ms, new_elem_size);
split_pt->prev = elem;
- next_elem->prev = split_pt;
+ split_pt->next = next_elem;
+ if (next_elem)
+ next_elem->prev = split_pt;
+ else
+ elem->heap->last = split_pt;
+ elem->next = split_pt;
elem->size = old_elem_size;
set_trailer(elem);
}

/*
+ * our malloc heap is a doubly linked list, so doubly remove our element.
+ */
+static void __rte_unused
+remove_elem(struct malloc_elem *elem)
+{
+ struct malloc_elem *next, *prev;
+ next = elem->next;
+ prev = elem->prev;
+
+ if (next)
+ next->prev = prev;
+ else
+ elem->heap->last = prev;
+ if (prev)
+ prev->next = next;
+ else
+ elem->heap->first = next;
+
+ elem->prev = NULL;
+ elem->next = NULL;
+}
+
+static int
+next_elem_is_adjacent(struct malloc_elem *elem)
+{
+ return elem->next == RTE_PTR_ADD(elem, elem->size);
+}
+
+static int
+prev_elem_is_adjacent(struct malloc_elem *elem)
+{
+ return elem == RTE_PTR_ADD(elem->prev, elem->prev->size);
+}
+
+/*
* Given an element size, compute its freelist index.
* We free an element into the freelist containing similarly-sized elements.
* We try to allocate elements starting with the freelist containing
@@ -192,6 +274,9 @@ malloc_elem_alloc(struct malloc_elem *elem, size_t size, unsigned align,

split_elem(elem, new_free_elem);
malloc_elem_free_list_insert(new_free_elem);
+
+ if (elem == elem->heap->last)
+ elem->heap->last = new_free_elem;
}

if (old_elem_size < MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) {
@@ -230,9 +315,62 @@ malloc_elem_alloc(struct malloc_elem *elem, size_t size, unsigned align,
static inline void
join_elem(struct malloc_elem *elem1, struct malloc_elem *elem2)
{
- struct malloc_elem *next = RTE_PTR_ADD(elem2, elem2->size);
+ struct malloc_elem *next = elem2->next;
elem1->size += elem2->size;
- next->prev = elem1;
+ if (next)
+ next->prev = elem1;
+ else
+ elem1->heap->last = elem1;
+ elem1->next = next;
+}
+
+static struct malloc_elem *
+elem_join_adjacent_free(struct malloc_elem *elem)
+{
+ /*
+ * check if next element exists, is adjacent and is free, if so join
+ * with it, need to remove from free list.
+ */
+ if (elem->next != NULL && elem->next->state == ELEM_FREE &&
+ next_elem_is_adjacent(elem)) {
+ void *erase;
+
+ /* we will want to erase the trailer and header */
+ erase = RTE_PTR_SUB(elem->next, MALLOC_ELEM_TRAILER_LEN);
+
+ /* remove from free list, join to this one */
+ elem_free_list_remove(elem->next);
+ join_elem(elem, elem->next);
+
+ /* erase header and trailer */
+ memset(erase, 0, MALLOC_ELEM_OVERHEAD);
+ }
+
+ /*
+ * check if prev element exists, is adjacent and is free, if so join
+ * with it, need to remove from free list.
+ */
+ if (elem->prev != NULL && elem->prev->state == ELEM_FREE &&
+ prev_elem_is_adjacent(elem)) {
+ struct malloc_elem *new_elem;
+ void *erase;
+
+ /* we will want to erase trailer and header */
+ erase = RTE_PTR_SUB(elem, MALLOC_ELEM_TRAILER_LEN);
+
+ /* remove from free list, join to this one */
+ elem_free_list_remove(elem->prev);
+
+ new_elem = elem->prev;
+ join_elem(new_elem, elem);
+
+ /* erase header and trailer */
+ memset(erase, 0, MALLOC_ELEM_OVERHEAD);
+
+ elem = new_elem;
+ }
+
+ return elem;
}

/*
@@ -243,32 +381,20 @@ join_elem(struct malloc_elem *elem1, struct malloc_elem *elem2)
int
malloc_elem_free(struct malloc_elem *elem)
{
- size_t sz = elem->size - sizeof(*elem) - MALLOC_ELEM_TRAILER_LEN;
- uint8_t *ptr = (uint8_t *)&elem[1];
- struct malloc_elem *next = RTE_PTR_ADD(elem, elem->size);
- if (next->state == ELEM_FREE){
- /* remove from free list, join to this one */
- elem_free_list_remove(next);
- join_elem(elem, next);
- sz += (sizeof(*elem) + MALLOC_ELEM_TRAILER_LEN);
- }
+ void *ptr;
+ size_t data_len;
+
+ ptr = RTE_PTR_ADD(elem, sizeof(*elem));
+ data_len = elem->size - MALLOC_ELEM_OVERHEAD;
+
+ elem = elem_join_adjacent_free(elem);

- /* check if previous element is free, if so join with it and return,
- * need to re-insert in free list, as that element's size is changing
- */
- if (elem->prev != NULL && elem->prev->state == ELEM_FREE) {
- elem_free_list_remove(elem->prev);
- join_elem(elem->prev, elem);
- sz += (sizeof(*elem) + MALLOC_ELEM_TRAILER_LEN);
- ptr -= (sizeof(*elem) + MALLOC_ELEM_TRAILER_LEN);
- elem = elem->prev;
- }
malloc_elem_free_list_insert(elem);

/* decrease heap's count of allocated elements */
elem->heap->alloc_count--;

- memset(ptr, 0, sz);
+ memset(ptr, 0, data_len);

return 0;
}
@@ -281,21 +407,23 @@ int
malloc_elem_resize(struct malloc_elem *elem, size_t size)
{
const size_t new_size = size + elem->pad + MALLOC_ELEM_OVERHEAD;
+
/* if we request a smaller size, then always return ok */
if (elem->size >= new_size)
return 0;

- struct malloc_elem *next = RTE_PTR_ADD(elem, elem->size);
- if (next ->state != ELEM_FREE)
+ /* check if there is a next element, it's free and adjacent */
+ if (!elem->next || elem->next->state != ELEM_FREE ||
+ !next_elem_is_adjacent(elem))
return -1;
- if (elem->size + next->size < new_size)
+ if (elem->size + elem->next->size < new_size)
return -1;

/* we now know the element fits, so remove from free list,
* join the two
*/
- elem_free_list_remove(next);
- join_elem(elem, next);
+ elem_free_list_remove(elem->next);
+ join_elem(elem, elem->next);

if (elem->size - new_size >= MIN_DATA_SIZE + MALLOC_ELEM_OVERHEAD) {
/* now we have a big block together. Lets cut it down a bit, by splitting */
diff --git a/lib/librte_eal/common/malloc_elem.h b/lib/librte_eal/common/malloc_elem.h
index f4c1c7a..238e451 100644
--- a/lib/librte_eal/common/malloc_elem.h
+++ b/lib/librte_eal/common/malloc_elem.h
@@ -18,8 +18,12 @@ enum elem_state {

struct malloc_elem {
struct malloc_heap *heap;
- struct malloc_elem *volatile prev; /* points to prev elem in memseg */
- LIST_ENTRY(malloc_elem) free_list; /* list of free elements in heap */
+ struct malloc_elem *volatile prev;
+ /**< points to prev elem in memseg */
+ struct malloc_elem *volatile next;
+ /**< points to next elem in memseg */
+ LIST_ENTRY(malloc_elem) free_list;
+ /**< list of free elements in heap */
const struct rte_memseg *ms;
volatile enum elem_state state;
uint32_t pad;
@@ -110,12 +114,8 @@ malloc_elem_init(struct malloc_elem *elem,
const struct rte_memseg *ms,
size_t size);

-/*
- * initialise a dummy malloc_elem header for the end-of-memseg marker
- */
void
-malloc_elem_mkend(struct malloc_elem *elem,
- struct malloc_elem *prev_free);
+malloc_elem_insert(struct malloc_elem *elem);

/*
* return true if the current malloc_elem can hold a block of data
diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c
index 7d8d70a..9c95166 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -70,15 +70,11 @@ check_hugepage_sz(unsigned flags, uint64_t hugepage_sz)
static void
malloc_heap_add_memseg(struct malloc_heap *heap, struct rte_memseg *ms)
{
- /* allocate the memory block headers, one at end, one at start */
struct malloc_elem *start_elem = (struct malloc_elem *)ms->addr;
- struct malloc_elem *end_elem = RTE_PTR_ADD(ms->addr,
- ms->len - MALLOC_ELEM_OVERHEAD);
- end_elem = RTE_PTR_ALIGN_FLOOR(end_elem, RTE_CACHE_LINE_SIZE);
- const size_t elem_size = (uintptr_t)end_elem - (uintptr_t)start_elem;
+ const size_t elem_size = ms->len - MALLOC_ELEM_OVERHEAD;

malloc_elem_init(start_elem, heap, ms, elem_size);
- malloc_elem_mkend(end_elem, start_elem);
+ malloc_elem_insert(start_elem);
malloc_elem_free_list_insert(start_elem);

heap->total_size += elem_size;
--
2.7.4
Anatoly Burakov
2018-03-07 16:56:33 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---
test/test/commands.c | 3 +++
1 file changed, 3 insertions(+)

diff --git a/test/test/commands.c b/test/test/commands.c
index cf0b726..6bfdc02 100644
--- a/test/test/commands.c
+++ b/test/test/commands.c
@@ -137,6 +137,8 @@ static void cmd_dump_parsed(void *parsed_result,
rte_log_dump(stdout);
else if (!strcmp(res->dump, "dump_malloc_stats"))
rte_malloc_dump_stats(stdout, NULL);
+ else if (!strcmp(res->dump, "dump_malloc_heaps"))
+ rte_malloc_dump_heaps(stdout);
}

cmdline_parse_token_string_t cmd_dump_dump =
@@ -147,6 +149,7 @@ cmdline_parse_token_string_t cmd_dump_dump =
"dump_ring#"
"dump_mempool#"
"dump_malloc_stats#"
+ "dump_malloc_heaps#"
"dump_devargs#"
"dump_log_types");
--
2.7.4
Anatoly Burakov
2018-03-07 16:56:30 UTC
Permalink
Down the line, we will need to do everything from the heap as any
alloc or free may trigger alloc/free OS memory, which would involve
growing/shrinking heap.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/common/malloc_elem.c | 16 ++--------------
lib/librte_eal/common/malloc_heap.c | 38 +++++++++++++++++++++++++++++++++++++
lib/librte_eal/common/malloc_heap.h | 6 ++++++
lib/librte_eal/common/rte_malloc.c | 4 ++--
4 files changed, 48 insertions(+), 16 deletions(-)

diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c
index 0cadc8a..ea041e2 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -243,10 +243,6 @@ join_elem(struct malloc_elem *elem1, struct malloc_elem *elem2)
int
malloc_elem_free(struct malloc_elem *elem)
{
- if (!malloc_elem_cookies_ok(elem) || elem->state != ELEM_BUSY)
- return -1;
-
- rte_spinlock_lock(&(elem->heap->lock));
size_t sz = elem->size - sizeof(*elem) - MALLOC_ELEM_TRAILER_LEN;
uint8_t *ptr = (uint8_t *)&elem[1];
struct malloc_elem *next = RTE_PTR_ADD(elem, elem->size);
@@ -274,8 +270,6 @@ malloc_elem_free(struct malloc_elem *elem)

memset(ptr, 0, sz);

- rte_spinlock_unlock(&(elem->heap->lock));
-
return 0;
}

@@ -292,11 +286,10 @@ malloc_elem_resize(struct malloc_elem *elem, size_t size)
return 0;

struct malloc_elem *next = RTE_PTR_ADD(elem, elem->size);
- rte_spinlock_lock(&elem->heap->lock);
if (next ->state != ELEM_FREE)
- goto err_return;
+ return -1;
if (elem->size + next->size < new_size)
- goto err_return;
+ return -1;

/* we now know the element fits, so remove from free list,
* join the two
@@ -311,10 +304,5 @@ malloc_elem_resize(struct malloc_elem *elem, size_t size)
split_elem(elem, split_pt);
malloc_elem_free_list_insert(split_pt);
}
- rte_spinlock_unlock(&elem->heap->lock);
return 0;
-
-err_return:
- rte_spinlock_unlock(&elem->heap->lock);
- return -1;
}
diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c
index 7aafc88..7d8d70a 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -145,6 +145,44 @@ malloc_heap_alloc(struct malloc_heap *heap,
return elem == NULL ? NULL : (void *)(&elem[1]);
}

+int
+malloc_heap_free(struct malloc_elem *elem)
+{
+ struct malloc_heap *heap;
+ int ret;
+
+ if (!malloc_elem_cookies_ok(elem) || elem->state != ELEM_BUSY)
+ return -1;
+
+ /* elem may be merged with previous element, so keep heap address */
+ heap = elem->heap;
+
+ rte_spinlock_lock(&(heap->lock));
+
+ ret = malloc_elem_free(elem);
+
+ rte_spinlock_unlock(&(heap->lock));
+
+ return ret;
+}
+
+int
+malloc_heap_resize(struct malloc_elem *elem, size_t size)
+{
+ int ret;
+
+ if (!malloc_elem_cookies_ok(elem) || elem->state != ELEM_BUSY)
+ return -1;
+
+ rte_spinlock_lock(&(elem->heap->lock));
+
+ ret = malloc_elem_resize(elem, size);
+
+ rte_spinlock_unlock(&(elem->heap->lock));
+
+ return ret;
+}
+
/*
* Function to retrieve data for heap on given socket
*/
diff --git a/lib/librte_eal/common/malloc_heap.h b/lib/librte_eal/common/malloc_heap.h
index e0defa7..ab0005c 100644
--- a/lib/librte_eal/common/malloc_heap.h
+++ b/lib/librte_eal/common/malloc_heap.h
@@ -28,6 +28,12 @@ malloc_heap_alloc(struct malloc_heap *heap, const char *type, size_t size,
unsigned flags, size_t align, size_t bound);

int
+malloc_heap_free(struct malloc_elem *elem);
+
+int
+malloc_heap_resize(struct malloc_elem *elem, size_t size);
+
+int
malloc_heap_get_stats(struct malloc_heap *heap,
struct rte_malloc_socket_stats *socket_stats);

diff --git a/lib/librte_eal/common/rte_malloc.c b/lib/librte_eal/common/rte_malloc.c
index e0e0d0b..970813e 100644
--- a/lib/librte_eal/common/rte_malloc.c
+++ b/lib/librte_eal/common/rte_malloc.c
@@ -29,7 +29,7 @@
void rte_free(void *addr)
{
if (addr == NULL) return;
- if (malloc_elem_free(malloc_elem_from_data(addr)) < 0)
+ if (malloc_heap_free(malloc_elem_from_data(addr)) < 0)
rte_panic("Fatal error: Invalid memory\n");
}

@@ -140,7 +140,7 @@ rte_realloc(void *ptr, size_t size, unsigned align)
size = RTE_CACHE_LINE_ROUNDUP(size), align = RTE_CACHE_LINE_ROUNDUP(align);
/* check alignment matches first, and if ok, see if we can resize block */
if (RTE_PTR_ALIGN(ptr,align) == ptr &&
- malloc_elem_resize(elem, size) == 0)
+ malloc_heap_resize(elem, size) == 0)
return ptr;

/* either alignment is off, or we have no room to expand,
--
2.7.4
Anatoly Burakov
2018-03-07 16:56:39 UTC
Permalink
This adds a "--legacy-mem" command-line switch. It will be used to
go back to the old memory behavior, one where we can't dynamically
allocate/free memory (the downside), but one where the user can
get physically contiguous memory, like before (the upside).

For now, nothing but the legacy behavior exists, non-legacy
memory init sequence will be added later.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/bsdapp/eal/eal.c | 3 +++
lib/librte_eal/common/eal_common_options.c | 4 ++++
lib/librte_eal/common/eal_internal_cfg.h | 4 ++++
lib/librte_eal/common/eal_options.h | 2 ++
lib/librte_eal/linuxapp/eal/eal.c | 1 +
lib/librte_eal/linuxapp/eal/eal_memory.c | 24 ++++++++++++++++++++----
6 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/lib/librte_eal/bsdapp/eal/eal.c b/lib/librte_eal/bsdapp/eal/eal.c
index 4eafcb5..45e5670 100644
--- a/lib/librte_eal/bsdapp/eal/eal.c
+++ b/lib/librte_eal/bsdapp/eal/eal.c
@@ -531,6 +531,9 @@ rte_eal_init(int argc, char **argv)
return -1;
}

+ /* FreeBSD always uses legacy memory model */
+ internal_config.legacy_mem = true;
+
if (eal_plugins_init() < 0) {
rte_eal_init_alert("Cannot init plugins\n");
rte_errno = EINVAL;
diff --git a/lib/librte_eal/common/eal_common_options.c b/lib/librte_eal/common/eal_common_options.c
index dbc3fb5..3e92551 100644
--- a/lib/librte_eal/common/eal_common_options.c
+++ b/lib/librte_eal/common/eal_common_options.c
@@ -74,6 +74,7 @@ eal_long_options[] = {
{OPT_VFIO_INTR, 1, NULL, OPT_VFIO_INTR_NUM },
{OPT_VMWARE_TSC_MAP, 0, NULL, OPT_VMWARE_TSC_MAP_NUM },
{OPT_SINGLE_FILE_SEGMENTS, 0, NULL, OPT_SINGLE_FILE_SEGMENTS_NUM},
+ {OPT_LEGACY_MEM, 0, NULL, OPT_LEGACY_MEM_NUM },
{0, 0, NULL, 0 }
};

@@ -1165,6 +1166,9 @@ eal_parse_common_option(int opt, const char *optarg,
case OPT_SINGLE_FILE_SEGMENTS_NUM:
conf->single_file_segments = 1;
break;
+ case OPT_LEGACY_MEM_NUM:
+ conf->legacy_mem = 1;
+ break;

/* don't know what to do, leave this to caller */
default:
diff --git a/lib/librte_eal/common/eal_internal_cfg.h b/lib/librte_eal/common/eal_internal_cfg.h
index d4c02d6..4a43de6 100644
--- a/lib/librte_eal/common/eal_internal_cfg.h
+++ b/lib/librte_eal/common/eal_internal_cfg.h
@@ -51,6 +51,10 @@ struct internal_config {
/**< true if storing all pages within single files (per-page-size,
* per-node).
*/
+ volatile unsigned legacy_mem;
+ /**< true to enable legacy memory behavior (no dynamic allocation,
+ * contiguous segments).
+ */
volatile int syslog_facility; /**< facility passed to openlog() */
/** default interrupt mode for VFIO */
volatile enum rte_intr_mode vfio_intr_mode;
diff --git a/lib/librte_eal/common/eal_options.h b/lib/librte_eal/common/eal_options.h
index a4b80d5..f9a679d 100644
--- a/lib/librte_eal/common/eal_options.h
+++ b/lib/librte_eal/common/eal_options.h
@@ -57,6 +57,8 @@ enum {
OPT_VMWARE_TSC_MAP_NUM,
#define OPT_SINGLE_FILE_SEGMENTS "single-file-segments"
OPT_SINGLE_FILE_SEGMENTS_NUM,
+#define OPT_LEGACY_MEM "legacy-mem"
+ OPT_LEGACY_MEM_NUM,
OPT_LONG_MAX_NUM
};

diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index c84e6bf..5207713 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -349,6 +349,7 @@ eal_usage(const char *prgname)
" --"OPT_CREATE_UIO_DEV" Create /dev/uioX (usually done by hotplug)\n"
" --"OPT_VFIO_INTR" Interrupt mode for VFIO (legacy|msi|msix)\n"
" --"OPT_SINGLE_FILE_SEGMENTS" Put all hugepage memory in single files\n"
+ " --"OPT_LEGACY_MEM" Legacy memory mode (no dynamic allocation, contiguous segments)\n"
"\n");
/* Allow the application to print its usage message too if hook is set */
if ( rte_application_usage_hook ) {
diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
index 5c11d77..b9bcb75 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -919,8 +919,8 @@ huge_recover_sigbus(void)
* 6. unmap the first mapping
* 7. fill memsegs in configuration with contiguous zones
*/
-int
-rte_eal_hugepage_init(void)
+static int
+eal_legacy_hugepage_init(void)
{
struct rte_mem_config *mcfg;
struct hugepage_file *hugepage = NULL, *tmp_hp = NULL;
@@ -1262,8 +1262,8 @@ getFileSize(int fd)
* configuration and finds the hugepages which form that segment, mapping them
* in order to form a contiguous block in the virtual memory space
*/
-int
-rte_eal_hugepage_attach(void)
+static int
+eal_legacy_hugepage_attach(void)
{
const struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
struct hugepage_file *hp = NULL;
@@ -1399,6 +1399,22 @@ rte_eal_hugepage_attach(void)
}

int
+rte_eal_hugepage_init(void)
+{
+ if (internal_config.legacy_mem)
+ return eal_legacy_hugepage_init();
+ return -1;
+}
+
+int
+rte_eal_hugepage_attach(void)
+{
+ if (internal_config.legacy_mem)
+ return eal_legacy_hugepage_attach();
+ return -1;
+}
+
+int
rte_eal_using_phys_addrs(void)
{
return phys_addrs_available;
--
2.7.4
Anatoly Burakov
2018-03-07 16:56:40 UTC
Permalink
For non-legacy memory init mode, instead of looking at generic
sysfs path, look at sysfs paths pertaining to each NUMA node
for hugepage counts. Note that per-NUMA node path does not
provide information regarding reserved pages, so we might not
get the best info from these paths, but this saves us from the
whole mapping/remapping business before we're actually able to
tell which page is on which socket, because we no longer require
our memory to be physically contiguous.

Legacy memory init will not use this.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/linuxapp/eal/eal_hugepage_info.c | 79 +++++++++++++++++++++++--
1 file changed, 73 insertions(+), 6 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
index 8bbf771..706b6d5 100644
--- a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
+++ b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
@@ -30,6 +30,7 @@
#include "eal_filesystem.h"

static const char sys_dir_path[] = "/sys/kernel/mm/hugepages";
+static const char sys_pages_numa_dir_path[] = "/sys/devices/system/node";

/* this function is only called from eal_hugepage_info_init which itself
* is only called from a primary process */
@@ -70,6 +71,45 @@ get_num_hugepages(const char *subdir)
return num_pages;
}

+static uint32_t
+get_num_hugepages_on_node(const char *subdir, unsigned int socket)
+{
+ char path[PATH_MAX], socketpath[PATH_MAX];
+ DIR *socketdir;
+ unsigned long num_pages = 0;
+ const char *nr_hp_file = "free_hugepages";
+
+ snprintf(socketpath, sizeof(socketpath), "%s/node%u/hugepages",
+ sys_pages_numa_dir_path, socket);
+
+ socketdir = opendir(socketpath);
+ if (socketdir) {
+ /* Keep calm and carry on */
+ closedir(socketdir);
+ } else {
+ /* Can't find socket dir, so ignore it */
+ return 0;
+ }
+
+ snprintf(path, sizeof(path), "%s/%s/%s",
+ socketpath, subdir, nr_hp_file);
+ if (eal_parse_sysfs_value(path, &num_pages) < 0)
+ return 0;
+
+ if (num_pages == 0)
+ RTE_LOG(WARNING, EAL, "No free hugepages reported in %s\n",
+ subdir);
+
+ /*
+ * we want to return a uint32_t and more than this looks suspicious
+ * anyway ...
+ */
+ if (num_pages > UINT32_MAX)
+ num_pages = UINT32_MAX;
+
+ return num_pages;
+}
+
static uint64_t
get_default_hp_size(void)
{
@@ -248,7 +288,7 @@ eal_hugepage_info_init(void)
{
const char dirent_start_text[] = "hugepages-";
const size_t dirent_start_len = sizeof(dirent_start_text) - 1;
- unsigned i, num_sizes = 0;
+ unsigned int i, total_pages, num_sizes = 0;
DIR *dir;
struct dirent *dirent;

@@ -302,9 +342,27 @@ eal_hugepage_info_init(void)
if (clear_hugedir(hpi->hugedir) == -1)
break;

- /* for now, put all pages into socket 0,
- * later they will be sorted */
- hpi->num_pages[0] = get_num_hugepages(dirent->d_name);
+ /*
+ * first, try to put all hugepages into relevant sockets, but
+ * if first attempts fails, fall back to collecting all pages
+ * in one socket and sorting them later
+ */
+ total_pages = 0;
+ /* we also don't want to do this for legacy init */
+ if (!internal_config.legacy_mem)
+ for (i = 0; i < rte_num_sockets(); i++) {
+ unsigned int num_pages =
+ get_num_hugepages_on_node(
+ dirent->d_name, i);
+ hpi->num_pages[i] = num_pages;
+ total_pages += num_pages;
+ }
+ /*
+ * we failed to sort memory from the get go, so fall
+ * back to old way
+ */
+ if (total_pages == 0)
+ hpi->num_pages[0] = get_num_hugepages(dirent->d_name);

#ifndef RTE_ARCH_64
/* for 32-bit systems, limit number of hugepages to
@@ -328,10 +386,19 @@ eal_hugepage_info_init(void)
sizeof(internal_config.hugepage_info[0]), compare_hpi);

/* now we have all info, check we have at least one valid size */
- for (i = 0; i < num_sizes; i++)
+ for (i = 0; i < num_sizes; i++) {
+ /* pages may no longer all be on socket 0, so check all */
+ unsigned int j, num_pages = 0;
+
+ for (j = 0; j < RTE_MAX_NUMA_NODES; j++) {
+ struct hugepage_info *hpi =
+ &internal_config.hugepage_info[i];
+ num_pages += hpi->num_pages[j];
+ }
if (internal_config.hugepage_info[i].hugedir != NULL &&
- internal_config.hugepage_info[i].num_pages[0] > 0)
+ num_pages > 0)
return 0;
+ }

/* no valid hugepage mounts available, return error */
return -1;
--
2.7.4
Anatoly Burakov
2018-03-07 16:56:35 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/common/malloc_elem.c | 12 ++++++------
lib/librte_eal/common/malloc_elem.h | 3 +++
2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c
index 2291ee1..008f5a3 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -245,8 +245,8 @@ malloc_elem_free_list_insert(struct malloc_elem *elem)
/*
* Remove the specified element from its heap's free list.
*/
-static void
-elem_free_list_remove(struct malloc_elem *elem)
+void
+malloc_elem_free_list_remove(struct malloc_elem *elem)
{
LIST_REMOVE(elem, free_list);
}
@@ -266,7 +266,7 @@ malloc_elem_alloc(struct malloc_elem *elem, size_t size, unsigned align,
const size_t trailer_size = elem->size - old_elem_size - size -
MALLOC_ELEM_OVERHEAD;

- elem_free_list_remove(elem);
+ malloc_elem_free_list_remove(elem);

if (trailer_size > MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) {
/* split it, too much free space after elem */
@@ -340,7 +340,7 @@ malloc_elem_join_adjacent_free(struct malloc_elem *elem)
erase = RTE_PTR_SUB(elem->next, MALLOC_ELEM_TRAILER_LEN);

/* remove from free list, join to this one */
- elem_free_list_remove(elem->next);
+ malloc_elem_free_list_remove(elem->next);
join_elem(elem, elem->next);

/* erase header and trailer */
@@ -360,7 +360,7 @@ malloc_elem_join_adjacent_free(struct malloc_elem *elem)
erase = RTE_PTR_SUB(elem, MALLOC_ELEM_TRAILER_LEN);

/* remove from free list, join to this one */
- elem_free_list_remove(elem->prev);
+ malloc_elem_free_list_remove(elem->prev);

new_elem = elem->prev;
join_elem(new_elem, elem);
@@ -423,7 +423,7 @@ malloc_elem_resize(struct malloc_elem *elem, size_t size)
/* we now know the element fits, so remove from free list,
* join the two
*/
- elem_free_list_remove(elem->next);
+ malloc_elem_free_list_remove(elem->next);
join_elem(elem, elem->next);

if (elem->size - new_size >= MIN_DATA_SIZE + MALLOC_ELEM_OVERHEAD) {
diff --git a/lib/librte_eal/common/malloc_elem.h b/lib/librte_eal/common/malloc_elem.h
index 99921d2..46e2383 100644
--- a/lib/librte_eal/common/malloc_elem.h
+++ b/lib/librte_eal/common/malloc_elem.h
@@ -151,6 +151,9 @@ malloc_elem_join_adjacent_free(struct malloc_elem *elem);
int
malloc_elem_resize(struct malloc_elem *elem, size_t size);

+void
+malloc_elem_free_list_remove(struct malloc_elem *elem);
+
/*
* dump contents of malloc elem to a file.
*/
--
2.7.4
Anatoly Burakov
2018-03-07 16:56:36 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/common/malloc_elem.c | 4 ++--
lib/librte_eal/common/malloc_elem.h | 2 +-
lib/librte_eal/common/malloc_heap.c | 4 ++--
3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c
index 008f5a3..c18f050 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -379,7 +379,7 @@ malloc_elem_join_adjacent_free(struct malloc_elem *elem)
* blocks either immediately before or immediately after newly freed block
* are also free, the blocks are merged together.
*/
-int
+struct malloc_elem *
malloc_elem_free(struct malloc_elem *elem)
{
void *ptr;
@@ -397,7 +397,7 @@ malloc_elem_free(struct malloc_elem *elem)

memset(ptr, 0, data_len);

- return 0;
+ return elem;
}

/*
diff --git a/lib/librte_eal/common/malloc_elem.h b/lib/librte_eal/common/malloc_elem.h
index 46e2383..9c1614c 100644
--- a/lib/librte_eal/common/malloc_elem.h
+++ b/lib/librte_eal/common/malloc_elem.h
@@ -138,7 +138,7 @@ malloc_elem_alloc(struct malloc_elem *elem, size_t size,
* blocks either immediately before or immediately after newly freed block
* are also free, the blocks are merged together.
*/
-int
+struct malloc_elem *
malloc_elem_free(struct malloc_elem *elem);

struct malloc_elem *
diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c
index 44538d7..a2c2e4c 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -145,7 +145,7 @@ int
malloc_heap_free(struct malloc_elem *elem)
{
struct malloc_heap *heap;
- int ret;
+ struct malloc_elem *ret;

if (!malloc_elem_cookies_ok(elem) || elem->state != ELEM_BUSY)
return -1;
@@ -159,7 +159,7 @@ malloc_heap_free(struct malloc_elem *elem)

rte_spinlock_unlock(&(heap->lock));

- return ret;
+ return ret != NULL ? 0 : -1;
}

int
--
2.7.4
Anatoly Burakov
2018-03-07 16:56:34 UTC
Permalink
We need this function to join newly allocated segments with the heap.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/common/malloc_elem.c | 6 +++---
lib/librte_eal/common/malloc_elem.h | 3 +++
2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c
index e02ed88..2291ee1 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -325,8 +325,8 @@ join_elem(struct malloc_elem *elem1, struct malloc_elem *elem2)
elem1->next = next;
}

-static struct malloc_elem *
-elem_join_adjacent_free(struct malloc_elem *elem)
+struct malloc_elem *
+malloc_elem_join_adjacent_free(struct malloc_elem *elem)
{
/*
* check if next element exists, is adjacent and is free, if so join
@@ -388,7 +388,7 @@ malloc_elem_free(struct malloc_elem *elem)
ptr = RTE_PTR_ADD(elem, sizeof(*elem));
data_len = elem->size - MALLOC_ELEM_OVERHEAD;

- elem = elem_join_adjacent_free(elem);
+ elem = malloc_elem_join_adjacent_free(elem);

malloc_elem_free_list_insert(elem);

diff --git a/lib/librte_eal/common/malloc_elem.h b/lib/librte_eal/common/malloc_elem.h
index 40e8eb5..99921d2 100644
--- a/lib/librte_eal/common/malloc_elem.h
+++ b/lib/librte_eal/common/malloc_elem.h
@@ -141,6 +141,9 @@ malloc_elem_alloc(struct malloc_elem *elem, size_t size,
int
malloc_elem_free(struct malloc_elem *elem);

+struct malloc_elem *
+malloc_elem_join_adjacent_free(struct malloc_elem *elem);
+
/*
* attempt to resize a malloc_elem by expanding into any free space
* immediately after it in memory.
--
2.7.4
Anatoly Burakov
2018-03-07 16:56:32 UTC
Permalink
Malloc heap is now a doubly linked list, so it's now possible to
iterate over each malloc element regardless of its state.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/common/include/rte_malloc.h | 9 +++++++++
lib/librte_eal/common/malloc_elem.c | 24 ++++++++++++++++++++++++
lib/librte_eal/common/malloc_elem.h | 6 ++++++
lib/librte_eal/common/malloc_heap.c | 22 ++++++++++++++++++++++
lib/librte_eal/common/malloc_heap.h | 3 +++
lib/librte_eal/common/rte_malloc.c | 16 ++++++++++++++++
lib/librte_eal/rte_eal_version.map | 1 +
7 files changed, 81 insertions(+)

diff --git a/lib/librte_eal/common/include/rte_malloc.h b/lib/librte_eal/common/include/rte_malloc.h
index f02a8ba..a3fc83e 100644
--- a/lib/librte_eal/common/include/rte_malloc.h
+++ b/lib/librte_eal/common/include/rte_malloc.h
@@ -278,6 +278,15 @@ void
rte_malloc_dump_stats(FILE *f, const char *type);

/**
+ * Dump contents of all malloc heaps to a file.
+ *
+ * @param f
+ * A pointer to a file for output
+ */
+void
+rte_malloc_dump_heaps(FILE *f);
+
+/**
* Set the maximum amount of allocated memory for this type.
*
* This is not yet implemented
diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c
index eb41200..e02ed88 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -1,6 +1,7 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright(c) 2010-2014 Intel Corporation
*/
+#include <inttypes.h>
#include <stdint.h>
#include <stddef.h>
#include <stdio.h>
@@ -434,3 +435,26 @@ malloc_elem_resize(struct malloc_elem *elem, size_t size)
}
return 0;
}
+
+static inline const char *
+elem_state_to_str(enum elem_state state)
+{
+ switch (state) {
+ case ELEM_PAD:
+ return "PAD";
+ case ELEM_BUSY:
+ return "BUSY";
+ case ELEM_FREE:
+ return "FREE";
+ }
+ return "ERROR";
+}
+
+void
+malloc_elem_dump(const struct malloc_elem *elem, FILE *f)
+{
+ fprintf(f, "Malloc element at %p (%s)\n", elem,
+ elem_state_to_str(elem->state));
+ fprintf(f, " len: 0x%zx pad: 0x%" PRIx32 "\n", elem->size, elem->pad);
+ fprintf(f, " prev: %p next: %p\n", elem->prev, elem->next);
+}
diff --git a/lib/librte_eal/common/malloc_elem.h b/lib/librte_eal/common/malloc_elem.h
index 238e451..40e8eb5 100644
--- a/lib/librte_eal/common/malloc_elem.h
+++ b/lib/librte_eal/common/malloc_elem.h
@@ -149,6 +149,12 @@ int
malloc_elem_resize(struct malloc_elem *elem, size_t size);

/*
+ * dump contents of malloc elem to a file.
+ */
+void
+malloc_elem_dump(const struct malloc_elem *elem, FILE *f);
+
+/*
* Given an element size, compute its freelist index.
*/
size_t
diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c
index 9c95166..44538d7 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -217,6 +217,28 @@ malloc_heap_get_stats(struct malloc_heap *heap,
return 0;
}

+/*
+ * Function to retrieve data for heap on given socket
+ */
+void
+malloc_heap_dump(struct malloc_heap *heap, FILE *f)
+{
+ struct malloc_elem *elem;
+
+ rte_spinlock_lock(&heap->lock);
+
+ fprintf(f, "Heap size: 0x%zx\n", heap->total_size);
+ fprintf(f, "Heap alloc count: %u\n", heap->alloc_count);
+
+ elem = heap->first;
+ while (elem) {
+ malloc_elem_dump(elem, f);
+ elem = elem->next;
+ }
+
+ rte_spinlock_unlock(&heap->lock);
+}
+
int
rte_eal_malloc_heap_init(void)
{
diff --git a/lib/librte_eal/common/malloc_heap.h b/lib/librte_eal/common/malloc_heap.h
index ab0005c..bb28422 100644
--- a/lib/librte_eal/common/malloc_heap.h
+++ b/lib/librte_eal/common/malloc_heap.h
@@ -37,6 +37,9 @@ int
malloc_heap_get_stats(struct malloc_heap *heap,
struct rte_malloc_socket_stats *socket_stats);

+void
+malloc_heap_dump(struct malloc_heap *heap, FILE *f);
+
int
rte_eal_malloc_heap_init(void);

diff --git a/lib/librte_eal/common/rte_malloc.c b/lib/librte_eal/common/rte_malloc.c
index 970813e..80fb6cc 100644
--- a/lib/librte_eal/common/rte_malloc.c
+++ b/lib/librte_eal/common/rte_malloc.c
@@ -182,6 +182,22 @@ rte_malloc_get_socket_stats(int socket,
}

/*
+ * Function to dump contents of all heaps
+ */
+void
+rte_malloc_dump_heaps(FILE *f)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ unsigned int socket;
+
+ for (socket = 0; socket < rte_num_sockets(); socket++) {
+ fprintf(f, "Heap on socket %i:\n", socket);
+ malloc_heap_dump(&mcfg->malloc_heaps[socket], f);
+ }
+
+}
+
+/*
* Print stats on memory type. If type is NULL, info on all types is printed
*/
void
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index 52f5940..18b8bf5 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -215,6 +215,7 @@ DPDK_18.05 {
global:

rte_num_sockets;
+ rte_malloc_dump_heaps;

} DPDK_18.02;
--
2.7.4
Anatoly Burakov
2018-03-07 16:56:38 UTC
Permalink
For now, this option does nothing, but it will be useful in
dynamic memory allocation down the line. Currently, DPDK stores
all pages as separate files in hugetlbfs. This option will allow
storing all pages in one file (one file per socket, per page size).

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/common/eal_common_options.c | 4 ++++
lib/librte_eal/common/eal_internal_cfg.h | 4 ++++
lib/librte_eal/common/eal_options.h | 2 ++
lib/librte_eal/linuxapp/eal/eal.c | 1 +
4 files changed, 11 insertions(+)

diff --git a/lib/librte_eal/common/eal_common_options.c b/lib/librte_eal/common/eal_common_options.c
index 0be80cb..dbc3fb5 100644
--- a/lib/librte_eal/common/eal_common_options.c
+++ b/lib/librte_eal/common/eal_common_options.c
@@ -73,6 +73,7 @@ eal_long_options[] = {
{OPT_VDEV, 1, NULL, OPT_VDEV_NUM },
{OPT_VFIO_INTR, 1, NULL, OPT_VFIO_INTR_NUM },
{OPT_VMWARE_TSC_MAP, 0, NULL, OPT_VMWARE_TSC_MAP_NUM },
+ {OPT_SINGLE_FILE_SEGMENTS, 0, NULL, OPT_SINGLE_FILE_SEGMENTS_NUM},
{0, 0, NULL, 0 }
};

@@ -1161,6 +1162,9 @@ eal_parse_common_option(int opt, const char *optarg,

core_parsed = LCORE_OPT_MAP;
break;
+ case OPT_SINGLE_FILE_SEGMENTS_NUM:
+ conf->single_file_segments = 1;
+ break;

/* don't know what to do, leave this to caller */
default:
diff --git a/lib/librte_eal/common/eal_internal_cfg.h b/lib/librte_eal/common/eal_internal_cfg.h
index a0082d1..d4c02d6 100644
--- a/lib/librte_eal/common/eal_internal_cfg.h
+++ b/lib/librte_eal/common/eal_internal_cfg.h
@@ -47,6 +47,10 @@ struct internal_config {
volatile unsigned force_sockets;
volatile uint64_t socket_mem[RTE_MAX_NUMA_NODES]; /**< amount of memory per socket */
uintptr_t base_virtaddr; /**< base address to try and reserve memory from */
+ volatile unsigned single_file_segments;
+ /**< true if storing all pages within single files (per-page-size,
+ * per-node).
+ */
volatile int syslog_facility; /**< facility passed to openlog() */
/** default interrupt mode for VFIO */
volatile enum rte_intr_mode vfio_intr_mode;
diff --git a/lib/librte_eal/common/eal_options.h b/lib/librte_eal/common/eal_options.h
index e86c711..a4b80d5 100644
--- a/lib/librte_eal/common/eal_options.h
+++ b/lib/librte_eal/common/eal_options.h
@@ -55,6 +55,8 @@ enum {
OPT_VFIO_INTR_NUM,
#define OPT_VMWARE_TSC_MAP "vmware-tsc-map"
OPT_VMWARE_TSC_MAP_NUM,
+#define OPT_SINGLE_FILE_SEGMENTS "single-file-segments"
+ OPT_SINGLE_FILE_SEGMENTS_NUM,
OPT_LONG_MAX_NUM
};

diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 2ecd07b..c84e6bf 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -348,6 +348,7 @@ eal_usage(const char *prgname)
" --"OPT_BASE_VIRTADDR" Base virtual address\n"
" --"OPT_CREATE_UIO_DEV" Create /dev/uioX (usually done by hotplug)\n"
" --"OPT_VFIO_INTR" Interrupt mode for VFIO (legacy|msi|msix)\n"
+ " --"OPT_SINGLE_FILE_SEGMENTS" Put all hugepage memory in single files\n"
"\n");
/* Allow the application to print its usage message too if hook is set */
if ( rte_application_usage_hook ) {
--
2.7.4
Anatoly Burakov
2018-03-07 16:56:46 UTC
Permalink
The test was expecting memory already being allocated on all sockets,
and thus was failing because calling rte_malloc could trigger memory
hotplug event and allocate memory where there was none before.

Fix it to instead report availability of memory on specific sockets
by attempting to allocate a page and see if that succeeds. Technically,
this can still cause failure as memory might not be available at the
time of check, but become available by the time the test is run, but
this is a corner case not worth considering.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
test/test/test_malloc.c | 52 +++++++++++++++++++++++++++++++++++++++++--------
1 file changed, 44 insertions(+), 8 deletions(-)

diff --git a/test/test/test_malloc.c b/test/test/test_malloc.c
index 8484fb6..2aaf1b8 100644
--- a/test/test/test_malloc.c
+++ b/test/test/test_malloc.c
@@ -22,6 +22,8 @@
#include <rte_random.h>
#include <rte_string_fns.h>

+#include "../../lib/librte_eal/common/eal_memalloc.h"
+
#include "test.h"

#define N 10000
@@ -708,22 +710,56 @@ test_malloc_bad_params(void)

/* Check if memory is avilable on a specific socket */
static int
-is_mem_on_socket(int32_t socket)
+is_mem_on_socket(unsigned int socket)
{
+ struct rte_malloc_socket_stats stats;
const struct rte_mem_config *mcfg =
rte_eal_get_configuration()->mem_config;
- unsigned i;
+ uint64_t prev_pgsz;
+ unsigned int i;
+
+ /* we cannot know if there's memory on a specific socket, since it might
+ * be available, but not yet allocated. so, in addition to checking
+ * already mapped memory, we will attempt to allocate a page from that
+ * socket and see if it works.
+ */
+ if (socket >= rte_num_sockets())
+ return 0;

+ rte_malloc_get_socket_stats(socket, &stats);
+
+ /* if heap has memory allocated, stop */
+ if (stats.heap_totalsz_bytes > 0)
+ return 1;
+
+ /* to allocate a page, we will have to know its size, so go through all
+ * supported page sizes and try with each one.
+ */
+ prev_pgsz = 0;
for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
- const struct rte_memseg_list *msl =
- &mcfg->memsegs[i];
- const struct rte_fbarray *arr = &msl->memseg_arr;
+ const struct rte_memseg_list *msl = &mcfg->memsegs[i];
+ uint64_t page_sz;

- if (msl->socket_id != socket)
+ /* skip unused memseg lists */
+ if (msl->memseg_arr.len == 0)
continue;
+ page_sz = msl->hugepage_sz;

- if (arr->count)
- return 1;
+ /* skip page sizes we've tried already */
+ if (prev_pgsz == page_sz)
+ continue;
+
+ prev_pgsz = page_sz;
+
+ struct rte_memseg *ms = eal_memalloc_alloc_page(page_sz,
+ socket);
+
+ if (ms == NULL)
+ continue;
+
+ eal_memalloc_free_page(ms);
+
+ return 1;
}
return 0;
}
--
2.7.4
Anatoly Burakov
2018-03-07 16:56:37 UTC
Permalink
rte_fbarray is a simple indexed array stored in shared memory
via mapping files into memory. Rationale for its existence is the
following: since we are going to map memory page-by-page, there
could be quite a lot of memory segments to keep track of (for
smaller page sizes, page count can easily reach thousands). We
can't really make page lists truly dynamic and infinitely expandable,
because that involves reallocating memory (which is a big no-no in
multiprocess). What we can do instead is have a maximum capacity as
something really, really large, and decide at allocation time how
big the array is going to be. We map the entire file into memory,
which makes it possible to use fbarray as shared memory, provided
the structure itself is allocated in shared memory. Per-fbarray
locking is also used to avoid index data races (but not contents
data races - that is up to user application to synchronize).

In addition, in understanding that we will frequently need to scan
this array for free space and iterating over array linearly can
become slow, rte_fbarray provides facilities to index array's
usage. The following use cases are covered:
- find next free/used slot (useful either for adding new elements
to fbarray, or walking the list)
- find starting index for next N free/used slots (useful for when
we want to allocate chunk of VA-contiguous memory composed of
several pages)
- find how many contiguous free/used slots there are, starting
from specified index (useful for when we want to figure out
how many pages we have until next hole in allocated memory, to
speed up some bulk operations where we would otherwise have to
walk the array and add pages one by one)

This is accomplished by storing a usage mask in-memory, right
after the data section of the array, and using some bit-level
magic to figure out the info we need.

Signed-off-by: Anatoly Burakov <***@intel.com>
---

Notes:
Initial version of this had resizing capability, however it was
removed due to the fact that in multiprocess scenario, each
fbarray would have its own view of mapped memory, which might not
correspond with others due to some other process performing a
resize that current process didn't know about.

It was therefore decided that to avoid cost of synchronization on
each and every operation (to make sure the array wasn't resized),
resizing feature should be dropped.

lib/librte_eal/bsdapp/eal/Makefile | 1 +
lib/librte_eal/common/Makefile | 2 +-
lib/librte_eal/common/eal_common_fbarray.c | 859 ++++++++++++++++++++++++++++
lib/librte_eal/common/eal_filesystem.h | 13 +
lib/librte_eal/common/include/rte_fbarray.h | 352 ++++++++++++
lib/librte_eal/common/meson.build | 2 +
lib/librte_eal/linuxapp/eal/Makefile | 1 +
lib/librte_eal/rte_eal_version.map | 17 +
8 files changed, 1246 insertions(+), 1 deletion(-)
create mode 100644 lib/librte_eal/common/eal_common_fbarray.c
create mode 100644 lib/librte_eal/common/include/rte_fbarray.h

diff --git a/lib/librte_eal/bsdapp/eal/Makefile b/lib/librte_eal/bsdapp/eal/Makefile
index ed1d17b..1b43d77 100644
--- a/lib/librte_eal/bsdapp/eal/Makefile
+++ b/lib/librte_eal/bsdapp/eal/Makefile
@@ -53,6 +53,7 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_dev.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_options.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_thread.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_proc.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_fbarray.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += rte_malloc.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += malloc_elem.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += malloc_heap.c
diff --git a/lib/librte_eal/common/Makefile b/lib/librte_eal/common/Makefile
index ea824a3..48f870f 100644
--- a/lib/librte_eal/common/Makefile
+++ b/lib/librte_eal/common/Makefile
@@ -16,7 +16,7 @@ INC += rte_pci_dev_feature_defs.h rte_pci_dev_features.h
INC += rte_malloc.h rte_keepalive.h rte_time.h
INC += rte_service.h rte_service_component.h
INC += rte_bitmap.h rte_vfio.h rte_hypervisor.h rte_test.h
-INC += rte_reciprocal.h
+INC += rte_reciprocal.h rte_fbarray.h

GENERIC_INC := rte_atomic.h rte_byteorder.h rte_cycles.h rte_prefetch.h
GENERIC_INC += rte_spinlock.h rte_memcpy.h rte_cpuflags.h rte_rwlock.h
diff --git a/lib/librte_eal/common/eal_common_fbarray.c b/lib/librte_eal/common/eal_common_fbarray.c
new file mode 100644
index 0000000..76d86c3
--- /dev/null
+++ b/lib/librte_eal/common/eal_common_fbarray.c
@@ -0,0 +1,859 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2017-2018 Intel Corporation
+ */
+
+#include <inttypes.h>
+#include <sys/mman.h>
+#include <stdint.h>
+#include <errno.h>
+#include <sys/file.h>
+#include <string.h>
+
+#include <rte_common.h>
+#include <rte_log.h>
+#include <rte_errno.h>
+#include <rte_spinlock.h>
+#include <rte_tailq.h>
+
+#include "eal_filesystem.h"
+#include "eal_private.h"
+
+#include "rte_fbarray.h"
+
+#define MASK_SHIFT 6ULL
+#define MASK_ALIGN (1 << MASK_SHIFT)
+#define MASK_LEN_TO_IDX(x) ((x) >> MASK_SHIFT)
+#define MASK_LEN_TO_MOD(x) ((x) - RTE_ALIGN_FLOOR(x, MASK_ALIGN))
+#define MASK_GET_IDX(idx, mod) ((idx << MASK_SHIFT) + mod)
+
+/*
+ * This is a mask that is always stored at the end of array, to provide fast
+ * way of finding free/used spots without looping through each element.
+ */
+
+struct used_mask {
+ int n_masks;
+ uint64_t data[];
+};
+
+static size_t
+calc_mask_size(int len)
+{
+ /* mask must be multiple of MASK_ALIGN, even though length of array
+ * itself may not be aligned on that boundary.
+ */
+ len = RTE_ALIGN_CEIL(len, MASK_ALIGN);
+ return sizeof(struct used_mask) +
+ sizeof(uint64_t) * MASK_LEN_TO_IDX(len);
+}
+
+static size_t
+calc_data_size(size_t page_sz, int elt_sz, int len)
+{
+ size_t data_sz = elt_sz * len;
+ size_t msk_sz = calc_mask_size(len);
+ return RTE_ALIGN_CEIL(data_sz + msk_sz, page_sz);
+}
+
+static struct used_mask *
+get_used_mask(void *data, int elt_sz, int len)
+{
+ return (struct used_mask *) RTE_PTR_ADD(data, elt_sz * len);
+}
+
+static int
+resize_and_map(int fd, void *addr, size_t len)
+{
+ char path[PATH_MAX];
+ void *map_addr;
+
+ if (ftruncate(fd, len)) {
+ RTE_LOG(ERR, EAL, "Cannot truncate %s\n", path);
+ /* pass errno up the chain */
+ rte_errno = errno;
+ return -1;
+ }
+
+ map_addr = mmap(addr, len, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd, 0);
+ if (map_addr != addr) {
+ RTE_LOG(ERR, EAL, "mmap() failed: %s\n", strerror(errno));
+ /* pass errno up the chain */
+ rte_errno = errno;
+ return -1;
+ }
+ return 0;
+}
+
+static int
+find_next_n(const struct rte_fbarray *arr, int start, int n, bool used)
+{
+ const struct used_mask *msk = get_used_mask(arr->data, arr->elt_sz,
+ arr->len);
+ int msk_idx, lookahead_idx, first, first_mod;
+ int last, last_mod, last_msk;
+ uint64_t ignore_msk;
+
+ /*
+ * mask only has granularity of MASK_ALIGN, but start may not be aligned
+ * on that boundary, so construct a special mask to exclude anything we
+ * don't want to see to avoid confusing ctz.
+ */
+ first = MASK_LEN_TO_IDX(start);
+ first_mod = MASK_LEN_TO_MOD(start);
+ ignore_msk = ~((1ULL << first_mod) - 1);
+
+ /* array length may not be aligned, so calculate ignore mask for last
+ * mask index.
+ */
+ last = MASK_LEN_TO_IDX(arr->len);
+ last_mod = MASK_LEN_TO_MOD(arr->len);
+ last_msk = ~(-(1ULL) << last_mod);
+
+ for (msk_idx = first; msk_idx < msk->n_masks; msk_idx++) {
+ uint64_t cur_msk, lookahead_msk;
+ int run_start, clz, left;
+ bool found = false;
+ /*
+ * The process of getting n consecutive bits for arbitrary n is
+ * a bit involved, but here it is in a nutshell:
+ *
+ * 1. let n be the number of consecutive bits we're looking for
+ * 2. check if n can fit in one mask, and if so, do n-1
+ * rshift-ands to see if there is an appropriate run inside
+ * our current mask
+ * 2a. if we found a run, bail out early
+ * 2b. if we didn't find a run, proceed
+ * 3. invert the mask and count leading zeroes (that is, count
+ * how many consecutive set bits we had starting from the
+ * end of current mask) as k
+ * 3a. if k is 0, continue to next mask
+ * 3b. if k is not 0, we have a potential run
+ * 4. to satisfy our requirements, next mask must have n-k
+ * consecutive set bits right at the start, so we will do
+ * (n-k-1) rshift-ands and check if first bit is set.
+ *
+ * Step 4 will need to be repeated if (n-k) > MASK_ALIGN until
+ * we either run out of masks, lose the run, or find what we
+ * were looking for.
+ */
+ cur_msk = msk->data[msk_idx];
+ left = n;
+
+ /* if we're looking for free spaces, invert the mask */
+ if (!used)
+ cur_msk = ~cur_msk;
+
+ /* combine current ignore mask with last index ignore mask */
+ if (msk_idx == last)
+ ignore_msk |= last_msk;
+
+ /* if we have an ignore mask, ignore once */
+ if (ignore_msk) {
+ cur_msk &= ignore_msk;
+ ignore_msk = 0;
+ }
+
+ /* if n can fit in within a single mask, do a search */
+ if (n <= MASK_ALIGN) {
+ uint64_t tmp_msk = cur_msk;
+ int s_idx;
+ for (s_idx = 0; s_idx < n - 1; s_idx++)
+ tmp_msk &= tmp_msk >> 1ULL;
+ /* we found what we were looking for */
+ if (tmp_msk != 0) {
+ run_start = __builtin_ctzll(tmp_msk);
+ return MASK_GET_IDX(msk_idx, run_start);
+ }
+ }
+
+ /*
+ * we didn't find our run within the mask, or n > MASK_ALIGN,
+ * so we're going for plan B.
+ */
+
+ /* count leading zeroes on inverted mask */
+ clz = __builtin_clzll(~cur_msk);
+
+ /* if there aren't any runs at the end either, just continue */
+ if (clz == 0)
+ continue;
+
+ /* we have a partial run at the end, so try looking ahead */
+ run_start = MASK_ALIGN - clz;
+ left -= clz;
+
+ for (lookahead_idx = msk_idx + 1; lookahead_idx < msk->n_masks;
+ lookahead_idx++) {
+ int s_idx, need;
+ lookahead_msk = msk->data[lookahead_idx];
+
+ /* if we're looking for free space, invert the mask */
+ if (!used)
+ lookahead_msk = ~lookahead_msk;
+
+ /* figure out how many consecutive bits we need here */
+ need = RTE_MIN(left, MASK_ALIGN);
+
+ for (s_idx = 0; s_idx < need - 1; s_idx++)
+ lookahead_msk &= lookahead_msk >> 1ULL;
+
+ /* if first bit is not set, we've lost the run */
+ if ((lookahead_msk & 1) == 0) {
+ /*
+ * we've scanned this far, so we know there are
+ * no runs in the space we've lookahead-scanned
+ * as well, so skip that on next iteration.
+ */
+ ignore_msk = ~((1ULL << need) - 1);
+ msk_idx = lookahead_idx;
+ break;
+ }
+
+ left -= need;
+
+ /* check if we've found what we were looking for */
+ if (left == 0) {
+ found = true;
+ break;
+ }
+ }
+
+ /* we didn't find anything, so continue */
+ if (!found)
+ continue;
+
+ return MASK_GET_IDX(msk_idx, run_start);
+ }
+ /* we didn't find anything */
+ rte_errno = used ? -ENOENT : -ENOSPC;
+ return -1;
+}
+
+static int
+find_next(const struct rte_fbarray *arr, int start, bool used)
+{
+ const struct used_mask *msk = get_used_mask(arr->data, arr->elt_sz,
+ arr->len);
+ int idx, first, first_mod;
+ int last, last_mod, last_msk;
+ uint64_t ignore_msk;
+
+ /*
+ * mask only has granularity of MASK_ALIGN, but start may not be aligned
+ * on that boundary, so construct a special mask to exclude anything we
+ * don't want to see to avoid confusing ctz.
+ */
+ first = MASK_LEN_TO_IDX(start);
+ first_mod = MASK_LEN_TO_MOD(start);
+ ignore_msk = ~((1ULL << first_mod) - 1ULL);
+
+ /* array length may not be aligned, so calculate ignore mask for last
+ * mask index.
+ */
+ last = MASK_LEN_TO_IDX(arr->len);
+ last_mod = MASK_LEN_TO_MOD(arr->len);
+ last_msk = ~(-(1ULL) << last_mod);
+
+ for (idx = first; idx < msk->n_masks; idx++) {
+ uint64_t cur = msk->data[idx];
+ int found;
+
+ /* if we're looking for free entries, invert mask */
+ if (!used)
+ cur = ~cur;
+
+ if (idx == last)
+ cur &= last_msk;
+
+ /* ignore everything before start on first iteration */
+ if (idx == first)
+ cur &= ignore_msk;
+
+ /* check if we have any entries */
+ if (cur == 0)
+ continue;
+
+ /*
+ * find first set bit - that will correspond to whatever it is
+ * that we're looking for.
+ */
+ found = __builtin_ctzll(cur);
+ return MASK_GET_IDX(idx, found);
+ }
+ /* we didn't find anything */
+ rte_errno = used ? -ENOENT : -ENOSPC;
+ return -1;
+}
+
+static int
+find_contig(const struct rte_fbarray *arr, int start, bool used)
+{
+ const struct used_mask *msk = get_used_mask(arr->data, arr->elt_sz,
+ arr->len);
+ int idx, first, first_mod;
+ int last, last_mod, last_msk;
+ int need_len, result = 0;
+
+ /* array length may not be aligned, so calculate ignore mask for last
+ * mask index.
+ */
+ last = MASK_LEN_TO_IDX(arr->len);
+ last_mod = MASK_LEN_TO_MOD(arr->len);
+ last_msk = ~(-(1ULL) << last_mod);
+
+ first = MASK_LEN_TO_IDX(start);
+ first_mod = MASK_LEN_TO_MOD(start);
+ for (idx = first; idx < msk->n_masks; idx++, result += need_len) {
+ uint64_t cur = msk->data[idx];
+ int run_len;
+
+ need_len = MASK_ALIGN;
+
+ /* if we're looking for free entries, invert mask */
+ if (!used)
+ cur = ~cur;
+
+ /* if this is last mask, ignore everything after last bit */
+ if (idx == last)
+ cur &= last_msk;
+
+ /* ignore everything before start on first iteration */
+ if (idx == first) {
+ cur >>= first_mod;
+ /* at the start, we don't need the full mask len */
+ need_len -= first_mod;
+ }
+
+ /* we will be looking for zeroes, so invert the mask */
+ cur = ~cur;
+
+ /* if mask is zero, we have a complete run */
+ if (cur == 0)
+ continue;
+
+ /*
+ * see if current run ends before mask end.
+ */
+ run_len = __builtin_ctzll(cur);
+
+ /* add however many zeroes we've had in the last run and quit */
+ if (run_len < need_len) {
+ result += run_len;
+ break;
+ }
+ }
+ return result;
+}
+
+static int
+set_used(struct rte_fbarray *arr, int idx, bool used)
+{
+ struct used_mask *msk = get_used_mask(arr->data, arr->elt_sz, arr->len);
+ uint64_t msk_bit = 1ULL << MASK_LEN_TO_MOD(idx);
+ int msk_idx = MASK_LEN_TO_IDX(idx);
+ bool already_used;
+ int ret = -1;
+
+ if (arr == NULL || idx < 0 || idx >= arr->len) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+ ret = 0;
+
+ /* prevent array from changing under us */
+ rte_rwlock_write_lock(&arr->rwlock);
+
+ already_used = (msk->data[msk_idx] & msk_bit) != 0;
+
+ /* nothing to be done */
+ if (used == already_used)
+ goto out;
+
+ if (used) {
+ msk->data[msk_idx] |= msk_bit;
+ arr->count++;
+ } else {
+ msk->data[msk_idx] &= ~msk_bit;
+ arr->count--;
+ }
+out:
+ rte_rwlock_write_unlock(&arr->rwlock);
+
+ return ret;
+}
+
+static int
+fully_validate(const char *name, unsigned int elt_sz, unsigned int len)
+{
+ if (name == NULL || elt_sz == 0 || len == 0) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ if (strnlen(name, RTE_FBARRAY_NAME_LEN) == RTE_FBARRAY_NAME_LEN) {
+ rte_errno = ENAMETOOLONG;
+ return -1;
+ }
+ return 0;
+}
+
+int
+rte_fbarray_init(struct rte_fbarray *arr, const char *name, int len, int elt_sz)
+{
+ size_t mmap_len, page_sz;
+ char path[PATH_MAX];
+ struct used_mask *msk;
+ void *data = NULL;
+ int fd = -1;
+
+ if (arr == NULL) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ if (fully_validate(name, elt_sz, len))
+ return -1;
+
+ page_sz = sysconf(_SC_PAGESIZE);
+
+ /* calculate our memory limits */
+ mmap_len = calc_data_size(page_sz, elt_sz, len);
+
+ data = eal_get_virtual_area(NULL, &mmap_len, page_sz, 0, 0);
+ if (data == NULL)
+ goto fail;
+
+ eal_get_fbarray_path(path, sizeof(path), name);
+
+ /*
+ * Each fbarray is unique to process namespace, i.e. the filename
+ * depends on process prefix. Try to take out a lock and see if we
+ * succeed. If we don't, someone else is using it already.
+ */
+ fd = open(path, O_CREAT | O_RDWR);
+ if (fd < 0) {
+ RTE_LOG(DEBUG, EAL, "%s(): couldn't open %s: %s\n", __func__,
+ path, strerror(errno));
+ rte_errno = errno;
+ goto fail;
+ } else if (flock(fd, LOCK_EX | LOCK_NB)) {
+ RTE_LOG(DEBUG, EAL, "%s(): couldn't lock %s: %s\n", __func__,
+ path, strerror(errno));
+ rte_errno = EBUSY;
+ goto fail;
+ }
+
+ /* take out a non-exclusive lock, so that other processes could still
+ * attach to it, but no other process could reinitialize it.
+ */
+ if (flock(fd, LOCK_SH | LOCK_NB)) {
+ rte_errno = errno;
+ goto fail;
+ }
+
+ if (resize_and_map(fd, data, mmap_len))
+ goto fail;
+
+ /* we've mmap'ed the file, we can now close the fd */
+ close(fd);
+
+ /* initialize the data */
+ memset(data, 0, mmap_len);
+
+ /* populate data structure */
+ snprintf(arr->name, sizeof(arr->name), "%s", name);
+ arr->data = data;
+ arr->len = len;
+ arr->elt_sz = elt_sz;
+ arr->count = 0;
+
+ msk = get_used_mask(data, elt_sz, len);
+ msk->n_masks = MASK_LEN_TO_IDX(len);
+
+ rte_rwlock_init(&arr->rwlock);
+
+ return 0;
+fail:
+ if (data)
+ munmap(data, mmap_len);
+ if (fd >= 0)
+ close(fd);
+ return -1;
+}
+
+int
+rte_fbarray_attach(struct rte_fbarray *arr)
+{
+ uint64_t mmap_len, page_sz;
+ char path[PATH_MAX];
+ void *data = NULL;
+ int fd = -1;
+
+ if (arr == NULL) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ /*
+ * we don't need to synchronize attach as two values we need (element
+ * size and array length) are constant for the duration of life of
+ * the array, so the parts we care about will not race.
+ */
+
+ if (fully_validate(arr->name, arr->elt_sz, arr->len))
+ return -1;
+
+ page_sz = sysconf(_SC_PAGESIZE);
+
+ mmap_len = calc_data_size(page_sz, arr->elt_sz, arr->len);
+
+ data = eal_get_virtual_area(arr->data, &mmap_len, page_sz, 0, 0);
+ if (data == NULL)
+ goto fail;
+
+ eal_get_fbarray_path(path, sizeof(path), arr->name);
+
+ fd = open(path, O_RDWR);
+ if (fd < 0) {
+ rte_errno = errno;
+ goto fail;
+ }
+
+ /* lock the file, to let others know we're using it */
+ if (flock(fd, LOCK_SH | LOCK_NB)) {
+ rte_errno = errno;
+ goto fail;
+ }
+
+ if (resize_and_map(fd, data, mmap_len))
+ goto fail;
+
+ close(fd);
+
+ /* we're done */
+
+ return 0;
+fail:
+ if (data)
+ munmap(data, mmap_len);
+ if (fd >= 0)
+ close(fd);
+ return -1;
+}
+
+int
+rte_fbarray_detach(struct rte_fbarray *arr)
+{
+ if (arr == NULL) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ /*
+ * we don't need to synchronize detach as two values we need (element
+ * size and total capacity) are constant for the duration of life of
+ * the array, so the parts we care about will not race. if the user is
+ * detaching while doing something else in the same process, we can't
+ * really do anything about it, things will blow up either way.
+ */
+
+ size_t page_sz = sysconf(_SC_PAGESIZE);
+
+ /* this may already be unmapped (e.g. repeated call from previously
+ * failed destroy(), but this is on user, we can't (easily) know if this
+ * is still mapped.
+ */
+ munmap(arr->data, calc_data_size(page_sz, arr->elt_sz, arr->len));
+
+ return 0;
+}
+
+int
+rte_fbarray_destroy(struct rte_fbarray *arr)
+{
+ int fd, ret;
+ char path[PATH_MAX];
+
+ ret = rte_fbarray_detach(arr);
+ if (ret)
+ return ret;
+
+ /* try deleting the file */
+ eal_get_fbarray_path(path, sizeof(path), arr->name);
+
+ fd = open(path, O_RDONLY);
+ if (flock(fd, LOCK_EX | LOCK_NB)) {
+ RTE_LOG(DEBUG, EAL, "Cannot destroy fbarray - another process is using it\n");
+ rte_errno = EBUSY;
+ ret = -1;
+ } else {
+ ret = 0;
+ unlink(path);
+ memset(arr, 0, sizeof(*arr));
+ }
+ close(fd);
+
+ return ret;
+}
+
+void *
+rte_fbarray_get(const struct rte_fbarray *arr, int idx)
+{
+ void *ret = NULL;
+ if (arr == NULL || idx < 0) {
+ rte_errno = EINVAL;
+ return NULL;
+ }
+
+ if (idx >= arr->len) {
+ rte_errno = EINVAL;
+ return NULL;
+ }
+
+ ret = RTE_PTR_ADD(arr->data, idx * arr->elt_sz);
+
+ return ret;
+}
+
+int
+rte_fbarray_set_used(struct rte_fbarray *arr, int idx)
+{
+ return set_used(arr, idx, true);
+}
+
+int
+rte_fbarray_set_free(struct rte_fbarray *arr, int idx)
+{
+ return set_used(arr, idx, false);
+}
+
+int
+rte_fbarray_is_used(struct rte_fbarray *arr, int idx)
+{
+ struct used_mask *msk;
+ int msk_idx;
+ uint64_t msk_bit;
+ int ret = -1;
+
+ if (arr == NULL || idx < 0 || idx >= arr->len) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ /* prevent array from changing under us */
+ rte_rwlock_read_lock(&arr->rwlock);
+
+ msk = get_used_mask(arr->data, arr->elt_sz, arr->len);
+ msk_idx = MASK_LEN_TO_IDX(idx);
+ msk_bit = 1ULL << MASK_LEN_TO_MOD(idx);
+
+ ret = (msk->data[msk_idx] & msk_bit) != 0;
+
+ rte_rwlock_read_unlock(&arr->rwlock);
+
+ return ret;
+}
+
+int
+rte_fbarray_find_next_free(struct rte_fbarray *arr, int start)
+{
+ int ret = -1;
+
+ if (arr == NULL || start < 0 || start >= arr->len) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ /* prevent array from changing under us */
+ rte_rwlock_read_lock(&arr->rwlock);
+
+ if (arr->len == arr->count) {
+ rte_errno = ENOSPC;
+ goto out;
+ }
+
+ ret = find_next(arr, start, false);
+out:
+ rte_rwlock_read_unlock(&arr->rwlock);
+ return ret;
+}
+
+int
+rte_fbarray_find_next_used(struct rte_fbarray *arr, int start)
+{
+ int ret = -1;
+
+ if (arr == NULL || start < 0 || start >= arr->len) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ /* prevent array from changing under us */
+ rte_rwlock_read_lock(&arr->rwlock);
+
+ if (arr->count == 0) {
+ rte_errno = ENOENT;
+ goto out;
+ }
+
+ ret = find_next(arr, start, true);
+out:
+ rte_rwlock_read_unlock(&arr->rwlock);
+ return ret;
+}
+
+int
+rte_fbarray_find_next_n_free(struct rte_fbarray *arr, int start, int n)
+{
+ int ret = -1;
+
+ if (arr == NULL || start < 0 || start >= arr->len ||
+ n < 0 || n > arr->len) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ /* prevent array from changing under us */
+ rte_rwlock_read_lock(&arr->rwlock);
+
+ if (arr->len == arr->count || arr->len - arr->count < n) {
+ rte_errno = ENOSPC;
+ goto out;
+ }
+
+ ret = find_next_n(arr, start, n, false);
+out:
+ rte_rwlock_read_unlock(&arr->rwlock);
+ return ret;
+}
+
+int
+rte_fbarray_find_next_n_used(struct rte_fbarray *arr, int start, int n)
+{
+ int ret = -1;
+
+ if (arr == NULL || start < 0 || start >= arr->len ||
+ n < 0 || n > arr->len) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ /* prevent array from changing under us */
+ rte_rwlock_read_lock(&arr->rwlock);
+
+ if (arr->count < n) {
+ rte_errno = ENOENT;
+ goto out;
+ }
+
+ ret = find_next_n(arr, start, n, true);
+out:
+ rte_rwlock_read_unlock(&arr->rwlock);
+ return ret;
+}
+
+int
+rte_fbarray_find_contig_free(struct rte_fbarray *arr, int start)
+{
+ int ret = -1;
+
+ if (arr == NULL || start < 0 || start >= arr->len) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ /* prevent array from changing under us */
+ rte_rwlock_read_lock(&arr->rwlock);
+
+ if (arr->len == arr->count) {
+ rte_errno = ENOSPC;
+ goto out;
+ }
+
+ if (arr->count == 0) {
+ ret = arr->len - start;
+ goto out;
+ }
+
+ ret = find_contig(arr, start, false);
+out:
+ rte_rwlock_read_unlock(&arr->rwlock);
+ return ret;
+}
+
+int
+rte_fbarray_find_contig_used(struct rte_fbarray *arr, int start)
+{
+ int ret = -1;
+
+ if (arr == NULL || start < 0 || start >= arr->len) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ /* prevent array from changing under us */
+ rte_rwlock_read_lock(&arr->rwlock);
+
+ ret = find_contig(arr, start, true);
+
+ rte_rwlock_read_unlock(&arr->rwlock);
+ return ret;
+}
+
+int
+rte_fbarray_find_idx(const struct rte_fbarray *arr, const void *elt)
+{
+ void *end;
+ int ret = -1;
+
+ /*
+ * no need to synchronize as it doesn't matter if underlying data
+ * changes - we're doing pointer arithmetic here.
+ */
+
+ if (arr == NULL || elt == NULL) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+ end = RTE_PTR_ADD(arr->data, arr->elt_sz * arr->len);
+ if (elt < arr->data || elt >= end) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ ret = RTE_PTR_DIFF(elt, arr->data) / arr->elt_sz;
+
+ return ret;
+}
+
+void
+rte_fbarray_dump_metadata(struct rte_fbarray *arr, FILE *f)
+{
+ struct used_mask *msk;
+ int i;
+
+ if (arr == NULL || f == NULL) {
+ rte_errno = EINVAL;
+ return;
+ }
+
+ if (fully_validate(arr->name, arr->elt_sz, arr->len)) {
+ fprintf(f, "Invalid file-backed array\n");
+ goto out;
+ }
+
+ /* prevent array from changing under us */
+ rte_rwlock_read_lock(&arr->rwlock);
+
+ fprintf(f, "File-backed array: %s\n", arr->name);
+ fprintf(f, "size: %i occupied: %i elt_sz: %i\n",
+ arr->len, arr->count, arr->elt_sz);
+
+ msk = get_used_mask(arr->data, arr->elt_sz, arr->len);
+
+ for (i = 0; i < msk->n_masks; i++)
+ fprintf(f, "msk idx %i: 0x%016" PRIx64 "\n", i, msk->data[i]);
+out:
+ rte_rwlock_read_unlock(&arr->rwlock);
+}
diff --git a/lib/librte_eal/common/eal_filesystem.h b/lib/librte_eal/common/eal_filesystem.h
index 4708dd5..1c6048b 100644
--- a/lib/librte_eal/common/eal_filesystem.h
+++ b/lib/librte_eal/common/eal_filesystem.h
@@ -13,6 +13,7 @@

/** Path of rte config file. */
#define RUNTIME_CONFIG_FMT "%s/.%s_config"
+#define FBARRAY_FMT "%s/%s_%s"

#include <stdint.h>
#include <limits.h>
@@ -55,6 +56,18 @@ eal_mp_socket_path(void)
return buffer;
}

+static inline const char *
+eal_get_fbarray_path(char *buffer, size_t buflen, const char *name) {
+ const char *directory = "/tmp";
+ const char *home_dir = getenv("HOME");
+
+ if (getuid() != 0 && home_dir != NULL)
+ directory = home_dir;
+ snprintf(buffer, buflen - 1, FBARRAY_FMT, directory,
+ internal_config.hugefile_prefix, name);
+ return buffer;
+}
+
/** Path of hugepage info file. */
#define HUGEPAGE_INFO_FMT "%s/.%s_hugepage_info"

diff --git a/lib/librte_eal/common/include/rte_fbarray.h b/lib/librte_eal/common/include/rte_fbarray.h
new file mode 100644
index 0000000..4e1d207
--- /dev/null
+++ b/lib/librte_eal/common/include/rte_fbarray.h
@@ -0,0 +1,352 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2017-2018 Intel Corporation
+ */
+
+#ifndef RTE_FBARRAY_H
+#define RTE_FBARRAY_H
+
+/**
+ * @file
+ *
+ * File-backed shared indexed array for DPDK.
+ *
+ * Basic workflow is expected to be the following:
+ * 1) Allocate array either using ``rte_fbarray_init()`` or
+ * ``rte_fbarray_attach()`` (depending on whether it's shared between
+ * multiple DPDK processes)
+ * 2) find free spots using ``rte_fbarray_find_next_free()``
+ * 3) get pointer to data in the free spot using ``rte_fbarray_get()``, and
+ * copy data into the pointer (element size is fixed)
+ * 4) mark entry as used using ``rte_fbarray_set_used()``
+ *
+ * Calls to ``rte_fbarray_init()`` and ``rte_fbarray_destroy()`` will have
+ * consequences for all processes, while calls to ``rte_fbarray_attach()`` and
+ * ``rte_fbarray_detach()`` will only have consequences within a single process.
+ * Therefore, it is safe to call ``rte_fbarray_attach()`` or
+ * ``rte_fbarray_detach()`` while another process is using ``rte_fbarray``,
+ * provided no other thread within the same process will try to use
+ * ``rte_fbarray`` before attaching or after detaching. It is not safe to call
+ * ``rte_fbarray_init()`` or ``rte_fbarray_destroy()`` while another thread or
+ * another process is using ``rte_fbarray``.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdbool.h>
+#include <stdio.h>
+
+#include <rte_rwlock.h>
+
+#define RTE_FBARRAY_NAME_LEN 64
+
+struct rte_fbarray {
+ char name[RTE_FBARRAY_NAME_LEN]; /**< name associated with an array */
+ int count; /**< number of entries stored */
+ int len; /**< current length of the array */
+ int elt_sz; /**< size of each element */
+ void *data; /**< data pointer */
+ rte_rwlock_t rwlock; /**< multiprocess lock */
+};
+
+/**
+ * Set up ``rte_fbarray`` structure and allocate underlying resources.
+ *
+ * Call this function to correctly set up ``rte_fbarray`` and allocate
+ * underlying files that will be backing the data in the current process. Note
+ * that in order to use and share ``rte_fbarray`` between multiple processes,
+ * data pointed to by ``arr`` pointer must itself be allocated in shared memory.
+ *
+ * @param arr
+ * Valid pointer to allocated ``rte_fbarray`` structure.
+ *
+ * @param name
+ * Unique name to be assigned to this array.
+ *
+ * @param len
+ * Number of elements initially available in the array.
+ *
+ * @param elt_sz
+ * Size of each element.
+ *
+ * @return
+ * - 0 on success.
+ * - -1 on failure, with ``rte_errno`` indicating reason for failure.
+ */
+int
+rte_fbarray_init(struct rte_fbarray *arr, const char *name, int len,
+ int elt_sz);
+
+
+/**
+ * Attach to a file backing an already allocated and correctly set up
+ * ``rte_fbarray`` structure.
+ *
+ * Call this function to attach to file that will be backing the data in the
+ * current process. The structure must have been previously correctly set up
+ * with a call to ``rte_fbarray_init()``. Calls to ``rte_fbarray_attach()`` are
+ * usually meant to be performed in a multiprocessing scenario, with data
+ * pointed to by ``arr`` pointer allocated in shared memory.
+ *
+ * @param arr
+ * Valid pointer to allocated and correctly set up rte_fbarray structure.
+ *
+ * @return
+ * - 0 on success.
+ * - -1 on failure, with ``rte_errno`` indicating reason for failure.
+ */
+int
+rte_fbarray_attach(struct rte_fbarray *arr);
+
+
+/**
+ * Deallocate resources for an already allocated and correctly set up
+ * ``rte_fbarray`` structure, and remove the underlying file.
+ *
+ * Call this function to deallocate all resources associated with an
+ * ``rte_fbarray`` structure within the current process. This will also
+ * zero-fill data pointed to by ``arr`` pointer and remove the underlying file
+ * backing the data, so it is expected that by the time this function is called,
+ * all other processes have detached from this ``rte_fbarray``.
+ *
+ * @param arr
+ * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure.
+ *
+ * @return
+ * - 0 on success.
+ * - -1 on failure, with ``rte_errno`` indicating reason for failure.
+ */
+int
+rte_fbarray_destroy(struct rte_fbarray *arr);
+
+
+/**
+ * Deallocate resources for an already allocated and correctly set up
+ * ``rte_fbarray`` structure.
+ *
+ * Call this function to deallocate all resources associated with an
+ * ``rte_fbarray`` structure within current process.
+ *
+ * @param arr
+ * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure.
+ *
+ * @return
+ * - 0 on success.
+ * - -1 on failure, with ``rte_errno`` indicating reason for failure.
+ */
+int
+rte_fbarray_detach(struct rte_fbarray *arr);
+
+
+/**
+ * Get pointer to element residing at specified index.
+ *
+ * @param arr
+ * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure.
+ *
+ * @param idx
+ * Index of an element to get a pointer to.
+ *
+ * @return
+ * - non-NULL pointer on success.
+ * - NULL on failure, with ``rte_errno`` indicating reason for failure.
+ */
+void *
+rte_fbarray_get(const struct rte_fbarray *arr, int idx);
+
+
+/**
+ * Find index of a specified element within the array.
+ *
+ * @param arr
+ * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure.
+ *
+ * @param elt
+ * Pointer to element to find index to.
+ *
+ * @return
+ * - non-negative integer on success.
+ * - -1 on failure, with ``rte_errno`` indicating reason for failure.
+ */
+int
+rte_fbarray_find_idx(const struct rte_fbarray *arr, const void *elt);
+
+
+/**
+ * Mark specified element as used.
+ *
+ * @param arr
+ * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure.
+ *
+ * @param idx
+ * Element index to mark as used.
+ *
+ * @return
+ * - 0 on success.
+ * - -1 on failure, with ``rte_errno`` indicating reason for failure.
+ */
+int
+rte_fbarray_set_used(struct rte_fbarray *arr, int idx);
+
+
+/**
+ * Mark specified element as free.
+ *
+ * @param arr
+ * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure.
+ *
+ * @param idx
+ * Element index to mark as free.
+ *
+ * @return
+ * - 0 on success.
+ * - -1 on failure, with ``rte_errno`` indicating reason for failure.
+ */
+int
+rte_fbarray_set_free(struct rte_fbarray *arr, int idx);
+
+
+/**
+ * Check whether element at specified index is marked as used.
+ *
+ * @param arr
+ * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure.
+ *
+ * @param idx
+ * Element index to check as used.
+ *
+ * @return
+ * - 1 if element is used.
+ * - 0 if element is unused.
+ * - -1 on failure, with ``rte_errno`` indicating reason for failure.
+ */
+int
+rte_fbarray_is_used(struct rte_fbarray *arr, int idx);
+
+
+/**
+ * Find index of next free element, starting at specified index.
+ *
+ * @param arr
+ * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure.
+ *
+ * @param start
+ * Element index to start search from.
+ *
+ * @return
+ * - non-negative integer on success.
+ * - -1 on failure, with ``rte_errno`` indicating reason for failure.
+ */
+int
+rte_fbarray_find_next_free(struct rte_fbarray *arr, int start);
+
+
+/**
+ * Find index of next used element, starting at specified index.
+ *
+ * @param arr
+ * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure.
+ *
+ * @param start
+ * Element index to start search from.
+ *
+ * @return
+ * - non-negative integer on success.
+ * - -1 on failure, with ``rte_errno`` indicating reason for failure.
+ */
+int
+rte_fbarray_find_next_used(struct rte_fbarray *arr, int start);
+
+
+/**
+ * Find index of next chunk of ``n`` free elements, starting at specified index.
+ *
+ * @param arr
+ * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure.
+ *
+ * @param start
+ * Element index to start search from.
+ *
+ * @param n
+ * Number of free elements to look for.
+ *
+ * @return
+ * - non-negative integer on success.
+ * - -1 on failure, with ``rte_errno`` indicating reason for failure.
+ */
+int
+rte_fbarray_find_next_n_free(struct rte_fbarray *arr, int start, int n);
+
+
+/**
+ * Find index of next chunk of ``n`` used elements, starting at specified index.
+ *
+ * @param arr
+ * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure.
+ *
+ * @param start
+ * Element index to start search from.
+ *
+ * @param n
+ * Number of used elements to look for.
+ *
+ * @return
+ * - non-negative integer on success.
+ * - -1 on failure, with ``rte_errno`` indicating reason for failure.
+ */
+int
+rte_fbarray_find_next_n_used(struct rte_fbarray *arr, int start, int n);
+
+
+/**
+ * Find how many more free entries there are, starting at specified index.
+ *
+ * @param arr
+ * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure.
+ *
+ * @param start
+ * Element index to start search from.
+ *
+ * @return
+ * - non-negative integer on success.
+ * - -1 on failure, with ``rte_errno`` indicating reason for failure.
+ */
+int
+rte_fbarray_find_contig_free(struct rte_fbarray *arr, int start);
+
+
+/**
+ * Find how many more used entries there are, starting at specified index.
+ *
+ * @param arr
+ * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure.
+ *
+ * @param start
+ * Element index to start search from.
+ *
+ * @return
+ * - non-negative integer on success.
+ * - -1 on failure, with ``rte_errno`` indicating reason for failure.
+ */
+int
+rte_fbarray_find_contig_used(struct rte_fbarray *arr, int start);
+
+
+/**
+ * Dump ``rte_fbarray`` metadata.
+ *
+ * @param arr
+ * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure.
+ *
+ * @param f
+ * File object to dump information into.
+ */
+void
+rte_fbarray_dump_metadata(struct rte_fbarray *arr, FILE *f);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // RTE_FBARRAY_H
diff --git a/lib/librte_eal/common/meson.build b/lib/librte_eal/common/meson.build
index 82b8910..7d02191 100644
--- a/lib/librte_eal/common/meson.build
+++ b/lib/librte_eal/common/meson.build
@@ -11,6 +11,7 @@ common_sources = files(
'eal_common_devargs.c',
'eal_common_dev.c',
'eal_common_errno.c',
+ 'eal_common_fbarray.c',
'eal_common_hexdump.c',
'eal_common_launch.c',
'eal_common_lcore.c',
@@ -51,6 +52,7 @@ common_headers = files(
'include/rte_eal_memconfig.h',
'include/rte_eal_interrupts.h',
'include/rte_errno.h',
+ 'include/rte_fbarray.h',
'include/rte_hexdump.h',
'include/rte_interrupts.h',
'include/rte_keepalive.h',
diff --git a/lib/librte_eal/linuxapp/eal/Makefile b/lib/librte_eal/linuxapp/eal/Makefile
index b9c7727..c407a43 100644
--- a/lib/librte_eal/linuxapp/eal/Makefile
+++ b/lib/librte_eal/linuxapp/eal/Makefile
@@ -61,6 +61,7 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_dev.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_options.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_thread.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_proc.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_fbarray.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_malloc.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_elem.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_heap.c
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index 18b8bf5..a938a2f 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -216,6 +216,23 @@ DPDK_18.05 {

rte_num_sockets;
rte_malloc_dump_heaps;
+ rte_fbarray_init;
+ rte_fbarray_destroy;
+ rte_fbarray_attach;
+ rte_fbarray_detach;
+ rte_fbarray_resize;
+ rte_fbarray_get;
+ rte_fbarray_find_idx;
+ rte_fbarray_set_free;
+ rte_fbarray_set_used;
+ rte_fbarray_is_used;
+ rte_fbarray_find_next_free;
+ rte_fbarray_find_next_used;
+ rte_fbarray_find_next_n_free;
+ rte_fbarray_find_next_n_used;
+ rte_fbarray_find_contig_free;
+ rte_fbarray_find_contig_used;
+ rte_fbarray_dump_metadata;

} DPDK_18.02;
--
2.7.4
Anatoly Burakov
2018-03-07 16:56:44 UTC
Permalink
Add a new (non-legacy) memory init path for EAL. It uses the
new memory hotplug facilities, although it's only being run
at startup.

If no -m or --socket-mem switches were specified, the new init
will not allocate anything, whereas if those switches were passed,
appropriate amounts of pages would be requested, just like for
legacy init.

Since rte_malloc support for dynamic allocation comes in later
patches, running DPDK without --socket-mem or -m switches will
fail in this patch.

Also, allocated pages will be physically discontiguous (or rather,
they're not guaranteed to be physically contiguous - they may still
be, by accident) unless IOVA_AS_VA mode is used.

Since memory hotplug subsystem relies on partial file locking,
replace flock() locks with fcntl() locks.

Signed-off-by: Anatoly Burakov <***@intel.com>
---

Notes:
This commit shows "the wolrd as it could have been". All of this other
monstrous amount of code in eal_memory.c is there because of legacy
init option. Do we *really* want to keep it around, and make DPDK
init and memory system suffer from split personality?

lib/librte_eal/linuxapp/eal/eal_hugepage_info.c | 25 ++++++++-
lib/librte_eal/linuxapp/eal/eal_memory.c | 74 +++++++++++++++++++++++--
2 files changed, 92 insertions(+), 7 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
index 706b6d5..7e2475f 100644
--- a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
+++ b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
@@ -15,6 +15,7 @@
#include <unistd.h>
#include <errno.h>
#include <sys/queue.h>
+#include <sys/stat.h>

#include <rte_memory.h>
#include <rte_eal.h>
@@ -200,6 +201,18 @@ get_hugepage_dir(uint64_t hugepage_sz)
}

/*
+ * uses fstat to report the size of a file on disk
+ */
+static off_t
+getFileSize(int fd)
+{
+ struct stat st;
+ if (fstat(fd, &st) < 0)
+ return 0;
+ return st.st_size;
+}
+
+/*
* Clear the hugepage directory of whatever hugepage files
* there are. Checks if the file is locked (i.e.
* if it's in use by another DPDK process).
@@ -229,6 +242,8 @@ clear_hugedir(const char * hugedir)
}

while(dirent != NULL){
+ struct flock lck = {0};
+
/* skip files that don't match the hugepage pattern */
if (fnmatch(filter, dirent->d_name, 0) > 0) {
dirent = readdir(dir);
@@ -245,11 +260,17 @@ clear_hugedir(const char * hugedir)
}

/* non-blocking lock */
- lck_result = flock(fd, LOCK_EX | LOCK_NB);
+ lck.l_type = F_RDLCK;
+ lck.l_whence = SEEK_SET;
+ lck.l_start = 0;
+ lck.l_len = getFileSize(fd);
+
+ lck_result = fcntl(fd, F_SETLK, &lck);

/* if lock succeeds, unlock and remove the file */
if (lck_result != -1) {
- flock(fd, LOCK_UN);
+ lck.l_type = F_UNLCK;
+ fcntl(fd, F_SETLK, &lck);
unlinkat(dir_fd, dirent->d_name, 0);
}
close (fd);
diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
index 9512da9..e0b4988 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -40,6 +40,7 @@
#include <rte_string_fns.h>

#include "eal_private.h"
+#include "eal_memalloc.h"
#include "eal_internal_cfg.h"
#include "eal_filesystem.h"
#include "eal_hugepages.h"
@@ -260,6 +261,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
void *virtaddr;
void *vma_addr = NULL;
size_t vma_len = 0;
+ struct flock lck = {0};
#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
int node_id = -1;
int essential_prev = 0;
@@ -434,8 +436,12 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
}


- /* set shared flock on the file. */
- if (flock(fd, LOCK_SH | LOCK_NB) == -1) {
+ /* set shared lock on the file. */
+ lck.l_type = F_RDLCK;
+ lck.l_whence = SEEK_SET;
+ lck.l_start = 0;
+ lck.l_len = hugepage_sz;
+ if (fcntl(fd, F_SETLK, &lck) == -1) {
RTE_LOG(DEBUG, EAL, "%s(): Locking file failed:%s \n",
__func__, strerror(errno));
close(fd);
@@ -1300,6 +1306,62 @@ eal_legacy_hugepage_init(void)
return -1;
}

+static int
+eal_hugepage_init(void)
+{
+ struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES];
+ uint64_t memory[RTE_MAX_NUMA_NODES];
+ int hp_sz_idx, socket_id;
+
+ test_phys_addrs_available();
+
+ memset(used_hp, 0, sizeof(used_hp));
+
+ for (hp_sz_idx = 0;
+ hp_sz_idx < (int) internal_config.num_hugepage_sizes;
+ hp_sz_idx++) {
+ /* also initialize used_hp hugepage sizes in used_hp */
+ struct hugepage_info *hpi;
+ hpi = &internal_config.hugepage_info[hp_sz_idx];
+ used_hp[hp_sz_idx].hugepage_sz = hpi->hugepage_sz;
+ }
+
+ /* make a copy of socket_mem, needed for balanced allocation. */
+ for (hp_sz_idx = 0; hp_sz_idx < RTE_MAX_NUMA_NODES; hp_sz_idx++)
+ memory[hp_sz_idx] = internal_config.socket_mem[hp_sz_idx];
+
+ /* calculate final number of pages */
+ if (calc_num_pages_per_socket(memory,
+ internal_config.hugepage_info, used_hp,
+ internal_config.num_hugepage_sizes) < 0)
+ return -1;
+
+ for (hp_sz_idx = 0;
+ hp_sz_idx < (int) internal_config.num_hugepage_sizes;
+ hp_sz_idx++) {
+ for (socket_id = 0; socket_id < RTE_MAX_NUMA_NODES;
+ socket_id++) {
+ struct hugepage_info *hpi = &used_hp[hp_sz_idx];
+ unsigned int num_pages = hpi->num_pages[socket_id];
+ int num_pages_alloc;
+
+ if (num_pages == 0)
+ continue;
+
+ RTE_LOG(DEBUG, EAL, "Allocating %u pages of size %luM on socket %i\n",
+ num_pages, hpi->hugepage_sz >> 20, socket_id);
+
+ num_pages_alloc = eal_memalloc_alloc_page_bulk(NULL,
+ num_pages,
+ hpi->hugepage_sz, socket_id,
+ true);
+ if (num_pages_alloc < 0)
+ return -1;
+ }
+ }
+ return 0;
+}
+
/*
* uses fstat to report the size of a file on disk
*/
@@ -1510,9 +1572,9 @@ eal_legacy_hugepage_attach(void)
int
rte_eal_hugepage_init(void)
{
- if (internal_config.legacy_mem)
- return eal_legacy_hugepage_init();
- return -1;
+ return internal_config.legacy_mem ?
+ eal_legacy_hugepage_init() :
+ eal_hugepage_init();
}

int
@@ -1520,6 +1582,8 @@ rte_eal_hugepage_attach(void)
{
if (internal_config.legacy_mem)
return eal_legacy_hugepage_attach();
+ else
+ RTE_LOG(ERR, EAL, "Secondary processes aren't supported yet\n");
return -1;
}
--
2.7.4
Anatoly Burakov
2018-03-07 16:56:48 UTC
Permalink
No major changes, just add some checks in a few key places, and
a new parameter to pass around.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/common/eal_common_memzone.c | 20 +++---
lib/librte_eal/common/malloc_elem.c | 101 ++++++++++++++++++++++-------
lib/librte_eal/common/malloc_elem.h | 4 +-
lib/librte_eal/common/malloc_heap.c | 57 ++++++++++------
lib/librte_eal/common/malloc_heap.h | 4 +-
lib/librte_eal/common/rte_malloc.c | 6 +-
6 files changed, 134 insertions(+), 58 deletions(-)

diff --git a/lib/librte_eal/common/eal_common_memzone.c b/lib/librte_eal/common/eal_common_memzone.c
index 718dee8..75c7dd9 100644
--- a/lib/librte_eal/common/eal_common_memzone.c
+++ b/lib/librte_eal/common/eal_common_memzone.c
@@ -98,7 +98,8 @@ find_heap_max_free_elem(int *s, unsigned align)

static const struct rte_memzone *
memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
- int socket_id, unsigned flags, unsigned align, unsigned bound)
+ int socket_id, unsigned int flags, unsigned int align,
+ unsigned int bound, bool contig)
{
struct rte_memzone *mz;
struct rte_mem_config *mcfg;
@@ -182,7 +183,7 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,

/* allocate memory on heap */
void *mz_addr = malloc_heap_alloc(NULL, requested_len, socket_id, flags,
- align, bound);
+ align, bound, contig);

if (mz_addr == NULL) {
rte_errno = ENOMEM;
@@ -215,9 +216,9 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
}

static const struct rte_memzone *
-rte_memzone_reserve_thread_safe(const char *name, size_t len,
- int socket_id, unsigned flags, unsigned align,
- unsigned bound)
+rte_memzone_reserve_thread_safe(const char *name, size_t len, int socket_id,
+ unsigned int flags, unsigned int align, unsigned int bound,
+ bool contig)
{
struct rte_mem_config *mcfg;
const struct rte_memzone *mz = NULL;
@@ -228,7 +229,7 @@ rte_memzone_reserve_thread_safe(const char *name, size_t len,
rte_rwlock_write_lock(&mcfg->mlock);

mz = memzone_reserve_aligned_thread_unsafe(
- name, len, socket_id, flags, align, bound);
+ name, len, socket_id, flags, align, bound, contig);

rte_rwlock_write_unlock(&mcfg->mlock);

@@ -245,7 +246,7 @@ rte_memzone_reserve_bounded(const char *name, size_t len, int socket_id,
unsigned flags, unsigned align, unsigned bound)
{
return rte_memzone_reserve_thread_safe(name, len, socket_id, flags,
- align, bound);
+ align, bound, false);
}

/*
@@ -257,7 +258,7 @@ rte_memzone_reserve_aligned(const char *name, size_t len, int socket_id,
unsigned flags, unsigned align)
{
return rte_memzone_reserve_thread_safe(name, len, socket_id, flags,
- align, 0);
+ align, 0, false);
}

/*
@@ -269,7 +270,8 @@ rte_memzone_reserve(const char *name, size_t len, int socket_id,
unsigned flags)
{
return rte_memzone_reserve_thread_safe(name, len, socket_id,
- flags, RTE_CACHE_LINE_SIZE, 0);
+ flags, RTE_CACHE_LINE_SIZE, 0,
+ false);
}

int
diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c
index eabad66..d2dba35 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -17,6 +17,7 @@
#include <rte_common.h>
#include <rte_spinlock.h>

+#include "eal_memalloc.h"
#include "malloc_elem.h"
#include "malloc_heap.h"

@@ -94,33 +95,88 @@ malloc_elem_insert(struct malloc_elem *elem)
}

/*
+ * Attempt to find enough physically contiguous memory in this block to store
+ * our data. Assume that element has at least enough space to fit in the data,
+ * so we just check the page addresses.
+ */
+static bool
+elem_check_phys_contig(struct rte_memseg_list *msl, void *start, size_t size)
+{
+ uint64_t page_sz;
+ void *aligned_start, *end, *aligned_end;
+ size_t aligned_len;
+
+ /* figure out how many pages we need to fit in current data */
+ page_sz = msl->hugepage_sz;
+ aligned_start = RTE_PTR_ALIGN_FLOOR(start, page_sz);
+ end = RTE_PTR_ADD(start, size);
+ aligned_end = RTE_PTR_ALIGN_CEIL(end, page_sz);
+
+ aligned_len = RTE_PTR_DIFF(aligned_end, aligned_start);
+
+ return eal_memalloc_is_contig(msl, aligned_start, aligned_len);
+}
+
+/*
* calculate the starting point of where data of the requested size
* and alignment would fit in the current element. If the data doesn't
* fit, return NULL.
*/
static void *
elem_start_pt(struct malloc_elem *elem, size_t size, unsigned align,
- size_t bound)
+ size_t bound, bool contig)
{
- const size_t bmask = ~(bound - 1);
- uintptr_t end_pt = (uintptr_t)elem +
- elem->size - MALLOC_ELEM_TRAILER_LEN;
- uintptr_t new_data_start = RTE_ALIGN_FLOOR((end_pt - size), align);
- uintptr_t new_elem_start;
-
- /* check boundary */
- if ((new_data_start & bmask) != ((end_pt - 1) & bmask)) {
- end_pt = RTE_ALIGN_FLOOR(end_pt, bound);
- new_data_start = RTE_ALIGN_FLOOR((end_pt - size), align);
- end_pt = new_data_start + size;
- if (((end_pt - 1) & bmask) != (new_data_start & bmask))
- return NULL;
- }
+ size_t elem_size = elem->size;

- new_elem_start = new_data_start - MALLOC_ELEM_HEADER_LEN;
+ /*
+ * we're allocating from the end, so adjust the size of element by page
+ * size each time
+ */
+ while (elem_size >= size) {
+ const size_t bmask = ~(bound - 1);
+ uintptr_t end_pt = (uintptr_t)elem +
+ elem_size - MALLOC_ELEM_TRAILER_LEN;
+ uintptr_t new_data_start = RTE_ALIGN_FLOOR((end_pt - size),
+ align);
+ uintptr_t new_elem_start;
+
+ /* check boundary */
+ if ((new_data_start & bmask) != ((end_pt - 1) & bmask)) {
+ end_pt = RTE_ALIGN_FLOOR(end_pt, bound);
+ new_data_start = RTE_ALIGN_FLOOR((end_pt - size),
+ align);
+ end_pt = new_data_start + size;
+
+ if (((end_pt - 1) & bmask) != (new_data_start & bmask))
+ return NULL;
+ }

- /* if the new start point is before the exist start, it won't fit */
- return (new_elem_start < (uintptr_t)elem) ? NULL : (void *)new_elem_start;
+ new_elem_start = new_data_start - MALLOC_ELEM_HEADER_LEN;
+
+ /* if the new start point is before the exist start,
+ * it won't fit
+ */
+ if (new_elem_start < (uintptr_t)elem)
+ return NULL;
+
+ if (contig) {
+ size_t new_data_size = end_pt - new_data_start;
+
+ /*
+ * if physical contiguousness was requested and we
+ * couldn't fit all data into one physically contiguous
+ * block, try again with lower addresses.
+ */
+ if (!elem_check_phys_contig(elem->msl,
+ (void *) new_data_start,
+ new_data_size)) {
+ elem_size -= align;
+ continue;
+ }
+ }
+ return (void *) new_elem_start;
+ }
+ return NULL;
}

/*
@@ -129,9 +185,9 @@ elem_start_pt(struct malloc_elem *elem, size_t size, unsigned align,
*/
int
malloc_elem_can_hold(struct malloc_elem *elem, size_t size, unsigned align,
- size_t bound)
+ size_t bound, bool contig)
{
- return elem_start_pt(elem, size, align, bound) != NULL;
+ return elem_start_pt(elem, size, align, bound, contig) != NULL;
}

/*
@@ -259,9 +315,10 @@ malloc_elem_free_list_remove(struct malloc_elem *elem)
*/
struct malloc_elem *
malloc_elem_alloc(struct malloc_elem *elem, size_t size, unsigned align,
- size_t bound)
+ size_t bound, bool contig)
{
- struct malloc_elem *new_elem = elem_start_pt(elem, size, align, bound);
+ struct malloc_elem *new_elem = elem_start_pt(elem, size, align, bound,
+ contig);
const size_t old_elem_size = (uintptr_t)new_elem - (uintptr_t)elem;
const size_t trailer_size = elem->size - old_elem_size - size -
MALLOC_ELEM_OVERHEAD;
diff --git a/lib/librte_eal/common/malloc_elem.h b/lib/librte_eal/common/malloc_elem.h
index 6d979d2..798472e 100644
--- a/lib/librte_eal/common/malloc_elem.h
+++ b/lib/librte_eal/common/malloc_elem.h
@@ -123,7 +123,7 @@ malloc_elem_insert(struct malloc_elem *elem);
*/
int
malloc_elem_can_hold(struct malloc_elem *elem, size_t size,
- unsigned align, size_t bound);
+ unsigned int align, size_t bound, bool contig);

/*
* reserve a block of data in an existing malloc_elem. If the malloc_elem
@@ -131,7 +131,7 @@ malloc_elem_can_hold(struct malloc_elem *elem, size_t size,
*/
struct malloc_elem *
malloc_elem_alloc(struct malloc_elem *elem, size_t size,
- unsigned align, size_t bound);
+ unsigned int align, size_t bound, bool contig);

/*
* free a malloc_elem block by adding it to the free list. If the
diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c
index 87dc9ad..984e027 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -94,7 +94,7 @@ malloc_heap_add_memory(struct malloc_heap *heap, struct rte_memseg_list *msl,
*/
static struct malloc_elem *
find_suitable_element(struct malloc_heap *heap, size_t size,
- unsigned flags, size_t align, size_t bound)
+ unsigned int flags, size_t align, size_t bound, bool contig)
{
size_t idx;
struct malloc_elem *elem, *alt_elem = NULL;
@@ -103,7 +103,8 @@ find_suitable_element(struct malloc_heap *heap, size_t size,
idx < RTE_HEAP_NUM_FREELISTS; idx++) {
for (elem = LIST_FIRST(&heap->free_head[idx]);
!!elem; elem = LIST_NEXT(elem, free_list)) {
- if (malloc_elem_can_hold(elem, size, align, bound)) {
+ if (malloc_elem_can_hold(elem, size, align, bound,
+ contig)) {
if (check_hugepage_sz(flags,
elem->msl->hugepage_sz))
return elem;
@@ -127,16 +128,16 @@ find_suitable_element(struct malloc_heap *heap, size_t size,
*/
static void *
heap_alloc(struct malloc_heap *heap, const char *type __rte_unused, size_t size,
- unsigned int flags, size_t align, size_t bound)
+ unsigned int flags, size_t align, size_t bound, bool contig)
{
struct malloc_elem *elem;

size = RTE_CACHE_LINE_ROUNDUP(size);
align = RTE_CACHE_LINE_ROUNDUP(align);

- elem = find_suitable_element(heap, size, flags, align, bound);
+ elem = find_suitable_element(heap, size, flags, align, bound, contig);
if (elem != NULL) {
- elem = malloc_elem_alloc(elem, size, align, bound);
+ elem = malloc_elem_alloc(elem, size, align, bound, contig);

/* increase heap's count of allocated elements */
heap->alloc_count++;
@@ -147,14 +148,15 @@ heap_alloc(struct malloc_heap *heap, const char *type __rte_unused, size_t size,

static int
try_expand_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size,
- int socket, unsigned int flags, size_t align, size_t bound)
+ int socket, unsigned int flags, size_t align, size_t bound,
+ bool contig)
{
+ size_t map_len, data_start_offset;
struct rte_memseg_list *msl;
struct rte_memseg **ms;
struct malloc_elem *elem;
- size_t map_len;
int i, n_pages, allocd_pages;
- void *ret, *map_addr;
+ void *ret, *map_addr, *data_start;

align = RTE_MAX(align, MALLOC_ELEM_HEADER_LEN);
map_len = RTE_ALIGN_CEIL(align + elt_size + MALLOC_ELEM_TRAILER_LEN,
@@ -175,11 +177,22 @@ try_expand_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size,
map_addr = ms[0]->addr;
msl = rte_mem_virt2memseg_list(map_addr);

+ /* check if we wanted contiguous memory but didn't get it */
+ data_start_offset = RTE_ALIGN(MALLOC_ELEM_HEADER_LEN, align);
+ data_start = RTE_PTR_ADD(ms[0]->addr, data_start_offset);
+ if (contig && !eal_memalloc_is_contig(msl, data_start,
+ n_pages * msl->hugepage_sz)) {
+ RTE_LOG(DEBUG, EAL, "%s(): couldn't allocate physically contiguous space\n",
+ __func__);
+ goto free_pages;
+ }
+
/* add newly minted memsegs to malloc heap */
elem = malloc_heap_add_memory(heap, msl, map_addr, map_len);

/* try once more, as now we have allocated new memory */
- ret = find_suitable_element(heap, elt_size, flags, align, bound);
+ ret = find_suitable_element(heap, elt_size, flags, align, bound,
+ contig);

if (ret == NULL)
goto free_elem;
@@ -196,6 +209,7 @@ try_expand_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size,
malloc_elem_hide_region(elem, map_addr, map_len);
heap->total_size -= map_len;

+free_pages:
for (i = 0; i < n_pages; i++)
eal_memalloc_free_page(ms[i]);
free_ms:
@@ -223,7 +237,7 @@ compare_pagesz(const void *a, const void *b)

static int
alloc_mem_on_socket(size_t size, int socket, unsigned int flags, size_t align,
- size_t bound)
+ size_t bound, bool contig)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
struct malloc_heap *heap = &mcfg->malloc_heaps[socket];
@@ -304,14 +318,14 @@ alloc_mem_on_socket(size_t size, int socket, unsigned int flags, size_t align,
* sizes first, before resorting to best effort allocation.
*/
if (!try_expand_heap(heap, pg_sz, size, socket, size_flags,
- align, bound))
+ align, bound, contig))
return 0;
}
if (n_other_pg_sz == 0)
return -1;

/* now, check if we can reserve anything with size hint */
- ret = find_suitable_element(heap, size, flags, align, bound);
+ ret = find_suitable_element(heap, size, flags, align, bound, contig);
if (ret != NULL)
return 0;

@@ -323,7 +337,7 @@ alloc_mem_on_socket(size_t size, int socket, unsigned int flags, size_t align,
uint64_t pg_sz = other_pg_sz[i];

if (!try_expand_heap(heap, pg_sz, size, socket, flags,
- align, bound))
+ align, bound, contig))
return 0;
}
return -1;
@@ -332,7 +346,7 @@ alloc_mem_on_socket(size_t size, int socket, unsigned int flags, size_t align,
/* this will try lower page sizes first */
static void *
heap_alloc_on_socket(const char *type, size_t size, int socket,
- unsigned int flags, size_t align, size_t bound)
+ unsigned int flags, size_t align, size_t bound, bool contig)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
struct malloc_heap *heap = &mcfg->malloc_heaps[socket];
@@ -345,7 +359,7 @@ heap_alloc_on_socket(const char *type, size_t size, int socket,

/* for legacy mode, try once and with all flags */
if (internal_config.legacy_mem) {
- ret = heap_alloc(heap, type, size, flags, align, bound);
+ ret = heap_alloc(heap, type, size, flags, align, bound, contig);
goto alloc_unlock;
}

@@ -354,12 +368,12 @@ heap_alloc_on_socket(const char *type, size_t size, int socket,
* we may still be able to allocate memory from appropriate page sizes,
* we just need to request more memory first.
*/
- ret = heap_alloc(heap, type, size, size_flags, align, bound);
+ ret = heap_alloc(heap, type, size, size_flags, align, bound, contig);
if (ret != NULL)
goto alloc_unlock;

- if (!alloc_mem_on_socket(size, socket, flags, align, bound)) {
- ret = heap_alloc(heap, type, size, flags, align, bound);
+ if (!alloc_mem_on_socket(size, socket, flags, align, bound, contig)) {
+ ret = heap_alloc(heap, type, size, flags, align, bound, contig);

/* this should have succeeded */
if (ret == NULL)
@@ -372,7 +386,7 @@ heap_alloc_on_socket(const char *type, size_t size, int socket,

void *
malloc_heap_alloc(const char *type, size_t size, int socket_arg,
- unsigned int flags, size_t align, size_t bound)
+ unsigned int flags, size_t align, size_t bound, bool contig)
{
int socket, i;
void *ret;
@@ -393,7 +407,8 @@ malloc_heap_alloc(const char *type, size_t size, int socket_arg,
if (socket >= RTE_MAX_NUMA_NODES)
return NULL;

- ret = heap_alloc_on_socket(type, size, socket, flags, align, bound);
+ ret = heap_alloc_on_socket(type, size, socket, flags, align, bound,
+ contig);
if (ret != NULL || socket_arg != SOCKET_ID_ANY)
return ret;

@@ -402,7 +417,7 @@ malloc_heap_alloc(const char *type, size_t size, int socket_arg,
if (i == socket)
continue;
ret = heap_alloc_on_socket(type, size, i, flags,
- align, bound);
+ align, bound, contig);
if (ret != NULL)
return ret;
}
diff --git a/lib/librte_eal/common/malloc_heap.h b/lib/librte_eal/common/malloc_heap.h
index 292d578..03b8014 100644
--- a/lib/librte_eal/common/malloc_heap.h
+++ b/lib/librte_eal/common/malloc_heap.h
@@ -5,6 +5,8 @@
#ifndef MALLOC_HEAP_H_
#define MALLOC_HEAP_H_

+#include <stdbool.h>
+
#include <rte_malloc.h>
#include <rte_malloc_heap.h>

@@ -25,7 +27,7 @@ malloc_get_numa_socket(void)

void *
malloc_heap_alloc(const char *type, size_t size, int socket, unsigned int flags,
- size_t align, size_t bound);
+ size_t align, size_t bound, bool contig);

int
malloc_heap_free(struct malloc_elem *elem);
diff --git a/lib/librte_eal/common/rte_malloc.c b/lib/librte_eal/common/rte_malloc.c
index b0fe11c..5cd92d1 100644
--- a/lib/librte_eal/common/rte_malloc.c
+++ b/lib/librte_eal/common/rte_malloc.c
@@ -37,7 +37,8 @@ void rte_free(void *addr)
* Allocate memory on specified heap.
*/
void *
-rte_malloc_socket(const char *type, size_t size, unsigned align, int socket_arg)
+rte_malloc_socket(const char *type, size_t size, unsigned int align,
+ int socket_arg)
{
/* return NULL if size is 0 or alignment is not power-of-2 */
if (size == 0 || (align && !rte_is_power_of_2(align)))
@@ -50,8 +51,7 @@ rte_malloc_socket(const char *type, size_t size, unsigned align, int socket_arg)
if (socket_arg >= RTE_MAX_NUMA_NODES)
return NULL;

- return malloc_heap_alloc(type, size, socket_arg, 0,
- align == 0 ? 1 : align, 0);
+ return malloc_heap_alloc(type, size, socket_arg, 0, align, 0, false);
}

/*
--
2.7.4
Anatoly Burakov
2018-03-07 16:56:41 UTC
Permalink
Before, we were aggregating multiple pages into one memseg, so the
number of memsegs was small. Now, each page gets its own memseg,
so the list of memsegs is huge. To accommodate the new memseg list
size and to keep the under-the-hood workings sane, the memseg list
is now not just a single list, but multiple lists. To be precise,
each hugepage size available on the system gets one or more memseg
lists, per socket.

In order to support dynamic memory allocation, we reserve all
memory in advance. As in, we do an anonymous mmap() of the entire
maximum size of memory per hugepage size, per socket (which is
limited to either RTE_MAX_MEMSEG_PER_TYPE pages or
RTE_MAX_MEM_PER_TYPE gigabytes worth of memory, whichever is the
smaller one), split over multiple lists (which are limited to
either RTE_MAX_MEMSEG_PER_LIST memsegs or RTE_MAX_MEM_PER_LIST
gigabytes per list, whichever is the smaller one).

So, for each hugepage size, we get (by default) up to 128G worth
of memory, per socket, split into chunks of up to 32G in size.
The address space is claimed at the start, in eal_common_memory.c.
The actual page allocation code is in eal_memalloc.c (Linux-only
for now), and largely consists of copied EAL memory init code.

Pages in the list are also indexed by address. That is, for
non-legacy mode, in order to figure out where the page belongs,
one can simply look at base address for a memseg list. Similarly,
figuring out IOVA address of a memzone is a matter of finding the
right memseg list, getting offset and dividing by page size to get
the appropriate memseg. For legacy mode, old behavior of walking
the memseg list remains.

Due to switch to fbarray and to avoid any intrusive changes,
secondary processes are not supported in this commit. Also, one
particular API call (dump physmem layout) no longer makes sense
and was removed, according to deprecation notice [1].

In legacy mode, nothing is preallocated, and all memsegs are in
a list like before, but each segment still resides in an appropriate
memseg list.

The rest of the changes are really ripple effects from the memseg
change - heap changes, compile fixes, and rewrites to support
fbarray-backed memseg lists.

[1] http://dpdk.org/dev/patchwork/patch/34002/

Signed-off-by: Anatoly Burakov <***@intel.com>
---
config/common_base | 15 +-
drivers/bus/pci/linux/pci.c | 29 +-
drivers/net/virtio/virtio_user/vhost_kernel.c | 108 +++++---
lib/librte_eal/common/eal_common_memory.c | 322 +++++++++++++++++++---
lib/librte_eal/common/eal_common_memzone.c | 12 +-
lib/librte_eal/common/eal_hugepages.h | 2 +
lib/librte_eal/common/eal_internal_cfg.h | 2 +-
lib/librte_eal/common/include/rte_eal_memconfig.h | 22 +-
lib/librte_eal/common/include/rte_memory.h | 33 ++-
lib/librte_eal/common/include/rte_memzone.h | 1 -
lib/librte_eal/common/malloc_elem.c | 8 +-
lib/librte_eal/common/malloc_elem.h | 6 +-
lib/librte_eal/common/malloc_heap.c | 92 +++++--
lib/librte_eal/common/rte_malloc.c | 22 +-
lib/librte_eal/linuxapp/eal/eal.c | 21 +-
lib/librte_eal/linuxapp/eal/eal_memory.c | 297 +++++++++++++-------
lib/librte_eal/linuxapp/eal/eal_vfio.c | 164 +++++++----
lib/librte_eal/rte_eal_version.map | 3 +-
test/test/test_malloc.c | 29 +-
test/test/test_memory.c | 43 ++-
test/test/test_memzone.c | 17 +-
21 files changed, 917 insertions(+), 331 deletions(-)

diff --git a/config/common_base b/config/common_base
index ad03cf4..e9c1d93 100644
--- a/config/common_base
+++ b/config/common_base
@@ -61,7 +61,20 @@ CONFIG_RTE_CACHE_LINE_SIZE=64
CONFIG_RTE_LIBRTE_EAL=y
CONFIG_RTE_MAX_LCORE=128
CONFIG_RTE_MAX_NUMA_NODES=8
-CONFIG_RTE_MAX_MEMSEG=256
+CONFIG_RTE_MAX_MEMSEG_LISTS=32
+# each memseg list will be limited to either RTE_MAX_MEMSEG_PER_LIST pages
+# or RTE_MAX_MEM_PER_LIST gigabytes worth of memory, whichever is the smallest
+CONFIG_RTE_MAX_MEMSEG_PER_LIST=8192
+CONFIG_RTE_MAX_MEM_PER_LIST=32
+# a "type" is a combination of page size and NUMA node. total number of memseg
+# lists per type will be limited to either RTE_MAX_MEMSEG_PER_TYPE pages (split
+# over multiple lists of RTE_MAX_MEMSEG_PER_LIST pages), or RTE_MAX_MEM_PER_TYPE
+# gigabytes of memory (split over multiple lists of RTE_MAX_MEM_PER_LIST),
+# whichever is the smallest
+CONFIG_RTE_MAX_MEMSEG_PER_TYPE=32768
+CONFIG_RTE_MAX_MEM_PER_TYPE=128
+# legacy mem mode only
+CONFIG_RTE_MAX_LEGACY_MEMSEG=256
CONFIG_RTE_MAX_MEMZONE=2560
CONFIG_RTE_MAX_TAILQ=32
CONFIG_RTE_ENABLE_ASSERT=n
diff --git a/drivers/bus/pci/linux/pci.c b/drivers/bus/pci/linux/pci.c
index abde641..ec05d7c 100644
--- a/drivers/bus/pci/linux/pci.c
+++ b/drivers/bus/pci/linux/pci.c
@@ -119,19 +119,30 @@ rte_pci_unmap_device(struct rte_pci_device *dev)
void *
pci_find_max_end_va(void)
{
- const struct rte_memseg *seg = rte_eal_get_physmem_layout();
- const struct rte_memseg *last = seg;
- unsigned i = 0;
+ void *cur_end, *max_end = NULL;
+ int i = 0;

- for (i = 0; i < RTE_MAX_MEMSEG; i++, seg++) {
- if (seg->addr == NULL)
- break;
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+ struct rte_mem_config *mcfg =
+ rte_eal_get_configuration()->mem_config;
+ struct rte_memseg_list *msl = &mcfg->memsegs[i];
+ struct rte_fbarray *arr = &msl->memseg_arr;

- if (seg->addr > last->addr)
- last = seg;
+ if (arr->count == 0)
+ continue;

+ /*
+ * we need to handle legacy mem case, so don't rely on page size
+ * to calculate max VA end
+ */
+ while ((i = rte_fbarray_find_next_used(arr, i)) >= 0) {
+ struct rte_memseg *ms = rte_fbarray_get(arr, i);
+ cur_end = RTE_PTR_ADD(ms->addr, ms->len);
+ if (cur_end > max_end)
+ max_end = cur_end;
+ }
}
- return RTE_PTR_ADD(last->addr, last->len);
+ return max_end;
}

/* parse one line of the "resource" sysfs file (note that the 'line'
diff --git a/drivers/net/virtio/virtio_user/vhost_kernel.c b/drivers/net/virtio/virtio_user/vhost_kernel.c
index 8d0a1ab..23c5e1c 100644
--- a/drivers/net/virtio/virtio_user/vhost_kernel.c
+++ b/drivers/net/virtio/virtio_user/vhost_kernel.c
@@ -70,6 +70,42 @@ static uint64_t vhost_req_user_to_kernel[] = {
[VHOST_USER_SET_MEM_TABLE] = VHOST_SET_MEM_TABLE,
};

+/* returns number of segments processed */
+static int
+add_memory_region(struct vhost_memory_region *mr, struct rte_fbarray *arr,
+ int reg_start_idx, int max)
+{
+ const struct rte_memseg *ms;
+ void *start_addr, *expected_addr;
+ uint64_t len;
+ int idx;
+
+ idx = reg_start_idx;
+ len = 0;
+ start_addr = NULL;
+ expected_addr = NULL;
+
+ /* we could've relied on page size, but we have to support legacy mem */
+ while (idx < max) {
+ ms = rte_fbarray_get(arr, idx);
+ if (expected_addr == NULL) {
+ start_addr = ms->addr;
+ expected_addr = RTE_PTR_ADD(ms->addr, ms->len);
+ } else if (ms->addr != expected_addr) {
+ break;
+ }
+ len += ms->len;
+ idx++;
+ }
+
+ mr->guest_phys_addr = (uint64_t)(uintptr_t)start_addr;
+ mr->userspace_addr = (uint64_t)(uintptr_t)start_addr;
+ mr->memory_size = len;
+ mr->mmap_offset = 0;
+
+ return idx;
+}
+
/* By default, vhost kernel module allows 64 regions, but DPDK allows
* 256 segments. As a relief, below function merges those virtually
* adjacent memsegs into one region.
@@ -77,8 +113,7 @@ static uint64_t vhost_req_user_to_kernel[] = {
static struct vhost_memory_kernel *
prepare_vhost_memory_kernel(void)
{
- uint32_t i, j, k = 0;
- struct rte_memseg *seg;
+ uint32_t list_idx, region_nr = 0;
struct vhost_memory_region *mr;
struct vhost_memory_kernel *vm;

@@ -88,52 +123,41 @@ prepare_vhost_memory_kernel(void)
if (!vm)
return NULL;

- for (i = 0; i < RTE_MAX_MEMSEG; ++i) {
- seg = &rte_eal_get_configuration()->mem_config->memseg[i];
- if (!seg->addr)
- break;
-
- int new_region = 1;
+ for (list_idx = 0; list_idx < RTE_MAX_MEMSEG_LISTS; ++list_idx) {
+ struct rte_mem_config *mcfg =
+ rte_eal_get_configuration()->mem_config;
+ struct rte_memseg_list *msl = &mcfg->memsegs[list_idx];
+ struct rte_fbarray *arr = &msl->memseg_arr;
+ int reg_start_idx, search_idx;

- for (j = 0; j < k; ++j) {
- mr = &vm->regions[j];
-
- if (mr->userspace_addr + mr->memory_size ==
- (uint64_t)(uintptr_t)seg->addr) {
- mr->memory_size += seg->len;
- new_region = 0;
- break;
- }
-
- if ((uint64_t)(uintptr_t)seg->addr + seg->len ==
- mr->userspace_addr) {
- mr->guest_phys_addr =
- (uint64_t)(uintptr_t)seg->addr;
- mr->userspace_addr =
- (uint64_t)(uintptr_t)seg->addr;
- mr->memory_size += seg->len;
- new_region = 0;
- break;
- }
- }
-
- if (new_region == 0)
+ /* skip empty segment lists */
+ if (arr->count == 0)
continue;

- mr = &vm->regions[k++];
- /* use vaddr here! */
- mr->guest_phys_addr = (uint64_t)(uintptr_t)seg->addr;
- mr->userspace_addr = (uint64_t)(uintptr_t)seg->addr;
- mr->memory_size = seg->len;
- mr->mmap_offset = 0;
-
- if (k >= max_regions) {
- free(vm);
- return NULL;
+ search_idx = 0;
+ while ((reg_start_idx = rte_fbarray_find_next_used(arr,
+ search_idx)) >= 0) {
+ int reg_n_pages;
+ if (region_nr >= max_regions) {
+ free(vm);
+ return NULL;
+ }
+ mr = &vm->regions[region_nr++];
+
+ /*
+ * we know memseg starts at search_idx, check how many
+ * segments there are
+ */
+ reg_n_pages = rte_fbarray_find_contig_used(arr,
+ search_idx);
+
+ /* look at at most reg_n_pages of memsegs */
+ search_idx = add_memory_region(mr, arr, reg_start_idx,
+ search_idx + reg_n_pages);
}
}

- vm->nregions = k;
+ vm->nregions = region_nr;
vm->padding = 0;
return vm;
}
diff --git a/lib/librte_eal/common/eal_common_memory.c b/lib/librte_eal/common/eal_common_memory.c
index 042881b..457e239 100644
--- a/lib/librte_eal/common/eal_common_memory.c
+++ b/lib/librte_eal/common/eal_common_memory.c
@@ -13,6 +13,7 @@
#include <sys/mman.h>
#include <sys/queue.h>

+#include <rte_fbarray.h>
#include <rte_memory.h>
#include <rte_eal.h>
#include <rte_eal_memconfig.h>
@@ -30,6 +31,8 @@
* which is a multiple of hugepage size.
*/

+#define MEMSEG_LIST_FMT "memseg-%luk-%i-%i"
+
static uint64_t baseaddr_offset;
static uint64_t system_page_sz;

@@ -120,15 +123,245 @@ eal_get_virtual_area(void *requested_addr, uint64_t *size,
return aligned_addr;
}

-/*
- * Return a pointer to a read-only table of struct rte_physmem_desc
- * elements, containing the layout of all addressable physical
- * memory. The last element of the table contains a NULL address.
- */
-const struct rte_memseg *
-rte_eal_get_physmem_layout(void)
+static uint64_t
+get_mem_amount(uint64_t page_sz)
+{
+ uint64_t area_sz, max_pages;
+
+ max_pages = internal_config.legacy_mem || internal_config.no_hugetlbfs ?
+ RTE_MAX_LEGACY_MEMSEG : RTE_MAX_MEMSEG_PER_LIST;
+
+ /* limit to RTE_MAX_MEMSEG_PER_LIST pages or RTE_MAX_MEM_PER_LIST GB */
+ area_sz = RTE_MIN(page_sz * max_pages,
+ (uint64_t) RTE_MAX_MEM_PER_LIST << 30);
+ /* make sure the list isn't smaller than the page size */
+ area_sz = RTE_MAX(area_sz, page_sz);
+
+ return rte_align64pow2(area_sz);
+}
+
+static int
+alloc_memseg_list(struct rte_memseg_list *msl, uint64_t page_sz,
+ int socket_id, int type_msl_idx)
+{
+ char name[RTE_FBARRAY_NAME_LEN];
+ int max_pages;
+ uint64_t mem_amount;
+ void *addr;
+
+ if (!internal_config.legacy_mem) {
+ mem_amount = get_mem_amount(page_sz);
+ max_pages = mem_amount / page_sz;
+
+ addr = eal_get_virtual_area(NULL, &mem_amount, page_sz, 0, 0);
+ if (addr == NULL) {
+ RTE_LOG(ERR, EAL, "Cannot reserve memory\n");
+ return -1;
+ }
+ } else {
+ addr = NULL;
+ /* numer of memsegs in each list, these will not be single-page
+ * segments, so RTE_MAX_LEGACY_MEMSEG is like old default.
+ */
+ max_pages = RTE_MAX_LEGACY_MEMSEG;
+ }
+
+ snprintf(name, sizeof(name), MEMSEG_LIST_FMT, page_sz >> 10, socket_id,
+ type_msl_idx);
+ if (rte_fbarray_init(&msl->memseg_arr, name, max_pages,
+ sizeof(struct rte_memseg))) {
+ RTE_LOG(ERR, EAL, "Cannot allocate memseg list: %s\n",
+ rte_strerror(rte_errno));
+ return -1;
+ }
+
+ msl->hugepage_sz = page_sz;
+ msl->socket_id = socket_id;
+ msl->base_va = addr;
+
+ RTE_LOG(DEBUG, EAL, "Memseg list allocated: 0x%zxkB at socket %i\n",
+ page_sz >> 10, socket_id);
+
+ return 0;
+}
+
+static int
+memseg_init(void)
{
- return rte_eal_get_configuration()->mem_config->memseg;
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ int socket_id, hpi_idx, msl_idx = 0;
+ struct rte_memseg_list *msl;
+
+ if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
+ RTE_LOG(ERR, EAL, "Secondary process not supported\n");
+ return -1;
+ }
+
+ /* create memseg lists */
+ for (hpi_idx = 0; hpi_idx < (int) internal_config.num_hugepage_sizes;
+ hpi_idx++) {
+ struct hugepage_info *hpi;
+ uint64_t hugepage_sz;
+
+ hpi = &internal_config.hugepage_info[hpi_idx];
+ hugepage_sz = hpi->hugepage_sz;
+
+ for (socket_id = 0; socket_id < (int) rte_num_sockets();
+ socket_id++) {
+ uint64_t max_mem, total_mem = 0;
+ int type_msl_idx, max_segs, total_segs = 0;
+
+ max_mem = (uint64_t)RTE_MAX_MEM_PER_TYPE << 30;
+ /* no-huge behaves the same as legacy */
+ max_segs = internal_config.legacy_mem ||
+ internal_config.no_hugetlbfs ?
+ RTE_MAX_LEGACY_MEMSEG :
+ RTE_MAX_MEMSEG_PER_TYPE;
+
+ type_msl_idx = 0;
+ while (total_mem < max_mem && total_segs < max_segs) {
+ if (msl_idx >= RTE_MAX_MEMSEG_LISTS) {
+ RTE_LOG(ERR, EAL,
+ "No more space in memseg lists, please increase CONFIG_RTE_MAX_MEMSEG_LISTS\n");
+ return -1;
+ }
+
+ msl = &mcfg->memsegs[msl_idx++];
+
+ if (alloc_memseg_list(msl, hugepage_sz,
+ socket_id, type_msl_idx))
+ return -1;
+
+ total_segs += msl->memseg_arr.len;
+ total_mem = total_segs * msl->hugepage_sz;
+ type_msl_idx++;
+ }
+ }
+ }
+ return 0;
+}
+
+static struct rte_memseg *
+virt2memseg(const void *addr, const struct rte_memseg_list *msl)
+{
+ const struct rte_fbarray *arr;
+ int ms_idx;
+
+ /* a memseg list was specified, check if it's the right one */
+ void *start, *end;
+ start = msl->base_va;
+ end = RTE_PTR_ADD(start, msl->hugepage_sz *
+ msl->memseg_arr.len);
+
+ if (addr < start || addr >= end)
+ return NULL;
+
+ /* now, calculate index */
+ arr = &msl->memseg_arr;
+ ms_idx = RTE_PTR_DIFF(addr, msl->base_va) / msl->hugepage_sz;
+ return rte_fbarray_get(arr, ms_idx);
+}
+
+static struct rte_memseg_list *
+virt2memseg_list(const void *addr)
+{
+ struct rte_mem_config *mcfg =
+ rte_eal_get_configuration()->mem_config;
+ struct rte_memseg_list *msl;
+ int msl_idx;
+
+ for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
+ void *start, *end;
+ msl = &mcfg->memsegs[msl_idx];
+
+ start = msl->base_va;
+ end = RTE_PTR_ADD(start, msl->hugepage_sz *
+ msl->memseg_arr.len);
+ if (addr >= start && addr < end)
+ break;
+ }
+ /* if we didn't find our memseg list */
+ if (msl_idx == RTE_MAX_MEMSEG_LISTS)
+ return NULL;
+ return msl;
+}
+
+static struct rte_memseg_list *
+virt2memseg_list_legacy(const void *addr)
+{
+ struct rte_mem_config *mcfg =
+ rte_eal_get_configuration()->mem_config;
+ struct rte_memseg_list *msl;
+ struct rte_fbarray *arr;
+ int msl_idx, ms_idx;
+ for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
+ msl = &mcfg->memsegs[msl_idx];
+ arr = &msl->memseg_arr;
+
+ ms_idx = 0;
+ while ((ms_idx =
+ rte_fbarray_find_next_used(arr, ms_idx)) >= 0) {
+ const struct rte_memseg *ms;
+ void *start, *end;
+ ms = rte_fbarray_get(arr, ms_idx);
+ start = ms->addr;
+ end = RTE_PTR_ADD(start, ms->len);
+ if (addr >= start && addr < end)
+ return msl;
+ ms_idx++;
+ }
+ }
+ return NULL;
+}
+
+static struct rte_memseg *
+virt2memseg_legacy(const void *addr)
+{
+ struct rte_mem_config *mcfg =
+ rte_eal_get_configuration()->mem_config;
+ struct rte_memseg_list *msl;
+ struct rte_fbarray *arr;
+ int msl_idx, ms_idx;
+ for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
+ msl = &mcfg->memsegs[msl_idx];
+ arr = &msl->memseg_arr;
+
+ ms_idx = 0;
+ while ((ms_idx =
+ rte_fbarray_find_next_used(arr, ms_idx)) >= 0) {
+ struct rte_memseg *ms;
+ void *start, *end;
+ ms = rte_fbarray_get(arr, ms_idx);
+ start = ms->addr;
+ end = RTE_PTR_ADD(start, ms->len);
+ if (addr >= start && addr < end)
+ return ms;
+ ms_idx++;
+ }
+ }
+ return NULL;
+}
+
+struct rte_memseg_list *
+rte_mem_virt2memseg_list(const void *addr)
+{
+ /* for legacy memory, we just walk the list, like in the old days. */
+ if (internal_config.legacy_mem)
+ return virt2memseg_list_legacy(addr);
+ else
+ return virt2memseg_list(addr);
+}
+
+struct rte_memseg *
+rte_mem_virt2memseg(const void *addr, const struct rte_memseg_list *msl)
+{
+ /* for legacy memory, we just walk the list, like in the old days. */
+ if (internal_config.legacy_mem)
+ /* ignore msl value */
+ return virt2memseg_legacy(addr);
+
+ return virt2memseg(addr, msl != NULL ? msl :
+ rte_mem_virt2memseg_list(addr));
}


@@ -136,18 +369,32 @@ rte_eal_get_physmem_layout(void)
uint64_t
rte_eal_get_physmem_size(void)
{
- const struct rte_mem_config *mcfg;
+ struct rte_mem_config *mcfg;
unsigned i = 0;
uint64_t total_len = 0;

/* get pointer to global configuration */
mcfg = rte_eal_get_configuration()->mem_config;

- for (i = 0; i < RTE_MAX_MEMSEG; i++) {
- if (mcfg->memseg[i].addr == NULL)
- break;
-
- total_len += mcfg->memseg[i].len;
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+ struct rte_memseg_list *msl = &mcfg->memsegs[i];
+
+ if (msl->memseg_arr.count == 0)
+ continue;
+
+ /* for legacy mem mode, walk the memsegs */
+ if (internal_config.legacy_mem) {
+ struct rte_fbarray *arr = &msl->memseg_arr;
+ int ms_idx = 0;
+
+ while ((ms_idx = rte_fbarray_find_next_used(arr,
+ ms_idx) >= 0)) {
+ const struct rte_memseg *ms =
+ rte_fbarray_get(arr, ms_idx);
+ total_len += ms->len;
+ }
+ } else
+ total_len += msl->hugepage_sz * msl->memseg_arr.count;
}

return total_len;
@@ -157,27 +404,35 @@ rte_eal_get_physmem_size(void)
void
rte_dump_physmem_layout(FILE *f)
{
- const struct rte_mem_config *mcfg;
+ struct rte_mem_config *mcfg;
unsigned i = 0;

/* get pointer to global configuration */
mcfg = rte_eal_get_configuration()->mem_config;

- for (i = 0; i < RTE_MAX_MEMSEG; i++) {
- if (mcfg->memseg[i].addr == NULL)
- break;
-
- fprintf(f, "Segment %u: IOVA:0x%"PRIx64", len:%zu, "
- "virt:%p, socket_id:%"PRId32", "
- "hugepage_sz:%"PRIu64", nchannel:%"PRIx32", "
- "nrank:%"PRIx32"\n", i,
- mcfg->memseg[i].iova,
- mcfg->memseg[i].len,
- mcfg->memseg[i].addr,
- mcfg->memseg[i].socket_id,
- mcfg->memseg[i].hugepage_sz,
- mcfg->memseg[i].nchannel,
- mcfg->memseg[i].nrank);
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+ struct rte_memseg_list *msl = &mcfg->memsegs[i];
+ struct rte_fbarray *arr = &msl->memseg_arr;
+ int m_idx = 0;
+
+ if (arr->count == 0)
+ continue;
+
+ while ((m_idx = rte_fbarray_find_next_used(arr, m_idx)) >= 0) {
+ struct rte_memseg *ms = rte_fbarray_get(arr, m_idx);
+ fprintf(f, "Page %u-%u: iova:0x%"PRIx64", len:%zu, "
+ "virt:%p, socket_id:%"PRId32", "
+ "hugepage_sz:%"PRIu64", nchannel:%"PRIx32", "
+ "nrank:%"PRIx32"\n", i, m_idx,
+ ms->iova,
+ ms->len,
+ ms->addr,
+ ms->socket_id,
+ ms->hugepage_sz,
+ ms->nchannel,
+ ms->nrank);
+ m_idx++;
+ }
}
}

@@ -222,9 +477,14 @@ rte_mem_lock_page(const void *virt)
int
rte_eal_memory_init(void)
{
+ int retval;
RTE_LOG(DEBUG, EAL, "Setting up physically contiguous memory...\n");

- const int retval = rte_eal_process_type() == RTE_PROC_PRIMARY ?
+ retval = memseg_init();
+ if (retval < 0)
+ return -1;
+
+ retval = rte_eal_process_type() == RTE_PROC_PRIMARY ?
rte_eal_hugepage_init() :
rte_eal_hugepage_attach();
if (retval < 0)
diff --git a/lib/librte_eal/common/eal_common_memzone.c b/lib/librte_eal/common/eal_common_memzone.c
index 1ab3ade..ed36174 100644
--- a/lib/librte_eal/common/eal_common_memzone.c
+++ b/lib/librte_eal/common/eal_common_memzone.c
@@ -226,10 +226,9 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
mz->iova = rte_malloc_virt2iova(mz_addr);
mz->addr = mz_addr;
mz->len = (requested_len == 0 ? elem->size : requested_len);
- mz->hugepage_sz = elem->ms->hugepage_sz;
- mz->socket_id = elem->ms->socket_id;
+ mz->hugepage_sz = elem->msl->hugepage_sz;
+ mz->socket_id = elem->msl->socket_id;
mz->flags = 0;
- mz->memseg_id = elem->ms - rte_eal_get_configuration()->mem_config->memseg;

return mz;
}
@@ -382,7 +381,6 @@ int
rte_eal_memzone_init(void)
{
struct rte_mem_config *mcfg;
- const struct rte_memseg *memseg;

/* get pointer to global configuration */
mcfg = rte_eal_get_configuration()->mem_config;
@@ -391,12 +389,6 @@ rte_eal_memzone_init(void)
if (rte_eal_process_type() == RTE_PROC_SECONDARY)
return 0;

- memseg = rte_eal_get_physmem_layout();
- if (memseg == NULL) {
- RTE_LOG(ERR, EAL, "%s(): Cannot get physical layout\n", __func__);
- return -1;
- }
-
rte_rwlock_write_lock(&mcfg->mlock);

/* delete all zones */
diff --git a/lib/librte_eal/common/eal_hugepages.h b/lib/librte_eal/common/eal_hugepages.h
index 1d519bb..f963ae5 100644
--- a/lib/librte_eal/common/eal_hugepages.h
+++ b/lib/librte_eal/common/eal_hugepages.h
@@ -23,6 +23,8 @@ struct hugepage_file {
int socket_id; /**< NUMA socket ID */
int file_id; /**< the '%d' in HUGEFILE_FMT */
int memseg_id; /**< the memory segment to which page belongs */
+ int memseg_list_id;
+ /**< the memory segment list to which page belongs */
char filepath[MAX_HUGEPAGE_PATH]; /**< path to backing file on filesystem */
};

diff --git a/lib/librte_eal/common/eal_internal_cfg.h b/lib/librte_eal/common/eal_internal_cfg.h
index 4a43de6..601ba90 100644
--- a/lib/librte_eal/common/eal_internal_cfg.h
+++ b/lib/librte_eal/common/eal_internal_cfg.h
@@ -23,7 +23,7 @@ struct hugepage_info {
uint64_t hugepage_sz; /**< size of a huge page */
const char *hugedir; /**< dir where hugetlbfs is mounted */
uint32_t num_pages[RTE_MAX_NUMA_NODES];
- /**< number of hugepages of that size on each socket */
+ /**< number of hugepages of that size on each socket */
int lock_descriptor; /**< file descriptor for hugepage dir */
};

diff --git a/lib/librte_eal/common/include/rte_eal_memconfig.h b/lib/librte_eal/common/include/rte_eal_memconfig.h
index 29fa0b6..31fc8e7 100644
--- a/lib/librte_eal/common/include/rte_eal_memconfig.h
+++ b/lib/librte_eal/common/include/rte_eal_memconfig.h
@@ -12,12 +12,30 @@
#include <rte_malloc_heap.h>
#include <rte_rwlock.h>
#include <rte_pause.h>
+#include <rte_fbarray.h>

#ifdef __cplusplus
extern "C" {
#endif

/**
+ * memseg list is a special case as we need to store a bunch of other data
+ * together with the array itself.
+ */
+struct rte_memseg_list {
+ RTE_STD_C11
+ union {
+ void *base_va;
+ /**< Base virtual address for this memseg list. */
+ uint64_t addr_64;
+ /**< Makes sure addr is always 64-bits */
+ };
+ int socket_id; /**< Socket ID for all memsegs in this list. */
+ uint64_t hugepage_sz; /**< page size for all memsegs in this list. */
+ struct rte_fbarray memseg_arr;
+};
+
+/**
* the structure for the memory configuration for the RTE.
* Used by the rte_config structure. It is separated out, as for multi-process
* support, the memory details should be shared across instances
@@ -43,9 +61,11 @@ struct rte_mem_config {
uint32_t memzone_cnt; /**< Number of allocated memzones */

/* memory segments and zones */
- struct rte_memseg memseg[RTE_MAX_MEMSEG]; /**< Physmem descriptors. */
struct rte_memzone memzone[RTE_MAX_MEMZONE]; /**< Memzone descriptors. */

+ struct rte_memseg_list memsegs[RTE_MAX_MEMSEG_LISTS];
+ /**< list of dynamic arrays holding memsegs */
+
struct rte_tailq_head tailq_head[RTE_MAX_TAILQ]; /**< Tailqs for objects */

/* Heaps of Malloc per socket */
diff --git a/lib/librte_eal/common/include/rte_memory.h b/lib/librte_eal/common/include/rte_memory.h
index 302f865..674d4cb 100644
--- a/lib/librte_eal/common/include/rte_memory.h
+++ b/lib/librte_eal/common/include/rte_memory.h
@@ -22,6 +22,9 @@ extern "C" {
#include <rte_common.h>
#include <rte_config.h>

+/* forward declaration for pointers */
+struct rte_memseg_list;
+
__extension__
enum rte_page_sizes {
RTE_PGSIZE_4K = 1ULL << 12,
@@ -130,21 +133,27 @@ phys_addr_t rte_mem_virt2phy(const void *virt);
rte_iova_t rte_mem_virt2iova(const void *virt);

/**
- * Get the layout of the available physical memory.
+ * Get memseg corresponding to virtual memory address.
*
- * It can be useful for an application to have the full physical
- * memory layout to decide the size of a memory zone to reserve. This
- * table is stored in rte_config (see rte_eal_get_configuration()).
+ * @param virt
+ * The virtual address.
+ * @param msl
+ * Memseg list in which to look for memsegs (can be NULL).
+ * @return
+ * Memseg to which this virtual address belongs to.
+ */
+struct rte_memseg *rte_mem_virt2memseg(const void *virt,
+ const struct rte_memseg_list *msl);
+
+/**
+ * Get memseg list corresponding to virtual memory address.
*
+ * @param virt
+ * The virtual address.
* @return
- * - On success, return a pointer to a read-only table of struct
- * rte_physmem_desc elements, containing the layout of all
- * addressable physical memory. The last element of the table
- * contains a NULL address.
- * - On error, return NULL. This should not happen since it is a fatal
- * error that will probably cause the entire system to panic.
- */
-const struct rte_memseg *rte_eal_get_physmem_layout(void);
+ * Memseg list to which this virtual address belongs to.
+ */
+struct rte_memseg_list *rte_mem_virt2memseg_list(const void *virt);

/**
* Dump the physical memory layout to a file.
diff --git a/lib/librte_eal/common/include/rte_memzone.h b/lib/librte_eal/common/include/rte_memzone.h
index 2bfb273..a69f068 100644
--- a/lib/librte_eal/common/include/rte_memzone.h
+++ b/lib/librte_eal/common/include/rte_memzone.h
@@ -66,7 +66,6 @@ struct rte_memzone {
int32_t socket_id; /**< NUMA socket ID. */

uint32_t flags; /**< Characteristics of this memzone. */
- uint32_t memseg_id; /**< Memseg it belongs. */
} __attribute__((__packed__));

/**
diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c
index c18f050..701bffd 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -26,11 +26,11 @@
* Initialize a general malloc_elem header structure
*/
void
-malloc_elem_init(struct malloc_elem *elem,
- struct malloc_heap *heap, const struct rte_memseg *ms, size_t size)
+malloc_elem_init(struct malloc_elem *elem, struct malloc_heap *heap,
+ struct rte_memseg_list *msl, size_t size)
{
elem->heap = heap;
- elem->ms = ms;
+ elem->msl = msl;
elem->prev = NULL;
elem->next = NULL;
memset(&elem->free_list, 0, sizeof(elem->free_list));
@@ -145,7 +145,7 @@ split_elem(struct malloc_elem *elem, struct malloc_elem *split_pt)
const size_t old_elem_size = (uintptr_t)split_pt - (uintptr_t)elem;
const size_t new_elem_size = elem->size - old_elem_size;

- malloc_elem_init(split_pt, elem->heap, elem->ms, new_elem_size);
+ malloc_elem_init(split_pt, elem->heap, elem->msl, new_elem_size);
split_pt->prev = elem;
split_pt->next = next_elem;
if (next_elem)
diff --git a/lib/librte_eal/common/malloc_elem.h b/lib/librte_eal/common/malloc_elem.h
index 9c1614c..388c16f 100644
--- a/lib/librte_eal/common/malloc_elem.h
+++ b/lib/librte_eal/common/malloc_elem.h
@@ -5,7 +5,7 @@
#ifndef MALLOC_ELEM_H_
#define MALLOC_ELEM_H_

-#include <rte_memory.h>
+#include <rte_eal_memconfig.h>

/* dummy definition of struct so we can use pointers to it in malloc_elem struct */
struct malloc_heap;
@@ -24,7 +24,7 @@ struct malloc_elem {
/**< points to next elem in memseg */
LIST_ENTRY(malloc_elem) free_list;
/**< list of free elements in heap */
- const struct rte_memseg *ms;
+ struct rte_memseg_list *msl;
volatile enum elem_state state;
uint32_t pad;
size_t size;
@@ -111,7 +111,7 @@ malloc_elem_from_data(const void *data)
void
malloc_elem_init(struct malloc_elem *elem,
struct malloc_heap *heap,
- const struct rte_memseg *ms,
+ struct rte_memseg_list *msl,
size_t size);

void
diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c
index a2c2e4c..058ad75 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -21,6 +21,7 @@
#include <rte_memcpy.h>
#include <rte_atomic.h>

+#include "eal_internal_cfg.h"
#include "malloc_elem.h"
#include "malloc_heap.h"

@@ -62,22 +63,25 @@ check_hugepage_sz(unsigned flags, uint64_t hugepage_sz)
}

/*
- * Expand the heap with a memseg.
- * This reserves the zone and sets a dummy malloc_elem header at the end
- * to prevent overflow. The rest of the zone is added to free list as a single
- * large free block
+ * Expand the heap with a memory area.
*/
-static void
-malloc_heap_add_memseg(struct malloc_heap *heap, struct rte_memseg *ms)
+static struct malloc_elem *
+malloc_heap_add_memory(struct malloc_heap *heap, struct rte_memseg_list *msl,
+ void *start, size_t len)
{
- struct malloc_elem *start_elem = (struct malloc_elem *)ms->addr;
- const size_t elem_size = ms->len - MALLOC_ELEM_OVERHEAD;
+ struct malloc_elem *elem = start;
+
+ malloc_elem_init(elem, heap, msl, len);
+
+ malloc_elem_insert(elem);
+
+ elem = malloc_elem_join_adjacent_free(elem);

- malloc_elem_init(start_elem, heap, ms, elem_size);
- malloc_elem_insert(start_elem);
- malloc_elem_free_list_insert(start_elem);
+ malloc_elem_free_list_insert(elem);

- heap->total_size += elem_size;
+ heap->total_size += len;
+
+ return elem;
}

/*
@@ -98,7 +102,8 @@ find_suitable_element(struct malloc_heap *heap, size_t size,
for (elem = LIST_FIRST(&heap->free_head[idx]);
!!elem; elem = LIST_NEXT(elem, free_list)) {
if (malloc_elem_can_hold(elem, size, align, bound)) {
- if (check_hugepage_sz(flags, elem->ms->hugepage_sz))
+ if (check_hugepage_sz(flags,
+ elem->msl->hugepage_sz))
return elem;
if (alt_elem == NULL)
alt_elem = elem;
@@ -243,16 +248,65 @@ int
rte_eal_malloc_heap_init(void)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
- unsigned ms_cnt;
- struct rte_memseg *ms;
+ int msl_idx;
+ struct rte_memseg_list *msl;

if (mcfg == NULL)
return -1;

- for (ms = &mcfg->memseg[0], ms_cnt = 0;
- (ms_cnt < RTE_MAX_MEMSEG) && (ms->len > 0);
- ms_cnt++, ms++) {
- malloc_heap_add_memseg(&mcfg->malloc_heaps[ms->socket_id], ms);
+ for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
+ int start;
+ struct rte_fbarray *arr;
+ struct malloc_heap *heap;
+
+ msl = &mcfg->memsegs[msl_idx];
+ arr = &msl->memseg_arr;
+ heap = &mcfg->malloc_heaps[msl->socket_id];
+
+ if (arr->count == 0)
+ continue;
+
+ /* for legacy mode, just walk the list */
+ if (internal_config.legacy_mem) {
+ int ms_idx = 0;
+ while ((ms_idx = rte_fbarray_find_next_used(arr,
+ ms_idx)) >= 0) {
+ struct rte_memseg *ms =
+ rte_fbarray_get(arr, ms_idx);
+ malloc_heap_add_memory(heap, msl,
+ ms->addr, ms->len);
+ ms_idx++;
+ RTE_LOG(DEBUG, EAL, "Heap on socket %d was expanded by %zdMB\n",
+ msl->socket_id, ms->len >> 20ULL);
+ }
+ continue;
+ }
+
+ /* find first segment */
+ start = rte_fbarray_find_next_used(arr, 0);
+
+ while (start >= 0) {
+ int contig_segs;
+ struct rte_memseg *start_seg;
+ size_t len, hugepage_sz = msl->hugepage_sz;
+
+ /* find how many pages we can lump in together */
+ contig_segs = rte_fbarray_find_contig_used(arr, start);
+ start_seg = rte_fbarray_get(arr, start);
+ len = contig_segs * hugepage_sz;
+
+ /*
+ * we've found (hopefully) a bunch of contiguous
+ * segments, so add them to the heap.
+ */
+ malloc_heap_add_memory(heap, msl, start_seg->addr, len);
+
+ RTE_LOG(DEBUG, EAL, "Heap on socket %d was expanded by %zdMB\n",
+ msl->socket_id, len >> 20ULL);
+
+ start = rte_fbarray_find_next_used(arr,
+ start + contig_segs);
+ }
}

return 0;
diff --git a/lib/librte_eal/common/rte_malloc.c b/lib/librte_eal/common/rte_malloc.c
index 80fb6cc..bd7e757 100644
--- a/lib/librte_eal/common/rte_malloc.c
+++ b/lib/librte_eal/common/rte_malloc.c
@@ -238,17 +238,21 @@ rte_malloc_set_limit(__rte_unused const char *type,
rte_iova_t
rte_malloc_virt2iova(const void *addr)
{
- rte_iova_t iova;
- const struct malloc_elem *elem = malloc_elem_from_data(addr);
+ const struct rte_memseg *ms;
+ struct malloc_elem *elem = malloc_elem_from_data(addr);
+
if (elem == NULL)
return RTE_BAD_IOVA;
- if (elem->ms->iova == RTE_BAD_IOVA)
- return RTE_BAD_IOVA;

if (rte_eal_iova_mode() == RTE_IOVA_VA)
- iova = (uintptr_t)addr;
- else
- iova = elem->ms->iova +
- RTE_PTR_DIFF(addr, elem->ms->addr);
- return iova;
+ return (uintptr_t) addr;
+
+ ms = rte_mem_virt2memseg(addr, elem->msl);
+ if (ms == NULL)
+ return RTE_BAD_IOVA;
+
+ if (ms->iova == RTE_BAD_IOVA)
+ return RTE_BAD_IOVA;
+
+ return ms->iova + RTE_PTR_DIFF(addr, ms->addr);
}
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 5207713..7851a7d 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -74,8 +74,8 @@ static int mem_cfg_fd = -1;
static struct flock wr_lock = {
.l_type = F_WRLCK,
.l_whence = SEEK_SET,
- .l_start = offsetof(struct rte_mem_config, memseg),
- .l_len = sizeof(early_mem_config.memseg),
+ .l_start = offsetof(struct rte_mem_config, memsegs),
+ .l_len = sizeof(early_mem_config.memsegs),
};

/* Address of global and public configuration */
@@ -643,17 +643,20 @@ eal_parse_args(int argc, char **argv)
static void
eal_check_mem_on_local_socket(void)
{
- const struct rte_memseg *ms;
+ const struct rte_memseg_list *msl;
int i, socket_id;

socket_id = rte_lcore_to_socket_id(rte_config.master_lcore);

- ms = rte_eal_get_physmem_layout();
-
- for (i = 0; i < RTE_MAX_MEMSEG; i++)
- if (ms[i].socket_id == socket_id &&
- ms[i].len > 0)
- return;
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+ msl = &rte_eal_get_configuration()->mem_config->memsegs[i];
+ if (msl->socket_id != socket_id)
+ continue;
+ /* for legacy memory, check if there's anything allocated */
+ if (internal_config.legacy_mem && msl->memseg_arr.count == 0)
+ continue;
+ return;
+ }

RTE_LOG(WARNING, EAL, "WARNING: Master core has no "
"memory on local socket!\n");
diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
index b9bcb75..9512da9 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -908,6 +908,28 @@ huge_recover_sigbus(void)
}
}

+/* in legacy mode, each combination of socket and pagesize directly map to a
+ * single memseg list.
+ */
+static struct rte_memseg_list *
+get_memseg_list(int socket, uint64_t page_sz)
+{
+ struct rte_mem_config *mcfg =
+ rte_eal_get_configuration()->mem_config;
+ struct rte_memseg_list *msl;
+ int msl_idx;
+
+ for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
+ msl = &mcfg->memsegs[msl_idx];
+ if (msl->hugepage_sz != page_sz)
+ continue;
+ if (msl->socket_id != socket)
+ continue;
+ return msl;
+ }
+ return NULL;
+}
+
/*
* Prepare physical memory mapping: fill configuration structure with
* these infos, return 0 on success.
@@ -925,11 +947,14 @@ eal_legacy_hugepage_init(void)
struct rte_mem_config *mcfg;
struct hugepage_file *hugepage = NULL, *tmp_hp = NULL;
struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES];
+ struct rte_fbarray *arr;
+ struct rte_memseg *ms;

uint64_t memory[RTE_MAX_NUMA_NODES];

unsigned hp_offset;
int i, j, new_memseg;
+ int ms_idx, msl_idx;
int nr_hugefiles, nr_hugepages = 0;
void *addr;

@@ -942,6 +967,12 @@ eal_legacy_hugepage_init(void)

/* hugetlbfs can be disabled */
if (internal_config.no_hugetlbfs) {
+ /* nohuge mode is legacy mode */
+ internal_config.legacy_mem = 1;
+
+ arr = &mcfg->memsegs[0].memseg_arr;
+ ms = rte_fbarray_get(arr, 0);
+
addr = mmap(NULL, internal_config.memory, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
if (addr == MAP_FAILED) {
@@ -949,14 +980,15 @@ eal_legacy_hugepage_init(void)
strerror(errno));
return -1;
}
+ rte_fbarray_set_used(arr, 0);
if (rte_eal_iova_mode() == RTE_IOVA_VA)
- mcfg->memseg[0].iova = (uintptr_t)addr;
+ ms->iova = (uintptr_t)addr;
else
- mcfg->memseg[0].iova = RTE_BAD_IOVA;
- mcfg->memseg[0].addr = addr;
- mcfg->memseg[0].hugepage_sz = RTE_PGSIZE_4K;
- mcfg->memseg[0].len = internal_config.memory;
- mcfg->memseg[0].socket_id = 0;
+ ms->iova = RTE_BAD_IOVA;
+ ms->addr = addr;
+ ms->hugepage_sz = RTE_PGSIZE_4K;
+ ms->len = internal_config.memory;
+ ms->socket_id = 0;
return 0;
}

@@ -1197,27 +1229,51 @@ eal_legacy_hugepage_init(void)
#endif

if (new_memseg) {
- j += 1;
- if (j == RTE_MAX_MEMSEG)
+ struct rte_memseg_list *msl;
+ int socket;
+ uint64_t page_sz;
+
+ socket = hugepage[i].socket_id;
+ page_sz = hugepage[i].size;
+
+ if (page_sz == 0)
+ continue;
+
+ /* figure out where to put this memseg */
+ msl = get_memseg_list(socket, page_sz);
+ if (!msl)
+ rte_panic("Unknown socket or page sz: %i %lx\n",
+ socket, page_sz);
+ msl_idx = msl - &mcfg->memsegs[0];
+ arr = &msl->memseg_arr;
+
+ ms_idx = rte_fbarray_find_next_free(arr, arr->count);
+ if (ms_idx < 0) {
+ RTE_LOG(ERR, EAL, "No space in memseg list\n");
break;
+ }
+ ms = rte_fbarray_get(arr, ms_idx);
+
+ ms->iova = hugepage[i].physaddr;
+ ms->addr = hugepage[i].final_va;
+ ms->len = page_sz;
+ ms->socket_id = socket;
+ ms->hugepage_sz = page_sz;

- mcfg->memseg[j].iova = hugepage[i].physaddr;
- mcfg->memseg[j].addr = hugepage[i].final_va;
- mcfg->memseg[j].len = hugepage[i].size;
- mcfg->memseg[j].socket_id = hugepage[i].socket_id;
- mcfg->memseg[j].hugepage_sz = hugepage[i].size;
+ rte_fbarray_set_used(arr, ms_idx);
}
/* continuation of previous memseg */
else {
#ifdef RTE_ARCH_PPC_64
/* Use the phy and virt address of the last page as segment
* address for IBM Power architecture */
- mcfg->memseg[j].iova = hugepage[i].physaddr;
- mcfg->memseg[j].addr = hugepage[i].final_va;
+ ms->iova = hugepage[i].physaddr;
+ ms->addr = hugepage[i].final_va;
#endif
- mcfg->memseg[j].len += mcfg->memseg[j].hugepage_sz;
+ ms->len += ms->hugepage_sz;
}
- hugepage[i].memseg_id = j;
+ hugepage[i].memseg_id = ms_idx;
+ hugepage[i].memseg_list_id = msl_idx;
}

if (i < nr_hugefiles) {
@@ -1227,7 +1283,7 @@ eal_legacy_hugepage_init(void)
"Please either increase it or request less amount "
"of memory.\n",
i, nr_hugefiles, RTE_STR(CONFIG_RTE_MAX_MEMSEG),
- RTE_MAX_MEMSEG);
+ RTE_MAX_MEMSEG_PER_LIST);
goto fail;
}

@@ -1265,11 +1321,12 @@ getFileSize(int fd)
static int
eal_legacy_hugepage_attach(void)
{
- const struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
struct hugepage_file *hp = NULL;
- unsigned num_hp = 0;
- unsigned i, s = 0; /* s used to track the segment number */
- unsigned max_seg = RTE_MAX_MEMSEG;
+ unsigned int num_hp = 0;
+ unsigned int i;
+ int ms_idx, msl_idx;
+ unsigned int cur_seg, max_seg;
off_t size = 0;
int fd, fd_hugepage = -1;

@@ -1289,46 +1346,57 @@ eal_legacy_hugepage_attach(void)
}

/* map all segments into memory to make sure we get the addrs */
- for (s = 0; s < RTE_MAX_MEMSEG; ++s) {
- void *base_addr;
+ max_seg = 0;
+ for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
+ struct rte_memseg_list *msl = &mcfg->memsegs[msl_idx];
+ struct rte_fbarray *arr = &msl->memseg_arr;
uint64_t mmap_sz;
int mmap_flags = 0;

- /*
- * the first memory segment with len==0 is the one that
- * follows the last valid segment.
- */
- if (mcfg->memseg[s].len == 0)
- break;
+ ms_idx = rte_fbarray_find_next_used(arr, 0);
+ while (ms_idx >= 0) {
+ struct rte_memseg *ms = rte_fbarray_get(arr, ms_idx);
+ void *base_addr;

- /* get identical addresses as the primary process.
- */
+ ms = rte_fbarray_get(arr, ms_idx);
+
+ /*
+ * the first memory segment with len==0 is the one that
+ * follows the last valid segment.
+ */
+ if (ms->len == 0)
+ break;
+
+ /* get identical addresses as the primary process.
+ */
#ifdef RTE_ARCH_PPC_64
- mmap_flags |= MAP_HUGETLB;
+ mmap_flags |= MAP_HUGETLB;
#endif
- mmap_sz = mcfg->memseg[s].len;
- base_addr = eal_get_virtual_area(mcfg->memseg[s].addr,
- &mmap_sz, mcfg->memseg[s].hugepage_sz, 0,
- mmap_flags);
- if (base_addr == NULL) {
- max_seg = s;
- if (rte_errno == EADDRNOTAVAIL) {
- RTE_LOG(ERR, EAL, "Could not mmap %llu bytes at [%p] - please use '--base-virtaddr' option\n",
- (unsigned long long)mcfg->memseg[s].len,
- mcfg->memseg[s].addr);
- } else {
- RTE_LOG(ERR, EAL, "Could not mmap %llu bytes at [%p]: '%s'\n",
- (unsigned long long)mcfg->memseg[s].len,
- mcfg->memseg[s].addr,
- rte_strerror(rte_errno));
- }
- if (aslr_enabled() > 0) {
- RTE_LOG(ERR, EAL, "It is recommended to "
- "disable ASLR in the kernel "
- "and retry running both primary "
- "and secondary processes\n");
+ mmap_sz = ms->len;
+ base_addr = eal_get_virtual_area(ms->addr, &mmap_sz,
+ ms->hugepage_sz, 0, mmap_flags);
+ if (base_addr == NULL) {
+ if (rte_errno == EADDRNOTAVAIL) {
+ RTE_LOG(ERR, EAL, "Could not mmap %llu bytes at [%p] - please use '--base-virtaddr' option\n",
+ (unsigned long long)ms->len,
+ ms->addr);
+ } else {
+ RTE_LOG(ERR, EAL, "Could not mmap %llu bytes at [%p]: '%s'\n",
+ (unsigned long long)ms->len,
+ ms->addr, strerror(errno));
+ }
+ if (aslr_enabled() > 0) {
+ RTE_LOG(ERR, EAL, "It is recommended to "
+ "disable ASLR in the kernel "
+ "and retry running both primary "
+ "and secondary processes\n");
+ }
+ goto error;
}
- goto error;
+ max_seg++;
+ ms_idx++;
+
+ ms_idx = rte_fbarray_find_next_used(arr, ms_idx);
}
}

@@ -1342,46 +1410,67 @@ eal_legacy_hugepage_attach(void)
num_hp = size / sizeof(struct hugepage_file);
RTE_LOG(DEBUG, EAL, "Analysing %u files\n", num_hp);

- s = 0;
- while (s < RTE_MAX_MEMSEG && mcfg->memseg[s].len > 0){
- void *addr, *base_addr;
- uintptr_t offset = 0;
- size_t mapping_size;
- /*
- * free previously mapped memory so we can map the
- * hugepages into the space
- */
- base_addr = mcfg->memseg[s].addr;
- munmap(base_addr, mcfg->memseg[s].len);
-
- /* find the hugepages for this segment and map them
- * we don't need to worry about order, as the server sorted the
- * entries before it did the second mmap of them */
- for (i = 0; i < num_hp && offset < mcfg->memseg[s].len; i++){
- if (hp[i].memseg_id == (int)s){
- fd = open(hp[i].filepath, O_RDWR);
- if (fd < 0) {
- RTE_LOG(ERR, EAL, "Could not open %s\n",
- hp[i].filepath);
- goto error;
- }
- mapping_size = hp[i].size;
- addr = mmap(RTE_PTR_ADD(base_addr, offset),
- mapping_size, PROT_READ | PROT_WRITE,
- MAP_SHARED, fd, 0);
- close(fd); /* close file both on success and on failure */
- if (addr == MAP_FAILED ||
- addr != RTE_PTR_ADD(base_addr, offset)) {
- RTE_LOG(ERR, EAL, "Could not mmap %s\n",
- hp[i].filepath);
- goto error;
+ /* map all segments into memory to make sure we get the addrs */
+ for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
+ struct rte_memseg_list *msl = &mcfg->memsegs[msl_idx];
+ struct rte_fbarray *arr = &msl->memseg_arr;
+
+ ms_idx = rte_fbarray_find_next_used(arr, 0);
+ while (ms_idx >= 0) {
+ struct rte_memseg *ms = rte_fbarray_get(arr, ms_idx);
+ void *addr, *base_addr;
+ uintptr_t offset = 0;
+ size_t mapping_size;
+
+ ms = rte_fbarray_get(arr, ms_idx);
+ /*
+ * free previously mapped memory so we can map the
+ * hugepages into the space
+ */
+ base_addr = ms->addr;
+ munmap(base_addr, ms->len);
+
+ /*
+ * find the hugepages for this segment and map them
+ * we don't need to worry about order, as the server
+ * sorted the entries before it did the second mmap of
+ * them
+ */
+ for (i = 0; i < num_hp && offset < ms->len; i++) {
+ if (hp[i].memseg_id == ms_idx &&
+ hp[i].memseg_list_id ==
+ msl_idx) {
+ fd = open(hp[i].filepath, O_RDWR);
+ if (fd < 0) {
+ RTE_LOG(ERR, EAL, "Could not open %s\n",
+ hp[i].filepath);
+ goto error;
+ }
+ mapping_size = hp[i].size;
+ addr = mmap(RTE_PTR_ADD(base_addr,
+ offset),
+ mapping_size,
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED, fd, 0);
+ /*
+ * close file both on success and on
+ * failure
+ */
+ close(fd);
+ if (addr == MAP_FAILED ||
+ addr != RTE_PTR_ADD(
+ base_addr, offset)) {
+ RTE_LOG(ERR, EAL, "Could not mmap %s\n",
+ hp[i].filepath);
+ goto error;
+ }
+ offset += mapping_size;
}
- offset+=mapping_size;
}
+ RTE_LOG(DEBUG, EAL, "Mapped segment of size 0x%llx\n",
+ (unsigned long long)ms->len);
+ ms_idx = rte_fbarray_find_next_used(arr, ms_idx + 1);
}
- RTE_LOG(DEBUG, EAL, "Mapped segment %u of size 0x%llx\n", s,
- (unsigned long long)mcfg->memseg[s].len);
- s++;
}
/* unmap the hugepage config file, since we are done using it */
munmap(hp, size);
@@ -1389,8 +1478,28 @@ eal_legacy_hugepage_attach(void)
return 0;

error:
- for (i = 0; i < max_seg && mcfg->memseg[i].len > 0; i++)
- munmap(mcfg->memseg[i].addr, mcfg->memseg[i].len);
+ /* map all segments into memory to make sure we get the addrs */
+ cur_seg = 0;
+ for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
+ struct rte_memseg_list *msl = &mcfg->memsegs[msl_idx];
+ struct rte_fbarray *arr = &msl->memseg_arr;
+
+ if (cur_seg >= max_seg)
+ break;
+
+ ms_idx = rte_fbarray_find_next_used(arr, 0);
+ while (ms_idx >= 0) {
+ struct rte_memseg *ms = rte_fbarray_get(arr, ms_idx);
+
+ if (cur_seg >= max_seg)
+ break;
+ ms = rte_fbarray_get(arr, i);
+ munmap(ms->addr, ms->len);
+
+ cur_seg++;
+ ms_idx = rte_fbarray_find_next_used(arr, ms_idx);
+ }
+ }
if (hp != NULL && hp != MAP_FAILED)
munmap(hp, size);
if (fd_hugepage >= 0)
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index e44ae4d..5192763 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -667,33 +667,53 @@ vfio_get_group_no(const char *sysfs_base,
static int
vfio_type1_dma_map(int vfio_container_fd)
{
- const struct rte_memseg *ms = rte_eal_get_physmem_layout();
int i, ret;

/* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
- for (i = 0; i < RTE_MAX_MEMSEG; i++) {
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
struct vfio_iommu_type1_dma_map dma_map;
+ struct rte_memseg_list *msl;
+ struct rte_fbarray *arr;
+ int ms_idx, next_idx;

- if (ms[i].addr == NULL)
- break;
+ msl = &rte_eal_get_configuration()->mem_config->memsegs[i];
+ arr = &msl->memseg_arr;

- memset(&dma_map, 0, sizeof(dma_map));
- dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
- dma_map.vaddr = ms[i].addr_64;
- dma_map.size = ms[i].len;
- if (rte_eal_iova_mode() == RTE_IOVA_VA)
- dma_map.iova = dma_map.vaddr;
- else
- dma_map.iova = ms[i].iova;
- dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
+ /* skip empty memseg lists */
+ if (arr->count == 0)
+ continue;

- ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+ next_idx = 0;

- if (ret) {
- RTE_LOG(ERR, EAL, " cannot set up DMA remapping, "
- "error %i (%s)\n", errno,
- strerror(errno));
- return -1;
+ while ((ms_idx = rte_fbarray_find_next_used(arr,
+ next_idx) >= 0)) {
+ uint64_t addr, len, hw_addr;
+ const struct rte_memseg *ms;
+ next_idx = ms_idx + 1;
+
+ ms = rte_fbarray_get(arr, ms_idx);
+
+ addr = ms->addr_64;
+ len = ms->hugepage_sz;
+ hw_addr = ms->iova;
+
+ memset(&dma_map, 0, sizeof(dma_map));
+ dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
+ dma_map.vaddr = addr;
+ dma_map.size = len;
+ dma_map.iova = hw_addr;
+ dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
+ VFIO_DMA_MAP_FLAG_WRITE;
+
+ ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA,
+ &dma_map);
+
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot set up DMA remapping, "
+ "error %i (%s)\n", errno,
+ strerror(errno));
+ return -1;
+ }
}
}

@@ -703,8 +723,8 @@ vfio_type1_dma_map(int vfio_container_fd)
static int
vfio_spapr_dma_map(int vfio_container_fd)
{
- const struct rte_memseg *ms = rte_eal_get_physmem_layout();
int i, ret;
+ uint64_t hugepage_sz = 0;

struct vfio_iommu_spapr_register_memory reg = {
.argsz = sizeof(reg),
@@ -738,17 +758,31 @@ vfio_spapr_dma_map(int vfio_container_fd)
}

/* create DMA window from 0 to max(phys_addr + len) */
- for (i = 0; i < RTE_MAX_MEMSEG; i++) {
- if (ms[i].addr == NULL)
- break;
-
- create.window_size = RTE_MAX(create.window_size,
- ms[i].iova + ms[i].len);
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+ struct rte_mem_config *mcfg =
+ rte_eal_get_configuration()->mem_config;
+ struct rte_memseg_list *msl = &mcfg->memsegs[i];
+ struct rte_fbarray *arr = &msl->memseg_arr;
+ int idx, next_idx;
+
+ if (msl->base_va == NULL)
+ continue;
+ if (msl->memseg_arr.count == 0)
+ continue;
+
+ next_idx = 0;
+ while ((idx = rte_fbarray_find_next_used(arr, next_idx)) >= 0) {
+ const struct rte_memseg *ms = rte_fbarray_get(arr, idx);
+ hugepage_sz = RTE_MAX(hugepage_sz, ms->hugepage_sz);
+ create.window_size = RTE_MAX(create.window_size,
+ ms[i].iova + ms[i].len);
+ next_idx = idx + 1;
+ }
}

/* sPAPR requires window size to be a power of 2 */
create.window_size = rte_align64pow2(create.window_size);
- create.page_shift = __builtin_ctzll(ms->hugepage_sz);
+ create.page_shift = __builtin_ctzll(hugepage_sz);
create.levels = 1;

ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
@@ -764,41 +798,61 @@ vfio_spapr_dma_map(int vfio_container_fd)
}

/* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
- for (i = 0; i < RTE_MAX_MEMSEG; i++) {
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
struct vfio_iommu_type1_dma_map dma_map;
+ struct rte_memseg_list *msl;
+ struct rte_fbarray *arr;
+ int ms_idx, next_idx;

- if (ms[i].addr == NULL)
- break;
+ msl = &rte_eal_get_configuration()->mem_config->memsegs[i];
+ arr = &msl->memseg_arr;

- reg.vaddr = (uintptr_t) ms[i].addr;
- reg.size = ms[i].len;
- ret = ioctl(vfio_container_fd,
- VFIO_IOMMU_SPAPR_REGISTER_MEMORY, &reg);
- if (ret) {
- RTE_LOG(ERR, EAL, " cannot register vaddr for IOMMU, "
- "error %i (%s)\n", errno, strerror(errno));
- return -1;
- }
+ /* skip empty memseg lists */
+ if (arr->count == 0)
+ continue;

- memset(&dma_map, 0, sizeof(dma_map));
- dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
- dma_map.vaddr = ms[i].addr_64;
- dma_map.size = ms[i].len;
- if (rte_eal_iova_mode() == RTE_IOVA_VA)
- dma_map.iova = dma_map.vaddr;
- else
- dma_map.iova = ms[i].iova;
- dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
- VFIO_DMA_MAP_FLAG_WRITE;
+ next_idx = 0;

- ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+ while ((ms_idx = rte_fbarray_find_next_used(arr,
+ next_idx) >= 0)) {
+ uint64_t addr, len, hw_addr;
+ const struct rte_memseg *ms;
+ next_idx = ms_idx + 1;

- if (ret) {
- RTE_LOG(ERR, EAL, " cannot set up DMA remapping, "
- "error %i (%s)\n", errno, strerror(errno));
- return -1;
- }
+ ms = rte_fbarray_get(arr, ms_idx);
+
+ addr = ms->addr_64;
+ len = ms->hugepage_sz;
+ hw_addr = ms->iova;

+ reg.vaddr = (uintptr_t) addr;
+ reg.size = len;
+ ret = ioctl(vfio_container_fd,
+ VFIO_IOMMU_SPAPR_REGISTER_MEMORY, &reg);
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot register vaddr for IOMMU, error %i (%s)\n",
+ errno, strerror(errno));
+ return -1;
+ }
+
+ memset(&dma_map, 0, sizeof(dma_map));
+ dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
+ dma_map.vaddr = addr;
+ dma_map.size = len;
+ dma_map.iova = hw_addr;
+ dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
+ VFIO_DMA_MAP_FLAG_WRITE;
+
+ ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA,
+ &dma_map);
+
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot set up DMA remapping, "
+ "error %i (%s)\n", errno,
+ strerror(errno));
+ return -1;
+ }
+ }
}

return 0;
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index a938a2f..4c2e959 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -25,7 +25,6 @@ DPDK_2.0 {
rte_eal_devargs_type_count;
rte_eal_get_configuration;
rte_eal_get_lcore_state;
- rte_eal_get_physmem_layout;
rte_eal_get_physmem_size;
rte_eal_has_hugepages;
rte_eal_hpet_init;
@@ -215,6 +214,8 @@ DPDK_18.05 {
global:

rte_num_sockets;
+ rte_mem_virt2memseg;
+ rte_mem_virt2memseg_list;
rte_malloc_dump_heaps;
rte_fbarray_init;
rte_fbarray_destroy;
diff --git a/test/test/test_malloc.c b/test/test/test_malloc.c
index d23192c..8484fb6 100644
--- a/test/test/test_malloc.c
+++ b/test/test/test_malloc.c
@@ -12,6 +12,7 @@

#include <rte_common.h>
#include <rte_memory.h>
+#include <rte_eal_memconfig.h>
#include <rte_per_lcore.h>
#include <rte_launch.h>
#include <rte_eal.h>
@@ -705,15 +706,23 @@ test_malloc_bad_params(void)
return -1;
}

-/* Check if memory is available on a specific socket */
+/* Check if memory is avilable on a specific socket */
static int
is_mem_on_socket(int32_t socket)
{
- const struct rte_memseg *ms = rte_eal_get_physmem_layout();
+ const struct rte_mem_config *mcfg =
+ rte_eal_get_configuration()->mem_config;
unsigned i;

- for (i = 0; i < RTE_MAX_MEMSEG; i++) {
- if (socket == ms[i].socket_id)
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+ const struct rte_memseg_list *msl =
+ &mcfg->memsegs[i];
+ const struct rte_fbarray *arr = &msl->memseg_arr;
+
+ if (msl->socket_id != socket)
+ continue;
+
+ if (arr->count)
return 1;
}
return 0;
@@ -726,16 +735,8 @@ is_mem_on_socket(int32_t socket)
static int32_t
addr_to_socket(void * addr)
{
- const struct rte_memseg *ms = rte_eal_get_physmem_layout();
- unsigned i;
-
- for (i = 0; i < RTE_MAX_MEMSEG; i++) {
- if ((ms[i].addr <= addr) &&
- ((uintptr_t)addr <
- ((uintptr_t)ms[i].addr + (uintptr_t)ms[i].len)))
- return ms[i].socket_id;
- }
- return -1;
+ const struct rte_memseg *ms = rte_mem_virt2memseg(addr, NULL);
+ return ms == NULL ? -1 : ms->socket_id;
}

/* Test using rte_[c|m|zm]alloc_socket() on a specific socket */
diff --git a/test/test/test_memory.c b/test/test/test_memory.c
index 972321f..8cb52d7 100644
--- a/test/test/test_memory.c
+++ b/test/test/test_memory.c
@@ -5,8 +5,11 @@
#include <stdio.h>
#include <stdint.h>

+#include <rte_eal.h>
+#include <rte_eal_memconfig.h>
#include <rte_memory.h>
#include <rte_common.h>
+#include <rte_memzone.h>

#include "test.h"

@@ -25,10 +28,12 @@
static int
test_memory(void)
{
+ const struct rte_memzone *mz = NULL;
uint64_t s;
unsigned i;
size_t j;
- const struct rte_memseg *mem;
+ struct rte_mem_config *mcfg =
+ rte_eal_get_configuration()->mem_config;

/*
* dump the mapped memory: the python-expect script checks
@@ -40,20 +45,42 @@ test_memory(void)
/* check that memory size is != 0 */
s = rte_eal_get_physmem_size();
if (s == 0) {
- printf("No memory detected\n");
- return -1;
+ printf("No memory detected, attempting to allocate\n");
+ mz = rte_memzone_reserve("tmp", 1000, SOCKET_ID_ANY, 0);
+
+ if (!mz) {
+ printf("Failed to allocate a memzone\n");
+ return -1;
+ }
}

/* try to read memory (should not segfault) */
- mem = rte_eal_get_physmem_layout();
- for (i = 0; i < RTE_MAX_MEMSEG && mem[i].addr != NULL ; i++) {
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+ struct rte_memseg_list *msl = &mcfg->memsegs[i];
+ struct rte_fbarray *arr = &msl->memseg_arr;
+ int search_idx, cur_idx;
+
+ if (arr->count == 0)
+ continue;
+
+ search_idx = 0;

- /* check memory */
- for (j = 0; j<mem[i].len; j++) {
- *((volatile uint8_t *) mem[i].addr + j);
+ while ((cur_idx = rte_fbarray_find_next_used(arr,
+ search_idx)) >= 0) {
+ const struct rte_memseg *ms;
+
+ ms = rte_fbarray_get(arr, cur_idx);
+
+ /* check memory */
+ for (j = 0; j < ms->len; j++)
+ *((volatile uint8_t *) ms->addr + j);
+ search_idx = cur_idx + 1;
}
}

+ if (mz)
+ rte_memzone_free(mz);
+
return 0;
}

diff --git a/test/test/test_memzone.c b/test/test/test_memzone.c
index 8ece1ac..47f4de8 100644
--- a/test/test/test_memzone.c
+++ b/test/test/test_memzone.c
@@ -108,22 +108,25 @@ static int
test_memzone_reserve_flags(void)
{
const struct rte_memzone *mz;
- const struct rte_memseg *ms;
int hugepage_2MB_avail = 0;
int hugepage_1GB_avail = 0;
int hugepage_16MB_avail = 0;
int hugepage_16GB_avail = 0;
const size_t size = 100;
int i = 0;
- ms = rte_eal_get_physmem_layout();
- for (i = 0; i < RTE_MAX_MEMSEG; i++) {
- if (ms[i].hugepage_sz == RTE_PGSIZE_2M)
+
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+ struct rte_mem_config *mcfg =
+ rte_eal_get_configuration()->mem_config;
+ struct rte_memseg_list *msl = &mcfg->memsegs[i];
+
+ if (msl->hugepage_sz == RTE_PGSIZE_2M)
hugepage_2MB_avail = 1;
- if (ms[i].hugepage_sz == RTE_PGSIZE_1G)
+ if (msl->hugepage_sz == RTE_PGSIZE_1G)
hugepage_1GB_avail = 1;
- if (ms[i].hugepage_sz == RTE_PGSIZE_16M)
+ if (msl->hugepage_sz == RTE_PGSIZE_16M)
hugepage_16MB_avail = 1;
- if (ms[i].hugepage_sz == RTE_PGSIZE_16G)
+ if (msl->hugepage_sz == RTE_PGSIZE_16G)
hugepage_16GB_avail = 1;
}
/* Display the availability of 2MB ,1GB, 16MB, 16GB pages */
--
2.7.4
Anatoly Burakov
2018-03-07 16:56:45 UTC
Permalink
This set of changes enables rte_malloc to allocate and free memory
as needed. The way it works is, first malloc checks if there is
enough memory already allocated to satisfy user's request. If there
isn't, we try and allocate more memory. The reverse happens with
free - we free an element, check its size (including free element
merging due to adjacency) and see if it's bigger than hugepage
size and that its start and end span a hugepage or more. Then we
remove the area from malloc heap (adjusting element lengths where
appropriate), and deallocate the page.

For legacy mode, runtime alloc/free of pages is disabled.

It is worth noting that memseg lists are being sorted by page size,
and that we try our best to satisfy user's request. That is, if
the user requests an element from a 2MB page memory, we will check
if we can satisfy that request from existing memory, if not we try
and allocate more 2MB pages. If that fails and user also specified
a "size is hint" flag, we then check other page sizes and try to
allocate from there. If that fails too, then, depending on flags,
we may try allocating from other sockets. In other words, we try
our best to give the user what they asked for, but going to other
sockets is last resort - first we try to allocate more memory on
the same socket.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/common/eal_common_memzone.c | 23 +-
lib/librte_eal/common/malloc_elem.c | 85 ++++++++
lib/librte_eal/common/malloc_elem.h | 3 +
lib/librte_eal/common/malloc_heap.c | 332 ++++++++++++++++++++++++++++-
lib/librte_eal/common/malloc_heap.h | 4 +-
lib/librte_eal/common/rte_malloc.c | 31 +--
6 files changed, 416 insertions(+), 62 deletions(-)

diff --git a/lib/librte_eal/common/eal_common_memzone.c b/lib/librte_eal/common/eal_common_memzone.c
index ed36174..718dee8 100644
--- a/lib/librte_eal/common/eal_common_memzone.c
+++ b/lib/librte_eal/common/eal_common_memzone.c
@@ -103,7 +103,6 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
struct rte_memzone *mz;
struct rte_mem_config *mcfg;
size_t requested_len;
- int socket, i;

/* get pointer to global configuration */
mcfg = rte_eal_get_configuration()->mem_config;
@@ -181,27 +180,9 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
}
}

- if (socket_id == SOCKET_ID_ANY)
- socket = malloc_get_numa_socket();
- else
- socket = socket_id;
-
/* allocate memory on heap */
- void *mz_addr = malloc_heap_alloc(&mcfg->malloc_heaps[socket], NULL,
- requested_len, flags, align, bound);
-
- if ((mz_addr == NULL) && (socket_id == SOCKET_ID_ANY)) {
- /* try other heaps */
- for (i = 0; i < RTE_MAX_NUMA_NODES; i++) {
- if (socket == i)
- continue;
-
- mz_addr = malloc_heap_alloc(&mcfg->malloc_heaps[i],
- NULL, requested_len, flags, align, bound);
- if (mz_addr != NULL)
- break;
- }
- }
+ void *mz_addr = malloc_heap_alloc(NULL, requested_len, socket_id, flags,
+ align, bound);

if (mz_addr == NULL) {
rte_errno = ENOMEM;
diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c
index 701bffd..eabad66 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -400,6 +400,91 @@ malloc_elem_free(struct malloc_elem *elem)
return elem;
}

+/* assume all checks were already done */
+void
+malloc_elem_hide_region(struct malloc_elem *elem, void *start, size_t len)
+{
+ size_t len_before, len_after;
+ struct malloc_elem *prev, *next;
+ void *end, *elem_end;
+
+ end = RTE_PTR_ADD(start, len);
+ elem_end = RTE_PTR_ADD(elem, elem->size);
+ len_before = RTE_PTR_DIFF(start, elem);
+ len_after = RTE_PTR_DIFF(elem_end, end);
+
+ prev = elem->prev;
+ next = elem->next;
+
+ if (len_after >= MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) {
+ /* split after */
+ struct malloc_elem *split_after = end;
+
+ split_elem(elem, split_after);
+
+ next = split_after;
+
+ malloc_elem_free_list_insert(split_after);
+ } else if (len_after >= MALLOC_ELEM_HEADER_LEN) {
+ struct malloc_elem *pad_elem = end;
+
+ /* shrink current element */
+ elem->size -= len_after;
+ memset(pad_elem, 0, sizeof(*pad_elem));
+
+ /* copy next element's data to our pad */
+ memcpy(pad_elem, next, sizeof(*pad_elem));
+
+ /* pad next element */
+ next->state = ELEM_PAD;
+ next->pad = len_after;
+
+ /* next element is busy, would've been merged otherwise */
+ pad_elem->pad = len_after;
+ pad_elem->size += len_after;
+
+ /* adjust pointers to point to our new pad */
+ pad_elem->next->prev = pad_elem;
+ elem->next = pad_elem;
+ } else if (len_after > 0) {
+ rte_panic("Unaligned element, heap is probably corrupt\n");
+ }
+
+ if (len_before >= MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) {
+ /* split before */
+ struct malloc_elem *split_before = start;
+
+ split_elem(elem, split_before);
+
+ prev = elem;
+ elem = split_before;
+
+ malloc_elem_free_list_insert(prev);
+ } else if (len_before > 0) {
+ /*
+ * unlike with elements after current, here we don't need to
+ * pad elements, but rather just increase the size of previous
+ * element, copy the old header and and set up trailer.
+ */
+ void *trailer = RTE_PTR_ADD(prev,
+ prev->size - MALLOC_ELEM_TRAILER_LEN);
+ struct malloc_elem *new_elem = start;
+
+ memcpy(new_elem, elem, sizeof(*elem));
+ new_elem->size -= len_before;
+
+ prev->size += len_before;
+ set_trailer(prev);
+
+ elem = new_elem;
+
+ /* erase old trailer */
+ memset(trailer, 0, MALLOC_ELEM_TRAILER_LEN);
+ }
+
+ remove_elem(elem);
+}
+
/*
* attempt to resize a malloc_elem by expanding into any free space
* immediately after it in memory.
diff --git a/lib/librte_eal/common/malloc_elem.h b/lib/librte_eal/common/malloc_elem.h
index 388c16f..6d979d2 100644
--- a/lib/librte_eal/common/malloc_elem.h
+++ b/lib/librte_eal/common/malloc_elem.h
@@ -152,6 +152,9 @@ int
malloc_elem_resize(struct malloc_elem *elem, size_t size);

void
+malloc_elem_hide_region(struct malloc_elem *elem, void *start, size_t len);
+
+void
malloc_elem_free_list_remove(struct malloc_elem *elem);

/*
diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c
index 058ad75..87dc9ad 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -20,8 +20,10 @@
#include <rte_spinlock.h>
#include <rte_memcpy.h>
#include <rte_atomic.h>
+#include <rte_fbarray.h>

#include "eal_internal_cfg.h"
+#include "eal_memalloc.h"
#include "malloc_elem.h"
#include "malloc_heap.h"

@@ -123,48 +125,356 @@ find_suitable_element(struct malloc_heap *heap, size_t size,
* scan fails. Once the new memseg is added, it re-scans and should return
* the new element after releasing the lock.
*/
-void *
-malloc_heap_alloc(struct malloc_heap *heap,
- const char *type __attribute__((unused)), size_t size, unsigned flags,
- size_t align, size_t bound)
+static void *
+heap_alloc(struct malloc_heap *heap, const char *type __rte_unused, size_t size,
+ unsigned int flags, size_t align, size_t bound)
{
struct malloc_elem *elem;

size = RTE_CACHE_LINE_ROUNDUP(size);
align = RTE_CACHE_LINE_ROUNDUP(align);

- rte_spinlock_lock(&heap->lock);
-
elem = find_suitable_element(heap, size, flags, align, bound);
if (elem != NULL) {
elem = malloc_elem_alloc(elem, size, align, bound);
+
/* increase heap's count of allocated elements */
heap->alloc_count++;
}
- rte_spinlock_unlock(&heap->lock);

return elem == NULL ? NULL : (void *)(&elem[1]);
}

+static int
+try_expand_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size,
+ int socket, unsigned int flags, size_t align, size_t bound)
+{
+ struct rte_memseg_list *msl;
+ struct rte_memseg **ms;
+ struct malloc_elem *elem;
+ size_t map_len;
+ int i, n_pages, allocd_pages;
+ void *ret, *map_addr;
+
+ align = RTE_MAX(align, MALLOC_ELEM_HEADER_LEN);
+ map_len = RTE_ALIGN_CEIL(align + elt_size + MALLOC_ELEM_TRAILER_LEN,
+ pg_sz);
+
+ n_pages = map_len / pg_sz;
+
+ /* we can't know in advance how many pages we'll need, so malloc */
+ ms = malloc(sizeof(*ms) * n_pages);
+
+ allocd_pages = eal_memalloc_alloc_page_bulk(ms, n_pages, pg_sz, socket,
+ true);
+
+ /* make sure we've allocated our pages... */
+ if (allocd_pages != n_pages)
+ goto free_ms;
+
+ map_addr = ms[0]->addr;
+ msl = rte_mem_virt2memseg_list(map_addr);
+
+ /* add newly minted memsegs to malloc heap */
+ elem = malloc_heap_add_memory(heap, msl, map_addr, map_len);
+
+ /* try once more, as now we have allocated new memory */
+ ret = find_suitable_element(heap, elt_size, flags, align, bound);
+
+ if (ret == NULL)
+ goto free_elem;
+
+ RTE_LOG(DEBUG, EAL, "Heap on socket %d was expanded by %zdMB\n",
+ socket, map_len >> 20ULL);
+
+ free(ms);
+
+ return 0;
+
+free_elem:
+ malloc_elem_free_list_remove(elem);
+ malloc_elem_hide_region(elem, map_addr, map_len);
+ heap->total_size -= map_len;
+
+ for (i = 0; i < n_pages; i++)
+ eal_memalloc_free_page(ms[i]);
+free_ms:
+ free(ms);
+
+ return -1;
+}
+
+static int
+compare_pagesz(const void *a, const void *b)
+{
+ const struct rte_memseg_list * const*mpa = a;
+ const struct rte_memseg_list * const*mpb = b;
+ const struct rte_memseg_list *msla = *mpa;
+ const struct rte_memseg_list *mslb = *mpb;
+ uint64_t pg_sz_a = msla->hugepage_sz;
+ uint64_t pg_sz_b = mslb->hugepage_sz;
+
+ if (pg_sz_a < pg_sz_b)
+ return -1;
+ if (pg_sz_a > pg_sz_b)
+ return 1;
+ return 0;
+}
+
+static int
+alloc_mem_on_socket(size_t size, int socket, unsigned int flags, size_t align,
+ size_t bound)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct malloc_heap *heap = &mcfg->malloc_heaps[socket];
+ struct rte_memseg_list *requested_msls[RTE_MAX_MEMSEG_LISTS];
+ struct rte_memseg_list *other_msls[RTE_MAX_MEMSEG_LISTS];
+ uint64_t requested_pg_sz[RTE_MAX_MEMSEG_LISTS];
+ uint64_t other_pg_sz[RTE_MAX_MEMSEG_LISTS];
+ uint64_t prev_pg_sz;
+ int i, n_other_msls, n_other_pg_sz, n_requested_msls, n_requested_pg_sz;
+ bool size_hint = (flags & RTE_MEMZONE_SIZE_HINT_ONLY) > 0;
+ unsigned int size_flags = flags & ~RTE_MEMZONE_SIZE_HINT_ONLY;
+ void *ret;
+
+ memset(requested_msls, 0, sizeof(requested_msls));
+ memset(other_msls, 0, sizeof(other_msls));
+ memset(requested_pg_sz, 0, sizeof(requested_pg_sz));
+ memset(other_pg_sz, 0, sizeof(other_pg_sz));
+
+ /*
+ * go through memseg list and take note of all the page sizes available,
+ * and if any of them were specifically requested by the user.
+ */
+ n_requested_msls = 0;
+ n_other_msls = 0;
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+ struct rte_memseg_list *msl = &mcfg->memsegs[i];
+
+ if (msl->socket_id != socket)
+ continue;
+
+ if (msl->base_va == NULL)
+ continue;
+
+ /* if pages of specific size were requested */
+ if (size_flags != 0 && check_hugepage_sz(size_flags,
+ msl->hugepage_sz))
+ requested_msls[n_requested_msls++] = msl;
+ else if (size_flags == 0 || size_hint)
+ other_msls[n_other_msls++] = msl;
+ }
+
+ /* sort the lists, smallest first */
+ qsort(requested_msls, n_requested_msls, sizeof(requested_msls[0]),
+ compare_pagesz);
+ qsort(other_msls, n_other_msls, sizeof(other_msls[0]),
+ compare_pagesz);
+
+ /* now, extract page sizes we are supposed to try */
+ prev_pg_sz = 0;
+ n_requested_pg_sz = 0;
+ for (i = 0; i < n_requested_msls; i++) {
+ uint64_t pg_sz = requested_msls[i]->hugepage_sz;
+
+ if (prev_pg_sz != pg_sz) {
+ requested_pg_sz[n_requested_pg_sz++] = pg_sz;
+ prev_pg_sz = pg_sz;
+ }
+ }
+ prev_pg_sz = 0;
+ n_other_pg_sz = 0;
+ for (i = 0; i < n_other_msls; i++) {
+ uint64_t pg_sz = other_msls[i]->hugepage_sz;
+
+ if (prev_pg_sz != pg_sz) {
+ other_pg_sz[n_other_pg_sz++] = pg_sz;
+ prev_pg_sz = pg_sz;
+ }
+ }
+
+ /* finally, try allocating memory of specified page sizes, starting from
+ * the smallest sizes
+ */
+ for (i = 0; i < n_requested_pg_sz; i++) {
+ uint64_t pg_sz = requested_pg_sz[i];
+
+ /*
+ * do not pass the size hint here, as user expects other page
+ * sizes first, before resorting to best effort allocation.
+ */
+ if (!try_expand_heap(heap, pg_sz, size, socket, size_flags,
+ align, bound))
+ return 0;
+ }
+ if (n_other_pg_sz == 0)
+ return -1;
+
+ /* now, check if we can reserve anything with size hint */
+ ret = find_suitable_element(heap, size, flags, align, bound);
+ if (ret != NULL)
+ return 0;
+
+ /*
+ * we still couldn't reserve memory, so try expanding heap with other
+ * page sizes, if there are any
+ */
+ for (i = 0; i < n_other_pg_sz; i++) {
+ uint64_t pg_sz = other_pg_sz[i];
+
+ if (!try_expand_heap(heap, pg_sz, size, socket, flags,
+ align, bound))
+ return 0;
+ }
+ return -1;
+}
+
+/* this will try lower page sizes first */
+static void *
+heap_alloc_on_socket(const char *type, size_t size, int socket,
+ unsigned int flags, size_t align, size_t bound)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct malloc_heap *heap = &mcfg->malloc_heaps[socket];
+ unsigned int size_flags = flags & ~RTE_MEMZONE_SIZE_HINT_ONLY;
+ void *ret;
+
+ rte_spinlock_lock(&(heap->lock));
+
+ align = align == 0 ? 1 : align;
+
+ /* for legacy mode, try once and with all flags */
+ if (internal_config.legacy_mem) {
+ ret = heap_alloc(heap, type, size, flags, align, bound);
+ goto alloc_unlock;
+ }
+
+ /*
+ * we do not pass the size hint here, because even if allocation fails,
+ * we may still be able to allocate memory from appropriate page sizes,
+ * we just need to request more memory first.
+ */
+ ret = heap_alloc(heap, type, size, size_flags, align, bound);
+ if (ret != NULL)
+ goto alloc_unlock;
+
+ if (!alloc_mem_on_socket(size, socket, flags, align, bound)) {
+ ret = heap_alloc(heap, type, size, flags, align, bound);
+
+ /* this should have succeeded */
+ if (ret == NULL)
+ rte_panic("Error allocating from heap\n");
+ }
+alloc_unlock:
+ rte_spinlock_unlock(&(heap->lock));
+ return ret;
+}
+
+void *
+malloc_heap_alloc(const char *type, size_t size, int socket_arg,
+ unsigned int flags, size_t align, size_t bound)
+{
+ int socket, i;
+ void *ret;
+
+ /* return NULL if size is 0 or alignment is not power-of-2 */
+ if (size == 0 || (align && !rte_is_power_of_2(align)))
+ return NULL;
+
+ if (!rte_eal_has_hugepages())
+ socket_arg = SOCKET_ID_ANY;
+
+ if (socket_arg == SOCKET_ID_ANY)
+ socket = malloc_get_numa_socket();
+ else
+ socket = socket_arg;
+
+ /* Check socket parameter */
+ if (socket >= RTE_MAX_NUMA_NODES)
+ return NULL;
+
+ ret = heap_alloc_on_socket(type, size, socket, flags, align, bound);
+ if (ret != NULL || socket_arg != SOCKET_ID_ANY)
+ return ret;
+
+ /* try other heaps */
+ for (i = 0; i < (int) rte_num_sockets(); i++) {
+ if (i == socket)
+ continue;
+ ret = heap_alloc_on_socket(type, size, i, flags,
+ align, bound);
+ if (ret != NULL)
+ return ret;
+ }
+ return NULL;
+}
+
int
malloc_heap_free(struct malloc_elem *elem)
{
struct malloc_heap *heap;
- struct malloc_elem *ret;
+ void *start, *aligned_start, *end, *aligned_end;
+ size_t len, aligned_len;
+ struct rte_memseg_list *msl;
+ int n_pages, page_idx, max_page_idx, ret;

if (!malloc_elem_cookies_ok(elem) || elem->state != ELEM_BUSY)
return -1;

/* elem may be merged with previous element, so keep heap address */
heap = elem->heap;
+ msl = elem->msl;

rte_spinlock_lock(&(heap->lock));

- ret = malloc_elem_free(elem);
+ elem = malloc_elem_free(elem);

- rte_spinlock_unlock(&(heap->lock));
+ /* anything after this is a bonus */
+ ret = 0;
+
+ /* ...of which we can't avail if we are in legacy mode */
+ if (internal_config.legacy_mem)
+ goto free_unlock;
+
+ /* check if we can free any memory back to the system */
+ if (elem->size < msl->hugepage_sz)
+ goto free_unlock;

- return ret != NULL ? 0 : -1;
+ /* probably, but let's make sure, as we may not be using up full page */
+ start = elem;
+ len = elem->size;
+ aligned_start = RTE_PTR_ALIGN_CEIL(start, msl->hugepage_sz);
+ end = RTE_PTR_ADD(elem, len);
+ aligned_end = RTE_PTR_ALIGN_FLOOR(end, msl->hugepage_sz);
+
+ aligned_len = RTE_PTR_DIFF(aligned_end, aligned_start);
+
+ /* can't free anything */
+ if (aligned_len < msl->hugepage_sz)
+ goto free_unlock;
+
+ malloc_elem_free_list_remove(elem);
+
+ malloc_elem_hide_region(elem, (void *) aligned_start, aligned_len);
+
+ /* we don't really care if we fail to deallocate memory */
+ n_pages = aligned_len / msl->hugepage_sz;
+ page_idx = RTE_PTR_DIFF(aligned_start, msl->base_va) / msl->hugepage_sz;
+ max_page_idx = page_idx + n_pages;
+
+ for (; page_idx < max_page_idx; page_idx++) {
+ struct rte_memseg *ms;
+
+ ms = rte_fbarray_get(&msl->memseg_arr, page_idx);
+ eal_memalloc_free_page(ms);
+ heap->total_size -= msl->hugepage_sz;
+ }
+
+ RTE_LOG(DEBUG, EAL, "Heap on socket %d was shrunk by %zdMB\n",
+ msl->socket_id, aligned_len >> 20ULL);
+free_unlock:
+ rte_spinlock_unlock(&(heap->lock));
+ return ret;
}

int
diff --git a/lib/librte_eal/common/malloc_heap.h b/lib/librte_eal/common/malloc_heap.h
index bb28422..292d578 100644
--- a/lib/librte_eal/common/malloc_heap.h
+++ b/lib/librte_eal/common/malloc_heap.h
@@ -24,8 +24,8 @@ malloc_get_numa_socket(void)
}

void *
-malloc_heap_alloc(struct malloc_heap *heap, const char *type, size_t size,
- unsigned flags, size_t align, size_t bound);
+malloc_heap_alloc(const char *type, size_t size, int socket, unsigned int flags,
+ size_t align, size_t bound);

int
malloc_heap_free(struct malloc_elem *elem);
diff --git a/lib/librte_eal/common/rte_malloc.c b/lib/librte_eal/common/rte_malloc.c
index bd7e757..b0fe11c 100644
--- a/lib/librte_eal/common/rte_malloc.c
+++ b/lib/librte_eal/common/rte_malloc.c
@@ -39,10 +39,6 @@ void rte_free(void *addr)
void *
rte_malloc_socket(const char *type, size_t size, unsigned align, int socket_arg)
{
- struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
- int socket, i;
- void *ret;
-
/* return NULL if size is 0 or alignment is not power-of-2 */
if (size == 0 || (align && !rte_is_power_of_2(align)))
return NULL;
@@ -50,33 +46,12 @@ rte_malloc_socket(const char *type, size_t size, unsigned align, int socket_arg)
if (!rte_eal_has_hugepages())
socket_arg = SOCKET_ID_ANY;

- if (socket_arg == SOCKET_ID_ANY)
- socket = malloc_get_numa_socket();
- else
- socket = socket_arg;
-
/* Check socket parameter */
- if (socket >= RTE_MAX_NUMA_NODES)
+ if (socket_arg >= RTE_MAX_NUMA_NODES)
return NULL;

- ret = malloc_heap_alloc(&mcfg->malloc_heaps[socket], type,
- size, 0, align == 0 ? 1 : align, 0);
- if (ret != NULL || socket_arg != SOCKET_ID_ANY)
- return ret;
-
- /* try other heaps */
- for (i = 0; i < RTE_MAX_NUMA_NODES; i++) {
- /* we already tried this one */
- if (i == socket)
- continue;
-
- ret = malloc_heap_alloc(&mcfg->malloc_heaps[i], type,
- size, 0, align == 0 ? 1 : align, 0);
- if (ret != NULL)
- return ret;
- }
-
- return NULL;
+ return malloc_heap_alloc(type, size, socket_arg, 0,
+ align == 0 ? 1 : align, 0);
}

/*
--
2.7.4
Anatoly Burakov
2018-03-07 16:56:49 UTC
Permalink
This adds a new set of _contig API's to rte_memzone.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/common/eal_common_memzone.c | 44 ++++++++
lib/librte_eal/common/include/rte_memzone.h | 154 ++++++++++++++++++++++++++++
2 files changed, 198 insertions(+)

diff --git a/lib/librte_eal/common/eal_common_memzone.c b/lib/librte_eal/common/eal_common_memzone.c
index 75c7dd9..8c9aa28 100644
--- a/lib/librte_eal/common/eal_common_memzone.c
+++ b/lib/librte_eal/common/eal_common_memzone.c
@@ -170,6 +170,12 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
socket_id = SOCKET_ID_ANY;

if (len == 0) {
+ /* len == 0 is only allowed for non-contiguous zones */
+ if (contig) {
+ RTE_LOG(DEBUG, EAL, "Reserving zero-length contiguous memzones is not supported\n");
+ rte_errno = EINVAL;
+ return NULL;
+ }
if (bound != 0)
requested_len = bound;
else {
@@ -251,6 +257,19 @@ rte_memzone_reserve_bounded(const char *name, size_t len, int socket_id,

/*
* Return a pointer to a correctly filled memzone descriptor (with a
+ * specified alignment and boundary). If the allocation cannot be done,
+ * return NULL.
+ */
+const struct rte_memzone *
+rte_memzone_reserve_bounded_contig(const char *name, size_t len, int socket_id,
+ unsigned int flags, unsigned int align, unsigned int bound)
+{
+ return rte_memzone_reserve_thread_safe(name, len, socket_id, flags,
+ align, bound, true);
+}
+
+/*
+ * Return a pointer to a correctly filled memzone descriptor (with a
* specified alignment). If the allocation cannot be done, return NULL.
*/
const struct rte_memzone *
@@ -262,6 +281,18 @@ rte_memzone_reserve_aligned(const char *name, size_t len, int socket_id,
}

/*
+ * Return a pointer to a correctly filled memzone descriptor (with a
+ * specified alignment). If the allocation cannot be done, return NULL.
+ */
+const struct rte_memzone *
+rte_memzone_reserve_aligned_contig(const char *name, size_t len, int socket_id,
+ unsigned int flags, unsigned int align)
+{
+ return rte_memzone_reserve_thread_safe(name, len, socket_id, flags,
+ align, 0, true);
+}
+
+/*
* Return a pointer to a correctly filled memzone descriptor. If the
* allocation cannot be done, return NULL.
*/
@@ -274,6 +305,19 @@ rte_memzone_reserve(const char *name, size_t len, int socket_id,
false);
}

+/*
+ * Return a pointer to a correctly filled memzone descriptor. If the
+ * allocation cannot be done, return NULL.
+ */
+const struct rte_memzone *
+rte_memzone_reserve_contig(const char *name, size_t len, int socket_id,
+ unsigned int flags)
+{
+ return rte_memzone_reserve_thread_safe(name, len, socket_id,
+ flags, RTE_CACHE_LINE_SIZE, 0,
+ true);
+}
+
int
rte_memzone_free(const struct rte_memzone *mz)
{
diff --git a/lib/librte_eal/common/include/rte_memzone.h b/lib/librte_eal/common/include/rte_memzone.h
index a69f068..5f1293f 100644
--- a/lib/librte_eal/common/include/rte_memzone.h
+++ b/lib/librte_eal/common/include/rte_memzone.h
@@ -227,6 +227,160 @@ const struct rte_memzone *rte_memzone_reserve_bounded(const char *name,
unsigned flags, unsigned align, unsigned bound);

/**
+ * Reserve an IOVA-contiguous portion of physical memory.
+ *
+ * This function reserves some IOVA-contiguous memory and returns a pointer to a
+ * correctly filled memzone descriptor. If the allocation cannot be
+ * done, return NULL.
+ *
+ * @param name
+ * The name of the memzone. If it already exists, the function will
+ * fail and return NULL.
+ * @param len
+ * The size of the memory to be reserved.
+ * @param socket_id
+ * The socket identifier in the case of
+ * NUMA. The value can be SOCKET_ID_ANY if there is no NUMA
+ * constraint for the reserved zone.
+ * @param flags
+ * The flags parameter is used to request memzones to be
+ * taken from specifically sized hugepages.
+ * - RTE_MEMZONE_2MB - Reserved from 2MB pages
+ * - RTE_MEMZONE_1GB - Reserved from 1GB pages
+ * - RTE_MEMZONE_16MB - Reserved from 16MB pages
+ * - RTE_MEMZONE_16GB - Reserved from 16GB pages
+ * - RTE_MEMZONE_256KB - Reserved from 256KB pages
+ * - RTE_MEMZONE_256MB - Reserved from 256MB pages
+ * - RTE_MEMZONE_512MB - Reserved from 512MB pages
+ * - RTE_MEMZONE_4GB - Reserved from 4GB pages
+ * - RTE_MEMZONE_SIZE_HINT_ONLY - Allow alternative page size to be used if
+ * the requested page size is unavailable.
+ * If this flag is not set, the function
+ * will return error on an unavailable size
+ * request.
+ * @return
+ * A pointer to a correctly-filled read-only memzone descriptor, or NULL
+ * on error.
+ * On error case, rte_errno will be set appropriately:
+ * - E_RTE_NO_CONFIG - function could not get pointer to rte_config structure
+ * - E_RTE_SECONDARY - function was called from a secondary process instance
+ * - ENOSPC - the maximum number of memzones has already been allocated
+ * - EEXIST - a memzone with the same name already exists
+ * - ENOMEM - no appropriate memory area found in which to create memzone
+ * - EINVAL - invalid parameters
+ */
+const struct rte_memzone *rte_memzone_reserve_contig(const char *name,
+ size_t len, int socket_id, unsigned int flags);
+
+/**
+ * Reserve an IOVA-contiguous portion of physical memory with alignment on a
+ * specified boundary.
+ *
+ * This function reserves some IOVA-contiguous memory with alignment on a
+ * specified boundary, and returns a pointer to a correctly filled memzone
+ * descriptor. If the allocation cannot be done or if the alignment
+ * is not a power of 2, returns NULL.
+ *
+ * @param name
+ * The name of the memzone. If it already exists, the function will
+ * fail and return NULL.
+ * @param len
+ * The size of the memory to be reserved.
+ * @param socket_id
+ * The socket identifier in the case of
+ * NUMA. The value can be SOCKET_ID_ANY if there is no NUMA
+ * constraint for the reserved zone.
+ * @param flags
+ * The flags parameter is used to request memzones to be
+ * taken from specifically sized hugepages.
+ * - RTE_MEMZONE_2MB - Reserved from 2MB pages
+ * - RTE_MEMZONE_1GB - Reserved from 1GB pages
+ * - RTE_MEMZONE_16MB - Reserved from 16MB pages
+ * - RTE_MEMZONE_16GB - Reserved from 16GB pages
+ * - RTE_MEMZONE_256KB - Reserved from 256KB pages
+ * - RTE_MEMZONE_256MB - Reserved from 256MB pages
+ * - RTE_MEMZONE_512MB - Reserved from 512MB pages
+ * - RTE_MEMZONE_4GB - Reserved from 4GB pages
+ * - RTE_MEMZONE_SIZE_HINT_ONLY - Allow alternative page size to be used if
+ * the requested page size is unavailable.
+ * If this flag is not set, the function
+ * will return error on an unavailable size
+ * request.
+ * @param align
+ * Alignment for resulting memzone. Must be a power of 2.
+ * @return
+ * A pointer to a correctly-filled read-only memzone descriptor, or NULL
+ * on error.
+ * On error case, rte_errno will be set appropriately:
+ * - E_RTE_NO_CONFIG - function could not get pointer to rte_config structure
+ * - E_RTE_SECONDARY - function was called from a secondary process instance
+ * - ENOSPC - the maximum number of memzones has already been allocated
+ * - EEXIST - a memzone with the same name already exists
+ * - ENOMEM - no appropriate memory area found in which to create memzone
+ * - EINVAL - invalid parameters
+ */
+const struct rte_memzone *rte_memzone_reserve_aligned_contig(const char *name,
+ size_t len, int socket_id, unsigned int flags,
+ unsigned int align);
+
+/**
+ * Reserve an IOVA-contiguous portion of physical memory with specified
+ * alignment and boundary.
+ *
+ * This function reserves some IOVA-contiguous memory with specified alignment
+ * and boundary, and returns a pointer to a correctly filled memzone
+ * descriptor. If the allocation cannot be done or if the alignment
+ * or boundary are not a power of 2, returns NULL.
+ * Memory buffer is reserved in a way, that it wouldn't cross specified
+ * boundary. That implies that requested length should be less or equal
+ * then boundary.
+ *
+ * @param name
+ * The name of the memzone. If it already exists, the function will
+ * fail and return NULL.
+ * @param len
+ * The size of the memory to be reserved.
+ * @param socket_id
+ * The socket identifier in the case of
+ * NUMA. The value can be SOCKET_ID_ANY if there is no NUMA
+ * constraint for the reserved zone.
+ * @param flags
+ * The flags parameter is used to request memzones to be
+ * taken from specifically sized hugepages.
+ * - RTE_MEMZONE_2MB - Reserved from 2MB pages
+ * - RTE_MEMZONE_1GB - Reserved from 1GB pages
+ * - RTE_MEMZONE_16MB - Reserved from 16MB pages
+ * - RTE_MEMZONE_16GB - Reserved from 16GB pages
+ * - RTE_MEMZONE_256KB - Reserved from 256KB pages
+ * - RTE_MEMZONE_256MB - Reserved from 256MB pages
+ * - RTE_MEMZONE_512MB - Reserved from 512MB pages
+ * - RTE_MEMZONE_4GB - Reserved from 4GB pages
+ * - RTE_MEMZONE_SIZE_HINT_ONLY - Allow alternative page size to be used if
+ * the requested page size is unavailable.
+ * If this flag is not set, the function
+ * will return error on an unavailable size
+ * request.
+ * @param align
+ * Alignment for resulting memzone. Must be a power of 2.
+ * @param bound
+ * Boundary for resulting memzone. Must be a power of 2 or zero.
+ * Zero value implies no boundary condition.
+ * @return
+ * A pointer to a correctly-filled read-only memzone descriptor, or NULL
+ * on error.
+ * On error case, rte_errno will be set appropriately:
+ * - E_RTE_NO_CONFIG - function could not get pointer to rte_config structure
+ * - E_RTE_SECONDARY - function was called from a secondary process instance
+ * - ENOSPC - the maximum number of memzones has already been allocated
+ * - EEXIST - a memzone with the same name already exists
+ * - ENOMEM - no appropriate memory area found in which to create memzone
+ * - EINVAL - invalid parameters
+ */
+const struct rte_memzone *rte_memzone_reserve_bounded_contig(const char *name,
+ size_t len, int socket_id, unsigned int flags,
+ unsigned int align, unsigned int bound);
+
+/**
* Free a memzone.
*
* @param mz
--
2.7.4
Anatoly Burakov
2018-03-07 16:56:51 UTC
Permalink
If a user has specified that the zone should have contiguous memory,
use the new _contig allocation API's instead of normal ones.
Otherwise, account for the fact that unless we're in IOVA_AS_VA
mode, we cannot guarantee that the pages would be physically
contiguous, so we calculate the memzone size and alignments as if
we were getting the smallest page size available.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_mempool/rte_mempool.c | 87 +++++++++++++++++++++++++++++++++++-----
1 file changed, 78 insertions(+), 9 deletions(-)

diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index 54f7f4b..5c4d3fd 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -98,6 +98,27 @@ static unsigned optimize_object_size(unsigned obj_size)
return new_obj_size * RTE_MEMPOOL_ALIGN;
}

+static size_t
+get_min_page_size(void)
+{
+ const struct rte_mem_config *mcfg =
+ rte_eal_get_configuration()->mem_config;
+ int i;
+ size_t min_pagesz = SIZE_MAX;
+
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+ const struct rte_memseg_list *msl = &mcfg->memsegs[i];
+
+ if (msl->base_va == NULL)
+ continue;
+
+ if (msl->hugepage_sz < min_pagesz)
+ min_pagesz = msl->hugepage_sz;
+ }
+
+ return min_pagesz == SIZE_MAX ? (size_t) getpagesize() : min_pagesz;
+}
+
static void
mempool_add_elem(struct rte_mempool *mp, void *obj, rte_iova_t iova)
{
@@ -549,6 +570,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
unsigned mz_id, n;
unsigned int mp_flags;
int ret;
+ bool force_contig, no_contig;

/* mempool must not be populated */
if (mp->nb_mem_chunks != 0)
@@ -563,10 +585,46 @@ rte_mempool_populate_default(struct rte_mempool *mp)
/* update mempool capabilities */
mp->flags |= mp_flags;

- if (rte_eal_has_hugepages()) {
- pg_shift = 0; /* not needed, zone is physically contiguous */
+ no_contig = mp->flags & MEMPOOL_F_NO_PHYS_CONTIG;
+ force_contig = mp->flags & MEMPOOL_F_CAPA_PHYS_CONTIG;
+
+ /*
+ * there are several considerations for page size and page shift here.
+ *
+ * if we don't need our mempools to have physically contiguous objects,
+ * then just set page shift and page size to 0, because the user has
+ * indicated that there's no need to care about anything.
+ *
+ * if we do need contiguous objects, there is also an option to reserve
+ * the entire mempool memory as one contiguous block of memory, in
+ * which case the page shift and alignment wouldn't matter as well.
+ *
+ * if we require contiguous objects, but not necessarily the entire
+ * mempool reserved space to be contiguous, then there are two options.
+ *
+ * if our IO addresses are virtual, not actual physical (IOVA as VA
+ * case), then no page shift needed - our memory allocation will give us
+ * contiguous physical memory as far as the hardware is concerned, so
+ * act as if we're getting contiguous memory.
+ *
+ * if our IO addresses are physical, we may get memory from bigger
+ * pages, or we might get memory from smaller pages, and how much of it
+ * we require depends on whether we want bigger or smaller pages.
+ * However, requesting each and every memory size is too much work, so
+ * what we'll do instead is walk through the page sizes available, pick
+ * the smallest one and set up page shift to match that one. We will be
+ * wasting some space this way, but it's much nicer than looping around
+ * trying to reserve each and every page size.
+ */
+
+ if (no_contig || force_contig || rte_eal_iova_mode() == RTE_IOVA_VA) {
pg_sz = 0;
+ pg_shift = 0;
align = RTE_CACHE_LINE_SIZE;
+ } else if (rte_eal_has_hugepages()) {
+ pg_sz = get_min_page_size();
+ pg_shift = rte_bsf32(pg_sz);
+ align = pg_sz;
} else {
pg_sz = getpagesize();
pg_shift = rte_bsf32(pg_sz);
@@ -585,23 +643,34 @@ rte_mempool_populate_default(struct rte_mempool *mp)
goto fail;
}

- mz = rte_memzone_reserve_aligned(mz_name, size,
- mp->socket_id, mz_flags, align);
- /* not enough memory, retry with the biggest zone we have */
- if (mz == NULL)
- mz = rte_memzone_reserve_aligned(mz_name, 0,
+ if (force_contig) {
+ /*
+ * if contiguous memory for entire mempool memory was
+ * requested, don't try reserving again if we fail.
+ */
+ mz = rte_memzone_reserve_aligned_contig(mz_name, size,
+ mp->socket_id, mz_flags, align);
+ } else {
+ mz = rte_memzone_reserve_aligned(mz_name, size,
mp->socket_id, mz_flags, align);
+ /* not enough memory, retry with the biggest zone we
+ * have
+ */
+ if (mz == NULL)
+ mz = rte_memzone_reserve_aligned(mz_name, 0,
+ mp->socket_id, mz_flags, align);
+ }
if (mz == NULL) {
ret = -rte_errno;
goto fail;
}

- if (mp->flags & MEMPOOL_F_NO_PHYS_CONTIG)
+ if (no_contig)
iova = RTE_BAD_IOVA;
else
iova = mz->iova;

- if (rte_eal_has_hugepages())
+ if (rte_eal_has_hugepages() && force_contig)
ret = rte_mempool_populate_iova(mp, mz->addr,
iova, mz->len,
rte_mempool_memchunk_mz_free,
--
2.7.4
Anatoly Burakov
2018-03-07 16:56:57 UTC
Permalink
Each process will have its own callbacks. Callbacks will indicate
whether it's allocation and deallocation that's happened, and will
also provide start VA address and length of allocated block.

Since memory hotplug isn't supported on FreeBSD and in legacy mem
mode, it will not be possible to register them in either.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/common/eal_common_memalloc.c | 132 ++++++++++++++++++++++++++++
lib/librte_eal/common/eal_common_memory.c | 28 ++++++
lib/librte_eal/common/eal_memalloc.h | 10 +++
lib/librte_eal/common/include/rte_memory.h | 48 ++++++++++
lib/librte_eal/rte_eal_version.map | 2 +
5 files changed, 220 insertions(+)

diff --git a/lib/librte_eal/common/eal_common_memalloc.c b/lib/librte_eal/common/eal_common_memalloc.c
index 62e8c16..4fb55f2 100644
--- a/lib/librte_eal/common/eal_common_memalloc.c
+++ b/lib/librte_eal/common/eal_common_memalloc.c
@@ -2,16 +2,46 @@
* Copyright(c) 2017-2018 Intel Corporation
*/

+#include <string.h>
+
+#include <rte_errno.h>
#include <rte_lcore.h>
#include <rte_fbarray.h>
#include <rte_memzone.h>
#include <rte_memory.h>
#include <rte_eal_memconfig.h>
+#include <rte_rwlock.h>

#include "eal_private.h"
#include "eal_internal_cfg.h"
#include "eal_memalloc.h"

+struct mem_event_callback_entry {
+ TAILQ_ENTRY(mem_event_callback_entry) next;
+ char name[RTE_MEM_EVENT_CALLBACK_NAME_LEN];
+ rte_mem_event_callback_t clb;
+};
+
+/** Double linked list of actions. */
+TAILQ_HEAD(mem_event_callback_entry_list, mem_event_callback_entry);
+
+static struct mem_event_callback_entry_list callback_list =
+ TAILQ_HEAD_INITIALIZER(callback_list);
+
+static rte_rwlock_t rwlock = RTE_RWLOCK_INITIALIZER;
+
+static struct mem_event_callback_entry *
+find_callback(const char *name)
+{
+ struct mem_event_callback_entry *r;
+
+ TAILQ_FOREACH(r, &callback_list, next) {
+ if (!strcmp(r->name, name))
+ break;
+ }
+ return r;
+}
+
bool
eal_memalloc_is_contig(struct rte_memseg_list *msl, void *start,
size_t len)
@@ -47,3 +77,105 @@ eal_memalloc_is_contig(struct rte_memseg_list *msl, void *start,

return true;
}
+
+int
+eal_memalloc_callback_register(const char *name,
+ rte_mem_event_callback_t clb)
+{
+ struct mem_event_callback_entry *entry;
+ int ret, len;
+ if (name == NULL || clb == NULL) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+ len = strnlen(name, RTE_MEM_EVENT_CALLBACK_NAME_LEN);
+ if (len == 0) {
+ rte_errno = EINVAL;
+ return -1;
+ } else if (len == RTE_MEM_EVENT_CALLBACK_NAME_LEN) {
+ rte_errno = ENAMETOOLONG;
+ return -1;
+ }
+ rte_rwlock_write_lock(&rwlock);
+
+ entry = find_callback(name);
+ if (entry != NULL) {
+ rte_errno = EEXIST;
+ ret = -1;
+ goto unlock;
+ }
+
+ entry = malloc(sizeof(*entry));
+ if (entry == NULL) {
+ rte_errno = ENOMEM;
+ ret = -1;
+ goto unlock;
+ }
+
+ /* callback successfully created and is valid, add it to the list */
+ entry->clb = clb;
+ snprintf(entry->name, RTE_MEM_EVENT_CALLBACK_NAME_LEN, "%s", name);
+ TAILQ_INSERT_TAIL(&callback_list, entry, next);
+
+ ret = 0;
+
+ RTE_LOG(DEBUG, EAL, "Mem event callback '%s' registered\n", name);
+
+unlock:
+ rte_rwlock_write_unlock(&rwlock);
+ return ret;
+}
+
+int
+eal_memalloc_callback_unregister(const char *name)
+{
+ struct mem_event_callback_entry *entry;
+ int ret, len;
+
+ if (name == NULL) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+ len = strnlen(name, RTE_MEM_EVENT_CALLBACK_NAME_LEN);
+ if (len == 0) {
+ rte_errno = EINVAL;
+ return -1;
+ } else if (len == RTE_MEM_EVENT_CALLBACK_NAME_LEN) {
+ rte_errno = ENAMETOOLONG;
+ return -1;
+ }
+ rte_rwlock_write_lock(&rwlock);
+
+ entry = find_callback(name);
+ if (entry == NULL) {
+ rte_errno = ENOENT;
+ ret = -1;
+ goto unlock;
+ }
+ TAILQ_REMOVE(&callback_list, entry, next);
+ free(entry);
+
+ ret = 0;
+
+ RTE_LOG(DEBUG, EAL, "Mem event callback '%s' unregistered\n", name);
+
+unlock:
+ rte_rwlock_write_unlock(&rwlock);
+ return ret;
+}
+
+void
+eal_memalloc_notify(enum rte_mem_event event, const void *start, size_t len)
+{
+ struct mem_event_callback_entry *entry;
+
+ rte_rwlock_read_lock(&rwlock);
+
+ TAILQ_FOREACH(entry, &callback_list, next) {
+ RTE_LOG(DEBUG, EAL, "Calling mem event callback %s",
+ entry->name);
+ entry->clb(event, start, len);
+ }
+
+ rte_rwlock_read_unlock(&rwlock);
+}
diff --git a/lib/librte_eal/common/eal_common_memory.c b/lib/librte_eal/common/eal_common_memory.c
index 0a0aa88..2d73cf3 100644
--- a/lib/librte_eal/common/eal_common_memory.c
+++ b/lib/librte_eal/common/eal_common_memory.c
@@ -466,6 +466,34 @@ rte_eal_get_physmem_size(void)
return total_len;
}

+/*
+ * Defining here because declared in rte_memory.h, but the actual implementation
+ * is in eal_common_memalloc.c, like all other memalloc internals.
+ */
+int
+rte_mem_event_register_callback(const char *name, rte_mem_event_callback_t clb)
+{
+ /* FreeBSD boots with legacy mem enabled by default */
+ if (internal_config.legacy_mem) {
+ RTE_LOG(DEBUG, EAL, "Registering mem event callbacks not supported\n");
+ rte_errno = ENOTSUP;
+ return -1;
+ }
+ return eal_memalloc_callback_register(name, clb);
+}
+
+int
+rte_mem_event_unregister_callback(const char *name)
+{
+ /* FreeBSD boots with legacy mem enabled by default */
+ if (internal_config.legacy_mem) {
+ RTE_LOG(DEBUG, EAL, "Registering mem event callbacks not supported\n");
+ rte_errno = ENOTSUP;
+ return -1;
+ }
+ return eal_memalloc_callback_unregister(name);
+}
+
/* Dump the physical memory layout on console */
void
rte_dump_physmem_layout(FILE *f)
diff --git a/lib/librte_eal/common/eal_memalloc.h b/lib/librte_eal/common/eal_memalloc.h
index beac296..499cf58 100644
--- a/lib/librte_eal/common/eal_memalloc.h
+++ b/lib/librte_eal/common/eal_memalloc.h
@@ -28,4 +28,14 @@ eal_memalloc_is_contig(struct rte_memseg_list *msl, void *start,
int
eal_memalloc_sync_with_primary(void);

+int
+eal_memalloc_callback_register(const char *name,
+ rte_mem_event_callback_t clb);
+
+int
+eal_memalloc_callback_unregister(const char *name);
+
+void
+eal_memalloc_notify(enum rte_mem_event event, const void *start, size_t len);
+
#endif // EAL_MEMALLOC_H
diff --git a/lib/librte_eal/common/include/rte_memory.h b/lib/librte_eal/common/include/rte_memory.h
index 674d4cb..1c8ffa6 100644
--- a/lib/librte_eal/common/include/rte_memory.h
+++ b/lib/librte_eal/common/include/rte_memory.h
@@ -200,6 +200,54 @@ unsigned rte_memory_get_nrank(void);
*/
int rte_eal_using_phys_addrs(void);

+
+/**
+ * Enum indicating which kind of memory event has happened. Used by callbacks to
+ * distinguish between memory allocations and deallocations.
+ */
+enum rte_mem_event {
+ RTE_MEM_EVENT_ALLOC = 0, /**< Allocation event. */
+ RTE_MEM_EVENT_FREE, /**< Deallocation event. */
+};
+#define RTE_MEM_EVENT_CALLBACK_NAME_LEN 64
+/**< maximum length of callback name */
+
+/**
+ * Function typedef used to register callbacks for memory events.
+ */
+typedef void (*rte_mem_event_callback_t)(enum rte_mem_event event_type,
+ const void *addr, size_t len);
+
+/**
+ * Function used to register callbacks for memory events.
+ *
+ * @param name
+ * Name associated with specified callback to be added to the list.
+ *
+ * @param clb
+ * Callback function pointer.
+ *
+ * @return
+ * 0 on successful callback register
+ * -1 on unsuccessful callback register, with rte_errno value indicating
+ * reason for failure.
+ */
+int rte_mem_event_register_callback(const char *name,
+ rte_mem_event_callback_t clb);
+
+/**
+ * Function used to unregister callbacks for memory events.
+ *
+ * @param name
+ * Name associated with specified callback to be removed from the list.
+ *
+ * @return
+ * 0 on successful callback unregister
+ * -1 on unsuccessful callback unregister, with rte_errno value indicating
+ * reason for failure.
+ */
+int rte_mem_event_unregister_callback(const char *name);
+
#ifdef __cplusplus
}
#endif
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index 4c2e959..b2a2d37 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -214,6 +214,8 @@ DPDK_18.05 {
global:

rte_num_sockets;
+ rte_mem_event_callback_register;
+ rte_mem_event_callback_unregister;
rte_mem_virt2memseg;
rte_mem_virt2memseg_list;
rte_malloc_dump_heaps;
--
2.7.4
Anatoly Burakov
2018-03-07 16:56:54 UTC
Permalink
In preparation for implementing multiprocess support, we are adding
a version number and write locks to memseg lists.

There are two ways of implementing multiprocess support for memory
hotplug: either all information about mapped memory is shared
between processes, and secondary processes simply attempt to
map/unmap memory based on requests from the primary, or secondary
processes store their own maps and only check if they are in sync
with the primary process' maps.

This implementation will opt for the latter option: primary process
shared mappings will be authoritative, and each secondary process
will use its own interal view of mapped memory, and will attempt
to synchronize on these mappings using versioning.

Under this model, only primary process will decide which pages get
mapped, and secondary processes will only copy primary's page
maps and get notified of the changes via IPC mechanism (coming
in later commits).

To avoid race conditions, memseg lists will also have write locks -
that is, it will be possible for several secondary processes to
initialize concurrently, but it will not be possible for several
processes to request memory allocation unless all other allocations
were complete (on a single socket - it is OK to allocate/free memory
on different sockets concurrently).

In principle, it is possible for multiple processes to request
allocation/deallcation on multiple sockets, but we will only allow
one such request to be active at any one time.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/bsdapp/eal/eal_memalloc.c | 7 +
lib/librte_eal/common/eal_memalloc.h | 4 +
lib/librte_eal/common/include/rte_eal_memconfig.h | 2 +
lib/librte_eal/linuxapp/eal/eal_memalloc.c | 288 +++++++++++++++++++++-
4 files changed, 295 insertions(+), 6 deletions(-)

diff --git a/lib/librte_eal/bsdapp/eal/eal_memalloc.c b/lib/librte_eal/bsdapp/eal/eal_memalloc.c
index be8340b..255aedc 100644
--- a/lib/librte_eal/bsdapp/eal/eal_memalloc.c
+++ b/lib/librte_eal/bsdapp/eal/eal_memalloc.c
@@ -24,3 +24,10 @@ eal_memalloc_alloc_page(uint64_t __rte_unused size, int __rte_unused socket)
RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n");
return NULL;
}
+
+int
+eal_memalloc_sync_with_primary(void)
+{
+ RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n");
+ return -1;
+}
diff --git a/lib/librte_eal/common/eal_memalloc.h b/lib/librte_eal/common/eal_memalloc.h
index 08ba70e..beac296 100644
--- a/lib/librte_eal/common/eal_memalloc.h
+++ b/lib/librte_eal/common/eal_memalloc.h
@@ -24,4 +24,8 @@ bool
eal_memalloc_is_contig(struct rte_memseg_list *msl, void *start,
size_t len);

+/* synchronize local memory map to primary process */
+int
+eal_memalloc_sync_with_primary(void);
+
#endif // EAL_MEMALLOC_H
diff --git a/lib/librte_eal/common/include/rte_eal_memconfig.h b/lib/librte_eal/common/include/rte_eal_memconfig.h
index b6bdb21..d653d57 100644
--- a/lib/librte_eal/common/include/rte_eal_memconfig.h
+++ b/lib/librte_eal/common/include/rte_eal_memconfig.h
@@ -32,6 +32,8 @@ struct rte_memseg_list {
};
int socket_id; /**< Socket ID for all memsegs in this list. */
uint64_t hugepage_sz; /**< page size for all memsegs in this list. */
+ rte_rwlock_t mplock; /**< read-write lock for multiprocess sync. */
+ uint32_t version; /**< version number for multiprocess sync. */
struct rte_fbarray memseg_arr;
};

diff --git a/lib/librte_eal/linuxapp/eal/eal_memalloc.c b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
index c03e7bc..227d703 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memalloc.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
@@ -65,6 +65,9 @@ static struct msl_entry_list msl_entry_list =
TAILQ_HEAD_INITIALIZER(msl_entry_list);
static rte_spinlock_t tailq_lock = RTE_SPINLOCK_INITIALIZER;

+/** local copy of a memory map, used to synchronize memory hotplug in MP */
+static struct rte_memseg_list local_memsegs[RTE_MAX_MEMSEG_LISTS];
+
static sigjmp_buf huge_jmpenv;

static void __rte_unused huge_sigbus_handler(int signo __rte_unused)
@@ -619,11 +622,14 @@ eal_memalloc_alloc_page_bulk(struct rte_memseg **ms, int n,
continue;
msl = cur_msl;

+ /* lock memseg list */
+ rte_rwlock_write_lock(&msl->mplock);
+
/* try finding space in memseg list */
cur_idx = rte_fbarray_find_next_n_free(&msl->memseg_arr, 0, n);

if (cur_idx < 0)
- continue;
+ goto next_list;

end_idx = cur_idx + n;
start_idx = cur_idx;
@@ -637,7 +643,6 @@ eal_memalloc_alloc_page_bulk(struct rte_memseg **ms, int n,

if (alloc_page(cur, addr, size, socket, hi, msl_idx,
cur_idx)) {
-
RTE_LOG(DEBUG, EAL, "attempted to allocate %i pages, but only %i were allocated\n",
n, i);

@@ -648,7 +653,7 @@ eal_memalloc_alloc_page_bulk(struct rte_memseg **ms, int n,
*/
if (!exact) {
ret = i;
- goto restore_numa;
+ goto success;
}
RTE_LOG(DEBUG, EAL, "exact amount of pages was requested, so returning %i allocated pages\n",
i);
@@ -680,10 +685,13 @@ eal_memalloc_alloc_page_bulk(struct rte_memseg **ms, int n,
}
/* we allocated all pages */
ret = n;
+success:
+ msl->version++;
+ rte_rwlock_write_unlock(&msl->mplock);

break;
next_list:
- /* dummy semi-colon to make label work */;
+ rte_rwlock_write_unlock(&msl->mplock);
}
/* we didn't break */
if (msl_idx == RTE_MAX_MEMSEG_LISTS) {
@@ -716,7 +724,7 @@ eal_memalloc_free_page(struct rte_memseg *ms)
struct rte_memseg_list *msl = NULL;
unsigned int msl_idx, seg_idx;
struct hugepage_info *hi = NULL;
- int i;
+ int ret, i;

/* dynamic free not supported in legacy mode */
if (internal_config.legacy_mem)
@@ -753,6 +761,274 @@ eal_memalloc_free_page(struct rte_memseg *ms)
RTE_LOG(ERR, EAL, "Couldn't find memseg list\n");
return -1;
}
+ rte_rwlock_write_lock(&msl->mplock);
+
rte_fbarray_set_free(&msl->memseg_arr, seg_idx);
- return free_page(ms, hi, msl_idx, seg_idx);
+
+ /* increment version number */
+ msl->version++;
+
+ ret = free_page(ms, hi, msl_idx, seg_idx);
+
+ rte_rwlock_write_unlock(&msl->mplock);
+
+ return ret;
+}
+
+static int
+sync_chunk(struct rte_memseg_list *primary_msl,
+ struct rte_memseg_list *local_msl, struct hugepage_info *hi,
+ unsigned int msl_idx, bool used, int start, int end)
+{
+ struct rte_fbarray *l_arr, *p_arr;
+ int i, ret, chunk_len, diff_len;
+
+ l_arr = &local_msl->memseg_arr;
+ p_arr = &primary_msl->memseg_arr;
+
+ /* we need to aggregate allocations/deallocations into bigger chunks,
+ * as we don't want to spam the user with per-page callbacks.
+ *
+ * to avoid any potential issues, we also want to trigger
+ * deallocation callbacks *before* we actually deallocate
+ * memory, so that the user application could wrap up its use
+ * before it goes away.
+ */
+
+ chunk_len = end - start;
+
+ /* find how many contiguous pages we can map/unmap for this chunk */
+ diff_len = used ?
+ rte_fbarray_find_contig_free(l_arr, start) :
+ rte_fbarray_find_contig_used(l_arr, start);
+
+ /* has to be at least one page */
+ if (diff_len < 1)
+ return -1;
+
+ diff_len = RTE_MIN(chunk_len, diff_len);
+
+ for (i = 0; i < diff_len; i++) {
+ struct rte_memseg *p_ms, *l_ms;
+ int seg_idx = start + i;
+
+ l_ms = rte_fbarray_get(l_arr, seg_idx);
+ p_ms = rte_fbarray_get(p_arr, seg_idx);
+
+ if (l_ms == NULL || p_ms == NULL)
+ return -1;
+
+ if (used) {
+ ret = alloc_page(l_ms, p_ms->addr,
+ p_ms->hugepage_sz,
+ p_ms->socket_id, hi,
+ msl_idx, seg_idx);
+ if (ret < 0)
+ return -1;
+ rte_fbarray_set_used(l_arr, seg_idx);
+ } else {
+ ret = free_page(l_ms, hi, msl_idx, seg_idx);
+ if (ret < 0)
+ return -1;
+ rte_fbarray_set_free(l_arr, seg_idx);
+ }
+ }
+
+ /* calculate how much we can advance until next chunk */
+ diff_len = used ?
+ rte_fbarray_find_contig_used(l_arr, start) :
+ rte_fbarray_find_contig_free(l_arr, start);
+ ret = RTE_MIN(chunk_len, diff_len);
+
+ return ret;
+}
+
+static int
+sync_status(struct rte_memseg_list *primary_msl,
+ struct rte_memseg_list *local_msl, struct hugepage_info *hi,
+ unsigned int msl_idx, bool used)
+{
+ struct rte_fbarray *l_arr, *p_arr;
+ int p_idx, l_chunk_len, p_chunk_len, ret;
+ int start, end;
+
+ /* this is a little bit tricky, but the basic idea is - walk both lists
+ * and spot any places where there are discrepancies. walking both lists
+ * and noting discrepancies in a single go is a hard problem, so we do
+ * it in two passes - first we spot any places where allocated segments
+ * mismatch (i.e. ensure that everything that's allocated in the primary
+ * is also allocated in the secondary), and then we do it by looking at
+ * free segments instead.
+ *
+ * we also need to aggregate changes into chunks, as we have to call
+ * callbacks per allocation, not per page.
+ */
+ l_arr = &local_msl->memseg_arr;
+ p_arr = &primary_msl->memseg_arr;
+
+ if (used)
+ p_idx = rte_fbarray_find_next_used(p_arr, 0);
+ else
+ p_idx = rte_fbarray_find_next_free(p_arr, 0);
+
+ while (p_idx >= 0) {
+ int next_chunk_search_idx;
+
+ if (used) {
+ p_chunk_len = rte_fbarray_find_contig_used(p_arr,
+ p_idx);
+ l_chunk_len = rte_fbarray_find_contig_used(l_arr,
+ p_idx);
+ } else {
+ p_chunk_len = rte_fbarray_find_contig_free(p_arr,
+ p_idx);
+ l_chunk_len = rte_fbarray_find_contig_free(l_arr,
+ p_idx);
+ }
+ /* best case scenario - no differences (or bigger, which will be
+ * fixed during next iteration), look for next chunk
+ */
+ if (l_chunk_len >= p_chunk_len) {
+ next_chunk_search_idx = p_idx + p_chunk_len;
+ goto next_chunk;
+ }
+
+ /* if both chunks start at the same point, skip parts we know
+ * are identical, and sync the rest. each call to sync_chunk
+ * will only sync contiguous segments, so we need to call this
+ * until we are sure there are no more differences in this
+ * chunk.
+ */
+ start = p_idx + l_chunk_len;
+ end = p_idx + p_chunk_len;
+ do {
+ ret = sync_chunk(primary_msl, local_msl, hi, msl_idx,
+ used, start, end);
+ start += ret;
+ } while (start < end && ret >= 0);
+ /* if ret is negative, something went wrong */
+ if (ret < 0)
+ return -1;
+
+ next_chunk_search_idx = p_idx + p_chunk_len;
+next_chunk:
+ /* skip to end of this chunk */
+ if (used) {
+ p_idx = rte_fbarray_find_next_used(p_arr,
+ next_chunk_search_idx);
+ } else {
+ p_idx = rte_fbarray_find_next_free(p_arr,
+ next_chunk_search_idx);
+ }
+ }
+ return 0;
+}
+
+static int
+sync_existing(struct rte_memseg_list *primary_msl,
+ struct rte_memseg_list *local_msl, struct hugepage_info *hi,
+ unsigned int msl_idx)
+{
+ int ret;
+
+ /* ensure all allocated space is the same in both lists */
+ ret = sync_status(primary_msl, local_msl, hi, msl_idx, true);
+ if (ret < 0)
+ return -1;
+
+ /* ensure all unallocated space is the same in both lists */
+ ret = sync_status(primary_msl, local_msl, hi, msl_idx, false);
+ if (ret < 0)
+ return -1;
+
+ /* update version number */
+ local_msl->version = primary_msl->version;
+
+ return 0;
+}
+
+
+int
+eal_memalloc_sync_with_primary(void)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct rte_memseg_list *primary_msl, *local_msl;
+ struct hugepage_info *hi = NULL;
+ unsigned int msl_idx;
+ int i;
+
+ /* nothing to be done in primary */
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+ return 0;
+
+ for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
+ bool new_msl = false;
+ bool fail = false;
+
+ primary_msl = &mcfg->memsegs[msl_idx];
+ local_msl = &local_memsegs[msl_idx];
+
+ if (primary_msl->base_va == 0)
+ continue;
+
+ /* this is a valid memseg list, so read-lock it */
+ rte_rwlock_read_lock(&primary_msl->mplock);
+
+ /* write-lock local memseg list */
+ rte_rwlock_write_lock(&local_msl->mplock);
+
+ /* check if secondary has this memseg list set up */
+ if (local_msl->base_va == 0) {
+ char name[PATH_MAX];
+ int ret;
+ new_msl = true;
+
+ /* create distinct fbarrays for each secondary */
+ snprintf(name, RTE_FBARRAY_NAME_LEN, "%s_%i",
+ primary_msl->memseg_arr.name, getpid());
+
+ ret = rte_fbarray_init(&local_msl->memseg_arr, name,
+ primary_msl->memseg_arr.len,
+ primary_msl->memseg_arr.elt_sz);
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL, "Cannot initialize local memory map\n");
+ fail = true;
+ goto endloop;
+ }
+
+ local_msl->base_va = primary_msl->base_va;
+ }
+
+ for (i = 0; i < (int) RTE_DIM(internal_config.hugepage_info);
+ i++) {
+ uint64_t cur_sz =
+ internal_config.hugepage_info[i].hugepage_sz;
+ uint64_t msl_sz = primary_msl->hugepage_sz;
+ if (msl_sz == cur_sz) {
+ hi = &internal_config.hugepage_info[i];
+ break;
+ }
+ }
+ if (!hi) {
+ RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n");
+ fail = true;
+ goto endloop;
+ }
+
+ /* if versions don't match or if we have just allocated a new
+ * memseg list, synchronize everything
+ */
+ if ((new_msl || local_msl->version != primary_msl->version) &&
+ sync_existing(primary_msl, local_msl, hi,
+ msl_idx)) {
+ fail = true;
+ goto endloop;
+ }
+endloop:
+ rte_rwlock_write_unlock(&local_msl->mplock);
+ rte_rwlock_read_unlock(&primary_msl->mplock);
+ if (fail)
+ return -1;
+ }
+ return 0;
}
--
2.7.4
Anatoly Burakov
2018-03-07 16:56:47 UTC
Permalink
This will be helpful down the line when we implement support for
allocating physically contiguous memory. We can no longer guarantee
physically contiguous memory unless we're in IOVA_AS_VA mode, but
we can certainly try and see if we succeed. In addition, this would
be useful for e.g. PMD's who may allocate chunks that are smaller
than the pagesize, but they must not cross the page boundary, in
which case we will be able to accommodate that request.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/bsdapp/eal/Makefile | 1 +
lib/librte_eal/common/eal_common_memalloc.c | 49 +++++++++++++++++++++++++++++
lib/librte_eal/common/eal_memalloc.h | 5 +++
lib/librte_eal/common/meson.build | 1 +
lib/librte_eal/linuxapp/eal/Makefile | 1 +
5 files changed, 57 insertions(+)
create mode 100644 lib/librte_eal/common/eal_common_memalloc.c

diff --git a/lib/librte_eal/bsdapp/eal/Makefile b/lib/librte_eal/bsdapp/eal/Makefile
index 19f9322..907e30d 100644
--- a/lib/librte_eal/bsdapp/eal/Makefile
+++ b/lib/librte_eal/bsdapp/eal/Makefile
@@ -41,6 +41,7 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_timer.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_memzone.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_log.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_launch.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_memalloc.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_memory.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_tailqs.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_errno.c
diff --git a/lib/librte_eal/common/eal_common_memalloc.c b/lib/librte_eal/common/eal_common_memalloc.c
new file mode 100644
index 0000000..62e8c16
--- /dev/null
+++ b/lib/librte_eal/common/eal_common_memalloc.c
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2017-2018 Intel Corporation
+ */
+
+#include <rte_lcore.h>
+#include <rte_fbarray.h>
+#include <rte_memzone.h>
+#include <rte_memory.h>
+#include <rte_eal_memconfig.h>
+
+#include "eal_private.h"
+#include "eal_internal_cfg.h"
+#include "eal_memalloc.h"
+
+bool
+eal_memalloc_is_contig(struct rte_memseg_list *msl, void *start,
+ size_t len)
+{
+ const struct rte_memseg *ms;
+ uint64_t page_sz;
+ void *end;
+ int start_page, end_page, cur_page;
+ rte_iova_t expected;
+
+ /* for legacy memory, it's always contiguous */
+ if (internal_config.legacy_mem)
+ return true;
+
+ /* figure out how many pages we need to fit in current data */
+ page_sz = msl->hugepage_sz;
+ end = RTE_PTR_ADD(start, len);
+
+ start_page = RTE_PTR_DIFF(start, msl->base_va) / page_sz;
+ end_page = RTE_PTR_DIFF(end, msl->base_va) / page_sz;
+
+ /* now, look for contiguous memory */
+ ms = rte_fbarray_get(&msl->memseg_arr, start_page);
+ expected = ms->iova + page_sz;
+
+ for (cur_page = start_page + 1; cur_page < end_page;
+ cur_page++, expected += page_sz) {
+ ms = rte_fbarray_get(&msl->memseg_arr, cur_page);
+
+ if (ms->iova != expected)
+ return false;
+ }
+
+ return true;
+}
diff --git a/lib/librte_eal/common/eal_memalloc.h b/lib/librte_eal/common/eal_memalloc.h
index adf59c4..08ba70e 100644
--- a/lib/librte_eal/common/eal_memalloc.h
+++ b/lib/librte_eal/common/eal_memalloc.h
@@ -8,6 +8,7 @@
#include <stdbool.h>

#include <rte_memory.h>
+#include <rte_eal_memconfig.h>

struct rte_memseg *
eal_memalloc_alloc_page(uint64_t size, int socket);
@@ -19,4 +20,8 @@ eal_memalloc_alloc_page_bulk(struct rte_memseg **ms, int n, uint64_t size,
int
eal_memalloc_free_page(struct rte_memseg *ms);

+bool
+eal_memalloc_is_contig(struct rte_memseg_list *msl, void *start,
+ size_t len);
+
#endif // EAL_MEMALLOC_H
diff --git a/lib/librte_eal/common/meson.build b/lib/librte_eal/common/meson.build
index 7d02191..a1ada24 100644
--- a/lib/librte_eal/common/meson.build
+++ b/lib/librte_eal/common/meson.build
@@ -16,6 +16,7 @@ common_sources = files(
'eal_common_launch.c',
'eal_common_lcore.c',
'eal_common_log.c',
+ 'eal_common_memalloc.c',
'eal_common_memory.c',
'eal_common_memzone.c',
'eal_common_options.c',
diff --git a/lib/librte_eal/linuxapp/eal/Makefile b/lib/librte_eal/linuxapp/eal/Makefile
index af6b9be..5380ba8 100644
--- a/lib/librte_eal/linuxapp/eal/Makefile
+++ b/lib/librte_eal/linuxapp/eal/Makefile
@@ -49,6 +49,7 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_timer.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_memzone.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_log.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_launch.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_memalloc.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_memory.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_tailqs.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_errno.c
--
2.7.4
Anatoly Burakov
2018-03-07 16:57:00 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
Acked-by: Fiona Trahe <***@intel.com>
---
drivers/crypto/qat/qat_qp.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/crypto/qat/qat_qp.c b/drivers/crypto/qat/qat_qp.c
index 87b9ce0..3f8ed4d 100644
--- a/drivers/crypto/qat/qat_qp.c
+++ b/drivers/crypto/qat/qat_qp.c
@@ -95,8 +95,8 @@ queue_dma_zone_reserve(const char *queue_name, uint32_t queue_size,
default:
memzone_flags = RTE_MEMZONE_SIZE_HINT_ONLY;
}
- return rte_memzone_reserve_aligned(queue_name, queue_size, socket_id,
- memzone_flags, queue_size);
+ return rte_memzone_reserve_aligned_contig(queue_name, queue_size,
+ socket_id, memzone_flags, queue_size);
}

int qat_crypto_sym_qp_setup(struct rte_cryptodev *dev, uint16_t queue_pair_id,
--
2.7.4
Anatoly Burakov
2018-03-07 16:56:59 UTC
Permalink
This fixes the following drivers in one go:

grep -Rl rte_eth_dma_zone_reserve drivers/

drivers/net/avf/avf_rxtx.c
drivers/net/thunderx/nicvf_ethdev.c
drivers/net/e1000/igb_rxtx.c
drivers/net/e1000/em_rxtx.c
drivers/net/fm10k/fm10k_ethdev.c
drivers/net/vmxnet3/vmxnet3_rxtx.c
drivers/net/liquidio/lio_rxtx.c
drivers/net/i40e/i40e_rxtx.c
drivers/net/sfc/sfc.c
drivers/net/ixgbe/ixgbe_rxtx.c
drivers/net/nfp/nfp_net.c

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_ether/rte_ethdev.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lib/librte_ether/rte_ethdev.c b/lib/librte_ether/rte_ethdev.c
index 0590f0c..7935230 100644
--- a/lib/librte_ether/rte_ethdev.c
+++ b/lib/librte_ether/rte_ethdev.c
@@ -3401,7 +3401,8 @@ rte_eth_dma_zone_reserve(const struct rte_eth_dev *dev, const char *ring_name,
if (mz)
return mz;

- return rte_memzone_reserve_aligned(z_name, size, socket_id, 0, align);
+ return rte_memzone_reserve_aligned_contig(z_name, size, socket_id, 0,
+ align);
}

int
--
2.7.4
Anatoly Burakov
2018-03-07 16:56:52 UTC
Permalink
Currently it is not possible to use memory that is not owned by DPDK to
perform DMA. This scenarion might be used in vhost applications (like
SPDK) where guest send its own memory table. To fill this gap provide
API to allow registering arbitrary address in VFIO container.

Signed-off-by: Pawel Wodkowski <***@intel.com>
Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/bsdapp/eal/eal.c | 16 ++++
lib/librte_eal/common/include/rte_vfio.h | 39 ++++++++
lib/librte_eal/linuxapp/eal/eal_vfio.c | 153 ++++++++++++++++++++++++++-----
lib/librte_eal/linuxapp/eal/eal_vfio.h | 11 +++
4 files changed, 196 insertions(+), 23 deletions(-)

diff --git a/lib/librte_eal/bsdapp/eal/eal.c b/lib/librte_eal/bsdapp/eal/eal.c
index 3b06e21..5a7f436 100644
--- a/lib/librte_eal/bsdapp/eal/eal.c
+++ b/lib/librte_eal/bsdapp/eal/eal.c
@@ -755,6 +755,8 @@ int rte_vfio_enable(const char *modname);
int rte_vfio_is_enabled(const char *modname);
int rte_vfio_noiommu_is_enabled(void);
int rte_vfio_clear_group(int vfio_group_fd);
+int rte_vfio_dma_map(uint64_t vaddr, uint64_t iova, uint64_t len);
+int rte_vfio_dma_unmap(uint64_t vaddr, uint64_t iova, uint64_t len);

int rte_vfio_setup_device(__rte_unused const char *sysfs_base,
__rte_unused const char *dev_addr,
@@ -790,3 +792,17 @@ int rte_vfio_clear_group(__rte_unused int vfio_group_fd)
{
return 0;
}
+
+int
+rte_vfio_dma_map(uint64_t __rte_unused vaddr, __rte_unused uint64_t iova,
+ __rte_unused uint64_t len)
+{
+ return -1;
+}
+
+int
+rte_vfio_dma_unmap(uint64_t __rte_unused vaddr, uint64_t __rte_unused iova,
+ __rte_unused uint64_t len)
+{
+ return -1;
+}
diff --git a/lib/librte_eal/common/include/rte_vfio.h b/lib/librte_eal/common/include/rte_vfio.h
index e981a62..093c309 100644
--- a/lib/librte_eal/common/include/rte_vfio.h
+++ b/lib/librte_eal/common/include/rte_vfio.h
@@ -123,6 +123,45 @@ int rte_vfio_noiommu_is_enabled(void);
int
rte_vfio_clear_group(int vfio_group_fd);

+/**
+ * Map memory region for use with VFIO.
+ *
+ * @param vaddr
+ * Starting virtual address of memory to be mapped.
+ *
+ * @param iova
+ * Starting IOVA address of memory to be mapped.
+ *
+ * @param len
+ * Length of memory segment being mapped.
+ *
+ * @return
+ * 0 if success.
+ * -1 on error.
+ */
+int
+rte_vfio_dma_map(uint64_t vaddr, uint64_t iova, uint64_t len);
+
+
+/**
+ * Unmap memory region from VFIO.
+ *
+ * @param vaddr
+ * Starting virtual address of memory to be unmapped.
+ *
+ * @param iova
+ * Starting IOVA address of memory to be unmapped.
+ *
+ * @param len
+ * Length of memory segment being unmapped.
+ *
+ * @return
+ * 0 if success.
+ * -1 on error.
+ */
+int
+rte_vfio_dma_unmap(uint64_t vaddr, uint64_t iova, uint64_t len);
+
#endif /* VFIO_PRESENT */

#endif /* _RTE_VFIO_H_ */
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index 5192763..8fe8984 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -22,17 +22,35 @@
static struct vfio_config vfio_cfg;

static int vfio_type1_dma_map(int);
+static int vfio_type1_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);
static int vfio_spapr_dma_map(int);
static int vfio_noiommu_dma_map(int);
+static int vfio_noiommu_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);

/* IOMMU types we support */
static const struct vfio_iommu_type iommu_types[] = {
/* x86 IOMMU, otherwise known as type 1 */
- { RTE_VFIO_TYPE1, "Type 1", &vfio_type1_dma_map},
+ {
+ .type_id = RTE_VFIO_TYPE1,
+ .name = "Type 1",
+ .dma_map_func = &vfio_type1_dma_map,
+ .dma_user_map_func = &vfio_type1_dma_mem_map
+ },
/* ppc64 IOMMU, otherwise known as spapr */
- { RTE_VFIO_SPAPR, "sPAPR", &vfio_spapr_dma_map},
+ {
+ .type_id = RTE_VFIO_SPAPR,
+ .name = "sPAPR",
+ .dma_map_func = &vfio_spapr_dma_map,
+ .dma_user_map_func = NULL
+ // TODO: work with PPC64 people on enabling this, window size!
+ },
/* IOMMU-less mode */
- { RTE_VFIO_NOIOMMU, "No-IOMMU", &vfio_noiommu_dma_map},
+ {
+ .type_id = RTE_VFIO_NOIOMMU,
+ .name = "No-IOMMU",
+ .dma_map_func = &vfio_noiommu_dma_map,
+ .dma_user_map_func = &vfio_noiommu_dma_mem_map
+ },
};

int
@@ -333,9 +351,10 @@ rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
*/
if (internal_config.process_type == RTE_PROC_PRIMARY &&
vfio_cfg.vfio_active_groups == 1) {
+ const struct vfio_iommu_type *t;
+
/* select an IOMMU type which we will be using */
- const struct vfio_iommu_type *t =
- vfio_set_iommu_type(vfio_cfg.vfio_container_fd);
+ t = vfio_set_iommu_type(vfio_cfg.vfio_container_fd);
if (!t) {
RTE_LOG(ERR, EAL,
" %s failed to select IOMMU type\n",
@@ -353,6 +372,8 @@ rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
rte_vfio_clear_group(vfio_group_fd);
return -1;
}
+
+ vfio_cfg.vfio_iommu_type = t;
}
}

@@ -665,13 +686,54 @@ vfio_get_group_no(const char *sysfs_base,
}

static int
+vfio_type1_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
+ uint64_t len, int do_map)
+{
+ struct vfio_iommu_type1_dma_map dma_map;
+ struct vfio_iommu_type1_dma_unmap dma_unmap;
+ int ret;
+
+ if (do_map != 0) {
+ memset(&dma_map, 0, sizeof(dma_map));
+ dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
+ dma_map.vaddr = vaddr;
+ dma_map.size = len;
+ dma_map.iova = iova;
+ dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
+ VFIO_DMA_MAP_FLAG_WRITE;
+
+ ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot set up DMA remapping, error %i (%s)\n",
+ errno, strerror(errno));
+ return -1;
+ }
+
+ } else {
+ memset(&dma_unmap, 0, sizeof(dma_unmap));
+ dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap);
+ dma_unmap.size = len;
+ dma_unmap.iova = iova;
+
+ ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA,
+ &dma_unmap);
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot clear DMA remapping, error %i (%s)\n",
+ errno, strerror(errno));
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+static int
vfio_type1_dma_map(int vfio_container_fd)
{
- int i, ret;
+ int i;

/* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
- struct vfio_iommu_type1_dma_map dma_map;
struct rte_memseg_list *msl;
struct rte_fbarray *arr;
int ms_idx, next_idx;
@@ -697,23 +759,9 @@ vfio_type1_dma_map(int vfio_container_fd)
len = ms->hugepage_sz;
hw_addr = ms->iova;

- memset(&dma_map, 0, sizeof(dma_map));
- dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
- dma_map.vaddr = addr;
- dma_map.size = len;
- dma_map.iova = hw_addr;
- dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
- VFIO_DMA_MAP_FLAG_WRITE;
-
- ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA,
- &dma_map);
-
- if (ret) {
- RTE_LOG(ERR, EAL, " cannot set up DMA remapping, "
- "error %i (%s)\n", errno,
- strerror(errno));
+ if (vfio_type1_dma_mem_map(vfio_container_fd, addr,
+ hw_addr, len, 1))
return -1;
- }
}
}

@@ -865,6 +913,49 @@ vfio_noiommu_dma_map(int __rte_unused vfio_container_fd)
return 0;
}

+static int
+vfio_noiommu_dma_mem_map(int __rte_unused vfio_container_fd,
+ uint64_t __rte_unused vaddr,
+ uint64_t __rte_unused iova, uint64_t __rte_unused len,
+ int __rte_unused do_map)
+{
+ /* No-IOMMU mode does not need DMA mapping */
+ return 0;
+}
+
+static int
+vfio_dma_mem_map(uint64_t vaddr, uint64_t iova, uint64_t len, int do_map)
+{
+ const struct vfio_iommu_type *t = vfio_cfg.vfio_iommu_type;
+
+ if (!t) {
+ RTE_LOG(ERR, EAL, " VFIO support not initialized\n");
+ return -1;
+ }
+
+ if (!t->dma_user_map_func) {
+ RTE_LOG(ERR, EAL,
+ " VFIO custom DMA region maping not supported by IOMMU %s\n",
+ t->name);
+ return -1;
+ }
+
+ return t->dma_user_map_func(vfio_cfg.vfio_container_fd, vaddr, iova,
+ len, do_map);
+}
+
+int
+rte_vfio_dma_map(uint64_t vaddr, uint64_t iova, uint64_t len)
+{
+ return vfio_dma_mem_map(vaddr, iova, len, 1);
+}
+
+int
+rte_vfio_dma_unmap(uint64_t vaddr, uint64_t iova, uint64_t len)
+{
+ return vfio_dma_mem_map(vaddr, iova, len, 0);
+}
+
int
rte_vfio_noiommu_is_enabled(void)
{
@@ -897,4 +988,20 @@ rte_vfio_noiommu_is_enabled(void)
return c == 'Y';
}

+#else
+
+int
+rte_vfio_dma_map(uint64_t __rte_unused vaddr, __rte_unused uint64_t iova,
+ __rte_unused uint64_t len)
+{
+ return -1;
+}
+
+int
+rte_vfio_dma_unmap(uint64_t __rte_unused vaddr, uint64_t __rte_unused iova,
+ __rte_unused uint64_t len)
+{
+ return -1;
+}
+
#endif
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h
index 8059577..b68703e 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.h
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
@@ -19,6 +19,7 @@

#ifdef VFIO_PRESENT

+#include <stdint.h>
#include <linux/vfio.h>

#define RTE_VFIO_TYPE1 VFIO_TYPE1_IOMMU
@@ -110,6 +111,7 @@ struct vfio_config {
int vfio_enabled;
int vfio_container_fd;
int vfio_active_groups;
+ const struct vfio_iommu_type *vfio_iommu_type;
struct vfio_group vfio_groups[VFIO_MAX_GROUPS];
};

@@ -119,9 +121,18 @@ struct vfio_config {
* */
typedef int (*vfio_dma_func_t)(int);

+/* Custom memory region DMA mapping function prototype.
+ * Takes VFIO container fd, virtual address, phisical address, length and
+ * operation type (0 to unmap 1 for map) as a parameters.
+ * Returns 0 on success, -1 on error.
+ **/
+typedef int (*vfio_dma_user_func_t)(int fd, uint64_t vaddr, uint64_t iova,
+ uint64_t len, int do_map);
+
struct vfio_iommu_type {
int type_id;
const char *name;
+ vfio_dma_user_func_t dma_user_map_func;
vfio_dma_func_t dma_map_func;
};
--
2.7.4
Gowrishankar
2018-03-30 09:42:10 UTC
Permalink
From: Gowrishankar Muthukrishnan <***@linux.vnet.ibm.com>

Below patch adds powerpc arch specific changes.

Signed-off-by: Gowrishankar Muthukrishnan <***@linux.vnet.ibm.com>
---
lib/librte_eal/linuxapp/eal/eal_vfio.c | 63 +++++++++++++++++++++++++---------
1 file changed, 47 insertions(+), 16 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index 4e9e296..063982c 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -24,6 +24,7 @@
static int vfio_type1_dma_map(int);
static int vfio_type1_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);
static int vfio_spapr_dma_map(int);
+static int vfio_spapr_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);
static int vfio_noiommu_dma_map(int);
static int vfio_noiommu_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);

@@ -41,8 +42,7 @@
.type_id = RTE_VFIO_SPAPR,
.name = "sPAPR",
.dma_map_func = &vfio_spapr_dma_map,
- .dma_user_map_func = NULL
- // TODO: work with PPC64 people on enabling this, window size!
+ .dma_user_map_func = &vfio_spapr_dma_mem_map
},
/* IOMMU-less mode */
{
@@ -801,10 +801,51 @@ struct spapr_create_window_walk_param {
}

static int
+vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
+ uint64_t len, int do_map)
+{
+ struct vfio_iommu_type1_dma_map dma_map;
+ struct vfio_iommu_type1_dma_unmap dma_unmap;
+ int ret;
+
+ if (do_map != 0) {
+ memset(&dma_map, 0, sizeof(dma_map));
+ dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
+ dma_map.vaddr = vaddr;
+ dma_map.size = len;
+ dma_map.iova = iova;
+ dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
+ VFIO_DMA_MAP_FLAG_WRITE;
+
+ ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot set up DMA remapping, error %i (%s)\n",
+ errno, strerror(errno));
+ return -1;
+ }
+
+ } else {
+ memset(&dma_unmap, 0, sizeof(dma_unmap));
+ dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap);
+ dma_unmap.size = len;
+ dma_unmap.iova = iova;
+
+ ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA,
+ &dma_unmap);
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot clear DMA remapping, error %i (%s)\n",
+ errno, strerror(errno));
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+static int
vfio_spapr_dma_map_walk(const struct rte_memseg_list *msl __rte_unused,
const struct rte_memseg *ms, void *arg)
{
- struct vfio_iommu_type1_dma_map dma_map;
struct vfio_iommu_spapr_register_memory reg = {
.argsz = sizeof(reg),
.flags = 0
@@ -828,17 +869,8 @@ struct spapr_create_window_walk_param {
return -1;
}

- memset(&dma_map, 0, sizeof(dma_map));
- dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
- dma_map.vaddr = addr;
- dma_map.size = len;
- dma_map.iova = hw_addr;
- dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
- VFIO_DMA_MAP_FLAG_WRITE;
-
- ret = ioctl(*vfio_container_fd, VFIO_IOMMU_MAP_DMA,
- &dma_map);
-
+ ret = vfio_spapr_dma_mem_map(*vfio_container_fd, addr,
+ hw_addr, len, 1);
if (ret) {
RTE_LOG(ERR, EAL, " cannot set up DMA remapping, error %i (%s)\n",
errno, strerror(errno));
@@ -852,7 +884,6 @@ struct spapr_create_window_walk_param {
vfio_spapr_dma_map(int vfio_container_fd)
{
int ret;
- uint64_t hugepage_sz = 0;
struct spapr_create_window_walk_param wa;

struct vfio_iommu_spapr_tce_info info = {
@@ -890,7 +921,7 @@ struct spapr_create_window_walk_param {

/* sPAPR requires window size to be a power of 2 */
create.window_size = rte_align64pow2(create.window_size);
- create.page_shift = __builtin_ctzll(hugepage_sz);
+ create.page_shift = __builtin_ctzll(wa.hugepage_sz);
create.levels = 1;

ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
--
1.9.1
Gowrishankar
2018-04-02 11:36:50 UTC
Permalink
From: Gowrishankar Muthukrishnan <***@linux.vnet.ibm.com>

Below patch adds powerpc arch specific changes.

Signed-off-by: Gowrishankar Muthukrishnan <***@linux.vnet.ibm.com>
---
lib/librte_eal/linuxapp/eal/eal_vfio.c | 110 +++++++++++++++++++++++++++++++--
1 file changed, 105 insertions(+), 5 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index 4e9e296..985acf4 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -24,6 +24,7 @@
static int vfio_type1_dma_map(int);
static int vfio_type1_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);
static int vfio_spapr_dma_map(int);
+static int vfio_spapr_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);
static int vfio_noiommu_dma_map(int);
static int vfio_noiommu_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);

@@ -41,8 +42,7 @@
.type_id = RTE_VFIO_SPAPR,
.name = "sPAPR",
.dma_map_func = &vfio_spapr_dma_map,
- .dma_user_map_func = NULL
- // TODO: work with PPC64 people on enabling this, window size!
+ .dma_user_map_func = &vfio_spapr_dma_mem_map
},
/* IOMMU-less mode */
{
@@ -838,7 +838,6 @@ struct spapr_create_window_walk_param {

ret = ioctl(*vfio_container_fd, VFIO_IOMMU_MAP_DMA,
&dma_map);
-
if (ret) {
RTE_LOG(ERR, EAL, " cannot set up DMA remapping, error %i (%s)\n",
errno, strerror(errno));
@@ -852,7 +851,6 @@ struct spapr_create_window_walk_param {
vfio_spapr_dma_map(int vfio_container_fd)
{
int ret;
- uint64_t hugepage_sz = 0;
struct spapr_create_window_walk_param wa;

struct vfio_iommu_spapr_tce_info info = {
@@ -890,7 +888,7 @@ struct spapr_create_window_walk_param {

/* sPAPR requires window size to be a power of 2 */
create.window_size = rte_align64pow2(create.window_size);
- create.page_shift = __builtin_ctzll(hugepage_sz);
+ create.page_shift = __builtin_ctzll(wa.hugepage_sz);
create.levels = 1;

ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
@@ -912,6 +910,108 @@ struct spapr_create_window_walk_param {
}

static int
+vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
+ uint64_t len, int do_map)
+{
+ int ret;
+ struct spapr_create_window_walk_param wa = {
+ .hugepage_sz = 0,
+ };
+ struct vfio_iommu_spapr_tce_create create = {
+ .argsz = sizeof(create),
+ };
+
+ /* check if DMA window is from 0 to max(phys_addr + len) */
+ wa.create = &create;
+ rte_memseg_walk(vfio_spapr_create_window_walk, &wa);
+ create.window_size = rte_align64pow2(create.window_size);
+ if (iova > create.window_size) {
+ struct vfio_iommu_spapr_tce_info info = {
+ .argsz = sizeof(info),
+ };
+ struct vfio_iommu_spapr_tce_remove remove = {
+ .argsz = sizeof(remove),
+ };
+
+ /* query spapr iommu info */
+ ret = ioctl(vfio_container_fd,
+ VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot get iommu info, "
+ "error %i (%s)\n", errno, strerror(errno));
+ return -1;
+ }
+
+ /* remove old DMA window */
+ remove.start_addr = info.dma32_window_start;
+ ret = ioctl(vfio_container_fd,
+ VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot remove default DMA window, "
+ "error %i (%s)\n", errno, strerror(errno));
+ return -1;
+ }
+ create.page_shift = __builtin_ctzll(wa.hugepage_sz);
+ create.levels = 1;
+
+ ret = ioctl(vfio_container_fd,
+ VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot create new DMA window, "
+ "error %i (%s)\n", errno, strerror(errno));
+ return -1;
+ }
+
+ if (create.start_addr != 0) {
+ RTE_LOG(ERR, EAL, " DMA window start address != 0\n");
+ return -1;
+ }
+
+ }
+
+ if (do_map != 0) {
+ if (rte_memseg_walk(vfio_spapr_dma_map_walk, &vfio_container_fd))
+ return -1;
+ } else {
+ struct vfio_iommu_type1_dma_unmap dma_unmap;
+ struct vfio_iommu_spapr_register_memory reg = {
+ .argsz = sizeof(reg),
+ .flags = 0
+ };
+
+ /* for unmap, check if iova within DMA window */
+ if (iova > create.window_size) {
+ RTE_LOG(ERR, EAL, "iova not beyond DMA window for unmap");
+ return -1;
+ }
+
+ reg.vaddr = (uintptr_t) vaddr;
+ reg.size = len;
+ ret = ioctl(vfio_container_fd,
+ VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, &reg);
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot unregister vaddr for IOMMU, error %i (%s)\n",
+ errno, strerror(errno));
+ return -1;
+ }
+
+ memset(&dma_unmap, 0, sizeof(dma_unmap));
+ dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap);
+ dma_unmap.size = len;
+ dma_unmap.iova = iova;
+
+ ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA,
+ &dma_unmap);
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot clear DMA remapping, error %i (%s)\n",
+ errno, strerror(errno));
+ return -1;
+ }
+ }
+ return 0;
+}
+
+static int
vfio_noiommu_dma_map(int __rte_unused vfio_container_fd)
{
/* No-IOMMU mode does not need DMA mapping */
--
1.9.1
Anatoly Burakov
2018-03-07 16:56:55 UTC
Permalink
for legacy memory mode, attach to primary's memseg list, and
map hugepages as before.

for non-legacy mode, preallocate all VA space and then do a
sync of local memory map.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/bsdapp/eal/eal_hugepage_info.c | 7 ++
lib/librte_eal/common/eal_common_memory.c | 99 +++++++++++++++++++++----
lib/librte_eal/common/eal_hugepages.h | 5 ++
lib/librte_eal/linuxapp/eal/eal.c | 18 +++--
lib/librte_eal/linuxapp/eal/eal_hugepage_info.c | 53 ++++++++-----
lib/librte_eal/linuxapp/eal/eal_memory.c | 24 ++++--
6 files changed, 159 insertions(+), 47 deletions(-)

diff --git a/lib/librte_eal/bsdapp/eal/eal_hugepage_info.c b/lib/librte_eal/bsdapp/eal/eal_hugepage_info.c
index be2dbf0..18e6e5e 100644
--- a/lib/librte_eal/bsdapp/eal/eal_hugepage_info.c
+++ b/lib/librte_eal/bsdapp/eal/eal_hugepage_info.c
@@ -103,3 +103,10 @@ eal_hugepage_info_init(void)

return 0;
}
+
+/* memory hotplug is not supported in FreeBSD, so no need to implement this */
+int
+eal_hugepage_info_read(void)
+{
+ return 0;
+}
diff --git a/lib/librte_eal/common/eal_common_memory.c b/lib/librte_eal/common/eal_common_memory.c
index 457e239..a571e24 100644
--- a/lib/librte_eal/common/eal_common_memory.c
+++ b/lib/librte_eal/common/eal_common_memory.c
@@ -20,6 +20,7 @@
#include <rte_errno.h>
#include <rte_log.h>

+#include "eal_memalloc.h"
#include "eal_private.h"
#include "eal_internal_cfg.h"

@@ -147,19 +148,11 @@ alloc_memseg_list(struct rte_memseg_list *msl, uint64_t page_sz,
char name[RTE_FBARRAY_NAME_LEN];
int max_pages;
uint64_t mem_amount;
- void *addr;

if (!internal_config.legacy_mem) {
mem_amount = get_mem_amount(page_sz);
max_pages = mem_amount / page_sz;
-
- addr = eal_get_virtual_area(NULL, &mem_amount, page_sz, 0, 0);
- if (addr == NULL) {
- RTE_LOG(ERR, EAL, "Cannot reserve memory\n");
- return -1;
- }
} else {
- addr = NULL;
/* numer of memsegs in each list, these will not be single-page
* segments, so RTE_MAX_LEGACY_MEMSEG is like old default.
*/
@@ -177,7 +170,7 @@ alloc_memseg_list(struct rte_memseg_list *msl, uint64_t page_sz,

msl->hugepage_sz = page_sz;
msl->socket_id = socket_id;
- msl->base_va = addr;
+ msl->base_va = NULL;

RTE_LOG(DEBUG, EAL, "Memseg list allocated: 0x%zxkB at socket %i\n",
page_sz >> 10, socket_id);
@@ -186,16 +179,46 @@ alloc_memseg_list(struct rte_memseg_list *msl, uint64_t page_sz,
}

static int
-memseg_init(void)
+alloc_va_space(struct rte_memseg_list *msl)
+{
+ uint64_t mem_sz, page_sz;
+ void *addr;
+ int flags = 0;
+
+#ifdef RTE_ARCH_PPC_64
+ flags |= MAP_HUGETLB;
+#endif
+
+ page_sz = msl->hugepage_sz;
+ mem_sz = page_sz * msl->memseg_arr.len;
+
+ addr = eal_get_virtual_area(msl->base_va, &mem_sz, page_sz, 0, flags);
+ if (addr == NULL) {
+ if (rte_errno == EADDRNOTAVAIL)
+ RTE_LOG(ERR, EAL, "Could not mmap %llu bytes at [%p] - please use '--base-virtaddr' option\n",
+ (unsigned long long)mem_sz, msl->base_va);
+ else
+ RTE_LOG(ERR, EAL, "Cannot reserve memory\n");
+ return -1;
+ }
+ msl->base_va = addr;
+
+ return 0;
+}
+
+
+static int
+memseg_primary_init(void)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
int socket_id, hpi_idx, msl_idx = 0;
struct rte_memseg_list *msl;

- if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
- RTE_LOG(ERR, EAL, "Secondary process not supported\n");
- return -1;
- }
+ /* if we start allocating memory segments for pages straight away, VA
+ * space will become fragmented, reducing chances of success when
+ * secondary process maps the same addresses. to fix this, allocate
+ * fbarrays first, and then allocate VA space for them.
+ */

/* create memseg lists */
for (hpi_idx = 0; hpi_idx < (int) internal_config.num_hugepage_sizes;
@@ -235,12 +258,55 @@ memseg_init(void)
total_segs += msl->memseg_arr.len;
total_mem = total_segs * msl->hugepage_sz;
type_msl_idx++;
+
+ /* no need to preallocate VA in legacy mode */
+ if (internal_config.legacy_mem)
+ continue;
+
+ if (alloc_va_space(msl)) {
+ RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n");
+ return -1;
+ }
}
}
}
return 0;
}

+static int
+memseg_secondary_init(void)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ int msl_idx = 0;
+ struct rte_memseg_list *msl;
+
+ for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
+
+ msl = &mcfg->memsegs[msl_idx];
+
+ /* skip empty memseg lists */
+ if (msl->memseg_arr.len == 0)
+ continue;
+
+ if (rte_fbarray_attach(&msl->memseg_arr)) {
+ RTE_LOG(ERR, EAL, "Cannot attach to primary process memseg lists\n");
+ return -1;
+ }
+
+ /* no need to preallocate VA space in legacy mode */
+ if (internal_config.legacy_mem)
+ continue;
+
+ /* preallocate VA space */
+ if (alloc_va_space(msl)) {
+ RTE_LOG(ERR, EAL, "Cannot preallocate VA space for hugepage memory\n");
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
static struct rte_memseg *
virt2memseg(const void *addr, const struct rte_memseg_list *msl)
{
@@ -480,7 +546,10 @@ rte_eal_memory_init(void)
int retval;
RTE_LOG(DEBUG, EAL, "Setting up physically contiguous memory...\n");

- retval = memseg_init();
+ retval = rte_eal_process_type() == RTE_PROC_PRIMARY ?
+ memseg_primary_init() :
+ memseg_secondary_init();
+
if (retval < 0)
return -1;

diff --git a/lib/librte_eal/common/eal_hugepages.h b/lib/librte_eal/common/eal_hugepages.h
index f963ae5..38d0b04 100644
--- a/lib/librte_eal/common/eal_hugepages.h
+++ b/lib/librte_eal/common/eal_hugepages.h
@@ -34,4 +34,9 @@ struct hugepage_file {
*/
int eal_hugepage_info_init(void);

+/**
+ * Read information about hugepages on Linux, but don't clear them out.
+ */
+int eal_hugepage_info_read(void);
+
#endif /* EAL_HUGEPAGES_H */
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index d336c96..7a0d742 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -805,13 +805,17 @@ rte_eal_init(int argc, char **argv)
"KNI module inserted\n");
}

- if (internal_config.no_hugetlbfs == 0 &&
- internal_config.process_type != RTE_PROC_SECONDARY &&
- eal_hugepage_info_init() < 0) {
- rte_eal_init_alert("Cannot get hugepage information.");
- rte_errno = EACCES;
- rte_atomic32_clear(&run_once);
- return -1;
+ if (internal_config.no_hugetlbfs == 0) {
+ /* rte_config isn't initialized yet */
+ ret = internal_config.process_type == RTE_PROC_PRIMARY ?
+ eal_hugepage_info_init() :
+ eal_hugepage_info_read();
+ if (ret < 0) {
+ rte_eal_init_alert("Cannot get hugepage information.");
+ rte_errno = EACCES;
+ rte_atomic32_clear(&run_once);
+ return -1;
+ }
}

if (internal_config.memory == 0 && internal_config.force_sockets == 0) {
diff --git a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
index 7e2475f..7a4adce 100644
--- a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
+++ b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
@@ -6,6 +6,7 @@
#include <sys/types.h>
#include <sys/file.h>
#include <dirent.h>
+#include <stdbool.h>
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
@@ -299,15 +300,9 @@ compare_hpi(const void *a, const void *b)
return hpi_b->hugepage_sz - hpi_a->hugepage_sz;
}

-/*
- * when we initialize the hugepage info, everything goes
- * to socket 0 by default. it will later get sorted by memory
- * initialization procedure.
- */
-int
-eal_hugepage_info_init(void)
-{
- const char dirent_start_text[] = "hugepages-";
+static int
+hugepage_info_init(bool clear_hugepages)
+{ const char dirent_start_text[] = "hugepages-";
const size_t dirent_start_len = sizeof(dirent_start_text) - 1;
unsigned int i, total_pages, num_sizes = 0;
DIR *dir;
@@ -350,18 +345,20 @@ eal_hugepage_info_init(void)
continue;
}

- /* try to obtain a writelock */
- hpi->lock_descriptor = open(hpi->hugedir, O_RDONLY);
+ if (clear_hugepages) {
+ /* try to obtain a writelock */
+ hpi->lock_descriptor = open(hpi->hugedir, O_RDONLY);

- /* if blocking lock failed */
- if (flock(hpi->lock_descriptor, LOCK_EX) == -1) {
- RTE_LOG(CRIT, EAL,
- "Failed to lock hugepage directory!\n");
- break;
+ /* if blocking lock failed */
+ if (flock(hpi->lock_descriptor, LOCK_EX) == -1) {
+ RTE_LOG(CRIT, EAL,
+ "Failed to lock hugepage directory!\n");
+ break;
+ }
+ /* clear out the hugepages dir from unused pages */
+ if (clear_hugedir(hpi->hugedir) == -1)
+ break;
}
- /* clear out the hugepages dir from unused pages */
- if (clear_hugedir(hpi->hugedir) == -1)
- break;

/*
* first, try to put all hugepages into relevant sockets, but
@@ -417,10 +414,26 @@ eal_hugepage_info_init(void)
num_pages += hpi->num_pages[j];
}
if (internal_config.hugepage_info[i].hugedir != NULL &&
- num_pages > 0)
+ (num_pages > 0 || !clear_hugepages))
return 0;
}

/* no valid hugepage mounts available, return error */
return -1;
}
+
+int eal_hugepage_info_read(void)
+{
+ return hugepage_info_init(false);
+}
+
+/*
+ * when we initialize the hugepage info, everything goes
+ * to socket 0 by default. it will later get sorted by memory
+ * initialization procedure.
+ */
+int
+eal_hugepage_info_init(void)
+{
+ return hugepage_info_init(true);
+}
diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
index e0b4988..f74291f 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -1569,6 +1569,22 @@ eal_legacy_hugepage_attach(void)
return -1;
}

+static int
+eal_hugepage_attach(void)
+{
+ if (eal_memalloc_sync_with_primary()) {
+ RTE_LOG(ERR, EAL, "Could not map memory from primary process\n");
+ if (aslr_enabled() > 0) {
+ RTE_LOG(ERR, EAL, "It is recommended to "
+ "disable ASLR in the kernel "
+ "and retry running both primary "
+ "and secondary processes\n");
+ }
+ return -1;
+ }
+ return 0;
+}
+
int
rte_eal_hugepage_init(void)
{
@@ -1580,11 +1596,9 @@ rte_eal_hugepage_init(void)
int
rte_eal_hugepage_attach(void)
{
- if (internal_config.legacy_mem)
- return eal_legacy_hugepage_attach();
- else
- RTE_LOG(ERR, EAL, "Secondary processes aren't supported yet\n");
- return -1;
+ return internal_config.legacy_mem ?
+ eal_legacy_hugepage_attach() :
+ eal_hugepage_attach();
}

int
--
2.7.4
Anatoly Burakov
2018-03-07 16:56:58 UTC
Permalink
Also, rewrite VFIO to rely on memory callbacks instead of manually
registering memory with VFIO. Callbacks will only be registered if
VFIO is enabled.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/common/malloc_heap.c | 21 +++++++++++++++++
lib/librte_eal/linuxapp/eal/eal_memalloc.c | 37 +++++++++++++++++++++---------
lib/librte_eal/linuxapp/eal/eal_vfio.c | 35 ++++++++++++++++++++++++++++
3 files changed, 82 insertions(+), 11 deletions(-)

diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c
index 9935238..d932ead 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -223,6 +223,7 @@ try_expand_heap_primary(struct malloc_heap *heap, uint64_t pg_sz,
void *map_addr;
size_t map_len;
int n_pages;
+ bool callback_triggered = false;

map_len = RTE_ALIGN_CEIL(align + elt_size +
MALLOC_ELEM_TRAILER_LEN, pg_sz);
@@ -242,14 +243,25 @@ try_expand_heap_primary(struct malloc_heap *heap, uint64_t pg_sz,

map_addr = ms[0]->addr;

+ /* notify user about changes in memory map */
+ eal_memalloc_notify(RTE_MEM_EVENT_ALLOC, map_addr, map_len);
+
/* notify other processes that this has happened */
if (request_sync()) {
/* we couldn't ensure all processes have mapped memory,
* so free it back and notify everyone that it's been
* freed back.
+ *
+ * technically, we could've avoided adding memory addresses to
+ * the map, but that would've led to inconsistent behavior
+ * between primary and secondary processes, as those get
+ * callbacks during sync. therefore, force primary process to
+ * do alloc-and-rollback syncs as well.
*/
+ callback_triggered = true;
goto free_elem;
}
+
heap->total_size += map_len;

RTE_LOG(DEBUG, EAL, "Heap on socket %d was expanded by %zdMB\n",
@@ -260,6 +272,9 @@ try_expand_heap_primary(struct malloc_heap *heap, uint64_t pg_sz,
return 0;

free_elem:
+ if (callback_triggered)
+ eal_memalloc_notify(RTE_MEM_EVENT_FREE, map_addr, map_len);
+
rollback_expand_heap(ms, n_pages, elem, map_addr, map_len);

request_sync();
@@ -615,6 +630,10 @@ malloc_heap_free(struct malloc_elem *elem)
heap->total_size -= n_pages * msl->hugepage_sz;

if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ /* notify user about changes in memory map */
+ eal_memalloc_notify(RTE_MEM_EVENT_FREE,
+ aligned_start, aligned_len);
+
/* don't care if any of this fails */
malloc_heap_free_pages(aligned_start, aligned_len);

@@ -637,6 +656,8 @@ malloc_heap_free(struct malloc_elem *elem)
* already removed from the heap, so it is, for all intents and
* purposes, hidden from the rest of DPDK even if some other
* process (including this one) may have these pages mapped.
+ *
+ * notifications about deallocated memory happen during sync.
*/
request_to_primary(&req);
}
diff --git a/lib/librte_eal/linuxapp/eal/eal_memalloc.c b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
index 227d703..1008fae 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memalloc.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
@@ -34,7 +34,6 @@
#include <rte_eal.h>
#include <rte_memory.h>
#include <rte_spinlock.h>
-#include <rte_vfio.h>

#include "eal_filesystem.h"
#include "eal_internal_cfg.h"
@@ -480,10 +479,6 @@ alloc_page(struct rte_memseg *ms, void *addr, uint64_t size, int socket_id,
ms->iova = iova;
ms->socket_id = socket_id;

- /* map the segment so that VFIO has access to it */
- if (rte_eal_iova_mode() == RTE_IOVA_VA &&
- rte_vfio_dma_map(ms->addr_64, iova, size))
- RTE_LOG(DEBUG, EAL, "Cannot register segment with VFIO\n");
return 0;

mapped:
@@ -515,12 +510,6 @@ free_page(struct rte_memseg *ms, struct hugepage_info *hi,
char path[PATH_MAX];
int fd, ret;

- /* unmap the segment from VFIO */
- if (rte_eal_iova_mode() == RTE_IOVA_VA &&
- rte_vfio_dma_unmap(ms->addr_64, ms->iova, ms->len)) {
- RTE_LOG(DEBUG, EAL, "Cannot unregister segment with VFIO\n");
- }
-
if (mmap(ms->addr, ms->hugepage_sz, PROT_READ,
MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0) ==
MAP_FAILED) {
@@ -808,6 +797,19 @@ sync_chunk(struct rte_memseg_list *primary_msl,

diff_len = RTE_MIN(chunk_len, diff_len);

+ /* if we are freeing memory, notif the application */
+ if (!used) {
+ struct rte_memseg *ms;
+ void *start_va;
+ size_t len;
+
+ ms = rte_fbarray_get(l_arr, start);
+ start_va = ms->addr;
+ len = ms->len * diff_len;
+
+ eal_memalloc_notify(RTE_MEM_EVENT_FREE, start_va, len);
+ }
+
for (i = 0; i < diff_len; i++) {
struct rte_memseg *p_ms, *l_ms;
int seg_idx = start + i;
@@ -834,6 +836,19 @@ sync_chunk(struct rte_memseg_list *primary_msl,
}
}

+ /* if we just allocated memory, notify the application */
+ if (used) {
+ struct rte_memseg *ms;
+ void *start_va;
+ size_t len;
+
+ ms = rte_fbarray_get(l_arr, start);
+ start_va = ms->addr;
+ len = ms->len * diff_len;
+
+ eal_memalloc_notify(RTE_MEM_EVENT_ALLOC, start_va, len);
+ }
+
/* calculate how much we can advance until next chunk */
diff_len = used ?
rte_fbarray_find_contig_used(l_arr, start) :
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index 8fe8984..d3c3b70 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -214,6 +214,37 @@ vfio_group_device_count(int vfio_group_fd)
return vfio_cfg.vfio_groups[i].devices;
}

+static void
+vfio_mem_event_callback(enum rte_mem_event type, const void *addr, size_t len)
+{
+ struct rte_memseg_list *msl;
+ struct rte_memseg *ms;
+ size_t cur_len = 0;
+ uint64_t pgsz;
+
+ msl = rte_mem_virt2memseg_list(addr);
+ pgsz = msl->hugepage_sz;
+
+ while (cur_len < len) {
+ const void *va = RTE_PTR_ADD(addr, cur_len);
+ uint64_t vfio_va, iova;
+
+ ms = rte_mem_virt2memseg(va, msl);
+ vfio_va = (uint64_t) (uintptr_t) va;
+ iova = ms->iova;
+
+ /* this never gets called in legacy mode, so we can be sure that
+ * each segment is a single page.
+ */
+ if (type == RTE_MEM_EVENT_ALLOC)
+ rte_vfio_dma_map(vfio_va, iova, pgsz);
+ else
+ rte_vfio_dma_unmap(vfio_va, iova, pgsz);
+
+ cur_len += pgsz;
+ }
+}
+
int
rte_vfio_clear_group(int vfio_group_fd)
{
@@ -507,6 +538,10 @@ rte_vfio_enable(const char *modname)
if (vfio_cfg.vfio_container_fd != -1) {
RTE_LOG(NOTICE, EAL, "VFIO support initialized\n");
vfio_cfg.vfio_enabled = 1;
+
+ /* register callback for mem events */
+ rte_mem_event_register_callback("vfio_mem_event_clb",
+ vfio_mem_event_callback);
} else {
RTE_LOG(NOTICE, EAL, "VFIO support could not be initialized\n");
}
--
2.7.4
Anatoly Burakov
2018-03-07 16:57:01 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---
drivers/net/avf/avf_ethdev.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/avf/avf_ethdev.c b/drivers/net/avf/avf_ethdev.c
index 4df6617..f69d697 100644
--- a/drivers/net/avf/avf_ethdev.c
+++ b/drivers/net/avf/avf_ethdev.c
@@ -1365,7 +1365,7 @@ avf_allocate_dma_mem_d(__rte_unused struct avf_hw *hw,
return AVF_ERR_PARAM;

snprintf(z_name, sizeof(z_name), "avf_dma_%"PRIu64, rte_rand());
- mz = rte_memzone_reserve_bounded(z_name, size, SOCKET_ID_ANY, 0,
+ mz = rte_memzone_reserve_bounded_contig(z_name, size, SOCKET_ID_ANY, 0,
alignment, RTE_PGSIZE_2M);
if (!mz)
return AVF_ERR_NO_MEMORY;
--
2.7.4
Anatoly Burakov
2018-03-07 16:57:09 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---

Notes:
Not sure if DMA-capable memzones are needed for vmxnet3.
Corrections welcome.

drivers/net/vmxnet3/vmxnet3_ethdev.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/net/vmxnet3/vmxnet3_ethdev.c b/drivers/net/vmxnet3/vmxnet3_ethdev.c
index 4e68aae..c787379 100644
--- a/drivers/net/vmxnet3/vmxnet3_ethdev.c
+++ b/drivers/net/vmxnet3/vmxnet3_ethdev.c
@@ -150,14 +150,15 @@ gpa_zone_reserve(struct rte_eth_dev *dev, uint32_t size,
if (!reuse) {
if (mz)
rte_memzone_free(mz);
- return rte_memzone_reserve_aligned(z_name, size, socket_id,
- 0, align);
+ return rte_memzone_reserve_aligned_contig(z_name, size,
+ socket_id, 0, align);
}

if (mz)
return mz;

- return rte_memzone_reserve_aligned(z_name, size, socket_id, 0, align);
+ return rte_memzone_reserve_aligned_contig(z_name, size, socket_id, 0,
+ align);
}

/**
--
2.7.4
Anatoly Burakov
2018-03-07 16:57:05 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
Acked-by: John Daley <***@cisco.com>
---

Notes:
It is not 100% clear that second call to memzone_reserve
is allocating DMA memory. Corrections welcome.

drivers/net/enic/enic_main.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/enic/enic_main.c b/drivers/net/enic/enic_main.c
index ec9d343..cb2a7ba 100644
--- a/drivers/net/enic/enic_main.c
+++ b/drivers/net/enic/enic_main.c
@@ -319,7 +319,7 @@ enic_alloc_consistent(void *priv, size_t size,
struct enic *enic = (struct enic *)priv;
struct enic_memzone_entry *mze;

- rz = rte_memzone_reserve_aligned((const char *)name,
+ rz = rte_memzone_reserve_aligned_contig((const char *)name,
size, SOCKET_ID_ANY, 0, ENIC_ALIGN);
if (!rz) {
pr_err("%s : Failed to allocate memory requested for %s\n",
@@ -787,7 +787,7 @@ int enic_alloc_wq(struct enic *enic, uint16_t queue_idx,
"vnic_cqmsg-%s-%d-%d", enic->bdf_name, queue_idx,
instance++);

- wq->cqmsg_rz = rte_memzone_reserve_aligned((const char *)name,
+ wq->cqmsg_rz = rte_memzone_reserve_aligned_contig((const char *)name,
sizeof(uint32_t),
SOCKET_ID_ANY, 0,
ENIC_ALIGN);
--
2.7.4
Anatoly Burakov
2018-03-07 16:57:07 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---

Notes:
Doing "grep -R rte_memzone_reserve drivers/net/qede" returns the following:

drivers/net/qede/qede_fdir.c: mz = rte_memzone_reserve_aligned(mz_name, QEDE_MAX_FDIR_PKT_LEN,
drivers/net/qede/base/bcm_osal.c: mz = rte_memzone_reserve_aligned_contig(mz_name, size,
drivers/net/qede/base/bcm_osal.c: mz = rte_memzone_reserve_aligned_contig(mz_name, size, socket_id, 0,

I took a brief look at memzone in qede_fdir and it didn't look like memzone
was used for DMA, so i left it alone. Corrections welcome.

drivers/net/qede/base/bcm_osal.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/qede/base/bcm_osal.c b/drivers/net/qede/base/bcm_osal.c
index fe42f32..707d553 100644
--- a/drivers/net/qede/base/bcm_osal.c
+++ b/drivers/net/qede/base/bcm_osal.c
@@ -135,7 +135,7 @@ void *osal_dma_alloc_coherent(struct ecore_dev *p_dev,
if (core_id == (unsigned int)LCORE_ID_ANY)
core_id = 0;
socket_id = rte_lcore_to_socket_id(core_id);
- mz = rte_memzone_reserve_aligned(mz_name, size,
+ mz = rte_memzone_reserve_aligned_contig(mz_name, size,
socket_id, 0, RTE_CACHE_LINE_SIZE);
if (!mz) {
DP_ERR(p_dev, "Unable to allocate DMA memory "
@@ -174,7 +174,8 @@ void *osal_dma_alloc_coherent_aligned(struct ecore_dev *p_dev,
if (core_id == (unsigned int)LCORE_ID_ANY)
core_id = 0;
socket_id = rte_lcore_to_socket_id(core_id);
- mz = rte_memzone_reserve_aligned(mz_name, size, socket_id, 0, align);
+ mz = rte_memzone_reserve_aligned_contig(mz_name, size, socket_id, 0,
+ align);
if (!mz) {
DP_ERR(p_dev, "Unable to allocate DMA memory "
"of size %zu bytes - %s\n",
--
2.7.4
Anatoly Burakov
2018-03-07 16:56:56 UTC
Permalink
This enables multiprocess synchronization for memory hotplug
requests at runtime (as opposed to initialization).

Basic workflow is the following. Primary process always does initial
mapping and unmapping, and secondary processes always follow primary
page map. Only one allocation request can be active at any one time.

When primary allocates memory, it ensures that all other processes
have allocated the same set of hugepages successfully, otherwise
any allocations made are being rolled back, and heap is freed back.
Heap is locked throughout the process, so no race conditions can
happen.

When primary frees memory, it frees the heap, deallocates affected
pages, and notifies other processes of deallocations. Since heap is
freed from that memory chunk, the area basically becomes invisible
to other processes even if they happen to fail to unmap that
specific set of pages, so it's completely safe to ignore results of
sync requests.

When secondary allocates memory, it does not do so by itself.
Instead, it sends a request to primary process to try and allocate
pages of specified size and on specified socket, such that a
specified heap allocation request could complete. Primary process
then sends all secondaries (including the requestor) a separate
notification of allocated pages, and expects all secondary
processes to report success before considering pages as "allocated".

Only after primary process ensures that all memory has been
successfully allocated in all secondary process, it will respond
positively to the initial request, and let secondary proceed with
the allocation. Since the heap now has memory that can satisfy
allocation request, and it was locked all this time (so no other
allocations could take place), secondary process will be able to
allocate memory from the heap.

When secondary frees memory, it hides pages to be deallocated from
the heap. Then, it sends a deallocation request to primary process,
so that it deallocates pages itself, and then sends a separate sync
request to all other processes (including the requestor) to unmap
the same pages. This way, even if secondary fails to notify other
processes of this deallocation, that memory will become invisible
to other processes, and will not be allocated from again.

So, to summarize: address space will only become part of the heap
if primary process can ensure that all other processes have
allocated this memory successfully. If anything goes wrong, the
worst thing that could happen is that a page will "leak" and will
not be available to neither DPDK nor the system, as some process
will still hold onto it. It's not an actual leak, as we can account
for the page - it's just that none of the processes will be able
to use this page for anything useful, until it gets allocated from
by the primary.

Due to underlying DPDK IPC implementation being single-threaded,
some asynchronous magic had to be done, as we need to complete
several requests before we can definitively allow secondary process
to use allocated memory (namely, it has to be present in all other
secondary processes before it can be used). Additionally, only
one allocation request is allowed to be submitted at once.

Memory allocation requests are only allowed when there are no
secondary processes currently initializing. To enforce that,
a shared rwlock is used, that is set to read lock on init (so that
several secondaries could initialize concurrently), and write lock
on making allocation requests (so that either secondary init will
have to wait, or allocation request will have to wait until all
processes have initialized).

Signed-off-by: Anatoly Burakov <***@intel.com>
---

Notes:
v2: - fixed deadlocking on init problem
- reverted rte_panic changes (fixed by changes in IPC instead)

This problem is evidently complex to solve without multithreaded
IPC implementation. An alternative approach would be to process
each individual message in its own thread (or at least spawn a
thread per incoming request) - that way, we can send requests
while responding to another request, and this problem becomes
trivial to solve (and in fact it was solved that way initially,
before my aversion to certain other programming languages kicked
in).

Is the added complexity worth saving a couple of thread spin-ups
here and there?

lib/librte_eal/bsdapp/eal/Makefile | 1 +
lib/librte_eal/common/eal_common_memory.c | 16 +-
lib/librte_eal/common/include/rte_eal_memconfig.h | 3 +
lib/librte_eal/common/malloc_heap.c | 255 ++++++--
lib/librte_eal/common/malloc_mp.c | 723 ++++++++++++++++++++++
lib/librte_eal/common/malloc_mp.h | 86 +++
lib/librte_eal/common/meson.build | 1 +
lib/librte_eal/linuxapp/eal/Makefile | 1 +
8 files changed, 1040 insertions(+), 46 deletions(-)
create mode 100644 lib/librte_eal/common/malloc_mp.c
create mode 100644 lib/librte_eal/common/malloc_mp.h

diff --git a/lib/librte_eal/bsdapp/eal/Makefile b/lib/librte_eal/bsdapp/eal/Makefile
index 907e30d..250d5c1 100644
--- a/lib/librte_eal/bsdapp/eal/Makefile
+++ b/lib/librte_eal/bsdapp/eal/Makefile
@@ -59,6 +59,7 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_fbarray.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += rte_malloc.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += malloc_elem.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += malloc_heap.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += malloc_mp.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += rte_keepalive.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += rte_service.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += rte_reciprocal.c
diff --git a/lib/librte_eal/common/eal_common_memory.c b/lib/librte_eal/common/eal_common_memory.c
index a571e24..0a0aa88 100644
--- a/lib/librte_eal/common/eal_common_memory.c
+++ b/lib/librte_eal/common/eal_common_memory.c
@@ -543,24 +543,34 @@ rte_mem_lock_page(const void *virt)
int
rte_eal_memory_init(void)
{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
int retval;
RTE_LOG(DEBUG, EAL, "Setting up physically contiguous memory...\n");

+ if (!mcfg)
+ return -1;
+
+ /* lock mem hotplug here, to prevent races while we init */
+ rte_rwlock_read_lock(&mcfg->memory_hotplug_lock);
+
retval = rte_eal_process_type() == RTE_PROC_PRIMARY ?
memseg_primary_init() :
memseg_secondary_init();

if (retval < 0)
- return -1;
+ goto fail;

retval = rte_eal_process_type() == RTE_PROC_PRIMARY ?
rte_eal_hugepage_init() :
rte_eal_hugepage_attach();
if (retval < 0)
- return -1;
+ goto fail;

if (internal_config.no_shconf == 0 && rte_eal_memdevice_init() < 0)
- return -1;
+ goto fail;

return 0;
+fail:
+ rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
+ return -1;
}
diff --git a/lib/librte_eal/common/include/rte_eal_memconfig.h b/lib/librte_eal/common/include/rte_eal_memconfig.h
index d653d57..c4b36f6 100644
--- a/lib/librte_eal/common/include/rte_eal_memconfig.h
+++ b/lib/librte_eal/common/include/rte_eal_memconfig.h
@@ -60,6 +60,9 @@ struct rte_mem_config {
rte_rwlock_t qlock; /**< used for tailq operation for thread safe. */
rte_rwlock_t mplock; /**< only used by mempool LIB for thread-safe. */

+ rte_rwlock_t memory_hotplug_lock;
+ /**< indicates whether memory hotplug request is in progress. */
+
/* memory segments and zones */
struct rte_fbarray memzones; /**< Memzone descriptors. */

diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c
index 7a3d0f3..9935238 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -10,6 +10,7 @@
#include <sys/queue.h>

#include <rte_memory.h>
+#include <rte_errno.h>
#include <rte_eal.h>
#include <rte_eal_memconfig.h>
#include <rte_launch.h>
@@ -26,6 +27,7 @@
#include "eal_memalloc.h"
#include "malloc_elem.h"
#include "malloc_heap.h"
+#include "malloc_mp.h"

static unsigned
check_hugepage_sz(unsigned flags, uint64_t hugepage_sz)
@@ -81,8 +83,6 @@ malloc_heap_add_memory(struct malloc_heap *heap, struct rte_memseg_list *msl,

malloc_elem_free_list_insert(elem);

- heap->total_size += len;
-
return elem;
}

@@ -146,33 +146,42 @@ heap_alloc(struct malloc_heap *heap, const char *type __rte_unused, size_t size,
return elem == NULL ? NULL : (void *)(&elem[1]);
}

-static int
-try_expand_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size,
+/* this function is exposed in malloc_mp.h */
+void
+rollback_expand_heap(struct rte_memseg **ms, int n_pages,
+ struct malloc_elem *elem, void *map_addr, size_t map_len)
+{
+ int i;
+
+ if (elem != NULL) {
+ malloc_elem_free_list_remove(elem);
+ malloc_elem_hide_region(elem, map_addr, map_len);
+ }
+
+ for (i = 0; i < n_pages; i++)
+ eal_memalloc_free_page(ms[i]);
+}
+
+/* this function is exposed in malloc_mp.h */
+struct malloc_elem *
+alloc_pages_on_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size,
int socket, unsigned int flags, size_t align, size_t bound,
- bool contig)
+ bool contig, struct rte_memseg **ms, int n_pages)
{
size_t map_len, data_start_offset;
struct rte_memseg_list *msl;
- struct rte_memseg **ms;
- struct malloc_elem *elem;
- int i, n_pages, allocd_pages;
+ struct malloc_elem *elem = NULL;
+ int allocd_pages;
void *ret, *map_addr, *data_start;

- align = RTE_MAX(align, MALLOC_ELEM_HEADER_LEN);
- map_len = RTE_ALIGN_CEIL(align + elt_size + MALLOC_ELEM_TRAILER_LEN,
- pg_sz);
-
- n_pages = map_len / pg_sz;
+ map_len = n_pages * pg_sz;

- /* we can't know in advance how many pages we'll need, so malloc */
- ms = malloc(sizeof(*ms) * n_pages);
-
- allocd_pages = eal_memalloc_alloc_page_bulk(ms, n_pages, pg_sz, socket,
- true);
+ allocd_pages = eal_memalloc_alloc_page_bulk(ms, n_pages, pg_sz,
+ socket, true);

/* make sure we've allocated our pages... */
if (allocd_pages != n_pages)
- goto free_ms;
+ return NULL;

map_addr = ms[0]->addr;
msl = rte_mem_virt2memseg_list(map_addr);
@@ -184,7 +193,7 @@ try_expand_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size,
n_pages * msl->hugepage_sz)) {
RTE_LOG(DEBUG, EAL, "%s(): couldn't allocate physically contiguous space\n",
__func__);
- goto free_pages;
+ goto fail;
}

/* add newly minted memsegs to malloc heap */
@@ -195,7 +204,53 @@ try_expand_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size,
contig);

if (ret == NULL)
+ goto fail;
+
+ return elem;
+
+fail:
+ rollback_expand_heap(ms, n_pages, elem, map_addr, map_len);
+ return NULL;
+}
+
+static int
+try_expand_heap_primary(struct malloc_heap *heap, uint64_t pg_sz,
+ size_t elt_size, int socket, unsigned int flags, size_t align,
+ size_t bound, bool contig)
+{
+ struct malloc_elem *elem;
+ struct rte_memseg **ms;
+ void *map_addr;
+ size_t map_len;
+ int n_pages;
+
+ map_len = RTE_ALIGN_CEIL(align + elt_size +
+ MALLOC_ELEM_TRAILER_LEN, pg_sz);
+ n_pages = map_len / pg_sz;
+
+ /* we can't know in advance how many pages we'll need, so we malloc */
+ ms = malloc(sizeof(*ms) * n_pages);
+
+ if (ms == NULL)
+ return -1;
+
+ elem = alloc_pages_on_heap(heap, pg_sz, elt_size, socket, flags, align,
+ bound, contig, ms, n_pages);
+
+ if (elem == NULL)
+ goto free_ms;
+
+ map_addr = ms[0]->addr;
+
+ /* notify other processes that this has happened */
+ if (request_sync()) {
+ /* we couldn't ensure all processes have mapped memory,
+ * so free it back and notify everyone that it's been
+ * freed back.
+ */
goto free_elem;
+ }
+ heap->total_size += map_len;

RTE_LOG(DEBUG, EAL, "Heap on socket %d was expanded by %zdMB\n",
socket, map_len >> 20ULL);
@@ -205,13 +260,9 @@ try_expand_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size,
return 0;

free_elem:
- malloc_elem_free_list_remove(elem);
- malloc_elem_hide_region(elem, map_addr, map_len);
- heap->total_size -= map_len;
+ rollback_expand_heap(ms, n_pages, elem, map_addr, map_len);

-free_pages:
- for (i = 0; i < n_pages; i++)
- eal_memalloc_free_page(ms[i]);
+ request_sync();
free_ms:
free(ms);

@@ -219,6 +270,57 @@ try_expand_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size,
}

static int
+try_expand_heap_secondary(struct malloc_heap *heap, uint64_t pg_sz,
+ size_t elt_size, int socket, unsigned int flags, size_t align,
+ size_t bound, bool contig)
+{
+ struct malloc_mp_req req;
+ int req_result;
+
+ req.t = REQ_TYPE_ALLOC;
+ req.alloc_req.align = align;
+ req.alloc_req.bound = bound;
+ req.alloc_req.contig = contig;
+ req.alloc_req.flags = flags;
+ req.alloc_req.elt_size = elt_size;
+ req.alloc_req.page_sz = pg_sz;
+ req.alloc_req.socket = socket;
+ req.alloc_req.heap = heap; /* it's in shared memory */
+
+ req_result = request_to_primary(&req);
+
+ if (req_result != 0)
+ return -1;
+
+ if (req.result != REQ_RESULT_SUCCESS)
+ return -1;
+
+ return 0;
+}
+
+static int
+try_expand_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size,
+ int socket, unsigned int flags, size_t align, size_t bound,
+ bool contig)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ int ret;
+
+ rte_rwlock_write_lock(&mcfg->memory_hotplug_lock);
+
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ ret = try_expand_heap_primary(heap, pg_sz, elt_size, socket,
+ flags, align, bound, contig);
+ } else {
+ ret = try_expand_heap_secondary(heap, pg_sz, elt_size, socket,
+ flags, align, bound, contig);
+ }
+
+ rte_rwlock_write_unlock(&mcfg->memory_hotplug_lock);
+ return ret;
+}
+
+static int
compare_pagesz(const void *a, const void *b)
{
const struct rte_memseg_list * const*mpa = a;
@@ -236,11 +338,10 @@ compare_pagesz(const void *a, const void *b)
}

static int
-alloc_mem_on_socket(size_t size, int socket, unsigned int flags, size_t align,
- size_t bound, bool contig)
+alloc_more_mem_on_socket(struct malloc_heap *heap, size_t size, int socket,
+ unsigned int flags, size_t align, size_t bound, bool contig)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
- struct malloc_heap *heap = &mcfg->malloc_heaps[socket];
struct rte_memseg_list *requested_msls[RTE_MAX_MEMSEG_LISTS];
struct rte_memseg_list *other_msls[RTE_MAX_MEMSEG_LISTS];
uint64_t requested_pg_sz[RTE_MAX_MEMSEG_LISTS];
@@ -355,7 +456,7 @@ heap_alloc_on_socket(const char *type, size_t size, int socket,

rte_spinlock_lock(&(heap->lock));

- align = align == 0 ? 1 : align;
+ align = RTE_MAX(align == 0 ? 1 : align, MALLOC_ELEM_HEADER_LEN);

/* for legacy mode, try once and with all flags */
if (internal_config.legacy_mem) {
@@ -372,7 +473,8 @@ heap_alloc_on_socket(const char *type, size_t size, int socket,
if (ret != NULL)
goto alloc_unlock;

- if (!alloc_mem_on_socket(size, socket, flags, align, bound, contig)) {
+ if (!alloc_more_mem_on_socket(heap, size, socket, flags, align, bound,
+ contig)) {
ret = heap_alloc(heap, type, size, flags, align, bound, contig);

/* this should have succeeded */
@@ -424,14 +526,40 @@ malloc_heap_alloc(const char *type, size_t size, int socket_arg,
return NULL;
}

+/* this function is exposed in malloc_mp.h */
+int
+malloc_heap_free_pages(void *aligned_start, size_t aligned_len)
+{
+ int n_pages, page_idx, max_page_idx;
+ struct rte_memseg_list *msl;
+
+ msl = rte_mem_virt2memseg_list(aligned_start);
+ if (msl == NULL)
+ return -1;
+
+ n_pages = aligned_len / msl->hugepage_sz;
+ page_idx = RTE_PTR_DIFF(aligned_start, msl->base_va) /
+ msl->hugepage_sz;
+ max_page_idx = page_idx + n_pages;
+
+ for (; page_idx < max_page_idx; page_idx++) {
+ struct rte_memseg *ms;
+
+ ms = rte_fbarray_get(&msl->memseg_arr, page_idx);
+ eal_memalloc_free_page(ms);
+ }
+ return 0;
+}
+
int
malloc_heap_free(struct malloc_elem *elem)
{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
struct malloc_heap *heap;
void *start, *aligned_start, *end, *aligned_end;
size_t len, aligned_len;
struct rte_memseg_list *msl;
- int n_pages, page_idx, max_page_idx, ret;
+ int n_pages, ret;

if (!malloc_elem_cookies_ok(elem) || elem->state != ELEM_BUSY)
return -1;
@@ -463,30 +591,60 @@ malloc_heap_free(struct malloc_elem *elem)
aligned_end = RTE_PTR_ALIGN_FLOOR(end, msl->hugepage_sz);

aligned_len = RTE_PTR_DIFF(aligned_end, aligned_start);
+ n_pages = aligned_len / msl->hugepage_sz;

/* can't free anything */
- if (aligned_len < msl->hugepage_sz)
+ if (n_pages == 0)
goto free_unlock;

+ rte_rwlock_write_lock(&mcfg->memory_hotplug_lock);
+
+ /*
+ * we allow secondary processes to clear the heap of this allocated
+ * memory because it is safe to do so, as even if notifications about
+ * unmapped pages don't make it to other processes, heap is shared
+ * across all processes, and will become empty of this memory anyway,
+ * and nothing can allocate it back unless primary process will be able
+ * to deliver allocation message to every single running process.
+ */
+
malloc_elem_free_list_remove(elem);

malloc_elem_hide_region(elem, (void *) aligned_start, aligned_len);

- /* we don't really care if we fail to deallocate memory */
- n_pages = aligned_len / msl->hugepage_sz;
- page_idx = RTE_PTR_DIFF(aligned_start, msl->base_va) / msl->hugepage_sz;
- max_page_idx = page_idx + n_pages;
+ heap->total_size -= n_pages * msl->hugepage_sz;

- for (; page_idx < max_page_idx; page_idx++) {
- struct rte_memseg *ms;
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ /* don't care if any of this fails */
+ malloc_heap_free_pages(aligned_start, aligned_len);

- ms = rte_fbarray_get(&msl->memseg_arr, page_idx);
- eal_memalloc_free_page(ms);
- heap->total_size -= msl->hugepage_sz;
+ request_sync();
+ } else {
+ struct malloc_mp_req req;
+
+ req.t = REQ_TYPE_FREE;
+ req.free_req.addr = aligned_start;
+ req.free_req.len = aligned_len;
+
+ /*
+ * we request primary to deallocate pages, but we don't do it
+ * in this thread. instead, we notify primary that we would like
+ * to deallocate pages, and this process will receive another
+ * request (in parallel) that will do it for us on another
+ * thread.
+ *
+ * we also don't really care if this succeeds - the data is
+ * already removed from the heap, so it is, for all intents and
+ * purposes, hidden from the rest of DPDK even if some other
+ * process (including this one) may have these pages mapped.
+ */
+ request_to_primary(&req);
}

RTE_LOG(DEBUG, EAL, "Heap on socket %d was shrunk by %zdMB\n",
msl->socket_id, aligned_len >> 20ULL);
+
+ rte_rwlock_write_unlock(&mcfg->memory_hotplug_lock);
free_unlock:
rte_spinlock_unlock(&(heap->lock));
return ret;
@@ -576,8 +734,16 @@ rte_eal_malloc_heap_init(void)
int msl_idx;
struct rte_memseg_list *msl;

- if (mcfg == NULL)
+ if (register_mp_requests()) {
+ RTE_LOG(ERR, EAL, "Couldn't register malloc multiprocess actions\n");
return -1;
+ }
+
+ /* unlock mem hotplug here. it's safe for primary as no requests can
+ * even come before primary itself is fully initialized, and secondaries
+ * do not need to initialize the heap.
+ */
+ rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);

/* secondary processes don't need to initialize heap */
if (rte_eal_process_type() == RTE_PROC_SECONDARY)
@@ -604,6 +770,7 @@ rte_eal_malloc_heap_init(void)
rte_fbarray_get(arr, ms_idx);
malloc_heap_add_memory(heap, msl,
ms->addr, ms->len);
+ heap->total_size += ms->len;
ms_idx++;
RTE_LOG(DEBUG, EAL, "Heap on socket %d was expanded by %zdMB\n",
msl->socket_id, ms->len >> 20ULL);
@@ -630,6 +797,8 @@ rte_eal_malloc_heap_init(void)
*/
malloc_heap_add_memory(heap, msl, start_seg->addr, len);

+ heap->total_size += len;
+
RTE_LOG(DEBUG, EAL, "Heap on socket %d was expanded by %zdMB\n",
msl->socket_id, len >> 20ULL);

diff --git a/lib/librte_eal/common/malloc_mp.c b/lib/librte_eal/common/malloc_mp.c
new file mode 100644
index 0000000..8052680
--- /dev/null
+++ b/lib/librte_eal/common/malloc_mp.c
@@ -0,0 +1,723 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#include <string.h>
+#include <sys/time.h>
+
+#include <rte_alarm.h>
+#include <rte_errno.h>
+
+#include "eal_memalloc.h"
+
+#include "malloc_elem.h"
+#include "malloc_mp.h"
+
+#define MP_ACTION_SYNC "mp_malloc_sync"
+/**< request sent by primary process to notify of changes in memory map */
+#define MP_ACTION_ROLLBACK "mp_malloc_rollback"
+/**< request sent by primary process to notify of changes in memory map. this is
+ * essentially a regular sync request, but we cannot send sync requests while
+ * another one is in progress, and we might have to - therefore, we do this as
+ * a separate callback.
+ */
+#define MP_ACTION_REQUEST "mp_malloc_request"
+/**< request sent by secondary process to ask for allocation/deallocation */
+#define MP_ACTION_RESPONSE "mp_malloc_response"
+/**< response sent to secondary process to indicate result of request */
+
+#define MP_TIMEOUT_S 5 /**< 5 seconds timeouts */
+
+/* when we're allocating, we need to store some state to ensure that we can
+ * roll back later
+ */
+struct primary_alloc_req_state {
+ struct malloc_heap *heap;
+ struct rte_memseg **ms;
+ int ms_len;
+ struct malloc_elem *elem;
+ void *map_addr;
+ size_t map_len;
+};
+
+enum req_state {
+ REQ_STATE_INACTIVE = 0,
+ REQ_STATE_ACTIVE,
+ REQ_STATE_COMPLETE
+};
+
+struct mp_request {
+ TAILQ_ENTRY(mp_request) next;
+ struct malloc_mp_req user_req; /**< contents of request */
+ pthread_cond_t cond; /**< variable we use to time out on this request */
+ enum req_state state; /**< indicate status of this request */
+ struct primary_alloc_req_state alloc_state;
+};
+
+/*
+ * We could've used just a single request, but it may be possible for
+ * secondaries to timeout earlier than the primary, and send a new request while
+ * primary is still expecting replies to the old one. Therefore, each new
+ * request will get assigned a new ID, which is how we will distinguish between
+ * expected and unexpected messages.
+ */
+TAILQ_HEAD(mp_request_list, mp_request);
+static struct {
+ struct mp_request_list list;
+ pthread_mutex_t lock;
+} mp_request_list = {
+ .list = TAILQ_HEAD_INITIALIZER(mp_request_list.list),
+ .lock = PTHREAD_MUTEX_INITIALIZER
+};
+
+/**
+ * General workflow is the following:
+ *
+ * Allocation:
+ * S: send request to primary
+ * P: attempt to allocate memory
+ * if failed, sendmsg failure
+ * if success, send sync request
+ * S: if received msg of failure, quit
+ * if received sync request, synchronize memory map and reply with result
+ * P: if received sync request result
+ * if success, sendmsg success
+ * if failure, roll back allocation and send a rollback request
+ * S: if received msg of success, quit
+ * if received rollback request, synchronize memory map and reply with result
+ * P: if received sync request result
+ * sendmsg sync request result
+ * S: if received msg, quit
+ *
+ * Aside from timeouts, there are three points where we can quit:
+ * - if allocation failed straight away
+ * - if allocation and sync request succeeded
+ * - if allocation succeeded, sync request failed, allocation rolled back and
+ * rollback request received (irrespective of whether it succeeded or failed)
+ *
+ * Deallocation:
+ * S: send request to primary
+ * P: attempt to deallocate memory
+ * if failed, sendmsg failure
+ * if success, send sync request
+ * S: if received msg of failure, quit
+ * if received sync request, synchronize memory map and reply with result
+ * P: if received sync request result
+ * sendmsg sync request result
+ * S: if received msg, quit
+ *
+ * There is no "rollback" from deallocation, as it's safe to have some memory
+ * mapped in some processes - it's absent from the heap, so it won't get used.
+ */
+
+static struct mp_request *
+find_request_by_id(uint64_t id)
+{
+ struct mp_request *req;
+ TAILQ_FOREACH(req, &mp_request_list.list, next) {
+ if (req->user_req.id == id)
+ break;
+ }
+ return req;
+}
+
+/* this ID is, like, totally guaranteed to be absolutely unique. pinky swear. */
+static uint64_t
+get_unique_id(void)
+{
+ uint64_t id;
+ do {
+ id = rte_rand();
+ } while (find_request_by_id(id) != NULL);
+ return id;
+}
+
+/* secondary will respond to sync requests thusly */
+static int
+handle_sync(const struct rte_mp_msg *msg, const void *peer)
+{
+ struct rte_mp_msg reply = {0};
+ const struct malloc_mp_req *req =
+ (const struct malloc_mp_req *)msg->param;
+ struct malloc_mp_req *resp =
+ (struct malloc_mp_req *)reply.param;
+ int ret;
+
+ if (req->t != REQ_TYPE_SYNC) {
+ RTE_LOG(ERR, EAL, "Unexpected request from primary\n");
+ return -1;
+ }
+
+ reply.num_fds = 0;
+ snprintf(reply.name, sizeof(reply.name), "%s", msg->name);
+ reply.len_param = sizeof(*resp);
+
+ ret = eal_memalloc_sync_with_primary();
+
+ resp->t = REQ_TYPE_SYNC;
+ resp->id = req->id;
+ resp->result = ret == 0 ? REQ_RESULT_SUCCESS : REQ_RESULT_FAIL;
+
+ rte_mp_reply(&reply, peer);
+
+ return 0;
+}
+
+static int
+handle_alloc_request(const struct malloc_mp_req *m,
+ struct mp_request *req)
+{
+ const struct malloc_req_alloc *ar = &m->alloc_req;
+ struct malloc_heap *heap;
+ struct malloc_elem *elem;
+ struct rte_memseg **ms;
+ size_t map_len;
+ int n_pages;
+
+ map_len = RTE_ALIGN_CEIL(ar->align + ar->elt_size +
+ MALLOC_ELEM_TRAILER_LEN, ar->page_sz);
+ n_pages = map_len / ar->page_sz;
+
+ heap = ar->heap;
+
+ /* we can't know in advance how many pages we'll need, so we malloc */
+ ms = malloc(sizeof(*ms) * n_pages);
+
+ if (ms == NULL) {
+ RTE_LOG(ERR, EAL, "Couldn't allocate memory for request state\n");
+ goto fail;
+ }
+
+ elem = alloc_pages_on_heap(heap, ar->page_sz, ar->elt_size, ar->socket,
+ ar->flags, ar->align, ar->bound, ar->contig, ms,
+ n_pages);
+
+ if (elem == NULL)
+ goto fail;
+
+ /* we have succeeded in allocating memory, but we still need to sync
+ * with other processes. however, since DPDK IPC is single-threaded, we
+ * send an asynchronous request and exit this callback.
+ */
+
+ req->alloc_state.ms = ms;
+ req->alloc_state.ms_len = n_pages;
+ req->alloc_state.map_addr = ms[0]->addr;
+ req->alloc_state.map_len = map_len;
+ req->alloc_state.elem = elem;
+ req->alloc_state.heap = heap;
+
+ return 0;
+fail:
+ free(ms);
+ return -1;
+}
+
+/* first stage of primary handling requests from secondary */
+static int
+handle_request(const struct rte_mp_msg *msg, const void *peer __rte_unused)
+{
+ const struct malloc_mp_req *m =
+ (const struct malloc_mp_req *)msg->param;
+ struct mp_request *entry;
+ int ret;
+
+ /* lock access to request */
+ pthread_mutex_lock(&mp_request_list.lock);
+
+ /* make sure it's not a dupe */
+ entry = find_request_by_id(m->id);
+ if (entry != NULL) {
+ RTE_LOG(ERR, EAL, "Duplicate request id\n");
+ goto fail;
+ }
+
+ entry = malloc(sizeof(*entry));
+ if (entry == NULL) {
+ RTE_LOG(ERR, EAL, "Unable to allocate memory for request\n");
+ goto fail;
+ }
+
+ /* erase all data */
+ memset(entry, 0, sizeof(*entry));
+
+ if (m->t == REQ_TYPE_ALLOC) {
+ ret = handle_alloc_request(m, entry);
+ } else if (m->t == REQ_TYPE_FREE) {
+ ret = malloc_heap_free_pages(m->free_req.addr,
+ m->free_req.len);
+ } else {
+ RTE_LOG(ERR, EAL, "Unexpected request from secondary\n");
+ goto fail;
+ }
+
+ if (ret != 0) {
+ struct rte_mp_msg resp_msg;
+ struct malloc_mp_req *resp =
+ (struct malloc_mp_req *)resp_msg.param;
+
+ /* send failure message straight away */
+ resp_msg.num_fds = 0;
+ resp_msg.len_param = sizeof(*resp);
+ snprintf(resp_msg.name, sizeof(resp_msg.name), "%s",
+ MP_ACTION_RESPONSE);
+
+ resp->t = m->t;
+ resp->result = REQ_RESULT_FAIL;
+ resp->id = m->id;
+
+ if (rte_mp_sendmsg(&resp_msg)) {
+ RTE_LOG(ERR, EAL, "Couldn't send response\n");
+ goto fail;
+ }
+ /* we did not modify the request */
+ free(entry);
+ } else {
+ struct rte_mp_msg sr_msg = {0};
+ struct malloc_mp_req *sr =
+ (struct malloc_mp_req *)sr_msg.param;
+ struct timespec ts;
+
+ /* we can do something, so send sync request asynchronously */
+ sr_msg.num_fds = 0;
+ sr_msg.len_param = sizeof(*sr);
+ snprintf(sr_msg.name, sizeof(sr_msg.name), "%s",
+ MP_ACTION_SYNC);
+
+ ts.tv_nsec = 0;
+ ts.tv_sec = MP_TIMEOUT_S;
+
+ /* sync requests carry no data */
+ sr->t = REQ_TYPE_SYNC;
+ sr->id = m->id;
+
+ /* there may be stray timeout still waiting */
+ do {
+ ret = rte_mp_request_async(&sr_msg, &ts);
+ } while (ret != 0 && rte_errno == EEXIST);
+ if (ret != 0) {
+ RTE_LOG(ERR, EAL, "Couldn't send sync request\n");
+ if (m->t == REQ_TYPE_ALLOC)
+ free(entry->alloc_state.ms);
+ goto fail;
+ }
+
+ /* mark request as in progress */
+ memcpy(&entry->user_req, m, sizeof(*m));
+ entry->state = REQ_STATE_ACTIVE;
+
+ TAILQ_INSERT_TAIL(&mp_request_list.list, entry, next);
+ }
+ pthread_mutex_unlock(&mp_request_list.lock);
+ return 0;
+fail:
+ pthread_mutex_unlock(&mp_request_list.lock);
+ free(entry);
+ return -1;
+}
+
+/* callback for asynchronous sync requests for primary. this will either do a
+ * sendmsg with results, or trigger rollback request.
+ */
+static int
+handle_sync_response(const struct rte_mp_msg *request,
+ const struct rte_mp_reply *reply)
+{
+ enum malloc_req_result result;
+ struct mp_request *entry;
+ const struct malloc_mp_req *mpreq =
+ (const struct malloc_mp_req *)request->param;
+ int i;
+
+ /* lock the request */
+ pthread_mutex_lock(&mp_request_list.lock);
+
+ entry = find_request_by_id(mpreq->id);
+ if (entry == NULL) {
+ RTE_LOG(ERR, EAL, "Wrong request ID\n");
+ goto fail;
+ }
+
+ result = REQ_RESULT_SUCCESS;
+
+ if (reply->nb_received != reply->nb_sent)
+ result = REQ_RESULT_FAIL;
+
+ for (i = 0; i < reply->nb_received; i++) {
+ struct malloc_mp_req *resp =
+ (struct malloc_mp_req *)reply->msgs[i].param;
+
+ if (resp->t != REQ_TYPE_SYNC) {
+ RTE_LOG(ERR, EAL, "Unexpected response to sync request\n");
+ result = REQ_RESULT_FAIL;
+ break;
+ }
+ if (resp->id != entry->user_req.id) {
+ RTE_LOG(ERR, EAL, "Response to wrong sync request\n");
+ result = REQ_RESULT_FAIL;
+ break;
+ }
+ if (resp->result == REQ_RESULT_FAIL) {
+ result = REQ_RESULT_FAIL;
+ break;
+ }
+ }
+
+ if (entry->user_req.t == REQ_TYPE_FREE) {
+ struct rte_mp_msg msg = {0};
+ struct malloc_mp_req *resp = (struct malloc_mp_req *)msg.param;
+
+ /* this is a free request, just sendmsg result */
+ resp->t = REQ_TYPE_FREE;
+ resp->result = result;
+ resp->id = entry->user_req.id;
+ msg.num_fds = 0;
+ msg.len_param = sizeof(*resp);
+ snprintf(msg.name, sizeof(msg.name), "%s", MP_ACTION_RESPONSE);
+
+ if (rte_mp_sendmsg(&msg))
+ RTE_LOG(ERR, EAL, "Could not send message to secondary process\n");
+
+ TAILQ_REMOVE(&mp_request_list.list, entry, next);
+ free(entry);
+ } else if (entry->user_req.t == REQ_TYPE_ALLOC &&
+ result == REQ_RESULT_SUCCESS) {
+ struct malloc_heap *heap = entry->alloc_state.heap;
+ struct rte_mp_msg msg = {0};
+ struct malloc_mp_req *resp =
+ (struct malloc_mp_req *)msg.param;
+
+ heap->total_size += entry->alloc_state.map_len;
+
+ /* result is success, so just notify secondary about this */
+ resp->t = REQ_TYPE_ALLOC;
+ resp->result = result;
+ resp->id = entry->user_req.id;
+ msg.num_fds = 0;
+ msg.len_param = sizeof(*resp);
+ snprintf(msg.name, sizeof(msg.name), "%s", MP_ACTION_RESPONSE);
+
+ if (rte_mp_sendmsg(&msg))
+ RTE_LOG(ERR, EAL, "Could not send message to secondary process\n");
+
+ TAILQ_REMOVE(&mp_request_list.list, entry, next);
+ free(entry->alloc_state.ms);
+ free(entry);
+ } else if (entry->user_req.t == REQ_TYPE_ALLOC &&
+ result == REQ_RESULT_FAIL) {
+ struct rte_mp_msg rb_msg = {0};
+ struct malloc_mp_req *rb =
+ (struct malloc_mp_req *)rb_msg.param;
+ struct timespec ts;
+ struct primary_alloc_req_state *state =
+ &entry->alloc_state;
+ int ret;
+
+ /* we've failed to sync, so do a rollback */
+ rollback_expand_heap(state->ms, state->ms_len, state->elem,
+ state->map_addr, state->map_len);
+
+ /* send rollback request */
+ rb_msg.num_fds = 0;
+ rb_msg.len_param = sizeof(*rb);
+ snprintf(rb_msg.name, sizeof(rb_msg.name), "%s",
+ MP_ACTION_ROLLBACK);
+
+ ts.tv_nsec = 0;
+ ts.tv_sec = MP_TIMEOUT_S;
+
+ /* sync requests carry no data */
+ rb->t = REQ_TYPE_SYNC;
+ rb->id = entry->user_req.id;
+
+ /* there may be stray timeout still waiting */
+ do {
+ ret = rte_mp_request_async(&rb_msg, &ts);
+ } while (ret != 0 && rte_errno == EEXIST);
+ if (ret != 0) {
+ RTE_LOG(ERR, EAL, "Could not send rollback request to secondary process\n");
+
+ /* we couldn't send rollback request, but that's OK -
+ * secondary will time out, and memory has been removed
+ * from heap anyway.
+ */
+ TAILQ_REMOVE(&mp_request_list.list, entry, next);
+ free(state->ms);
+ free(entry);
+ goto fail;
+ }
+ } else {
+ RTE_LOG(ERR, EAL, " to sync request of unknown type\n");
+ goto fail;
+ }
+
+ pthread_mutex_unlock(&mp_request_list.lock);
+ return 0;
+fail:
+ pthread_mutex_unlock(&mp_request_list.lock);
+ return -1;
+}
+
+static int
+handle_rollback_response(const struct rte_mp_msg *request,
+ const struct rte_mp_reply *reply __rte_unused)
+{
+ struct rte_mp_msg msg = {0};
+ struct malloc_mp_req *resp = (struct malloc_mp_req *)msg.param;
+ const struct malloc_mp_req *mpreq =
+ (const struct malloc_mp_req *)request->param;
+ struct mp_request *entry;
+
+ /* lock the request */
+ pthread_mutex_lock(&mp_request_list.lock);
+
+ entry = find_request_by_id(mpreq->id);
+ if (entry == NULL) {
+ RTE_LOG(ERR, EAL, "Wrong request ID\n");
+ goto fail;
+ }
+
+ if (entry->user_req.t != REQ_TYPE_ALLOC) {
+ RTE_LOG(ERR, EAL, "Unexpected active request\n");
+ goto fail;
+ }
+
+ /* we don't care if rollback succeeded, request still failed */
+ resp->t = REQ_TYPE_ALLOC;
+ resp->result = REQ_RESULT_FAIL;
+ resp->id = mpreq->id;
+ msg.num_fds = 0;
+ msg.len_param = sizeof(*resp);
+ snprintf(msg.name, sizeof(msg.name), "%s", MP_ACTION_RESPONSE);
+
+ if (rte_mp_sendmsg(&msg))
+ RTE_LOG(ERR, EAL, "Could not send message to secondary process\n");
+
+ /* clean up */
+ TAILQ_REMOVE(&mp_request_list.list, entry, next);
+ free(entry->alloc_state.ms);
+ free(entry);
+
+ pthread_mutex_unlock(&mp_request_list.lock);
+ return 0;
+fail:
+ pthread_mutex_unlock(&mp_request_list.lock);
+ return -1;
+}
+
+/* final stage of the request from secondary */
+static int
+handle_response(const struct rte_mp_msg *msg, const void *peer __rte_unused)
+{
+ const struct malloc_mp_req *m =
+ (const struct malloc_mp_req *)msg->param;
+ struct mp_request *entry;
+
+ pthread_mutex_lock(&mp_request_list.lock);
+
+ entry = find_request_by_id(m->id);
+ if (entry != NULL) {
+ /* update request status */
+ entry->user_req.result = m->result;
+
+ entry->state = REQ_STATE_COMPLETE;
+
+ /* trigger thread wakeup */
+ pthread_cond_signal(&entry->cond);
+ }
+
+ pthread_mutex_unlock(&mp_request_list.lock);
+
+ return 0;
+}
+
+/* synchronously request memory map sync, this is only called whenever primary
+ * process initiates the allocation.
+ */
+int
+request_sync(void)
+{
+ struct rte_mp_msg msg = {0};
+ struct rte_mp_reply reply = {0};
+ struct malloc_mp_req *req = (struct malloc_mp_req *)msg.param;
+ struct timespec ts;
+ int i, ret;
+
+ /* no need to create tailq entries as this is entirely synchronous */
+
+ msg.num_fds = 0;
+ msg.len_param = sizeof(*req);
+ snprintf(msg.name, sizeof(msg.name), "%s", MP_ACTION_SYNC);
+
+ /* sync request carries no data */
+ req->t = REQ_TYPE_SYNC;
+ req->id = get_unique_id();
+
+ ts.tv_nsec = 0;
+ ts.tv_sec = MP_TIMEOUT_S;
+
+ /* there may be stray timeout still waiting */
+ do {
+ ret = rte_mp_request(&msg, &reply, &ts);
+ } while (ret != 0 && rte_errno == EEXIST);
+ if (ret != 0) {
+ RTE_LOG(ERR, EAL, "Could not send sync request to secondary process\n");
+ ret = -1;
+ goto out;
+ }
+
+ if (reply.nb_received != reply.nb_sent) {
+ RTE_LOG(ERR, EAL, "Not all secondaries have responded\n");
+ ret = -1;
+ goto out;
+ }
+
+ for (i = 0; i < reply.nb_received; i++) {
+ struct malloc_mp_req *resp =
+ (struct malloc_mp_req *)reply.msgs[i].param;
+ if (resp->t != REQ_TYPE_SYNC) {
+ RTE_LOG(ERR, EAL, "Unexpected response from secondary\n");
+ ret = -1;
+ goto out;
+ }
+ if (resp->id != req->id) {
+ RTE_LOG(ERR, EAL, "Wrong request ID\n");
+ ret = -1;
+ goto out;
+ }
+ if (resp->result != REQ_RESULT_SUCCESS) {
+ RTE_LOG(ERR, EAL, "Secondary process failed to synchronize\n");
+ ret = -1;
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ free(reply.msgs);
+ return ret;
+}
+
+/* this is a synchronous wrapper around a bunch of asynchronous requests to
+ * primary process. this will initiate a request and wait until responses come.
+ */
+int
+request_to_primary(struct malloc_mp_req *user_req)
+{
+ struct rte_mp_msg msg = {0};
+ struct malloc_mp_req *msg_req = (struct malloc_mp_req *)msg.param;
+ struct mp_request *entry;
+ struct timespec ts = {0};
+ struct timeval now;
+ int ret;
+
+ pthread_mutex_lock(&mp_request_list.lock);
+
+ entry = malloc(sizeof(*entry));
+ if (entry == NULL) {
+ RTE_LOG(ERR, EAL, "Cannot allocate memory for request\n");
+ goto fail;
+ }
+
+ memset(entry, 0, sizeof(*entry));
+
+ if (gettimeofday(&now, NULL) < 0) {
+ RTE_LOG(ERR, EAL, "Cannot get current time\n");
+ goto fail;
+ }
+
+ ts.tv_nsec = (now.tv_usec * 1000) % 1000000000;
+ ts.tv_sec = now.tv_sec + MP_TIMEOUT_S +
+ (now.tv_usec * 1000) / 1000000000;
+
+ /* initialize the request */
+ pthread_cond_init(&entry->cond, NULL);
+
+ msg.num_fds = 0;
+ msg.len_param = sizeof(*msg_req);
+ snprintf(msg.name, sizeof(msg.name), "%s", MP_ACTION_REQUEST);
+
+ /* (attempt to) get a unique id */
+ user_req->id = get_unique_id();
+
+ /* copy contents of user request into the message */
+ memcpy(msg_req, user_req, sizeof(*msg_req));
+
+ if (rte_mp_sendmsg(&msg)) {
+ RTE_LOG(ERR, EAL, "Cannot send message to primary\n");
+ goto fail;
+ }
+
+ /* copy contents of user request into active request */
+ memcpy(&entry->user_req, user_req, sizeof(*user_req));
+
+ /* mark request as in progress */
+ entry->state = REQ_STATE_ACTIVE;
+
+ TAILQ_INSERT_TAIL(&mp_request_list.list, entry, next);
+
+ /* finally, wait on timeout */
+ do {
+ ret = pthread_cond_timedwait(&entry->cond,
+ &mp_request_list.lock, &ts);
+ } while (ret != 0 && ret != ETIMEDOUT);
+
+ if (entry->state != REQ_STATE_COMPLETE) {
+ RTE_LOG(ERR, EAL, "Request timed out\n");
+ ret = -1;
+ } else {
+ ret = 0;
+ user_req->result = entry->user_req.result;
+ }
+ TAILQ_REMOVE(&mp_request_list.list, entry, next);
+ free(entry);
+
+ pthread_mutex_unlock(&mp_request_list.lock);
+ return ret;
+fail:
+ pthread_mutex_unlock(&mp_request_list.lock);
+ free(entry);
+ return -1;
+}
+
+int
+register_mp_requests(void)
+{
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ if (rte_mp_action_register(MP_ACTION_REQUEST, handle_request)) {
+ RTE_LOG(ERR, EAL, "Couldn't register '%s' action\n",
+ MP_ACTION_REQUEST);
+ return -1;
+ }
+ if (rte_mp_async_reply_register(MP_ACTION_SYNC,
+ handle_sync_response)) {
+ RTE_LOG(ERR, EAL, "Couldn't register '%s' action\n",
+ MP_ACTION_SYNC);
+ return -1;
+ }
+ if (rte_mp_async_reply_register(MP_ACTION_ROLLBACK,
+ handle_rollback_response)) {
+ RTE_LOG(ERR, EAL, "Couldn't register '%s' action\n",
+ MP_ACTION_ROLLBACK);
+ return -1;
+ }
+ } else {
+ if (rte_mp_action_register(MP_ACTION_SYNC, handle_sync)) {
+ RTE_LOG(ERR, EAL, "Couldn't register '%s' action\n",
+ MP_ACTION_SYNC);
+ return -1;
+ }
+ if (rte_mp_action_register(MP_ACTION_ROLLBACK, handle_sync)) {
+ RTE_LOG(ERR, EAL, "Couldn't register '%s' action\n",
+ MP_ACTION_SYNC);
+ return -1;
+ }
+ if (rte_mp_action_register(MP_ACTION_RESPONSE,
+ handle_response)) {
+ RTE_LOG(ERR, EAL, "Couldn't register '%s' action\n",
+ MP_ACTION_RESPONSE);
+ return -1;
+ }
+ }
+ return 0;
+}
diff --git a/lib/librte_eal/common/malloc_mp.h b/lib/librte_eal/common/malloc_mp.h
new file mode 100644
index 0000000..9c79d31
--- /dev/null
+++ b/lib/librte_eal/common/malloc_mp.h
@@ -0,0 +1,86 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef MALLOC_MP_H
+#define MALLOC_MP_H
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include <rte_common.h>
+#include <rte_random.h>
+#include <rte_spinlock.h>
+#include <rte_tailq.h>
+
+/* forward declarations */
+struct malloc_heap;
+struct rte_memseg;
+
+/* multiprocess synchronization structures for malloc */
+enum malloc_req_type {
+ REQ_TYPE_ALLOC, /**< ask primary to allocate */
+ REQ_TYPE_FREE, /**< ask primary to free */
+ REQ_TYPE_SYNC /**< ask secondary to synchronize its memory map */
+};
+
+enum malloc_req_result {
+ REQ_RESULT_SUCCESS,
+ REQ_RESULT_FAIL
+};
+
+struct malloc_req_alloc {
+ struct malloc_heap *heap;
+ uint64_t page_sz;
+ size_t elt_size;
+ int socket;
+ unsigned int flags;
+ size_t align;
+ size_t bound;
+ bool contig;
+};
+
+struct malloc_req_free {
+ RTE_STD_C11
+ union {
+ void *addr;
+ uint64_t addr_64;
+ };
+ uint64_t len;
+};
+
+struct malloc_mp_req {
+ enum malloc_req_type t;
+ RTE_STD_C11
+ union {
+ struct malloc_req_alloc alloc_req;
+ struct malloc_req_free free_req;
+ };
+ uint64_t id; /**< not to be populated by caller */
+ enum malloc_req_result result;
+};
+
+int
+register_mp_requests(void);
+
+int
+request_to_primary(struct malloc_mp_req *req);
+
+/* synchronous memory map sync request */
+int
+request_sync(void);
+
+/* functions from malloc_heap exposed here */
+int
+malloc_heap_free_pages(void *aligned_start, size_t aligned_len);
+
+struct malloc_elem *
+alloc_pages_on_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size,
+ int socket, unsigned int flags, size_t align, size_t bound,
+ bool contig, struct rte_memseg **ms, int n_pages);
+
+void
+rollback_expand_heap(struct rte_memseg **ms, int n_pages,
+ struct malloc_elem *elem, void *map_addr, size_t map_len);
+
+#endif // MALLOC_MP_H
diff --git a/lib/librte_eal/common/meson.build b/lib/librte_eal/common/meson.build
index a1ada24..8a3dcfe 100644
--- a/lib/librte_eal/common/meson.build
+++ b/lib/librte_eal/common/meson.build
@@ -27,6 +27,7 @@ common_sources = files(
'eal_common_timer.c',
'malloc_elem.c',
'malloc_heap.c',
+ 'malloc_mp.c',
'rte_keepalive.c',
'rte_malloc.c',
'rte_reciprocal.c',
diff --git a/lib/librte_eal/linuxapp/eal/Makefile b/lib/librte_eal/linuxapp/eal/Makefile
index 5380ba8..542bf7e 100644
--- a/lib/librte_eal/linuxapp/eal/Makefile
+++ b/lib/librte_eal/linuxapp/eal/Makefile
@@ -67,6 +67,7 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_fbarray.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_malloc.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_elem.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_heap.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_mp.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_keepalive.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_service.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_reciprocal.c
--
2.7.4
Anatoly Burakov
2018-03-07 16:57:08 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
Reviewed-by: Venkatesh Srinivas <***@google.com>
---

Notes:
Not sure if virtio needs to allocate DMA-capable memory,
being a software driver and all. Corrections welcome.

drivers/net/virtio/virtio_ethdev.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/virtio/virtio_ethdev.c b/drivers/net/virtio/virtio_ethdev.c
index 884f74a..35812e4 100644
--- a/drivers/net/virtio/virtio_ethdev.c
+++ b/drivers/net/virtio/virtio_ethdev.c
@@ -391,7 +391,7 @@ virtio_init_queue(struct rte_eth_dev *dev, uint16_t vtpci_queue_idx)
PMD_INIT_LOG(DEBUG, "vring_size: %d, rounded_vring_size: %d",
size, vq->vq_ring_size);

- mz = rte_memzone_reserve_aligned(vq_name, vq->vq_ring_size,
+ mz = rte_memzone_reserve_aligned_contig(vq_name, vq->vq_ring_size,
SOCKET_ID_ANY,
0, VIRTIO_PCI_VRING_ALIGN);
if (mz == NULL) {
@@ -417,9 +417,9 @@ virtio_init_queue(struct rte_eth_dev *dev, uint16_t vtpci_queue_idx)
if (sz_hdr_mz) {
snprintf(vq_hdr_name, sizeof(vq_hdr_name), "port%d_vq%d_hdr",
dev->data->port_id, vtpci_queue_idx);
- hdr_mz = rte_memzone_reserve_aligned(vq_hdr_name, sz_hdr_mz,
- SOCKET_ID_ANY, 0,
- RTE_CACHE_LINE_SIZE);
+ hdr_mz = rte_memzone_reserve_aligned_contig(vq_hdr_name,
+ sz_hdr_mz, SOCKET_ID_ANY, 0,
+ RTE_CACHE_LINE_SIZE);
if (hdr_mz == NULL) {
if (rte_errno == EEXIST)
hdr_mz = rte_memzone_lookup(vq_hdr_name);
--
2.7.4
Maxime Coquelin
2018-03-28 11:58:51 UTC
Permalink
Hi Anatoly,
Post by Anatoly Burakov
---
Not sure if virtio needs to allocate DMA-capable memory,
being a software driver and all. Corrections welcome.
Yes, we need the ring memory to be contiguous in physical address space.
Post by Anatoly Burakov
drivers/net/virtio/virtio_ethdev.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
So:

Reviewed-by: Maxime Coquelin <***@redhat.com>

Thanks,
Maxime
Anatoly Burakov
2018-03-07 16:56:42 UTC
Permalink
Nothing uses this code yet. The bulk of it is copied from old
memory allocation code (linuxapp eal_memory.c). We provide an
EAL-internal API to allocate either one page or multiple pages,
guaranteeing that we'll get contiguous VA for all of the pages
that we requested.

For single-file segments, we will use fallocate() to grow and
shrink memory segments, however fallocate() is not supported
on all kernel versions, so we will fall back to using
ftruncate() to grow the file, and disable shrinking as there's
little we can do there. This will enable vhost use cases where
having single file segments is of great value even without
support for hot-unplugging memory.

Not supported on FreeBSD.

Locking is done via fcntl() because that way, when it comes to
taking out write locks or unlocking on deallocation, we don't
have to keep original fd's around. Plus, using fcntl() gives us
ability to lock parts of a file, which is useful for single-file
segments.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/bsdapp/eal/Makefile | 1 +
lib/librte_eal/bsdapp/eal/eal_memalloc.c | 26 ++
lib/librte_eal/bsdapp/eal/meson.build | 1 +
lib/librte_eal/common/eal_memalloc.h | 19 +
lib/librte_eal/linuxapp/eal/Makefile | 2 +
lib/librte_eal/linuxapp/eal/eal_memalloc.c | 609 +++++++++++++++++++++++++++++
lib/librte_eal/linuxapp/eal/meson.build | 1 +
7 files changed, 659 insertions(+)
create mode 100644 lib/librte_eal/bsdapp/eal/eal_memalloc.c
create mode 100644 lib/librte_eal/common/eal_memalloc.h
create mode 100644 lib/librte_eal/linuxapp/eal/eal_memalloc.c

diff --git a/lib/librte_eal/bsdapp/eal/Makefile b/lib/librte_eal/bsdapp/eal/Makefile
index 1b43d77..19f9322 100644
--- a/lib/librte_eal/bsdapp/eal/Makefile
+++ b/lib/librte_eal/bsdapp/eal/Makefile
@@ -29,6 +29,7 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_memory.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_hugepage_info.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_thread.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_debug.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_memalloc.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_lcore.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_timer.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_interrupts.c
diff --git a/lib/librte_eal/bsdapp/eal/eal_memalloc.c b/lib/librte_eal/bsdapp/eal/eal_memalloc.c
new file mode 100644
index 0000000..be8340b
--- /dev/null
+++ b/lib/librte_eal/bsdapp/eal/eal_memalloc.c
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2017-2018 Intel Corporation
+ */
+
+#include <inttypes.h>
+
+#include <rte_log.h>
+#include <rte_memory.h>
+
+#include "eal_memalloc.h"
+
+int
+eal_memalloc_alloc_page_bulk(struct rte_memseg **ms __rte_unused,
+ int __rte_unused n, uint64_t __rte_unused size,
+ int __rte_unused socket, bool __rte_unused exact)
+{
+ RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n");
+ return -1;
+}
+
+struct rte_memseg *
+eal_memalloc_alloc_page(uint64_t __rte_unused size, int __rte_unused socket)
+{
+ RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n");
+ return NULL;
+}
diff --git a/lib/librte_eal/bsdapp/eal/meson.build b/lib/librte_eal/bsdapp/eal/meson.build
index e83fc91..4b40223 100644
--- a/lib/librte_eal/bsdapp/eal/meson.build
+++ b/lib/librte_eal/bsdapp/eal/meson.build
@@ -8,6 +8,7 @@ env_sources = files('eal_alarm.c',
'eal_hugepage_info.c',
'eal_interrupts.c',
'eal_lcore.c',
+ 'eal_memalloc.c',
'eal_thread.c',
'eal_timer.c',
'eal.c',
diff --git a/lib/librte_eal/common/eal_memalloc.h b/lib/librte_eal/common/eal_memalloc.h
new file mode 100644
index 0000000..c1076cf
--- /dev/null
+++ b/lib/librte_eal/common/eal_memalloc.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2017-2018 Intel Corporation
+ */
+
+#ifndef EAL_MEMALLOC_H
+#define EAL_MEMALLOC_H
+
+#include <stdbool.h>
+
+#include <rte_memory.h>
+
+struct rte_memseg *
+eal_memalloc_alloc_page(uint64_t size, int socket);
+
+int
+eal_memalloc_alloc_page_bulk(struct rte_memseg **ms, int n, uint64_t size,
+ int socket, bool exact);
+
+#endif // EAL_MEMALLOC_H
diff --git a/lib/librte_eal/linuxapp/eal/Makefile b/lib/librte_eal/linuxapp/eal/Makefile
index c407a43..af6b9be 100644
--- a/lib/librte_eal/linuxapp/eal/Makefile
+++ b/lib/librte_eal/linuxapp/eal/Makefile
@@ -36,6 +36,7 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_thread.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_log.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_vfio.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_vfio_mp_sync.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_memalloc.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_debug.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_lcore.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_timer.c
@@ -82,6 +83,7 @@ CFLAGS_eal_interrupts.o := -D_GNU_SOURCE
CFLAGS_eal_vfio_mp_sync.o := -D_GNU_SOURCE
CFLAGS_eal_timer.o := -D_GNU_SOURCE
CFLAGS_eal_lcore.o := -D_GNU_SOURCE
+CFLAGS_eal_memalloc.o := -D_GNU_SOURCE
CFLAGS_eal_thread.o := -D_GNU_SOURCE
CFLAGS_eal_log.o := -D_GNU_SOURCE
CFLAGS_eal_common_log.o := -D_GNU_SOURCE
diff --git a/lib/librte_eal/linuxapp/eal/eal_memalloc.c b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
new file mode 100644
index 0000000..1ba1201
--- /dev/null
+++ b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
@@ -0,0 +1,609 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2017-2018 Intel Corporation
+ */
+
+#define _FILE_OFFSET_BITS 64
+#include <errno.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/queue.h>
+#include <sys/file.h>
+#include <unistd.h>
+#include <limits.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/time.h>
+#include <signal.h>
+#include <setjmp.h>
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+#include <numa.h>
+#include <numaif.h>
+#endif
+
+#include <rte_common.h>
+#include <rte_log.h>
+#include <rte_eal_memconfig.h>
+#include <rte_eal.h>
+#include <rte_memory.h>
+#include <rte_spinlock.h>
+
+#include "eal_filesystem.h"
+#include "eal_internal_cfg.h"
+#include "eal_memalloc.h"
+
+/*
+ * not all kernel version support fallocate on hugetlbfs, so fall back to
+ * ftruncate and disallow deallocation if fallocate is not supported.
+ */
+static int fallocate_supported = -1; /* unknown */
+
+/*
+ * If each page is in a separate file, we can close fd's since we need each fd
+ * only once. However, in single file segments mode, we can get away with using
+ * a single fd for entire segments, but we need to store them somewhere. Each
+ * fd is different within each process, so we'll store them in a local tailq.
+ */
+struct msl_entry {
+ TAILQ_ENTRY(msl_entry) next;
+ unsigned int msl_idx;
+ int fd;
+};
+
+/** Double linked list of memseg list fd's. */
+TAILQ_HEAD(msl_entry_list, msl_entry);
+
+static struct msl_entry_list msl_entry_list =
+ TAILQ_HEAD_INITIALIZER(msl_entry_list);
+static rte_spinlock_t tailq_lock = RTE_SPINLOCK_INITIALIZER;
+
+static sigjmp_buf huge_jmpenv;
+
+static void __rte_unused huge_sigbus_handler(int signo __rte_unused)
+{
+ siglongjmp(huge_jmpenv, 1);
+}
+
+/* Put setjmp into a wrap method to avoid compiling error. Any non-volatile,
+ * non-static local variable in the stack frame calling sigsetjmp might be
+ * clobbered by a call to longjmp.
+ */
+static int __rte_unused huge_wrap_sigsetjmp(void)
+{
+ return sigsetjmp(huge_jmpenv, 1);
+}
+
+static struct sigaction huge_action_old;
+static int huge_need_recover;
+
+static void __rte_unused
+huge_register_sigbus(void)
+{
+ sigset_t mask;
+ struct sigaction action;
+
+ sigemptyset(&mask);
+ sigaddset(&mask, SIGBUS);
+ action.sa_flags = 0;
+ action.sa_mask = mask;
+ action.sa_handler = huge_sigbus_handler;
+
+ huge_need_recover = !sigaction(SIGBUS, &action, &huge_action_old);
+}
+
+static void __rte_unused
+huge_recover_sigbus(void)
+{
+ if (huge_need_recover) {
+ sigaction(SIGBUS, &huge_action_old, NULL);
+ huge_need_recover = 0;
+ }
+}
+
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+static bool
+prepare_numa(int *oldpolicy, struct bitmask *oldmask, int socket_id)
+{
+ bool have_numa = true;
+
+ /* Check if kernel supports NUMA. */
+ if (numa_available() != 0) {
+ RTE_LOG(DEBUG, EAL, "NUMA is not supported.\n");
+ have_numa = false;
+ }
+
+ if (have_numa) {
+ RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n");
+ if (get_mempolicy(oldpolicy, oldmask->maskp,
+ oldmask->size + 1, 0, 0) < 0) {
+ RTE_LOG(ERR, EAL,
+ "Failed to get current mempolicy: %s. "
+ "Assuming MPOL_DEFAULT.\n", strerror(errno));
+ oldpolicy = MPOL_DEFAULT;
+ }
+ RTE_LOG(DEBUG, EAL,
+ "Setting policy MPOL_PREFERRED for socket %d\n",
+ socket_id);
+ numa_set_preferred(socket_id);
+ }
+ return have_numa;
+}
+
+static void
+resotre_numa(int *oldpolicy, struct bitmask *oldmask)
+{
+ RTE_LOG(DEBUG, EAL,
+ "Restoring previous memory policy: %d\n", *oldpolicy);
+ if (oldpolicy == MPOL_DEFAULT) {
+ numa_set_localalloc();
+ } else if (set_mempolicy(*oldpolicy, oldmask->maskp,
+ oldmask->size + 1) < 0) {
+ RTE_LOG(ERR, EAL, "Failed to restore mempolicy: %s\n",
+ strerror(errno));
+ numa_set_localalloc();
+ }
+ numa_free_cpumask(oldmask);
+}
+#endif
+
+static struct msl_entry *
+get_msl_entry_by_idx(unsigned int list_idx)
+{
+ struct msl_entry *te;
+
+ rte_spinlock_lock(&tailq_lock);
+
+ TAILQ_FOREACH(te, &msl_entry_list, next) {
+ if (te->msl_idx == list_idx)
+ break;
+ }
+ if (te == NULL) {
+ /* doesn't exist, so create it and set fd to -1 */
+
+ te = malloc(sizeof(*te));
+ if (te == NULL) {
+ RTE_LOG(ERR, EAL, "%s(): cannot allocate tailq entry for memseg list\n",
+ __func__);
+ goto unlock;
+ }
+ te->msl_idx = list_idx;
+ te->fd = -1;
+ TAILQ_INSERT_TAIL(&msl_entry_list, te, next);
+ }
+unlock:
+ rte_spinlock_unlock(&tailq_lock);
+ return te;
+}
+
+/*
+ * uses fstat to report the size of a file on disk
+ */
+static off_t
+getFileSize(int fd)
+{
+ struct stat st;
+ if (fstat(fd, &st) < 0)
+ return 0;
+ return st.st_size;
+}
+
+/*
+ * uses fstat to check if file size on disk is zero (regular fstat won't show
+ * true file size due to how fallocate works)
+ */
+static bool
+is_zero_length(int fd)
+{
+ struct stat st;
+ if (fstat(fd, &st) < 0)
+ return false;
+ return st.st_blocks == 0;
+}
+
+static int
+get_page_fd(char *path, int buflen, struct hugepage_info *hi,
+ unsigned int list_idx, unsigned int seg_idx)
+{
+ int fd;
+
+ if (internal_config.single_file_segments) {
+ /*
+ * try to find a tailq entry, for this memseg list, or create
+ * one if it doesn't exist.
+ */
+ struct msl_entry *te = get_msl_entry_by_idx(list_idx);
+ if (te == NULL) {
+ RTE_LOG(ERR, EAL, "%s(): cannot allocate tailq entry for memseg list\n",
+ __func__);
+ return -1;
+ } else if (te->fd < 0) {
+ /* create a hugepage file */
+ eal_get_hugefile_path(path, buflen, hi->hugedir,
+ list_idx);
+ fd = open(path, O_CREAT | O_RDWR, 0600);
+ if (fd < 0) {
+ RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n",
+ __func__, strerror(errno));
+ return -1;
+ }
+ te->fd = fd;
+ } else {
+ fd = te->fd;
+ }
+ } else {
+ /* one file per page, just create it */
+ eal_get_hugefile_path(path, buflen, hi->hugedir,
+ list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx);
+ fd = open(path, O_CREAT | O_RDWR, 0600);
+ if (fd < 0) {
+ RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__,
+ strerror(errno));
+ return -1;
+ }
+ }
+ return fd;
+}
+
+/* returns 1 on successful lock, 0 on unsuccessful lock, -1 on error */
+static int lock(int fd, uint64_t offset, uint64_t len, int type)
+{
+ struct flock lck = {0};
+ int ret;
+
+ lck.l_type = type;
+ lck.l_whence = SEEK_SET;
+ lck.l_start = offset;
+ lck.l_len = len;
+
+ ret = fcntl(fd, F_SETLK, &lck);
+
+ if (ret && (errno == EAGAIN || errno == EACCES)) {
+ /* locked by another process, not an error */
+ return 0;
+ } else if (ret) {
+ RTE_LOG(ERR, EAL, "%s(): error calling fcntl(): %s\n",
+ __func__, strerror(errno));
+ /* we've encountered an unexpected error */
+ return -1;
+ }
+ return 1;
+}
+
+static int
+resize_hugefile(int fd, uint64_t fa_offset, uint64_t page_sz,
+ bool grow)
+{
+ bool again = false;
+ do {
+ if (fallocate_supported == 0) {
+ /* we cannot deallocate memory if fallocate() is not
+ * supported, but locks are still needed to prevent
+ * primary process' initialization from clearing out
+ * huge pages used by this process.
+ */
+
+ if (!grow) {
+ RTE_LOG(DEBUG, EAL, "%s(): fallocate not supported, not freeing page back to the system\n",
+ __func__);
+ return -1;
+ }
+ uint64_t new_size = fa_offset + page_sz;
+ uint64_t cur_size = getFileSize(fd);
+
+ /* fallocate isn't supported, fall back to ftruncate */
+ if (new_size > cur_size &&
+ ftruncate(fd, new_size) < 0) {
+ RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n",
+ __func__, strerror(errno));
+ return -1;
+ }
+ /* not being able to take out a read lock is an error */
+ if (lock(fd, fa_offset, page_sz, F_RDLCK) != 1)
+ return -1;
+ } else {
+ int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE |
+ FALLOC_FL_KEEP_SIZE;
+ int ret;
+
+ /* if fallocate() is supported, we need to take out a
+ * read lock on allocate (to prevent other processes
+ * from deallocating this page), and take out a write
+ * lock on deallocate (to ensure nobody else is using
+ * this page).
+ *
+ * we can't use flock() for this, as we actually need to
+ * lock part of the file, not the entire file.
+ */
+
+ if (!grow) {
+ ret = lock(fd, fa_offset, page_sz, F_WRLCK);
+
+ if (ret < 0)
+ return -1;
+ else if (ret == 0)
+ /* failed to lock, not an error */
+ return 0;
+ }
+ if (fallocate(fd, flags, fa_offset, page_sz) < 0) {
+ if (fallocate_supported == -1 &&
+ errno == ENOTSUP) {
+ RTE_LOG(ERR, EAL, "%s(): fallocate() not supported, hugepage deallocation will be disabled\n",
+ __func__);
+ again = true;
+ fallocate_supported = 0;
+ } else {
+ RTE_LOG(DEBUG, EAL, "%s(): fallocate() failed: %s\n",
+ __func__,
+ strerror(errno));
+ return -1;
+ }
+ } else {
+ fallocate_supported = 1;
+
+ if (grow) {
+ /* if can't read lock, it's an error */
+ if (lock(fd, fa_offset, page_sz,
+ F_RDLCK) != 1)
+ return -1;
+ } else {
+ /* if can't unlock, it's an error */
+ if (lock(fd, fa_offset, page_sz,
+ F_UNLCK) != 1)
+ return -1;
+ }
+ }
+ }
+ } while (again);
+ return 0;
+}
+
+static int
+alloc_page(struct rte_memseg *ms, void *addr, uint64_t size, int socket_id,
+ struct hugepage_info *hi, unsigned int list_idx,
+ unsigned int seg_idx)
+{
+ int cur_socket_id = 0;
+ uint64_t map_offset;
+ char path[PATH_MAX];
+ int ret = 0;
+ int fd;
+
+ fd = get_page_fd(path, sizeof(path), hi, list_idx, seg_idx);
+ if (fd < 0)
+ return -1;
+
+
+ if (internal_config.single_file_segments) {
+ map_offset = seg_idx * size;
+ ret = resize_hugefile(fd, map_offset, size, true);
+ if (ret < 1)
+ goto resized;
+ } else {
+ map_offset = 0;
+ if (ftruncate(fd, size) < 0) {
+ RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n",
+ __func__, strerror(errno));
+ goto resized;
+ }
+ /* we've allocated a page - take out a read lock. we're using
+ * fcntl() locks rather than flock() here because doing that
+ * gives us one huge advantage - fcntl() locks are per-process,
+ * not per-file descriptor, which means that we don't have to
+ * keep the original fd's around to keep a lock on the file.
+ *
+ * this is useful, because when it comes to unmapping pages, we
+ * will have to take out a write lock (to figure out if another
+ * process still has this page mapped), and to do itwith flock()
+ * we'll have to use original fd, as lock is associated with
+ * that particular fd. with fcntl(), this is not necessary - we
+ * can open a new fd and use fcntl() on that.
+ */
+ ret = lock(fd, map_offset, size, F_RDLCK);
+
+ /* this should not fail */
+ if (ret != 1) {
+ RTE_LOG(ERR, EAL, "%s(): error locking file: %s\n",
+ __func__,
+ strerror(errno));
+ goto resized;
+ }
+ }
+
+ /*
+ * map the segment, and populate page tables, the kernel fills this
+ * segment with zeros if it's a new page.
+ */
+ void *va = mmap(addr, size, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd, map_offset);
+ /* for non-single file segments, we can close fd here */
+ if (!internal_config.single_file_segments)
+ close(fd);
+
+ if (va == MAP_FAILED) {
+ RTE_LOG(DEBUG, EAL, "%s(): mmap() failed: %s\n", __func__,
+ strerror(errno));
+ goto resized;
+ }
+ if (va != addr) {
+ RTE_LOG(DEBUG, EAL, "%s(): wrong mmap() address\n", __func__);
+ goto mapped;
+ }
+
+ rte_iova_t iova = rte_mem_virt2iova(addr);
+ if (iova == RTE_BAD_PHYS_ADDR) {
+ RTE_LOG(DEBUG, EAL, "%s(): can't get IOVA addr\n",
+ __func__);
+ goto mapped;
+ }
+
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+ move_pages(getpid(), 1, &addr, NULL, &cur_socket_id, 0);
+
+ if (cur_socket_id != socket_id) {
+ RTE_LOG(DEBUG, EAL,
+ "%s(): allocation happened on wrong socket (wanted %d, got %d)\n",
+ __func__, socket_id, cur_socket_id);
+ goto mapped;
+ }
+#endif
+
+ /* In linux, hugetlb limitations, like cgroup, are
+ * enforced at fault time instead of mmap(), even
+ * with the option of MAP_POPULATE. Kernel will send
+ * a SIGBUS signal. To avoid to be killed, save stack
+ * environment here, if SIGBUS happens, we can jump
+ * back here.
+ */
+ if (huge_wrap_sigsetjmp()) {
+ RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more hugepages of size %uMB\n",
+ (unsigned int)(size / 0x100000));
+ goto mapped;
+ }
+ *(int *)addr = *(int *) addr;
+
+ ms->addr = addr;
+ ms->hugepage_sz = size;
+ ms->len = size;
+ ms->nchannel = rte_memory_get_nchannel();
+ ms->nrank = rte_memory_get_nrank();
+ ms->iova = iova;
+ ms->socket_id = socket_id;
+
+ return 0;
+
+mapped:
+ munmap(addr, size);
+resized:
+ if (internal_config.single_file_segments) {
+ resize_hugefile(fd, map_offset, size, false);
+ if (is_zero_length(fd)) {
+ struct msl_entry *te = get_msl_entry_by_idx(list_idx);
+ if (te != NULL && te->fd >= 0) {
+ close(te->fd);
+ te->fd = -1;
+ }
+ /* ignore errors, can't make it any worse */
+ unlink(path);
+ }
+ } else {
+ close(fd);
+ unlink(path);
+ }
+ return -1;
+}
+
+int
+eal_memalloc_alloc_page_bulk(struct rte_memseg **ms, int n,
+ uint64_t size, int socket, bool exact)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct rte_memseg_list *msl = NULL;
+ void *addr;
+ unsigned int msl_idx;
+ int cur_idx, end_idx, i, ret = -1;
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+ bool have_numa;
+ int oldpolicy;
+ struct bitmask *oldmask = numa_allocate_nodemask();
+#endif
+ struct hugepage_info *hi = NULL;
+
+ /* dynamic allocation not supported in legacy mode */
+ if (internal_config.legacy_mem)
+ goto restore_numa;
+
+ for (i = 0; i < (int) RTE_DIM(internal_config.hugepage_info); i++) {
+ if (size ==
+ internal_config.hugepage_info[i].hugepage_sz) {
+ hi = &internal_config.hugepage_info[i];
+ break;
+ }
+ }
+ if (!hi) {
+ RTE_LOG(ERR, EAL, "%s(): can't find relevant hugepage_info entry\n",
+ __func__);
+ goto restore_numa;
+ }
+
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+ have_numa = prepare_numa(&oldpolicy, oldmask, socket);
+#endif
+
+ /* there may be several memsegs for this page size and socket id, so try
+ * allocating on all of them.
+ */
+
+ /* find our memseg list */
+ for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
+ struct rte_memseg_list *cur_msl = &mcfg->memsegs[msl_idx];
+
+ if (cur_msl->hugepage_sz != size)
+ continue;
+ if (cur_msl->socket_id != socket)
+ continue;
+ msl = cur_msl;
+
+ /* try finding space in memseg list */
+ cur_idx = rte_fbarray_find_next_n_free(&msl->memseg_arr, 0, n);
+
+ if (cur_idx < 0)
+ continue;
+
+ end_idx = cur_idx + n;
+
+ for (i = 0; cur_idx < end_idx; cur_idx++, i++) {
+ struct rte_memseg *cur;
+
+ cur = rte_fbarray_get(&msl->memseg_arr, cur_idx);
+ addr = RTE_PTR_ADD(msl->base_va,
+ cur_idx * msl->hugepage_sz);
+
+ if (alloc_page(cur, addr, size, socket, hi, msl_idx,
+ cur_idx)) {
+ RTE_LOG(DEBUG, EAL, "attempted to allocate %i pages, but only %i were allocated\n",
+ n, i);
+
+ /* if exact number wasn't requested, stop */
+ if (!exact)
+ ret = i;
+ goto restore_numa;
+ }
+ if (ms)
+ ms[i] = cur;
+
+ rte_fbarray_set_used(&msl->memseg_arr, cur_idx);
+ }
+ ret = n;
+
+ break;
+ }
+ /* we didn't break */
+ if (!msl) {
+ RTE_LOG(ERR, EAL, "%s(): couldn't find suitable memseg_list\n",
+ __func__);
+ }
+
+restore_numa:
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+ if (have_numa)
+ resotre_numa(&oldpolicy, oldmask);
+#endif
+ return ret;
+}
+
+struct rte_memseg *
+eal_memalloc_alloc_page(uint64_t size, int socket)
+{
+ struct rte_memseg *ms;
+ if (eal_memalloc_alloc_page_bulk(&ms, 1, size, socket, true) < 0)
+ return NULL;
+ /* return pointer to newly allocated memseg */
+ return ms;
+}
diff --git a/lib/librte_eal/linuxapp/eal/meson.build b/lib/librte_eal/linuxapp/eal/meson.build
index 03974ff..5254c6c 100644
--- a/lib/librte_eal/linuxapp/eal/meson.build
+++ b/lib/librte_eal/linuxapp/eal/meson.build
@@ -10,6 +10,7 @@ env_sources = files('eal_alarm.c',
'eal_debug.c',
'eal_hugepage_info.c',
'eal_interrupts.c',
+ 'eal_memalloc.c',
'eal_lcore.c',
'eal_log.c',
'eal_thread.c',
--
2.7.4
Anatoly Burakov
2018-03-07 16:56:43 UTC
Permalink
This isn't used anywhere yet, but the support is now there. Also,
adding cleanup to allocation procedures, so that if we fail to
allocate everything we asked for, we can free all of it back.

Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/common/eal_memalloc.h | 3 +
lib/librte_eal/linuxapp/eal/eal_memalloc.c | 148 ++++++++++++++++++++++++++++-
2 files changed, 146 insertions(+), 5 deletions(-)

diff --git a/lib/librte_eal/common/eal_memalloc.h b/lib/librte_eal/common/eal_memalloc.h
index c1076cf..adf59c4 100644
--- a/lib/librte_eal/common/eal_memalloc.h
+++ b/lib/librte_eal/common/eal_memalloc.h
@@ -16,4 +16,7 @@ int
eal_memalloc_alloc_page_bulk(struct rte_memseg **ms, int n, uint64_t size,
int socket, bool exact);

+int
+eal_memalloc_free_page(struct rte_memseg *ms);
+
#endif // EAL_MEMALLOC_H
diff --git a/lib/librte_eal/linuxapp/eal/eal_memalloc.c b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
index 1ba1201..bbeeeba 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memalloc.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
@@ -499,6 +499,64 @@ alloc_page(struct rte_memseg *ms, void *addr, uint64_t size, int socket_id,
return -1;
}

+static int
+free_page(struct rte_memseg *ms, struct hugepage_info *hi,
+ unsigned int list_idx, unsigned int seg_idx)
+{
+ uint64_t map_offset;
+ char path[PATH_MAX];
+ int fd, ret;
+
+ if (mmap(ms->addr, ms->hugepage_sz, PROT_READ,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0) ==
+ MAP_FAILED) {
+ RTE_LOG(DEBUG, EAL, "couldn't unmap page\n");
+ return -1;
+ }
+
+ fd = get_page_fd(path, sizeof(path), hi, list_idx, seg_idx);
+ if (fd < 0)
+ return -1;
+
+ if (internal_config.single_file_segments) {
+ map_offset = seg_idx * ms->hugepage_sz;
+ if (resize_hugefile(fd, map_offset, ms->hugepage_sz, false))
+ return -1;
+ /* if file is zero-length, we've already shrunk it, so it's
+ * safe to remove.
+ */
+ if (is_zero_length(fd)) {
+ struct msl_entry *te = get_msl_entry_by_idx(list_idx);
+ if (te != NULL && te->fd >= 0) {
+ close(te->fd);
+ te->fd = -1;
+ }
+ unlink(path);
+ }
+ ret = 0;
+ } else {
+ /* if we're able to take out a write lock, we're the last one
+ * holding onto this page.
+ */
+
+ ret = lock(fd, 0, ms->hugepage_sz, F_WRLCK);
+ if (ret >= 0) {
+ /* no one else is using this page */
+ if (ret == 1)
+ unlink(path);
+ ret = lock(fd, 0, ms->hugepage_sz, F_UNLCK);
+ if (ret != 1)
+ RTE_LOG(ERR, EAL, "%s(): unable to unlock file %s\n",
+ __func__, path);
+ }
+ close(fd);
+ }
+
+ memset(ms, 0, sizeof(*ms));
+
+ return ret;
+}
+
int
eal_memalloc_alloc_page_bulk(struct rte_memseg **ms, int n,
uint64_t size, int socket, bool exact)
@@ -507,7 +565,7 @@ eal_memalloc_alloc_page_bulk(struct rte_memseg **ms, int n,
struct rte_memseg_list *msl = NULL;
void *addr;
unsigned int msl_idx;
- int cur_idx, end_idx, i, ret = -1;
+ int cur_idx, start_idx, end_idx, i, j, ret = -1;
#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
bool have_numa;
int oldpolicy;
@@ -557,6 +615,7 @@ eal_memalloc_alloc_page_bulk(struct rte_memseg **ms, int n,
continue;

end_idx = cur_idx + n;
+ start_idx = cur_idx;

for (i = 0; cur_idx < end_idx; cur_idx++, i++) {
struct rte_memseg *cur;
@@ -567,25 +626,56 @@ eal_memalloc_alloc_page_bulk(struct rte_memseg **ms, int n,

if (alloc_page(cur, addr, size, socket, hi, msl_idx,
cur_idx)) {
+
RTE_LOG(DEBUG, EAL, "attempted to allocate %i pages, but only %i were allocated\n",
n, i);

- /* if exact number wasn't requested, stop */
- if (!exact)
+ /* if exact number of pages wasn't requested,
+ * failing to allocate is not an error. we could
+ * of course try other lists to see if there are
+ * better fits, but a bird in the hand...
+ */
+ if (!exact) {
ret = i;
- goto restore_numa;
+ goto restore_numa;
+ }
+ RTE_LOG(DEBUG, EAL, "exact amount of pages was requested, so returning %i allocated pages\n",
+ i);
+
+ /* clean up */
+ for (j = start_idx; j < cur_idx; j++) {
+ struct rte_memseg *tmp;
+ struct rte_fbarray *arr =
+ &msl->memseg_arr;
+
+ tmp = rte_fbarray_get(arr, j);
+ if (free_page(tmp, hi, msl_idx,
+ start_idx + j))
+ rte_panic("Cannot free page\n");
+
+ rte_fbarray_set_free(arr, j);
+ }
+ /* clear the list */
+ if (ms)
+ memset(ms, 0, sizeof(*ms) * n);
+
+ /* try next list */
+ goto next_list;
}
if (ms)
ms[i] = cur;

rte_fbarray_set_used(&msl->memseg_arr, cur_idx);
}
+ /* we allocated all pages */
ret = n;

break;
+next_list:
+ /* dummy semi-colon to make label work */;
}
/* we didn't break */
- if (!msl) {
+ if (msl_idx == RTE_MAX_MEMSEG_LISTS) {
RTE_LOG(ERR, EAL, "%s(): couldn't find suitable memseg_list\n",
__func__);
}
@@ -607,3 +697,51 @@ eal_memalloc_alloc_page(uint64_t size, int socket)
/* return pointer to newly allocated memseg */
return ms;
}
+
+int
+eal_memalloc_free_page(struct rte_memseg *ms)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct rte_memseg_list *msl = NULL;
+ unsigned int msl_idx, seg_idx;
+ struct hugepage_info *hi = NULL;
+ int i;
+
+ /* dynamic free not supported in legacy mode */
+ if (internal_config.legacy_mem)
+ return -1;
+
+ for (i = 0; i < (int) RTE_DIM(internal_config.hugepage_info); i++) {
+ if (ms->hugepage_sz ==
+ internal_config.hugepage_info[i].hugepage_sz) {
+ hi = &internal_config.hugepage_info[i];
+ break;
+ }
+ }
+ if (!hi) {
+ RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n");
+ return -1;
+ }
+
+ for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
+ uintptr_t start_addr, end_addr;
+ struct rte_memseg_list *cur = &mcfg->memsegs[msl_idx];
+
+ start_addr = (uintptr_t) cur->base_va;
+ end_addr = start_addr + cur->memseg_arr.len * cur->hugepage_sz;
+
+ if ((uintptr_t) ms->addr < start_addr ||
+ (uintptr_t) ms->addr >= end_addr) {
+ continue;
+ }
+ msl = cur;
+ seg_idx = RTE_PTR_DIFF(ms->addr, start_addr) / ms->hugepage_sz;
+ break;
+ }
+ if (!msl) {
+ RTE_LOG(ERR, EAL, "Couldn't find memseg list\n");
+ return -1;
+ }
+ rte_fbarray_set_free(&msl->memseg_arr, seg_idx);
+ return free_page(ms, hi, msl_idx, seg_idx);
+}
--
2.7.4
Anatoly Burakov
2018-03-07 16:56:53 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/linuxapp/eal/eal_memalloc.c | 11 +++++++++++
1 file changed, 11 insertions(+)

diff --git a/lib/librte_eal/linuxapp/eal/eal_memalloc.c b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
index bbeeeba..c03e7bc 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memalloc.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
@@ -34,6 +34,7 @@
#include <rte_eal.h>
#include <rte_memory.h>
#include <rte_spinlock.h>
+#include <rte_vfio.h>

#include "eal_filesystem.h"
#include "eal_internal_cfg.h"
@@ -476,6 +477,10 @@ alloc_page(struct rte_memseg *ms, void *addr, uint64_t size, int socket_id,
ms->iova = iova;
ms->socket_id = socket_id;

+ /* map the segment so that VFIO has access to it */
+ if (rte_eal_iova_mode() == RTE_IOVA_VA &&
+ rte_vfio_dma_map(ms->addr_64, iova, size))
+ RTE_LOG(DEBUG, EAL, "Cannot register segment with VFIO\n");
return 0;

mapped:
@@ -507,6 +512,12 @@ free_page(struct rte_memseg *ms, struct hugepage_info *hi,
char path[PATH_MAX];
int fd, ret;

+ /* unmap the segment from VFIO */
+ if (rte_eal_iova_mode() == RTE_IOVA_VA &&
+ rte_vfio_dma_unmap(ms->addr_64, ms->iova, ms->len)) {
+ RTE_LOG(DEBUG, EAL, "Cannot unregister segment with VFIO\n");
+ }
+
if (mmap(ms->addr, ms->hugepage_sz, PROT_READ,
MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0) ==
MAP_FAILED) {
--
2.7.4
Anatoly Burakov
2018-03-07 16:56:50 UTC
Permalink
It's there, so we might as well use it. Some operations will be
sped up by that.

Since we have to allocate an fbarray for memzones, we have to do
it before we initialize memory subsystem, because that, in
secondary processes, will (later) allocate more fbarrays than the
primary process, which will result in inability to attach to
memzone fbarray if we do it after the fact.

Signed-off-by: Anatoly Burakov <***@intel.com>
---

Notes:
Code for ENA driver makes little sense to me, but i've
attempted to keep the same semantics as the old code.

drivers/net/ena/ena_ethdev.c | 10 +-
lib/librte_eal/bsdapp/eal/eal.c | 6 +
lib/librte_eal/common/eal_common_memzone.c | 180 +++++++++++++++-------
lib/librte_eal/common/include/rte_eal_memconfig.h | 4 +-
lib/librte_eal/common/malloc_heap.c | 4 +
lib/librte_eal/linuxapp/eal/eal.c | 13 +-
test/test/test_memzone.c | 9 +-
7 files changed, 157 insertions(+), 69 deletions(-)

diff --git a/drivers/net/ena/ena_ethdev.c b/drivers/net/ena/ena_ethdev.c
index 34b2a8d..f7bfc7a 100644
--- a/drivers/net/ena/ena_ethdev.c
+++ b/drivers/net/ena/ena_ethdev.c
@@ -264,11 +264,15 @@ static const struct eth_dev_ops ena_dev_ops = {
static inline int ena_cpu_to_node(int cpu)
{
struct rte_config *config = rte_eal_get_configuration();
+ struct rte_fbarray *arr = &config->mem_config->memzones;
+ const struct rte_memzone *mz;

- if (likely(cpu < RTE_MAX_MEMZONE))
- return config->mem_config->memzone[cpu].socket_id;
+ if (unlikely(cpu >= RTE_MAX_MEMZONE))
+ return NUMA_NO_NODE;

- return NUMA_NO_NODE;
+ mz = rte_fbarray_get(arr, cpu);
+
+ return mz->socket_id;
}

static inline void ena_rx_mbuf_prepare(struct rte_mbuf *mbuf,
diff --git a/lib/librte_eal/bsdapp/eal/eal.c b/lib/librte_eal/bsdapp/eal/eal.c
index 45e5670..3b06e21 100644
--- a/lib/librte_eal/bsdapp/eal/eal.c
+++ b/lib/librte_eal/bsdapp/eal/eal.c
@@ -608,6 +608,12 @@ rte_eal_init(int argc, char **argv)
return -1;
}

+ if (rte_eal_malloc_heap_init() < 0) {
+ rte_eal_init_alert("Cannot init malloc heap\n");
+ rte_errno = ENODEV;
+ return -1;
+ }
+
if (rte_eal_tailqs_init() < 0) {
rte_eal_init_alert("Cannot init tail queues for objects\n");
rte_errno = EFAULT;
diff --git a/lib/librte_eal/common/eal_common_memzone.c b/lib/librte_eal/common/eal_common_memzone.c
index 8c9aa28..a7cfdaf 100644
--- a/lib/librte_eal/common/eal_common_memzone.c
+++ b/lib/librte_eal/common/eal_common_memzone.c
@@ -28,42 +28,29 @@
static inline const struct rte_memzone *
memzone_lookup_thread_unsafe(const char *name)
{
- const struct rte_mem_config *mcfg;
+ struct rte_mem_config *mcfg;
+ struct rte_fbarray *arr;
const struct rte_memzone *mz;
- unsigned i = 0;
+ int i = 0;

/* get pointer to global configuration */
mcfg = rte_eal_get_configuration()->mem_config;
+ arr = &mcfg->memzones;

/*
* the algorithm is not optimal (linear), but there are few
* zones and this function should be called at init only
*/
- for (i = 0; i < RTE_MAX_MEMZONE; i++) {
- mz = &mcfg->memzone[i];
- if (mz->addr != NULL && !strncmp(name, mz->name, RTE_MEMZONE_NAMESIZE))
- return &mcfg->memzone[i];
+ while ((i = rte_fbarray_find_next_used(arr, i)) >= 0) {
+ mz = rte_fbarray_get(arr, i++);
+ if (mz->addr != NULL &&
+ !strncmp(name, mz->name, RTE_MEMZONE_NAMESIZE))
+ return mz;
}

return NULL;
}

-static inline struct rte_memzone *
-get_next_free_memzone(void)
-{
- struct rte_mem_config *mcfg;
- unsigned i = 0;
-
- /* get pointer to global configuration */
- mcfg = rte_eal_get_configuration()->mem_config;
-
- for (i = 0; i < RTE_MAX_MEMZONE; i++) {
- if (mcfg->memzone[i].addr == NULL)
- return &mcfg->memzone[i];
- }
-
- return NULL;
-}

/* This function will return the greatest free block if a heap has been
* specified. If no heap has been specified, it will return the heap and
@@ -103,13 +90,16 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
{
struct rte_memzone *mz;
struct rte_mem_config *mcfg;
+ struct rte_fbarray *arr;
size_t requested_len;
+ int idx;

/* get pointer to global configuration */
mcfg = rte_eal_get_configuration()->mem_config;
+ arr = &mcfg->memzones;

/* no more room in config */
- if (mcfg->memzone_cnt >= RTE_MAX_MEMZONE) {
+ if (arr->count >= arr->len) {
RTE_LOG(ERR, EAL, "%s(): No more room in config\n", __func__);
rte_errno = ENOSPC;
return NULL;
@@ -199,7 +189,14 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
struct malloc_elem *elem = malloc_elem_from_data(mz_addr);

/* fill the zone in config */
- mz = get_next_free_memzone();
+ idx = rte_fbarray_find_next_free(arr, 0);
+
+ if (idx < 0) {
+ mz = NULL;
+ } else {
+ rte_fbarray_set_used(arr, idx);
+ mz = rte_fbarray_get(arr, idx);
+ }

if (mz == NULL) {
RTE_LOG(ERR, EAL, "%s(): Cannot find free memzone but there is room "
@@ -209,7 +206,6 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
return NULL;
}

- mcfg->memzone_cnt++;
snprintf(mz->name, sizeof(mz->name), "%s", name);
mz->iova = rte_malloc_virt2iova(mz_addr);
mz->addr = mz_addr;
@@ -322,6 +318,8 @@ int
rte_memzone_free(const struct rte_memzone *mz)
{
struct rte_mem_config *mcfg;
+ struct rte_fbarray *arr;
+ struct rte_memzone *found_mz;
int ret = 0;
void *addr;
unsigned idx;
@@ -330,21 +328,26 @@ rte_memzone_free(const struct rte_memzone *mz)
return -EINVAL;

mcfg = rte_eal_get_configuration()->mem_config;
+ arr = &mcfg->memzones;

rte_rwlock_write_lock(&mcfg->mlock);

- idx = ((uintptr_t)mz - (uintptr_t)mcfg->memzone);
- idx = idx / sizeof(struct rte_memzone);
+ idx = rte_fbarray_find_idx(arr, mz);
+ found_mz = rte_fbarray_get(arr, idx);

- addr = mcfg->memzone[idx].addr;
- if (addr == NULL)
+ if (found_mz == NULL) {
ret = -EINVAL;
- else if (mcfg->memzone_cnt == 0) {
- rte_panic("%s(): memzone address not NULL but memzone_cnt is 0!\n",
- __func__);
} else {
- memset(&mcfg->memzone[idx], 0, sizeof(mcfg->memzone[idx]));
- mcfg->memzone_cnt--;
+ addr = found_mz->addr;
+ if (addr == NULL)
+ ret = -EINVAL;
+ else if (arr->count == 0) {
+ rte_panic("%s(): memzone address not NULL but memzone_cnt is 0!\n",
+ __func__);
+ } else {
+ memset(found_mz, 0, sizeof(*found_mz));
+ rte_fbarray_set_free(arr, idx);
+ }
}

rte_rwlock_write_unlock(&mcfg->mlock);
@@ -378,25 +381,79 @@ rte_memzone_lookup(const char *name)
void
rte_memzone_dump(FILE *f)
{
+ struct rte_fbarray *arr;
struct rte_mem_config *mcfg;
- unsigned i = 0;
+ int i = 0;

/* get pointer to global configuration */
mcfg = rte_eal_get_configuration()->mem_config;
+ arr = &mcfg->memzones;

rte_rwlock_read_lock(&mcfg->mlock);
/* dump all zones */
- for (i=0; i<RTE_MAX_MEMZONE; i++) {
- if (mcfg->memzone[i].addr == NULL)
- break;
- fprintf(f, "Zone %u: name:<%s>, IO:0x%"PRIx64", len:0x%zx"
+ while ((i = rte_fbarray_find_next_used(arr, i)) >= 0) {
+ void *cur_addr, *mz_end;
+ struct rte_memzone *mz;
+ struct rte_memseg_list *msl = NULL;
+ struct rte_memseg *ms;
+ int ms_idx;
+
+ mz = rte_fbarray_get(arr, i);
+
+ /*
+ * memzones can span multiple physical pages, so dump addresses
+ * of all physical pages this memzone spans.
+ */
+
+ fprintf(f, "Zone %u: name:<%s>, len:0x%zx"
", virt:%p, socket_id:%"PRId32", flags:%"PRIx32"\n", i,
- mcfg->memzone[i].name,
- mcfg->memzone[i].iova,
- mcfg->memzone[i].len,
- mcfg->memzone[i].addr,
- mcfg->memzone[i].socket_id,
- mcfg->memzone[i].flags);
+ mz->name,
+ mz->len,
+ mz->addr,
+ mz->socket_id,
+ mz->flags);
+
+ msl = rte_mem_virt2memseg_list(mz->addr);
+ if (!msl) {
+ RTE_LOG(DEBUG, EAL, "Skipping bad memzone\n");
+ continue;
+ }
+
+ cur_addr = RTE_PTR_ALIGN_FLOOR(mz->addr, mz->hugepage_sz);
+ mz_end = RTE_PTR_ADD(cur_addr, mz->len);
+
+ fprintf(f, "physical segments used:\n");
+ if (msl->base_va == NULL) {
+ /* if memseg list base VA, we're in legacy mem mode,
+ * which means we have only one memseg.
+ */
+ ms = rte_mem_virt2memseg(mz->addr, msl);
+
+ fprintf(f, " addr: %p iova: 0x%" PRIx64 " "
+ "len: 0x%" PRIx64 " "
+ "pagesz: 0x%" PRIx64 "\n",
+ cur_addr, ms->iova, ms->len, ms->hugepage_sz);
+ } else {
+ ms_idx = RTE_PTR_DIFF(mz->addr, msl->base_va) /
+ msl->hugepage_sz;
+ ms = rte_fbarray_get(&msl->memseg_arr, ms_idx);
+
+ do {
+ fprintf(f, " addr: %p iova: 0x%" PRIx64 " "
+ "len: 0x%" PRIx64 " "
+ "pagesz: 0x%" PRIx64 "\n",
+ cur_addr, ms->iova, ms->len,
+ ms->hugepage_sz);
+
+ /* advance VA to next page */
+ cur_addr = RTE_PTR_ADD(cur_addr,
+ ms->hugepage_sz);
+
+ /* memzones occupy contiguous segments */
+ ++ms;
+ } while (cur_addr < mz_end);
+ }
+ i++;
}
rte_rwlock_read_unlock(&mcfg->mlock);
}
@@ -412,19 +469,23 @@ rte_eal_memzone_init(void)
/* get pointer to global configuration */
mcfg = rte_eal_get_configuration()->mem_config;

- /* secondary processes don't need to initialise anything */
- if (rte_eal_process_type() == RTE_PROC_SECONDARY)
- return 0;
-
rte_rwlock_write_lock(&mcfg->mlock);

- /* delete all zones */
- mcfg->memzone_cnt = 0;
- memset(mcfg->memzone, 0, sizeof(mcfg->memzone));
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY &&
+ rte_fbarray_init(&mcfg->memzones, "memzone",
+ RTE_MAX_MEMZONE, sizeof(struct rte_memzone))) {
+ RTE_LOG(ERR, EAL, "Cannot allocate memzone list\n");
+ return -1;
+ } else if (rte_eal_process_type() == RTE_PROC_SECONDARY &&
+ rte_fbarray_attach(&mcfg->memzones)) {
+ RTE_LOG(ERR, EAL, "Cannot attach to memzone list\n");
+ rte_rwlock_write_unlock(&mcfg->mlock);
+ return -1;
+ }

rte_rwlock_write_unlock(&mcfg->mlock);

- return rte_eal_malloc_heap_init();
+ return 0;
}

/* Walk all reserved memory zones */
@@ -432,14 +493,19 @@ void rte_memzone_walk(void (*func)(const struct rte_memzone *, void *),
void *arg)
{
struct rte_mem_config *mcfg;
- unsigned i;
+ struct rte_fbarray *arr;
+ int i;

mcfg = rte_eal_get_configuration()->mem_config;
+ arr = &mcfg->memzones;
+
+ i = 0;

rte_rwlock_read_lock(&mcfg->mlock);
- for (i=0; i<RTE_MAX_MEMZONE; i++) {
- if (mcfg->memzone[i].addr != NULL)
- (*func)(&mcfg->memzone[i], arg);
+ while ((i = rte_fbarray_find_next_used(arr, i)) > 0) {
+ struct rte_memzone *mz = rte_fbarray_get(arr, i);
+ (*func)(mz, arg);
+ i++;
}
rte_rwlock_read_unlock(&mcfg->mlock);
}
diff --git a/lib/librte_eal/common/include/rte_eal_memconfig.h b/lib/librte_eal/common/include/rte_eal_memconfig.h
index 31fc8e7..b6bdb21 100644
--- a/lib/librte_eal/common/include/rte_eal_memconfig.h
+++ b/lib/librte_eal/common/include/rte_eal_memconfig.h
@@ -58,10 +58,8 @@ struct rte_mem_config {
rte_rwlock_t qlock; /**< used for tailq operation for thread safe. */
rte_rwlock_t mplock; /**< only used by mempool LIB for thread-safe. */

- uint32_t memzone_cnt; /**< Number of allocated memzones */
-
/* memory segments and zones */
- struct rte_memzone memzone[RTE_MAX_MEMZONE]; /**< Memzone descriptors. */
+ struct rte_fbarray memzones; /**< Memzone descriptors. */

struct rte_memseg_list memsegs[RTE_MAX_MEMSEG_LISTS];
/**< list of dynamic arrays holding memsegs */
diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c
index 984e027..7a3d0f3 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -579,6 +579,10 @@ rte_eal_malloc_heap_init(void)
if (mcfg == NULL)
return -1;

+ /* secondary processes don't need to initialize heap */
+ if (rte_eal_process_type() == RTE_PROC_SECONDARY)
+ return 0;
+
for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
int start;
struct rte_fbarray *arr;
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 7851a7d..d336c96 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -857,6 +857,15 @@ rte_eal_init(int argc, char **argv)
return -1;
}
#endif
+ /* memzone_init maps rte_fbarrays, which has to be done before hugepage
+ * init/attach, because attach creates extra fbarrays in secondary
+ * process, resulting in inability to map memzone fbarray.
+ */
+ if (rte_eal_memzone_init() < 0) {
+ rte_eal_init_alert("Cannot init memzone\n");
+ rte_errno = ENODEV;
+ return -1;
+ }

if (rte_eal_memory_init() < 0) {
rte_eal_init_alert("Cannot init memory\n");
@@ -867,8 +876,8 @@ rte_eal_init(int argc, char **argv)
/* the directories are locked during eal_hugepage_info_init */
eal_hugedirs_unlock();

- if (rte_eal_memzone_init() < 0) {
- rte_eal_init_alert("Cannot init memzone\n");
+ if (rte_eal_malloc_heap_init() < 0) {
+ rte_eal_init_alert("Cannot init malloc heap\n");
rte_errno = ENODEV;
return -1;
}
diff --git a/test/test/test_memzone.c b/test/test/test_memzone.c
index 47f4de8..4b49d61 100644
--- a/test/test/test_memzone.c
+++ b/test/test/test_memzone.c
@@ -893,7 +893,7 @@ test_memzone_basic(void)
const struct rte_memzone *mz;
int memzone_cnt_after, memzone_cnt_expected;
int memzone_cnt_before =
- rte_eal_get_configuration()->mem_config->memzone_cnt;
+ rte_eal_get_configuration()->mem_config->memzones.count;

memzone1 = rte_memzone_reserve(TEST_MEMZONE_NAME("testzone1"), 100,
SOCKET_ID_ANY, 0);
@@ -917,7 +917,7 @@ test_memzone_basic(void)
(memzone3 != NULL) + (memzone4 != NULL);

memzone_cnt_after =
- rte_eal_get_configuration()->mem_config->memzone_cnt;
+ rte_eal_get_configuration()->mem_config->memzones.count;

if (memzone_cnt_after != memzone_cnt_expected)
return -1;
@@ -996,7 +996,7 @@ test_memzone_basic(void)
}

memzone_cnt_after =
- rte_eal_get_configuration()->mem_config->memzone_cnt;
+ rte_eal_get_configuration()->mem_config->memzones.count;
if (memzone_cnt_after != memzone_cnt_before)
return -1;

@@ -1017,7 +1017,8 @@ static int
test_memzone(void)
{
/* take note of how many memzones were allocated before running */
- int memzone_cnt = rte_eal_get_configuration()->mem_config->memzone_cnt;
+ int memzone_cnt =
+ rte_eal_get_configuration()->mem_config->memzones.count;

printf("test basic memzone API\n");
if (test_memzone_basic() < 0)
--
2.7.4
Anatoly Burakov
2018-03-07 16:57:03 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---

Notes:
It is not 100% clear if this memzone is used for DMA,
corrections welcome.

drivers/net/cxgbe/sge.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/cxgbe/sge.c b/drivers/net/cxgbe/sge.c
index 3d5aa59..e31474c 100644
--- a/drivers/net/cxgbe/sge.c
+++ b/drivers/net/cxgbe/sge.c
@@ -1299,7 +1299,8 @@ static void *alloc_ring(size_t nelem, size_t elem_size,
* handle the maximum ring size is allocated in order to allow for
* resizing in later calls to the queue setup function.
*/
- tz = rte_memzone_reserve_aligned(z_name, len, socket_id, 0, 4096);
+ tz = rte_memzone_reserve_aligned_contig(z_name, len, socket_id, 0,
+ 4096);
if (!tz)
return NULL;
--
2.7.4
Anatoly Burakov
2018-03-07 16:57:02 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---
drivers/net/bnx2x/bnx2x.c | 2 +-
drivers/net/bnx2x/bnx2x_rxtx.c | 3 ++-
2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/bnx2x/bnx2x.c b/drivers/net/bnx2x/bnx2x.c
index fb02d0f..81f5dae 100644
--- a/drivers/net/bnx2x/bnx2x.c
+++ b/drivers/net/bnx2x/bnx2x.c
@@ -177,7 +177,7 @@ bnx2x_dma_alloc(struct bnx2x_softc *sc, size_t size, struct bnx2x_dma *dma,
rte_get_timer_cycles());

/* Caller must take care that strlen(mz_name) < RTE_MEMZONE_NAMESIZE */
- z = rte_memzone_reserve_aligned(mz_name, (uint64_t) (size),
+ z = rte_memzone_reserve_aligned_contig(mz_name, (uint64_t)size,
SOCKET_ID_ANY,
0, align);
if (z == NULL) {
diff --git a/drivers/net/bnx2x/bnx2x_rxtx.c b/drivers/net/bnx2x/bnx2x_rxtx.c
index a0d4ac9..325b94d 100644
--- a/drivers/net/bnx2x/bnx2x_rxtx.c
+++ b/drivers/net/bnx2x/bnx2x_rxtx.c
@@ -26,7 +26,8 @@ ring_dma_zone_reserve(struct rte_eth_dev *dev, const char *ring_name,
if (mz)
return mz;

- return rte_memzone_reserve_aligned(z_name, ring_size, socket_id, 0, BNX2X_PAGE_SIZE);
+ return rte_memzone_reserve_aligned_contig(z_name, ring_size, socket_id,
+ 0, BNX2X_PAGE_SIZE);
}

static void
--
2.7.4
Anatoly Burakov
2018-03-07 16:57:06 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---

Notes:
It is not 100% clear that all users of this function
need to allocate DMA memory. Corrections welcome.

drivers/net/i40e/i40e_ethdev.c | 2 +-
drivers/net/i40e/i40e_rxtx.c | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/i40e/i40e_ethdev.c b/drivers/net/i40e/i40e_ethdev.c
index 508b417..0fffe2c 100644
--- a/drivers/net/i40e/i40e_ethdev.c
+++ b/drivers/net/i40e/i40e_ethdev.c
@@ -4010,7 +4010,7 @@ i40e_allocate_dma_mem_d(__attribute__((unused)) struct i40e_hw *hw,
return I40E_ERR_PARAM;

snprintf(z_name, sizeof(z_name), "i40e_dma_%"PRIu64, rte_rand());
- mz = rte_memzone_reserve_bounded(z_name, size, SOCKET_ID_ANY, 0,
+ mz = rte_memzone_reserve_bounded_contig(z_name, size, SOCKET_ID_ANY, 0,
alignment, RTE_PGSIZE_2M);
if (!mz)
return I40E_ERR_NO_MEMORY;
diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c
index 1217e5a..6b2b40e 100644
--- a/drivers/net/i40e/i40e_rxtx.c
+++ b/drivers/net/i40e/i40e_rxtx.c
@@ -2189,7 +2189,7 @@ i40e_memzone_reserve(const char *name, uint32_t len, int socket_id)
if (mz)
return mz;

- mz = rte_memzone_reserve_aligned(name, len,
+ mz = rte_memzone_reserve_aligned_contig(name, len,
socket_id, 0, I40E_RING_BASE_ALIGN);
return mz;
}
--
2.7.4
Anatoly Burakov
2018-03-07 16:57:04 UTC
Permalink
Signed-off-by: Anatoly Burakov <***@intel.com>
---
drivers/net/ena/base/ena_plat_dpdk.h | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ena/base/ena_plat_dpdk.h b/drivers/net/ena/base/ena_plat_dpdk.h
index 8cba319..c1ebf00 100644
--- a/drivers/net/ena/base/ena_plat_dpdk.h
+++ b/drivers/net/ena/base/ena_plat_dpdk.h
@@ -188,7 +188,8 @@ typedef uint64_t dma_addr_t;
ENA_TOUCH(dmadev); ENA_TOUCH(handle); \
snprintf(z_name, sizeof(z_name), \
"ena_alloc_%d", ena_alloc_cnt++); \
- mz = rte_memzone_reserve(z_name, size, SOCKET_ID_ANY, 0); \
+ mz = rte_memzone_reserve_contig(z_name, \
+ size, SOCKET_ID_ANY, 0); \
memset(mz->addr, 0, size); \
virt = mz->addr; \
phys = mz->iova; \
@@ -206,7 +207,7 @@ typedef uint64_t dma_addr_t;
ENA_TOUCH(dmadev); ENA_TOUCH(dev_node); \
snprintf(z_name, sizeof(z_name), \
"ena_alloc_%d", ena_alloc_cnt++); \
- mz = rte_memzone_reserve(z_name, size, node, 0); \
+ mz = rte_memzone_reserve_contig(z_name, size, node, 0); \
memset(mz->addr, 0, size); \
virt = mz->addr; \
phys = mz->iova; \
@@ -219,7 +220,7 @@ typedef uint64_t dma_addr_t;
ENA_TOUCH(dmadev); ENA_TOUCH(dev_node); \
snprintf(z_name, sizeof(z_name), \
"ena_alloc_%d", ena_alloc_cnt++); \
- mz = rte_memzone_reserve(z_name, size, node, 0); \
+ mz = rte_memzone_reserve_contig(z_name, size, node, 0); \
memset(mz->addr, 0, size); \
virt = mz->addr; \
} while (0)
--
2.7.4
Nélio Laranjeiro
2018-03-12 15:58:03 UTC
Permalink
Hi Anatoly,
We are currently facing issues with running testpmd on thunderx
platform.
The issue seems to be with vfio
EAL: Detected 24 lcore(s)
EAL: Detected 1 NUMA nodes
EAL: No free hugepages reported in hugepages-2048kB
EAL: Multi-process socket /var/run/.rte_unix
EAL: Probing VFIO support...
EAL: VFIO support initialized
EAL:   VFIO support not initialized
<snip>
EAL:   probe driver: 177d:a053 octeontx_fpavf
EAL: PCI device 0001:01:00.1 on NUMA socket 0
EAL:   probe driver: 177d:a034 net_thunderx
EAL:   using IOMMU type 1 (Type 1)
EAL:   cannot set up DMA remapping, error 22 (Invalid argument)
EAL:   0001:01:00.1 DMA remapping failed, error 22 (Invalid argument)
EAL: Requested device 0001:01:00.1 cannot be used
EAL: PCI device 0001:01:00.2 on NUMA socket 0
<snip>
testpmd: No probed ethernet devices
testpmd: create a new mbuf pool <mbuf_pool_socket_0>: n=251456,
size=2176, socket=0
testpmd: preferred mempool ops selected: ring_mp_mc
EAL:   VFIO support not initialized
EAL:   VFIO support not initialized
EAL:   VFIO support not initialized
Done
This is because rte_service_init() calls rte_calloc() before
rte_bus_probe() and vfio_dma_mem_map fails because iommu type is
not set.
gdb) bt
#0  vfio_dma_mem_map (vaddr=281439006359552, iova=11274289152,
len=536870912, do_map=1) at
/root/clean/dpdk/lib/librte_eal/linuxapp/eal/eal_vfio.c:967
#1  0x00000000004fd974 in rte_vfio_dma_map
(vaddr=281439006359552, iova=11274289152, len=536870912) at
/root/clean/dpdk/lib/librte_eal/linuxapp/eal/eal_vfio.c:988
#2  0x00000000004fbe78 in vfio_mem_event_callback
(type=RTE_MEM_EVENT_ALLOC, addr=0xfff7a0000000, len=536870912)
at /root/clean/dpdk/lib/librte_eal/linuxapp/eal/eal_vfio.c:240
#3  0x00000000005070ac in eal_memalloc_notify
(event=RTE_MEM_EVENT_ALLOC, start=0xfff7a0000000, len=536870912)
at
/root/clean/dpdk/lib/librte_eal/common/eal_common_memalloc.c:177
#4  0x0000000000515c98 in try_expand_heap_primary
(heap=0xffffb7fb167c, pg_sz=536870912, elt_size=8192, socket=0,
flags=0, align=128, bound=0, contig=false) at
/root/clean/dpdk/lib/librte_eal/common/malloc_heap.c:247
#5  0x0000000000515e94 in try_expand_heap (heap=0xffffb7fb167c,
pg_sz=536870912, elt_size=8192, socket=0, flags=0, align=128,
bound=0, contig=false) at
/root/clean/dpdk/lib/librte_eal/common/malloc_heap.c:327
#6  0x00000000005163a0 in alloc_more_mem_on_socket
(heap=0xffffb7fb167c, size=8192, socket=0, flags=0, align=128,
bound=0, contig=false) at
/root/clean/dpdk/lib/librte_eal/common/malloc_heap.c:455
#7  0x0000000000516514 in heap_alloc_on_socket (type=0x85bf90
"rte_services", size=8192, socket=0, flags=0, align=128,
bound=0, contig=false) at
/root/clean/dpdk/lib/librte_eal/common/malloc_heap.c:491
#8  0x0000000000516664 in malloc_heap_alloc (type=0x85bf90
"rte_services", size=8192, socket_arg=-1, flags=0, align=128,
bound=0, contig=false) at
/root/clean/dpdk/lib/librte_eal/common/malloc_heap.c:527
#9  0x0000000000513b54 in rte_malloc_socket (type=0x85bf90
"rte_services", size=8192, align=128, socket_arg=-1) at
/root/clean/dpdk/lib/librte_eal/common/rte_malloc.c:54
#10 0x0000000000513bc8 in rte_zmalloc_socket (type=0x85bf90
"rte_services", size=8192, align=128, socket=-1) at
/root/clean/dpdk/lib/librte_eal/common/rte_malloc.c:72
#11 0x0000000000513c00 in rte_zmalloc (type=0x85bf90
"rte_services", size=8192, align=128) at
/root/clean/dpdk/lib/librte_eal/common/rte_malloc.c:81
#12 0x0000000000513c90 in rte_calloc (type=0x85bf90
"rte_services", num=64, size=128, align=128) at
/root/clean/dpdk/lib/librte_eal/common/rte_malloc.c:99
#13 0x0000000000518cec in rte_service_init () at
/root/clean/dpdk/lib/librte_eal/common/rte_service.c:81
#14 0x00000000004f55f4 in rte_eal_init (argc=3,
argv=0xfffffffff488) at
/root/clean/dpdk/lib/librte_eal/linuxapp/eal/eal.c:959
#15 0x000000000045af5c in main (argc=3, argv=0xfffffffff488) at
/root/clean/dpdk/app/test-pmd/testpmd.c:2483
Also, I have tried running with --legacy-mem but I'm stuck in
`pci_find_max_end_va` loop  because `rte_fbarray_find_next_used`
always return
0. >
HugePages_Total:      15
HugePages_Free:       11
HugePages_Rsvd:        0
HugePages_Surp:        0
Hugepagesize:     524288 kB
(gdb) bt
#0  find_next (arr=0xffffb7fb009c, start=0, used=true) at
/root/clean/dpdk/lib/librte_eal/common/eal_common_fbarray.c:248
#1  0x00000000005132a8 in rte_fbarray_find_next_used
(arr=0xffffb7fb009c, start=0) at
/root/clean/dpdk/lib/librte_eal/common/eal_common_fbarray.c:700
#2  0x000000000052d030 in pci_find_max_end_va () at
/root/clean/dpdk/drivers/bus/pci/linux/pci.c:138
#3  0x0000000000530ab8 in pci_vfio_map_resource_primary
(dev=0xeae700) at
/root/clean/dpdk/drivers/bus/pci/linux/pci_vfio.c:499
#4  0x0000000000530ffc in pci_vfio_map_resource (dev=0xeae700)
at /root/clean/dpdk/drivers/bus/pci/linux/pci_vfio.c:601
#5  0x000000000052ce90 in rte_pci_map_device (dev=0xeae700) at
/root/clean/dpdk/drivers/bus/pci/linux/pci.c:75
#6  0x0000000000531a20 in rte_pci_probe_one_driver (dr=0x997e20
<rte_nicvf_pmd>, dev=0xeae700) at
/root/clean/dpdk/drivers/bus/pci/pci_common.c:164
#7  0x0000000000531c68 in pci_probe_all_drivers (dev=0xeae700)
at /root/clean/dpdk/drivers/bus/pci/pci_common.c:249
#8  0x0000000000531f68 in rte_pci_probe () at
/root/clean/dpdk/drivers/bus/pci/pci_common.c:359
#9  0x000000000050a140 in rte_bus_probe () at
/root/clean/dpdk/lib/librte_eal/common/eal_common_bus.c:98
#10 0x00000000004f55f4 in rte_eal_init (argc=1,
argv=0xfffffffff498) at
/root/clean/dpdk/lib/librte_eal/linuxapp/eal/eal.c:967
#11 0x000000000045af5c in main (argc=1, argv=0xfffffffff498) at
/root/clean/dpdk/app/test-pmd/testpmd.c:2483
Am I missing something here?
I'll look into those, thanks!
https://github.com/anatolyburakov/dpdk
I will be pushing quick fixes there before spinning new revisions,
so we can discover and fix bugs more rapidly. I'll fix compile
issues reported earlier, then i'll take a look at your issues. The
latter one seems like a typo, the former is probably a matter of
moving things around a bit.
(also, pull requests welcome if you find it easier to fix things
yourself and submit patches against my tree!)
Thanks for testing.
I've looked into the failures.
The VFIO one is not actually a failure. It only prints out errors
because rte_malloc is called before VFIO is initialized. However, once
VFIO *is* initialized, all of that memory would be added to VFIO, so
these error messages are harmless. Regardless, i've added a check to see
if init is finished before printing out those errors, so they won't be
printed out any more.
Second one is a typo on my part that got lost in one of the rebases.
I've pushed fixes for both into the github repo.
Although i do wonder where do the DMA remapping errors come from. The error
message says "invalid argument", so that doesn't come from rte_service or
anything to do with rte_malloc - this is us not providing valid arguments to
VFIO. I'm not seeing these errors on my system. I'll check on others to be
sure.
I have taken a look at the github tree the issues with VFIO are gone, Although
compilation issues with dpaa/dpaa2 are still present due to their dependency on
`rte_eal_get_physmem_layout`.
I've fixed the dpaa compile issue and pushed it to github. I've tried to
keep the semantics the same as before, but i can't compile-test (let alone
test-test) them as i don't have access to a system with dpaa bus.
Also, you might want to know that dpaa bus driver references
RTE_LIBRTE_DPAA_MAX_CRYPTODEV which is only found in
config/common_armv8a_linuxapp but is not present in base config. Not sure if
that's an issue.
Hi anatoly,

I've checkout your branch on github
commit 61e6876fd1d2 ("remove panic from memalloc")

I am compiling it with clang:
clang version 3.8.1-24 (tags/RELEASE_381/final)
Target: x86_64-pc-linux-gnu
Thread model: posix
InstalledDir: /usr/bin

There are a lot of compilation errors not present on DPDK branch.
They need to be fixed.

Thanks,
--
Nélio Laranjeiro
6WIND
Burakov, Anatoly
2018-04-04 11:27:41 UTC
Permalink
Post by Anatoly Burakov
Currently it is not possible to use memory that is not owned by DPDK to
perform DMA. This scenarion might be used in vhost applications (like
SPDK) where guest send its own memory table. To fill this gap provide
API to allow registering arbitrary address in VFIO container.
---
There's a larger issue raised by this patch. We do support hotplug for
VFIO devices, and VFIO will drop all maps whenever last device used by
VFIO is unplugged. Since primary use case for this API is mapping
non-DPDK-owned memory for DMA, this presents a problem, because we're
not tracking the mapped areas anywhere, which means that on last device
hot-unplug followed by first device hotplug event, whatever DMA maps
user has set up, will be gone - only DPDK-owned memory will be remapped.

One way we could solve it is to store user maps in a local tailq and
remap those on hotplug. Another way would be to not solve it and just
document this limitation - that using this API in conjunction with
hotplug will result in undefined behavior. I would tend to favor the
first way.
--
Thanks,
Anatoly
Burakov, Anatoly
2018-04-05 11:30:47 UTC
Permalink
Post by Anatoly Burakov
Currently it is not possible to use memory that is not owned by DPDK to
perform DMA. This scenarion might be used in vhost applications (like
SPDK) where guest send its own memory table. To fill this gap provide
API to allow registering arbitrary address in VFIO container.
---
@Gowrishankar,

We've discussed this privately already, but just to make sure it is
publicly stated: as it is, parts of this patchset for PPC64 have
potential issues with them.

Unmapping and remapping the entire segment list on alloc introduces a
race condition - what if a DMA request comes in while we're in the
middle of remapping? We cannot realistically stop NICs from doing DMA
while some other thread is allocating memory.

There is also a larger issue that i've raised in a previous response to
this patch [1], and PPC64 will will have this problem worse, because not
only the described issue will happen on hot-unplug/hotplug, but it may
also happen during regular allocations, because PPC64 IOMMU will drop
all mappings on window resize.

[1] http://dpdk.org/ml/archives/dev/2018-April/095182.html
--
Thanks,
Anatoly
Shreyansh Jain
2018-04-05 14:24:18 UTC
Permalink
Hello Anatoly,
This patchset introduces dynamic memory allocation for DPDK (aka memory
hotplug). Based upon RFC submitted in December [1].
- IPC asynchronous request API patch [2]
- Function to return number of sockets [3]
- EAL IOVA fix [4]
- General outline of memory hotplug changes [5]
- EAL NUMA node count changes [6]
The vast majority of changes are in the EAL and malloc, the external API
disruption is minimal: a new set of API's are added for contiguous memory
allocation for rte_memzone, and a few API additions in rte_memory due to
switch to memseg_lists as opposed to memsegs. Every other API change is
internal to EAL, and all of the memory allocation/freeing is handled
through rte_malloc, with no externally visible API changes.
* Malloc heap adjusted to handle holes in address space
* Single memseg list replaced by multiple memseg lists
* VA space for hugepages is preallocated in advance
* Added alloc/free for pages happening as needed on rte_malloc/rte_free
* Added contiguous memory allocation API's for rte_memzone
* Added convenience API calls to walk over memsegs
* Integrated Pawel Wodkowski's patch for registering/unregistering memory
with VFIO [7]
* Callbacks for registering memory allocations
* Callbacks for allowing/disallowing allocations above specified limit
* Multiprocess support done via DPDK IPC introduced in 18.02
The biggest difference is a "memseg" now represents a single page (as opposed to
being a big contiguous block of pages). As a consequence, both memzones and
malloc elements are no longer guaranteed to be physically contiguous, unless
the user asks for it at reserve time. To preserve whatever functionality that
was dependent on previous behavior, a legacy memory option is also provided,
however it is expected (or perhaps vainly hoped) to be temporary solution.
Why multiple memseg lists instead of one? Since memseg is a single page now,
the list of memsegs will get quite big, and we need to locate pages somehow
when we allocate and free them. We could of course just walk the list and
allocate one contiguous chunk of VA space for memsegs, but this
implementation uses separate lists instead in order to speed up many
operations with memseg lists.
- VFIO support is only smoke-tested (but is expected to work), VFIO support
with secondary processes is not tested; work is ongoing to validate VFIO
for all use cases
- FSLMC bus VFIO code is not yet integrated, work is in progress
For testing, it is recommended to use the GitHub repository [8], as it will
have all of the dependencies already integrated.
- Lots of compile fixes
- Fixes for multiprocess synchronization
- Fixes for mempool size calculation
- Added convenience memseg walk() API's
- Added alloc validation callback
v2: - fixed deadlock at init
- reverted rte_panic changes at init, this is now handled inside IPC
[1] http://dpdk.org/dev/patchwork/bundle/aburakov/Memory_RFC/
[2] http://dpdk.org/dev/patchwork/bundle/aburakov/IPC_Async_Request/
[3] http://dpdk.org/dev/patchwork/bundle/aburakov/Num_Sockets/
[4] http://dpdk.org/dev/patchwork/bundle/aburakov/IOVA_mode_fixes/
[5] http://dpdk.org/dev/patchwork/patch/34002/
[6] http://dpdk.org/dev/patchwork/patch/33853/
[7] http://dpdk.org/dev/patchwork/patch/24484/
[8] https://github.com/anatolyburakov/dpdk
Thanks for the huge work and continuously answering my barrage of questions.
Here is the update for dpaa/dpaa2. Incremental to [1]:

Note: Results based on github repo (16fbfef04a37bb9d714), rather than
this series over master. Though, it shouldn't make that big a difference.

PA: Physical Addressing mode
VA: Virtual Addressing mode

1. DPAA2 PA and VA work OK over the github repo
a. Almost equal performance for VA cases as compared to non-hotplug.
Whether in --legacy-mem or without it. ($ see below)
b. 70~90% drops in performance for PA case, depending on page size
used (# see below)

2. DPAA PA (there is no VA mode) works with a minor fix over v3 which
Anatoly knows about might incorporate in v4 (Patch 50/68)
a. 70-90% performance drop. (# see below)

($)
There are some changes in dpaa2 code base which I will share against
relevant patch in this series. That can be incorporated into v4 to
enable dpaa2 in VA mode.

(#)
Physical addressing cases for both, dpaa/dpaa2, depend heavily on the
fact that physical addressing was the base and was available in sorted
manner. This is reversed/negated with hotplugging support.
So, rework of both the drivers is required from this perspective. There
are some suggestions floated by Anatoly and internally, but work still
needs to be done.
It also impacts a lot of use-cases for virtualization (no-iommu).

[1] http://dpdk.org/ml/archives/dev/2018-March/094184.html
Thomas Monjalon
2018-04-05 14:12:55 UTC
Permalink
Post by Shreyansh Jain
Physical addressing cases for both, dpaa/dpaa2, depend heavily on the
fact that physical addressing was the base and was available in sorted
manner. This is reversed/negated with hotplugging support.
So, rework of both the drivers is required from this perspective. There
are some suggestions floated by Anatoly and internally, but work still
needs to be done.
It also impacts a lot of use-cases for virtualization (no-iommu).
So what is your recommendation?
Can you rework PA case in dpaa/dpaa2 drivers within 18.05 timeframe?
Hemant Agrawal
2018-04-05 14:20:04 UTC
Permalink
HI Thomas,
-----Original Message-----
Sent: Thursday, April 05, 2018 7:43 PM
Subject: Re: [dpdk-dev] [PATCH v3 00/68] Memory Hotplug for DPDK
Importance: High
Post by Shreyansh Jain
Physical addressing cases for both, dpaa/dpaa2, depend heavily on the
fact that physical addressing was the base and was available in sorted
manner. This is reversed/negated with hotplugging support.
So, rework of both the drivers is required from this perspective.
There are some suggestions floated by Anatoly and internally, but work
still needs to be done.
It also impacts a lot of use-cases for virtualization (no-iommu).
So what is your recommendation?
Can you rework PA case in dpaa/dpaa2 drivers within 18.05 timeframe?
We will like 2-3 more days on this before we can ack/nack this patch.
We are working on priority on this. PA case rework is not a trivial change.

Regards,
Hemant
Hemant Agrawal
2018-04-06 12:01:58 UTC
Permalink
Hi Thomas
Post by Hemant Agrawal
-----Original Message-----
Sent: Thursday, April 05, 2018 7:43 PM
Subject: Re: [dpdk-dev] [PATCH v3 00/68] Memory Hotplug for DPDK
Importance: High
Post by Shreyansh Jain
Physical addressing cases for both, dpaa/dpaa2, depend heavily on
the fact that physical addressing was the base and was available in
sorted manner. This is reversed/negated with hotplugging support.
So, rework of both the drivers is required from this perspective.
There are some suggestions floated by Anatoly and internally, but
work still needs to be done.
It also impacts a lot of use-cases for virtualization (no-iommu).
So what is your recommendation?
Can you rework PA case in dpaa/dpaa2 drivers within 18.05 timeframe?
We will like 2-3 more days on this before we can ack/nack this patch.
We are working on priority on this. PA case rework is not a trivial change.
The patch is good to go. However, we will be making changes in dpaa/dpaa2 drivers to fix the PA issues shortly (within 18.05 timeframe)

Anatoly needs to take care of following:
1. Comment by Shreyansh on " Re: [dpdk-dev] [PATCH v3 50/68] eal: replace memzone array with fbarray"
2. I could not apply the patches cleanly on current master.
Post by Hemant Agrawal
Regards,
Hemant
Burakov, Anatoly
2018-04-06 12:55:28 UTC
Permalink
Post by Hemant Agrawal
Hi Thomas
Post by Hemant Agrawal
-----Original Message-----
Sent: Thursday, April 05, 2018 7:43 PM
Subject: Re: [dpdk-dev] [PATCH v3 00/68] Memory Hotplug for DPDK
Importance: High
Post by Shreyansh Jain
Physical addressing cases for both, dpaa/dpaa2, depend heavily on
the fact that physical addressing was the base and was available in
sorted manner. This is reversed/negated with hotplugging support.
So, rework of both the drivers is required from this perspective.
There are some suggestions floated by Anatoly and internally, but
work still needs to be done.
It also impacts a lot of use-cases for virtualization (no-iommu).
So what is your recommendation?
Can you rework PA case in dpaa/dpaa2 drivers within 18.05 timeframe?
We will like 2-3 more days on this before we can ack/nack this patch.
We are working on priority on this. PA case rework is not a trivial change.
The patch is good to go. However, we will be making changes in dpaa/dpaa2 drivers to fix the PA issues shortly (within 18.05 timeframe)
That's great to hear!
Post by Hemant Agrawal
1. Comment by Shreyansh on " Re: [dpdk-dev] [PATCH v3 50/68] eal: replace memzone array with fbarray"
Yes, that is already fixed in both github and upcoming v4.
Post by Hemant Agrawal
2. I could not apply the patches cleanly on current master.
The patchset has dependencies, listed in the cover letter. I'll rebase
on latest master before sending v4 just in case.
Post by Hemant Agrawal
Post by Hemant Agrawal
Regards,
Hemant
--
Thanks,
Anatoly
santosh
2018-04-05 18:59:57 UTC
Permalink
Hi Anatoly,
This patchset introduces dynamic memory allocation for DPDK (aka memory
hotplug). Based upon RFC submitted in December [1].
- IPC asynchronous request API patch [2]
- Function to return number of sockets [3]
- EAL IOVA fix [4]
- General outline of memory hotplug changes [5]
- EAL NUMA node count changes [6]
The vast majority of changes are in the EAL and malloc, the external API
disruption is minimal: a new set of API's are added for contiguous memory
allocation for rte_memzone, and a few API additions in rte_memory due to
switch to memseg_lists as opposed to memsegs. Every other API change is
internal to EAL, and all of the memory allocation/freeing is handled
through rte_malloc, with no externally visible API changes.
* Malloc heap adjusted to handle holes in address space
* Single memseg list replaced by multiple memseg lists
* VA space for hugepages is preallocated in advance
* Added alloc/free for pages happening as needed on rte_malloc/rte_free
* Added contiguous memory allocation API's for rte_memzone
* Added convenience API calls to walk over memsegs
* Integrated Pawel Wodkowski's patch for registering/unregistering memory
with VFIO [7]
* Callbacks for registering memory allocations
* Callbacks for allowing/disallowing allocations above specified limit
* Multiprocess support done via DPDK IPC introduced in 18.02
The biggest difference is a "memseg" now represents a single page (as opposed to
being a big contiguous block of pages). As a consequence, both memzones and
malloc elements are no longer guaranteed to be physically contiguous, unless
the user asks for it at reserve time. To preserve whatever functionality that
was dependent on previous behavior, a legacy memory option is also provided,
however it is expected (or perhaps vainly hoped) to be temporary solution.
Why multiple memseg lists instead of one? Since memseg is a single page now,
the list of memsegs will get quite big, and we need to locate pages somehow
when we allocate and free them. We could of course just walk the list and
allocate one contiguous chunk of VA space for memsegs, but this
implementation uses separate lists instead in order to speed up many
operations with memseg lists.
- VFIO support is only smoke-tested (but is expected to work), VFIO support
with secondary processes is not tested; work is ongoing to validate VFIO
for all use cases
- FSLMC bus VFIO code is not yet integrated, work is in progress
For testing, it is recommended to use the GitHub repository [8], as it will
have all of the dependencies already integrated.
- Lots of compile fixes
- Fixes for multiprocess synchronization
- Fixes for mempool size calculation
- Added convenience memseg walk() API's
- Added alloc validation callback
v2: - fixed deadlock at init
- reverted rte_panic changes at init, this is now handled inside IPC
Tested-by: Santosh Shukla <***@caviumnetworks.com>
Continue reading on narkive:
Loading...