Your application allocates a lot of 24-byte objects and you want to combine these objectives:
- align each 24-byte object on a 16-byte boundary, presumably to read its contents with SIMD instructions
- use as little memory as possible, ideally just 24 bytes per object.
These objectives are incompatible: if the objects require a 16-byte alignment, they must be at least 32 bytes apart, regardless of how you allocate memory.
The C library malloc()
probably enforces 16-byte alignment on your system already (it is a common requirement on 64-bit systems for SIMD compatible data), but could use the 8-byte slack at the end of the block for its own bookkeeping data. jemalloc()
certainly does. So the overhead is not wasted but inherent to the allocation algorithm.
Allocating objects in pools does not help with the packing, because of the alignment constraint. It might be more efficient, but modern malloc()
implementations are remarkably efficient and some do use thread-based pools (for example tcmalloc()
).
Designing your own allocation scheme is tricky and error prone, linking a custom malloc()
implementation is non trivial either as it may cause problems with C library functions' own use of malloc()
. I would strongly advise against these approaches unless you are very proficient in C and have a good understanding of your system.
There is one possible direction to improve packing: if you also allocate many 8-byte objects, you could interlace them in combined pools of 32-byte chunks, using the first 24 bytes for a 24-byte object aligned on a 16-byte boundary and the 8 remaining bytes for a separate 8-byte object aligned on an 8-byte boundary.
Another approach would be to split the storage of your 24-byte objects into an array of 16-byte parts and another array of 8-byte parts using the same index to access the parts of the same logical object. If you know the maximum number of such objects to allocate, it is a workable solution. You would use index values instead of pointers to access the parts. This may require substantial modifications of your code.
Memory is quite cheap and abundant on current systems. Unless you target existing deployed embedded systems, specifying more RAM for your application is a simple and effective approach.
Here is a pool allocator for 24-byte objects with very small overhead. Try and see if you use less memory with it and get better performance:
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/mman.h>
typedef struct pool_link_t {
struct pool_link_t *next; // generic link for the free list
} pool_link_t;
typedef struct pool_page_t {
struct pool_block_t *head; // pointer to the block at the start of each page
} pool_page_t;
typedef struct pool_block_t pool_block_t;
struct pool_block_t {
pool_block_t *head; // at the start of each page
pool_block_t *next, *prev; // pool_block linkage
size_t block_size; // mmapped size
size_t avail_page_count; // number of unused pages
size_t avail_count; // number of unused objects
size_t free_count; // length of free list
pool_link_t *free_list; // free list
pool_link_t *avail_ptr; // pointer to unused object area
};
#define PAGE_SIZE 0x1000 // system dependent
#define POOL_BLOCK_SIZE 0x100000 // must be a multiple of PAGE_SIZE
#define POOL_OBJ_SIZE 24 // must be a multiple of sizeof(void*)
static pool_block_t dummy_arena = {
&dummy_arena, &dummy_arena, &dummy_arena, 0, 0, 0, 0, NULL, NULL,
};
static pool_block_t *pool24 = &dummy_arena;
void *malloc24(void) {
pool_block_t *p, *startp;
for (startp = p = pool24;; pool24 = p = p->next) {
if (p->free_count) {
pool_link_t *link = p->free_list;
p->free_list = link->next;
p->free_count--;
return link;
}
if (p->avail_count) {
void *ptr = p->avail_ptr;
p->avail_ptr += POOL_OBJ_SIZE / sizeof(pool_block_t*);
if (--p->avail_count == 0) {
if (p->avail_page_count) { // prep the next page of the block
pool_page_t *page = (void *)((unsigned char *)p + POOL_BLOCK_SIZE - p->avail_page_count * PAGE_SIZE);
page->head = p;
p->avail_ptr = (void *)(page + 1);
p->avail_count = (PAGE_SIZE - sizeof(pool_block_t*)) / POOL_OBJ_SIZE;
p->avail_page_count--;
}
}
return ptr;
}
if (p->next == startp) {
pool_block_t *np = mmap(NULL, POOL_BLOCK_SIZE,
PROT_READ | PROT_WRITE,
MAP_ANON | MAP_PRIVATE, -1, 0);
if (np == MAP_FAILED)
return NULL;
np->head = np;
np->block_size = POOL_BLOCK_SIZE;
// prep the first page of the block
np->avail_page_count = POOL_BLOCK_SIZE / PAGE_SIZE - 1;
np->avail_count = (PAGE_SIZE - sizeof(pool_block_t)) / POOL_OBJ_SIZE;
np->avail_ptr = (void *)(np + 1);
np->free_count = 0;
np->free_list = NULL;
// link the block in the arena
np->prev = p;
np->next = p->next;
p->next = np->next->prev = np;
}
}
}
void free24(void *p) {
pool_link_t *lp;
if ((lp = p) != NULL) {
pool_block_t *np = (void *)((uintptr_t)p & ~(PAGE_SIZE - 1));
np = np->head;
lp->next = np->free_list;
np->free_list = lp;
np->free_count++;
}
}
void trim_arena24(void) {
pool_block_t *p;
pool24 = &dummy_arena;
while ((p = dummy_arena.next) != &dummy_arena) {
if (p->free_count == (PAGE_SIZE - sizeof(pool_block_t)) / POOL_OBJ_SIZE +
(PAGE_SIZE - sizeof(pool_block_t*)) / POOL_OBJ_SIZE * (POOL_BLOCK_SIZE / PAGE_SIZE - 1 - p->avail_page_count)) {
dummy_arena.next = p->next;
p->next->prev = p->prev;
munmap(p, p->block_size);
}
}
}
void free_arena24(void) {
pool_block_t *p;
pool24 = &dummy_arena;
while ((p = dummy_arena.next) != &dummy_arena) {
dummy_arena.next = p->next;
p->next->prev = p->prev;
munmap(p, p->block_size);
}
}
#define TRACE(s) //s
#define TEST_COUNT (16 << 20)
static void *ptr[TEST_COUNT];
#ifdef BENCH_REF
#define malloc24() malloc(24)
#define free24(p) free(p)
#endif
int main(void) {
int i;
TRACE(printf("testing %d\n", TEST_COUNT));
for (i = 0; i < TEST_COUNT; i++) {
ptr[i] = malloc24();
TRACE(printf("%d: malloc24() -> %p\n", i, ptr[i]));
}
for (i = 0; i < TEST_COUNT; i++) {
int n = rand() % TEST_COUNT;
if (ptr[n]) {
TRACE(printf("%d: free24(%p)\n", n, ptr[n]));
free24(ptr[n]);
ptr[n] = NULL;
}
}
for (i = 0; i < TEST_COUNT; i++) {
if (!ptr[i]) {
ptr[i] = malloc24();
TRACE(printf("%d: malloc24() -> %p\n", i, ptr[i]));
}
}
for (i = 0; i < TEST_COUNT; i++) {
TRACE(printf("%d: free24(%p)\n", i, ptr[i]));
free24(ptr[i]);
ptr[i] = NULL;
}
TRACE(printf("trim_arena24()\n"));
trim_arena24();
if (pool24 != &dummy_arena) printf("pool24 != &dummy_arena\n");
if (pool24->next != pool24) printf("pool24->next != pool24\n");
if (pool24->prev != pool24) printf("pool24->prev != pool24\n");
TRACE(printf("free_arena24()\n"));
free_arena24();
TRACE(printf("done\n"));
return 0;
}