winsys/amdgpu,pb_slab: add slabs with 3/4 of power of two sizes to save memory

Instead of aligning slab allocations to powers of two (e.g. 129K -> 256K),
implement slab allocations with 3/4 of power of two sizes to reduce
overallocation. (e.g. 129K -> 192K)

The limitation is that the alignment must be 1/3rd of the allocation size.

DeusExMD allocates 2.1 GB of VRAM. Without this, slabs waste 194 MB due
to alignment, i.e. 9.2%. This commit reduces the waste to 102 MB, i.e. 4.9%.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8683>
This commit is contained in:
Marek Olšák 2021-01-23 21:53:30 -05:00 committed by Marge Bot
parent 35005881bf
commit e97af11ba9
6 changed files with 84 additions and 13 deletions

View File

@ -102,11 +102,22 @@ pb_slab_alloc(struct pb_slabs *slabs, unsigned size, unsigned heap)
struct pb_slab_group *group;
struct pb_slab *slab;
struct pb_slab_entry *entry;
unsigned entry_size = 1 << order;
bool three_fourths = false;
/* If the size is <= 3/4 of the entry size, use a slab with entries using
* 3/4 sizes to reduce overallocation.
*/
if (slabs->allow_three_fourths_allocations && size <= entry_size * 3 / 4) {
entry_size = entry_size * 3 / 4;
three_fourths = true;
}
assert(order < slabs->min_order + slabs->num_orders);
assert(heap < slabs->num_heaps);
group_index = heap * slabs->num_orders + (order - slabs->min_order);
group_index = (heap * slabs->num_orders + (order - slabs->min_order)) *
(1 + slabs->allow_three_fourths_allocations) + three_fourths;
group = &slabs->groups[group_index];
mtx_lock(&slabs->mutex);
@ -136,7 +147,7 @@ pb_slab_alloc(struct pb_slabs *slabs, unsigned size, unsigned heap)
* slabs for the same group, but that doesn't hurt correctness.
*/
mtx_unlock(&slabs->mutex);
slab = slabs->slab_alloc(slabs->priv, heap, 1 << order, group_index);
slab = slabs->slab_alloc(slabs->priv, heap, entry_size, group_index);
if (!slab)
return NULL;
mtx_lock(&slabs->mutex);
@ -191,7 +202,7 @@ pb_slabs_reclaim(struct pb_slabs *slabs)
bool
pb_slabs_init(struct pb_slabs *slabs,
unsigned min_order, unsigned max_order,
unsigned num_heaps,
unsigned num_heaps, bool allow_three_fourth_allocations,
void *priv,
slab_can_reclaim_fn *can_reclaim,
slab_alloc_fn *slab_alloc,
@ -206,6 +217,7 @@ pb_slabs_init(struct pb_slabs *slabs,
slabs->min_order = min_order;
slabs->num_orders = max_order - min_order + 1;
slabs->num_heaps = num_heaps;
slabs->allow_three_fourths_allocations = allow_three_fourth_allocations;
slabs->priv = priv;
slabs->can_reclaim = can_reclaim;
@ -214,7 +226,8 @@ pb_slabs_init(struct pb_slabs *slabs,
list_inithead(&slabs->reclaim);
num_groups = slabs->num_orders * slabs->num_heaps;
num_groups = slabs->num_orders * slabs->num_heaps *
(1 + allow_three_fourth_allocations);
slabs->groups = CALLOC(num_groups, sizeof(*slabs->groups));
if (!slabs->groups)
return false;

View File

@ -116,8 +116,9 @@ struct pb_slabs
unsigned min_order;
unsigned num_orders;
unsigned num_heaps;
bool allow_three_fourths_allocations;
/* One group per (heap, order) pair. */
/* One group per (heap, order, three_fourth_allocations). */
struct pb_slab_group *groups;
/* List of entries waiting to be reclaimed, i.e. they have been passed to
@ -144,7 +145,7 @@ pb_slabs_reclaim(struct pb_slabs *slabs);
bool
pb_slabs_init(struct pb_slabs *slabs,
unsigned min_order, unsigned max_order,
unsigned num_heaps,
unsigned num_heaps, bool allow_three_fourth_allocations,
void *priv,
slab_can_reclaim_fn *can_reclaim,
slab_alloc_fn *slab_alloc,

View File

@ -682,7 +682,8 @@ static const struct pb_vtbl amdgpu_winsys_bo_slab_vtbl = {
/* other functions are never called */
};
static unsigned get_slab_entry_alignment(struct amdgpu_winsys *ws, unsigned size)
/* Return the power of two size of a slab entry matching the input size. */
static unsigned get_slab_pot_entry_size(struct amdgpu_winsys *ws, unsigned size)
{
unsigned entry_size = util_next_power_of_two(size);
unsigned min_entry_size = 1 << ws->bo_slabs[0].min_order;
@ -690,6 +691,17 @@ static unsigned get_slab_entry_alignment(struct amdgpu_winsys *ws, unsigned size
return MAX2(entry_size, min_entry_size);
}
/* Return the slab entry alignment. */
static unsigned get_slab_entry_alignment(struct amdgpu_winsys *ws, unsigned size)
{
unsigned entry_size = get_slab_pot_entry_size(ws, size);
if (size <= entry_size * 3 / 4)
return entry_size / 4;
return entry_size;
}
static struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap,
unsigned entry_size,
unsigned group_index,
@ -719,6 +731,21 @@ static struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap,
/* The slab size is twice the size of the largest possible entry. */
slab_size = max_entry_size * 2;
if (!util_is_power_of_two_nonzero(entry_size)) {
assert(util_is_power_of_two_nonzero(entry_size * 4 / 3));
/* If the entry size is 3/4 of a power of two, we would waste space and not gain
* anything if we allocated only twice the power of two for the backing buffer:
* 2 * 3/4 = 1.5 usable with buffer size 2
*
* Allocating 5 times the entry size leads us to the next power of two and results
* in a much better memory utilization:
* 5 * 3/4 = 3.75 usable with buffer size 4
*/
if (entry_size * 5 > slab_size)
slab_size = util_next_power_of_two(entry_size * 5);
}
/* The largest slab should have the same size as the PTE fragment
* size to get faster address translation.
*/
@ -736,8 +763,11 @@ static struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap,
if (!slab->buffer)
goto fail;
slab->base.num_entries = slab->buffer->base.size / entry_size;
slab_size = slab->buffer->base.size;
slab->base.num_entries = slab_size / entry_size;
slab->base.num_free = slab->base.num_entries;
slab->entry_size = entry_size;
slab->entries = CALLOC(slab->base.num_entries, sizeof(*slab->entries));
if (!slab->entries)
goto fail_buffer;
@ -773,6 +803,13 @@ static struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap,
list_addtail(&bo->u.slab.entry.head, &slab->base.free);
}
/* Wasted alignment due to slabs with 3/4 allocations being aligned to a power of two. */
assert(slab->base.num_entries * entry_size <= slab_size);
if (domains & RADEON_DOMAIN_VRAM)
ws->slab_wasted_vram += slab_size - slab->base.num_entries * entry_size;
else
ws->slab_wasted_gtt += slab_size - slab->base.num_entries * entry_size;
return &slab->base;
fail_buffer:
@ -799,6 +836,14 @@ struct pb_slab *amdgpu_bo_slab_alloc_normal(void *priv, unsigned heap,
void amdgpu_bo_slab_free(void *priv, struct pb_slab *pslab)
{
struct amdgpu_slab *slab = amdgpu_slab(pslab);
struct amdgpu_winsys *ws = slab->buffer->ws;
unsigned slab_size = slab->buffer->base.size;
assert(slab->base.num_entries * slab->entry_size <= slab_size);
if (slab->buffer->base.placement & RADEON_DOMAIN_VRAM)
ws->slab_wasted_vram -= slab_size - slab->base.num_entries * slab->entry_size;
else
ws->slab_wasted_gtt -= slab_size - slab->base.num_entries * slab->entry_size;
for (unsigned i = 0; i < slab->base.num_entries; ++i) {
amdgpu_bo_remove_fences(&slab->entries[i]);
@ -1347,8 +1392,19 @@ amdgpu_bo_create(struct amdgpu_winsys *ws,
if (size < alignment && alignment <= 4 * 1024)
alloc_size = alignment;
if (alignment > get_slab_entry_alignment(ws, size))
goto no_slab; /* can't fulfil alignment requirements */
if (alignment > get_slab_entry_alignment(ws, alloc_size)) {
/* 3/4 allocations can return too small alignment. Try again with a power of two
* allocation size.
*/
unsigned pot_size = get_slab_pot_entry_size(ws, alloc_size);
if (alignment <= pot_size) {
/* This size works but wastes some memory to fulfil the alignment. */
alloc_size = pot_size;
} else {
goto no_slab; /* can't fulfil alignment requirements */
}
}
struct pb_slabs *slabs = get_slabs(ws, alloc_size, flags);
entry = pb_slab_alloc(slabs, alloc_size, heap);

View File

@ -115,6 +115,7 @@ struct amdgpu_winsys_bo {
struct amdgpu_slab {
struct pb_slab base;
unsigned entry_size;
struct amdgpu_winsys_bo *buffer;
struct amdgpu_winsys_bo *entries;
};

View File

@ -458,7 +458,7 @@ amdgpu_winsys_create(int fd, const struct pipe_screen_config *config,
if (!pb_slabs_init(&aws->bo_slabs[i],
min_order, max_order,
RADEON_MAX_SLAB_HEAPS,
RADEON_MAX_SLAB_HEAPS, true,
aws,
amdgpu_bo_can_reclaim_slab,
amdgpu_bo_slab_alloc_normal,
@ -471,7 +471,7 @@ amdgpu_winsys_create(int fd, const struct pipe_screen_config *config,
if (aws->info.has_tmz_support &&
!pb_slabs_init(&aws->bo_slabs_encrypted[i],
min_order, max_order,
RADEON_MAX_SLAB_HEAPS,
RADEON_MAX_SLAB_HEAPS, true,
aws,
amdgpu_bo_can_reclaim_slab,
amdgpu_bo_slab_alloc_encrypted,

View File

@ -861,7 +861,7 @@ radeon_drm_winsys_create(int fd, const struct pipe_screen_config *config,
*/
if (!pb_slabs_init(&ws->bo_slabs,
RADEON_SLAB_MIN_SIZE_LOG2, RADEON_SLAB_MAX_SIZE_LOG2,
RADEON_MAX_SLAB_HEAPS,
RADEON_MAX_SLAB_HEAPS, false,
ws,
radeon_bo_can_reclaim_slab,
radeon_bo_slab_alloc,