winsys/amdgpu,pb_slab: add slabs with 3/4 of power of two sizes to save memory
Instead of aligning slab allocations to powers of two (e.g. 129K -> 256K), implement slab allocations with 3/4 of power of two sizes to reduce overallocation. (e.g. 129K -> 192K) The limitation is that the alignment must be 1/3rd of the allocation size. DeusExMD allocates 2.1 GB of VRAM. Without this, slabs waste 194 MB due to alignment, i.e. 9.2%. This commit reduces the waste to 102 MB, i.e. 4.9%. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8683>
This commit is contained in:
parent
35005881bf
commit
e97af11ba9
|
@ -102,11 +102,22 @@ pb_slab_alloc(struct pb_slabs *slabs, unsigned size, unsigned heap)
|
|||
struct pb_slab_group *group;
|
||||
struct pb_slab *slab;
|
||||
struct pb_slab_entry *entry;
|
||||
unsigned entry_size = 1 << order;
|
||||
bool three_fourths = false;
|
||||
|
||||
/* If the size is <= 3/4 of the entry size, use a slab with entries using
|
||||
* 3/4 sizes to reduce overallocation.
|
||||
*/
|
||||
if (slabs->allow_three_fourths_allocations && size <= entry_size * 3 / 4) {
|
||||
entry_size = entry_size * 3 / 4;
|
||||
three_fourths = true;
|
||||
}
|
||||
|
||||
assert(order < slabs->min_order + slabs->num_orders);
|
||||
assert(heap < slabs->num_heaps);
|
||||
|
||||
group_index = heap * slabs->num_orders + (order - slabs->min_order);
|
||||
group_index = (heap * slabs->num_orders + (order - slabs->min_order)) *
|
||||
(1 + slabs->allow_three_fourths_allocations) + three_fourths;
|
||||
group = &slabs->groups[group_index];
|
||||
|
||||
mtx_lock(&slabs->mutex);
|
||||
|
@ -136,7 +147,7 @@ pb_slab_alloc(struct pb_slabs *slabs, unsigned size, unsigned heap)
|
|||
* slabs for the same group, but that doesn't hurt correctness.
|
||||
*/
|
||||
mtx_unlock(&slabs->mutex);
|
||||
slab = slabs->slab_alloc(slabs->priv, heap, 1 << order, group_index);
|
||||
slab = slabs->slab_alloc(slabs->priv, heap, entry_size, group_index);
|
||||
if (!slab)
|
||||
return NULL;
|
||||
mtx_lock(&slabs->mutex);
|
||||
|
@ -191,7 +202,7 @@ pb_slabs_reclaim(struct pb_slabs *slabs)
|
|||
bool
|
||||
pb_slabs_init(struct pb_slabs *slabs,
|
||||
unsigned min_order, unsigned max_order,
|
||||
unsigned num_heaps,
|
||||
unsigned num_heaps, bool allow_three_fourth_allocations,
|
||||
void *priv,
|
||||
slab_can_reclaim_fn *can_reclaim,
|
||||
slab_alloc_fn *slab_alloc,
|
||||
|
@ -206,6 +217,7 @@ pb_slabs_init(struct pb_slabs *slabs,
|
|||
slabs->min_order = min_order;
|
||||
slabs->num_orders = max_order - min_order + 1;
|
||||
slabs->num_heaps = num_heaps;
|
||||
slabs->allow_three_fourths_allocations = allow_three_fourth_allocations;
|
||||
|
||||
slabs->priv = priv;
|
||||
slabs->can_reclaim = can_reclaim;
|
||||
|
@ -214,7 +226,8 @@ pb_slabs_init(struct pb_slabs *slabs,
|
|||
|
||||
list_inithead(&slabs->reclaim);
|
||||
|
||||
num_groups = slabs->num_orders * slabs->num_heaps;
|
||||
num_groups = slabs->num_orders * slabs->num_heaps *
|
||||
(1 + allow_three_fourth_allocations);
|
||||
slabs->groups = CALLOC(num_groups, sizeof(*slabs->groups));
|
||||
if (!slabs->groups)
|
||||
return false;
|
||||
|
|
|
@ -116,8 +116,9 @@ struct pb_slabs
|
|||
unsigned min_order;
|
||||
unsigned num_orders;
|
||||
unsigned num_heaps;
|
||||
bool allow_three_fourths_allocations;
|
||||
|
||||
/* One group per (heap, order) pair. */
|
||||
/* One group per (heap, order, three_fourth_allocations). */
|
||||
struct pb_slab_group *groups;
|
||||
|
||||
/* List of entries waiting to be reclaimed, i.e. they have been passed to
|
||||
|
@ -144,7 +145,7 @@ pb_slabs_reclaim(struct pb_slabs *slabs);
|
|||
bool
|
||||
pb_slabs_init(struct pb_slabs *slabs,
|
||||
unsigned min_order, unsigned max_order,
|
||||
unsigned num_heaps,
|
||||
unsigned num_heaps, bool allow_three_fourth_allocations,
|
||||
void *priv,
|
||||
slab_can_reclaim_fn *can_reclaim,
|
||||
slab_alloc_fn *slab_alloc,
|
||||
|
|
|
@ -682,7 +682,8 @@ static const struct pb_vtbl amdgpu_winsys_bo_slab_vtbl = {
|
|||
/* other functions are never called */
|
||||
};
|
||||
|
||||
static unsigned get_slab_entry_alignment(struct amdgpu_winsys *ws, unsigned size)
|
||||
/* Return the power of two size of a slab entry matching the input size. */
|
||||
static unsigned get_slab_pot_entry_size(struct amdgpu_winsys *ws, unsigned size)
|
||||
{
|
||||
unsigned entry_size = util_next_power_of_two(size);
|
||||
unsigned min_entry_size = 1 << ws->bo_slabs[0].min_order;
|
||||
|
@ -690,6 +691,17 @@ static unsigned get_slab_entry_alignment(struct amdgpu_winsys *ws, unsigned size
|
|||
return MAX2(entry_size, min_entry_size);
|
||||
}
|
||||
|
||||
/* Return the slab entry alignment. */
|
||||
static unsigned get_slab_entry_alignment(struct amdgpu_winsys *ws, unsigned size)
|
||||
{
|
||||
unsigned entry_size = get_slab_pot_entry_size(ws, size);
|
||||
|
||||
if (size <= entry_size * 3 / 4)
|
||||
return entry_size / 4;
|
||||
|
||||
return entry_size;
|
||||
}
|
||||
|
||||
static struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap,
|
||||
unsigned entry_size,
|
||||
unsigned group_index,
|
||||
|
@ -719,6 +731,21 @@ static struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap,
|
|||
/* The slab size is twice the size of the largest possible entry. */
|
||||
slab_size = max_entry_size * 2;
|
||||
|
||||
if (!util_is_power_of_two_nonzero(entry_size)) {
|
||||
assert(util_is_power_of_two_nonzero(entry_size * 4 / 3));
|
||||
|
||||
/* If the entry size is 3/4 of a power of two, we would waste space and not gain
|
||||
* anything if we allocated only twice the power of two for the backing buffer:
|
||||
* 2 * 3/4 = 1.5 usable with buffer size 2
|
||||
*
|
||||
* Allocating 5 times the entry size leads us to the next power of two and results
|
||||
* in a much better memory utilization:
|
||||
* 5 * 3/4 = 3.75 usable with buffer size 4
|
||||
*/
|
||||
if (entry_size * 5 > slab_size)
|
||||
slab_size = util_next_power_of_two(entry_size * 5);
|
||||
}
|
||||
|
||||
/* The largest slab should have the same size as the PTE fragment
|
||||
* size to get faster address translation.
|
||||
*/
|
||||
|
@ -736,8 +763,11 @@ static struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap,
|
|||
if (!slab->buffer)
|
||||
goto fail;
|
||||
|
||||
slab->base.num_entries = slab->buffer->base.size / entry_size;
|
||||
slab_size = slab->buffer->base.size;
|
||||
|
||||
slab->base.num_entries = slab_size / entry_size;
|
||||
slab->base.num_free = slab->base.num_entries;
|
||||
slab->entry_size = entry_size;
|
||||
slab->entries = CALLOC(slab->base.num_entries, sizeof(*slab->entries));
|
||||
if (!slab->entries)
|
||||
goto fail_buffer;
|
||||
|
@ -773,6 +803,13 @@ static struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap,
|
|||
list_addtail(&bo->u.slab.entry.head, &slab->base.free);
|
||||
}
|
||||
|
||||
/* Wasted alignment due to slabs with 3/4 allocations being aligned to a power of two. */
|
||||
assert(slab->base.num_entries * entry_size <= slab_size);
|
||||
if (domains & RADEON_DOMAIN_VRAM)
|
||||
ws->slab_wasted_vram += slab_size - slab->base.num_entries * entry_size;
|
||||
else
|
||||
ws->slab_wasted_gtt += slab_size - slab->base.num_entries * entry_size;
|
||||
|
||||
return &slab->base;
|
||||
|
||||
fail_buffer:
|
||||
|
@ -799,6 +836,14 @@ struct pb_slab *amdgpu_bo_slab_alloc_normal(void *priv, unsigned heap,
|
|||
void amdgpu_bo_slab_free(void *priv, struct pb_slab *pslab)
|
||||
{
|
||||
struct amdgpu_slab *slab = amdgpu_slab(pslab);
|
||||
struct amdgpu_winsys *ws = slab->buffer->ws;
|
||||
unsigned slab_size = slab->buffer->base.size;
|
||||
|
||||
assert(slab->base.num_entries * slab->entry_size <= slab_size);
|
||||
if (slab->buffer->base.placement & RADEON_DOMAIN_VRAM)
|
||||
ws->slab_wasted_vram -= slab_size - slab->base.num_entries * slab->entry_size;
|
||||
else
|
||||
ws->slab_wasted_gtt -= slab_size - slab->base.num_entries * slab->entry_size;
|
||||
|
||||
for (unsigned i = 0; i < slab->base.num_entries; ++i) {
|
||||
amdgpu_bo_remove_fences(&slab->entries[i]);
|
||||
|
@ -1347,8 +1392,19 @@ amdgpu_bo_create(struct amdgpu_winsys *ws,
|
|||
if (size < alignment && alignment <= 4 * 1024)
|
||||
alloc_size = alignment;
|
||||
|
||||
if (alignment > get_slab_entry_alignment(ws, size))
|
||||
goto no_slab; /* can't fulfil alignment requirements */
|
||||
if (alignment > get_slab_entry_alignment(ws, alloc_size)) {
|
||||
/* 3/4 allocations can return too small alignment. Try again with a power of two
|
||||
* allocation size.
|
||||
*/
|
||||
unsigned pot_size = get_slab_pot_entry_size(ws, alloc_size);
|
||||
|
||||
if (alignment <= pot_size) {
|
||||
/* This size works but wastes some memory to fulfil the alignment. */
|
||||
alloc_size = pot_size;
|
||||
} else {
|
||||
goto no_slab; /* can't fulfil alignment requirements */
|
||||
}
|
||||
}
|
||||
|
||||
struct pb_slabs *slabs = get_slabs(ws, alloc_size, flags);
|
||||
entry = pb_slab_alloc(slabs, alloc_size, heap);
|
||||
|
|
|
@ -115,6 +115,7 @@ struct amdgpu_winsys_bo {
|
|||
|
||||
struct amdgpu_slab {
|
||||
struct pb_slab base;
|
||||
unsigned entry_size;
|
||||
struct amdgpu_winsys_bo *buffer;
|
||||
struct amdgpu_winsys_bo *entries;
|
||||
};
|
||||
|
|
|
@ -458,7 +458,7 @@ amdgpu_winsys_create(int fd, const struct pipe_screen_config *config,
|
|||
|
||||
if (!pb_slabs_init(&aws->bo_slabs[i],
|
||||
min_order, max_order,
|
||||
RADEON_MAX_SLAB_HEAPS,
|
||||
RADEON_MAX_SLAB_HEAPS, true,
|
||||
aws,
|
||||
amdgpu_bo_can_reclaim_slab,
|
||||
amdgpu_bo_slab_alloc_normal,
|
||||
|
@ -471,7 +471,7 @@ amdgpu_winsys_create(int fd, const struct pipe_screen_config *config,
|
|||
if (aws->info.has_tmz_support &&
|
||||
!pb_slabs_init(&aws->bo_slabs_encrypted[i],
|
||||
min_order, max_order,
|
||||
RADEON_MAX_SLAB_HEAPS,
|
||||
RADEON_MAX_SLAB_HEAPS, true,
|
||||
aws,
|
||||
amdgpu_bo_can_reclaim_slab,
|
||||
amdgpu_bo_slab_alloc_encrypted,
|
||||
|
|
|
@ -861,7 +861,7 @@ radeon_drm_winsys_create(int fd, const struct pipe_screen_config *config,
|
|||
*/
|
||||
if (!pb_slabs_init(&ws->bo_slabs,
|
||||
RADEON_SLAB_MIN_SIZE_LOG2, RADEON_SLAB_MAX_SIZE_LOG2,
|
||||
RADEON_MAX_SLAB_HEAPS,
|
||||
RADEON_MAX_SLAB_HEAPS, false,
|
||||
ws,
|
||||
radeon_bo_can_reclaim_slab,
|
||||
radeon_bo_slab_alloc,
|
||||
|
|
Loading…
Reference in New Issue