diff --git a/src/gallium/auxiliary/pipebuffer/pb_slab.c b/src/gallium/auxiliary/pipebuffer/pb_slab.c index 37f2400dd9b..9918e854b1e 100644 --- a/src/gallium/auxiliary/pipebuffer/pb_slab.c +++ b/src/gallium/auxiliary/pipebuffer/pb_slab.c @@ -102,11 +102,22 @@ pb_slab_alloc(struct pb_slabs *slabs, unsigned size, unsigned heap) struct pb_slab_group *group; struct pb_slab *slab; struct pb_slab_entry *entry; + unsigned entry_size = 1 << order; + bool three_fourths = false; + + /* If the size is <= 3/4 of the entry size, use a slab with entries using + * 3/4 sizes to reduce overallocation. + */ + if (slabs->allow_three_fourths_allocations && size <= entry_size * 3 / 4) { + entry_size = entry_size * 3 / 4; + three_fourths = true; + } assert(order < slabs->min_order + slabs->num_orders); assert(heap < slabs->num_heaps); - group_index = heap * slabs->num_orders + (order - slabs->min_order); + group_index = (heap * slabs->num_orders + (order - slabs->min_order)) * + (1 + slabs->allow_three_fourths_allocations) + three_fourths; group = &slabs->groups[group_index]; mtx_lock(&slabs->mutex); @@ -136,7 +147,7 @@ pb_slab_alloc(struct pb_slabs *slabs, unsigned size, unsigned heap) * slabs for the same group, but that doesn't hurt correctness. */ mtx_unlock(&slabs->mutex); - slab = slabs->slab_alloc(slabs->priv, heap, 1 << order, group_index); + slab = slabs->slab_alloc(slabs->priv, heap, entry_size, group_index); if (!slab) return NULL; mtx_lock(&slabs->mutex); @@ -191,7 +202,7 @@ pb_slabs_reclaim(struct pb_slabs *slabs) bool pb_slabs_init(struct pb_slabs *slabs, unsigned min_order, unsigned max_order, - unsigned num_heaps, + unsigned num_heaps, bool allow_three_fourth_allocations, void *priv, slab_can_reclaim_fn *can_reclaim, slab_alloc_fn *slab_alloc, @@ -206,6 +217,7 @@ pb_slabs_init(struct pb_slabs *slabs, slabs->min_order = min_order; slabs->num_orders = max_order - min_order + 1; slabs->num_heaps = num_heaps; + slabs->allow_three_fourths_allocations = allow_three_fourth_allocations; slabs->priv = priv; slabs->can_reclaim = can_reclaim; @@ -214,7 +226,8 @@ pb_slabs_init(struct pb_slabs *slabs, list_inithead(&slabs->reclaim); - num_groups = slabs->num_orders * slabs->num_heaps; + num_groups = slabs->num_orders * slabs->num_heaps * + (1 + allow_three_fourth_allocations); slabs->groups = CALLOC(num_groups, sizeof(*slabs->groups)); if (!slabs->groups) return false; diff --git a/src/gallium/auxiliary/pipebuffer/pb_slab.h b/src/gallium/auxiliary/pipebuffer/pb_slab.h index 584e31157d7..a7940b6b513 100644 --- a/src/gallium/auxiliary/pipebuffer/pb_slab.h +++ b/src/gallium/auxiliary/pipebuffer/pb_slab.h @@ -116,8 +116,9 @@ struct pb_slabs unsigned min_order; unsigned num_orders; unsigned num_heaps; + bool allow_three_fourths_allocations; - /* One group per (heap, order) pair. */ + /* One group per (heap, order, three_fourth_allocations). */ struct pb_slab_group *groups; /* List of entries waiting to be reclaimed, i.e. they have been passed to @@ -144,7 +145,7 @@ pb_slabs_reclaim(struct pb_slabs *slabs); bool pb_slabs_init(struct pb_slabs *slabs, unsigned min_order, unsigned max_order, - unsigned num_heaps, + unsigned num_heaps, bool allow_three_fourth_allocations, void *priv, slab_can_reclaim_fn *can_reclaim, slab_alloc_fn *slab_alloc, diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c index 020cf0b0603..fc8e0dda07b 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c @@ -682,7 +682,8 @@ static const struct pb_vtbl amdgpu_winsys_bo_slab_vtbl = { /* other functions are never called */ }; -static unsigned get_slab_entry_alignment(struct amdgpu_winsys *ws, unsigned size) +/* Return the power of two size of a slab entry matching the input size. */ +static unsigned get_slab_pot_entry_size(struct amdgpu_winsys *ws, unsigned size) { unsigned entry_size = util_next_power_of_two(size); unsigned min_entry_size = 1 << ws->bo_slabs[0].min_order; @@ -690,6 +691,17 @@ static unsigned get_slab_entry_alignment(struct amdgpu_winsys *ws, unsigned size return MAX2(entry_size, min_entry_size); } +/* Return the slab entry alignment. */ +static unsigned get_slab_entry_alignment(struct amdgpu_winsys *ws, unsigned size) +{ + unsigned entry_size = get_slab_pot_entry_size(ws, size); + + if (size <= entry_size * 3 / 4) + return entry_size / 4; + + return entry_size; +} + static struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap, unsigned entry_size, unsigned group_index, @@ -719,6 +731,21 @@ static struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap, /* The slab size is twice the size of the largest possible entry. */ slab_size = max_entry_size * 2; + if (!util_is_power_of_two_nonzero(entry_size)) { + assert(util_is_power_of_two_nonzero(entry_size * 4 / 3)); + + /* If the entry size is 3/4 of a power of two, we would waste space and not gain + * anything if we allocated only twice the power of two for the backing buffer: + * 2 * 3/4 = 1.5 usable with buffer size 2 + * + * Allocating 5 times the entry size leads us to the next power of two and results + * in a much better memory utilization: + * 5 * 3/4 = 3.75 usable with buffer size 4 + */ + if (entry_size * 5 > slab_size) + slab_size = util_next_power_of_two(entry_size * 5); + } + /* The largest slab should have the same size as the PTE fragment * size to get faster address translation. */ @@ -736,8 +763,11 @@ static struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap, if (!slab->buffer) goto fail; - slab->base.num_entries = slab->buffer->base.size / entry_size; + slab_size = slab->buffer->base.size; + + slab->base.num_entries = slab_size / entry_size; slab->base.num_free = slab->base.num_entries; + slab->entry_size = entry_size; slab->entries = CALLOC(slab->base.num_entries, sizeof(*slab->entries)); if (!slab->entries) goto fail_buffer; @@ -773,6 +803,13 @@ static struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap, list_addtail(&bo->u.slab.entry.head, &slab->base.free); } + /* Wasted alignment due to slabs with 3/4 allocations being aligned to a power of two. */ + assert(slab->base.num_entries * entry_size <= slab_size); + if (domains & RADEON_DOMAIN_VRAM) + ws->slab_wasted_vram += slab_size - slab->base.num_entries * entry_size; + else + ws->slab_wasted_gtt += slab_size - slab->base.num_entries * entry_size; + return &slab->base; fail_buffer: @@ -799,6 +836,14 @@ struct pb_slab *amdgpu_bo_slab_alloc_normal(void *priv, unsigned heap, void amdgpu_bo_slab_free(void *priv, struct pb_slab *pslab) { struct amdgpu_slab *slab = amdgpu_slab(pslab); + struct amdgpu_winsys *ws = slab->buffer->ws; + unsigned slab_size = slab->buffer->base.size; + + assert(slab->base.num_entries * slab->entry_size <= slab_size); + if (slab->buffer->base.placement & RADEON_DOMAIN_VRAM) + ws->slab_wasted_vram -= slab_size - slab->base.num_entries * slab->entry_size; + else + ws->slab_wasted_gtt -= slab_size - slab->base.num_entries * slab->entry_size; for (unsigned i = 0; i < slab->base.num_entries; ++i) { amdgpu_bo_remove_fences(&slab->entries[i]); @@ -1347,8 +1392,19 @@ amdgpu_bo_create(struct amdgpu_winsys *ws, if (size < alignment && alignment <= 4 * 1024) alloc_size = alignment; - if (alignment > get_slab_entry_alignment(ws, size)) - goto no_slab; /* can't fulfil alignment requirements */ + if (alignment > get_slab_entry_alignment(ws, alloc_size)) { + /* 3/4 allocations can return too small alignment. Try again with a power of two + * allocation size. + */ + unsigned pot_size = get_slab_pot_entry_size(ws, alloc_size); + + if (alignment <= pot_size) { + /* This size works but wastes some memory to fulfil the alignment. */ + alloc_size = pot_size; + } else { + goto no_slab; /* can't fulfil alignment requirements */ + } + } struct pb_slabs *slabs = get_slabs(ws, alloc_size, flags); entry = pb_slab_alloc(slabs, alloc_size, heap); diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h index d33c141e90f..c9b1cb6a517 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h @@ -115,6 +115,7 @@ struct amdgpu_winsys_bo { struct amdgpu_slab { struct pb_slab base; + unsigned entry_size; struct amdgpu_winsys_bo *buffer; struct amdgpu_winsys_bo *entries; }; diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c index 194f0da1840..693414abd40 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c @@ -458,7 +458,7 @@ amdgpu_winsys_create(int fd, const struct pipe_screen_config *config, if (!pb_slabs_init(&aws->bo_slabs[i], min_order, max_order, - RADEON_MAX_SLAB_HEAPS, + RADEON_MAX_SLAB_HEAPS, true, aws, amdgpu_bo_can_reclaim_slab, amdgpu_bo_slab_alloc_normal, @@ -471,7 +471,7 @@ amdgpu_winsys_create(int fd, const struct pipe_screen_config *config, if (aws->info.has_tmz_support && !pb_slabs_init(&aws->bo_slabs_encrypted[i], min_order, max_order, - RADEON_MAX_SLAB_HEAPS, + RADEON_MAX_SLAB_HEAPS, true, aws, amdgpu_bo_can_reclaim_slab, amdgpu_bo_slab_alloc_encrypted, diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c index 8d25ff368d7..842604b8593 100644 --- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c +++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c @@ -861,7 +861,7 @@ radeon_drm_winsys_create(int fd, const struct pipe_screen_config *config, */ if (!pb_slabs_init(&ws->bo_slabs, RADEON_SLAB_MIN_SIZE_LOG2, RADEON_SLAB_MAX_SIZE_LOG2, - RADEON_MAX_SLAB_HEAPS, + RADEON_MAX_SLAB_HEAPS, false, ws, radeon_bo_can_reclaim_slab, radeon_bo_slab_alloc,