diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index 5086a33969a..61eb2f10be2 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -95,10 +95,13 @@ static uint32_t null_image_descriptor[8] = { * descriptor */ }; -static void si_init_descriptors(struct si_descriptors *desc, +static void si_init_descriptors(struct si_context *sctx, + struct si_descriptors *desc, unsigned shader_userdata_index, unsigned element_dw_size, unsigned num_elements, + unsigned first_ce_slot, + unsigned num_ce_slots, unsigned *ce_offset) { assert(num_elements <= sizeof(desc->dirty_mask)*8); @@ -106,14 +109,16 @@ static void si_init_descriptors(struct si_descriptors *desc, desc->list = CALLOC(num_elements, element_dw_size * 4); desc->element_dw_size = element_dw_size; desc->num_elements = num_elements; + desc->first_ce_slot = sctx->ce_ib ? first_ce_slot : 0; + desc->num_ce_slots = sctx->ce_ib ? num_ce_slots : 0; desc->dirty_mask = u_bit_consecutive64(0, num_elements); desc->shader_userdata_offset = shader_userdata_index * 4; - if (ce_offset) { + if (desc->num_ce_slots) { desc->uses_ce = true; desc->ce_offset = *ce_offset; - *ce_offset += element_dw_size * num_elements * 4; + *ce_offset += element_dw_size * desc->num_ce_slots * 4; } } @@ -205,13 +210,16 @@ static bool si_upload_descriptors(struct si_context *sctx, if (!upload_size) return true; - if (sctx->ce_ib && desc->uses_ce) { - uint32_t const* list = (uint32_t const*)desc->list; + if (desc->uses_ce) { + const uint32_t *list = desc->list + + desc->first_ce_slot * desc->element_dw_size; + uint64_t mask = (desc->dirty_mask >> desc->first_ce_slot) & + u_bit_consecutive64(0, desc->num_ce_slots); - while(desc->dirty_mask) { + + while (mask) { int begin, count; - u_bit_scan_consecutive_range64(&desc->dirty_mask, &begin, - &count); + u_bit_scan_consecutive_range64(&mask, &begin, &count); begin *= desc->element_dw_size; count *= desc->element_dw_size; @@ -222,7 +230,9 @@ static bool si_upload_descriptors(struct si_context *sctx, radeon_emit_array(sctx->ce_ib, list + begin, count); } - if (!si_ce_upload(sctx, desc->ce_offset + first_slot_offset, + if (!si_ce_upload(sctx, + desc->ce_offset + + (first_slot_offset - desc->first_ce_slot * slot_size), upload_size, (unsigned*)&desc->buffer_offset, &desc->buffer)) return false; @@ -920,9 +930,12 @@ static void si_bind_sampler_states(struct pipe_context *ctx, /* BUFFER RESOURCES */ -static void si_init_buffer_resources(struct si_buffer_resources *buffers, +static void si_init_buffer_resources(struct si_context *sctx, + struct si_buffer_resources *buffers, struct si_descriptors *descs, unsigned num_buffers, + unsigned first_ce_slot, + unsigned num_ce_slots, unsigned shader_userdata_index, enum radeon_bo_usage shader_usage, enum radeon_bo_usage shader_usage_constbuf, @@ -936,8 +949,8 @@ static void si_init_buffer_resources(struct si_buffer_resources *buffers, buffers->priority_constbuf = priority_constbuf; buffers->buffers = CALLOC(num_buffers, sizeof(struct pipe_resource*)); - si_init_descriptors(descs, shader_userdata_index, 4, - num_buffers, ce_offset); + si_init_descriptors(sctx, descs, shader_userdata_index, 4, num_buffers, + first_ce_slot, num_ce_slots, ce_offset); } static void si_release_buffer_resources(struct si_buffer_resources *buffers, @@ -1994,6 +2007,48 @@ void si_emit_compute_shader_userdata(struct si_context *sctx) /* INIT/DEINIT/UPLOAD */ +/* GFX9 has only 4KB of CE, while previous chips had 32KB. In order + * to make CE RAM as useful as possible, this defines limits + * for the number slots that can be in CE RAM on GFX9. If a shader + * is using more, descriptors will be uploaded to memory directly and + * CE won't be used. + * + * These numbers are based on shader-db. + */ +static unsigned gfx9_max_ce_samplers[SI_NUM_SHADERS] = { + [PIPE_SHADER_VERTEX] = 0, + [PIPE_SHADER_TESS_CTRL] = 0, + [PIPE_SHADER_TESS_EVAL] = 1, + [PIPE_SHADER_GEOMETRY] = 0, + [PIPE_SHADER_FRAGMENT] = 24, + [PIPE_SHADER_COMPUTE] = 16, +}; +static unsigned gfx9_max_ce_images[SI_NUM_SHADERS] = { + /* these must be even due to slot alignment */ + [PIPE_SHADER_VERTEX] = 0, + [PIPE_SHADER_TESS_CTRL] = 0, + [PIPE_SHADER_TESS_EVAL] = 0, + [PIPE_SHADER_GEOMETRY] = 0, + [PIPE_SHADER_FRAGMENT] = 2, + [PIPE_SHADER_COMPUTE] = 8, +}; +static unsigned gfx9_max_ce_const_buffers[SI_NUM_SHADERS] = { + [PIPE_SHADER_VERTEX] = 9, + [PIPE_SHADER_TESS_CTRL] = 3, + [PIPE_SHADER_TESS_EVAL] = 5, + [PIPE_SHADER_GEOMETRY] = 0, + [PIPE_SHADER_FRAGMENT] = 8, + [PIPE_SHADER_COMPUTE] = 6, +}; +static unsigned gfx9_max_ce_shader_buffers[SI_NUM_SHADERS] = { + [PIPE_SHADER_VERTEX] = 0, + [PIPE_SHADER_TESS_CTRL] = 0, + [PIPE_SHADER_TESS_EVAL] = 0, + [PIPE_SHADER_GEOMETRY] = 0, + [PIPE_SHADER_FRAGMENT] = 12, + [PIPE_SHADER_COMPUTE] = 13, +}; + void si_init_all_descriptors(struct si_context *sctx) { int i; @@ -2003,23 +2058,37 @@ void si_init_all_descriptors(struct si_context *sctx) STATIC_ASSERT(GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS % 2 == 0); for (i = 0; i < SI_NUM_SHADERS; i++) { - bool gfx9_tcs = sctx->b.chip_class == GFX9 && - i == PIPE_SHADER_TESS_CTRL; - bool gfx9_gs = sctx->b.chip_class == GFX9 && - i == PIPE_SHADER_GEOMETRY; - /* GFX9 has only 4KB of CE, while previous chips had 32KB. - * Rarely used descriptors don't use CE RAM. - */ - bool big_ce = sctx->b.chip_class <= VI; - bool const_and_shaderbufs_use_ce = big_ce || - i == PIPE_SHADER_VERTEX || - i == PIPE_SHADER_FRAGMENT; - bool samplers_and_images_use_ce = big_ce || - i == PIPE_SHADER_FRAGMENT; + bool gfx9_tcs = false; + bool gfx9_gs = false; + unsigned num_sampler_slots = SI_NUM_IMAGES / 2 + SI_NUM_SAMPLERS; + unsigned num_buffer_slots = SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS; - si_init_buffer_resources(&sctx->const_and_shader_buffers[i], + unsigned first_sampler_ce_slot = 0; + unsigned num_sampler_ce_slots = num_sampler_slots; + + unsigned first_buffer_ce_slot = 0; + unsigned num_buffer_ce_slots = num_buffer_slots; + + /* Adjust CE slot ranges based on GFX9 CE RAM limits. */ + if (sctx->b.chip_class >= GFX9) { + gfx9_tcs = i == PIPE_SHADER_TESS_CTRL; + gfx9_gs = i == PIPE_SHADER_GEOMETRY; + + first_sampler_ce_slot = + si_get_image_slot(gfx9_max_ce_images[i] - 1) / 2; + num_sampler_ce_slots = gfx9_max_ce_images[i] / 2 + + gfx9_max_ce_samplers[i]; + + first_buffer_ce_slot = + si_get_shaderbuf_slot(gfx9_max_ce_shader_buffers[i] - 1); + num_buffer_ce_slots = gfx9_max_ce_shader_buffers[i] + + gfx9_max_ce_const_buffers[i]; + } + + si_init_buffer_resources(sctx, &sctx->const_and_shader_buffers[i], si_const_and_shader_buffer_descriptors(sctx, i), - SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS, + num_buffer_slots, + first_buffer_ce_slot, num_buffer_ce_slots, gfx9_tcs ? GFX9_SGPR_TCS_CONST_AND_SHADER_BUFFERS : gfx9_gs ? GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS : SI_SGPR_CONST_AND_SHADER_BUFFERS, @@ -2027,15 +2096,16 @@ void si_init_all_descriptors(struct si_context *sctx) RADEON_USAGE_READ, RADEON_PRIO_SHADER_RW_BUFFER, RADEON_PRIO_CONST_BUFFER, - const_and_shaderbufs_use_ce ? &ce_offset : NULL); + &ce_offset); struct si_descriptors *desc = si_sampler_and_image_descriptors(sctx, i); - si_init_descriptors(desc, + si_init_descriptors(sctx, desc, gfx9_tcs ? GFX9_SGPR_TCS_SAMPLERS_AND_IMAGES : gfx9_gs ? GFX9_SGPR_GS_SAMPLERS_AND_IMAGES : SI_SGPR_SAMPLERS_AND_IMAGES, - 16, SI_NUM_IMAGES / 2 + SI_NUM_SAMPLERS, - samplers_and_images_use_ce ? &ce_offset : NULL); + 16, num_sampler_slots, + first_sampler_ce_slot, num_sampler_ce_slots, + &ce_offset); int j; for (j = 0; j < SI_NUM_IMAGES; j++) @@ -2044,9 +2114,10 @@ void si_init_all_descriptors(struct si_context *sctx) memcpy(desc->list + j * 8, null_texture_descriptor, 8 * 4); } - si_init_buffer_resources(&sctx->rw_buffers, + si_init_buffer_resources(sctx, &sctx->rw_buffers, &sctx->descriptors[SI_DESCS_RW_BUFFERS], - SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS, + SI_NUM_RW_BUFFERS, 0, SI_NUM_RW_BUFFERS, + SI_SGPR_RW_BUFFERS, /* The second set of usage/priority is used by * const buffers in RW buffer slots. */ RADEON_USAGE_READWRITE, RADEON_USAGE_READ, @@ -2054,8 +2125,8 @@ void si_init_all_descriptors(struct si_context *sctx) &ce_offset); sctx->descriptors[SI_DESCS_RW_BUFFERS].num_active_slots = SI_NUM_RW_BUFFERS; - si_init_descriptors(&sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFERS, - 4, SI_NUM_VERTEX_BUFFERS, NULL); + si_init_descriptors(sctx, &sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFERS, + 4, SI_NUM_VERTEX_BUFFERS, 0, 0, NULL); sctx->descriptors_dirty = u_bit_consecutive(0, SI_NUM_DESCS); sctx->total_ce_ram_allocated = ce_offset; @@ -2197,6 +2268,26 @@ void si_set_active_descriptors(struct si_context *sctx, unsigned desc_idx, first + count > desc->first_active_slot + desc->num_active_slots) sctx->descriptors_dirty |= 1u << desc_idx; + /* Enable or disable CE for this descriptor array. */ + bool used_ce = desc->uses_ce; + desc->uses_ce = desc->first_ce_slot <= first && + desc->first_ce_slot + desc->num_ce_slots >= first + count; + + if (desc->uses_ce != used_ce) { + /* Upload or dump descriptors if we're disabling or enabling CE, + * respectively. */ + sctx->descriptors_dirty |= 1u << desc_idx; + + /* If we're enabling CE, re-upload all descriptors to CE RAM. + * When CE was disabled, uploads to CE RAM stopped. + */ + if (desc->uses_ce) { + desc->dirty_mask |= + u_bit_consecutive64(desc->first_ce_slot, + desc->num_ce_slots); + } + } + desc->first_active_slot = first; desc->num_active_slots = count; } diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index dfabaa35566..275f830613c 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -228,6 +228,13 @@ struct si_descriptors { /* Offset in CE RAM */ unsigned ce_offset; + /* Slots allocated in CE RAM. If we get active slots outside of this + * range, direct uploads to memory will be used instead. This basically + * governs switching between onchip (CE) and offchip (upload) modes. + */ + unsigned first_ce_slot; + unsigned num_ce_slots; + /* Slots that are used by currently-bound shaders. * With CE: It determines which slots are dumped to L2. * It doesn't skip uploads to CE RAM.