radeonsi: move the GS copy shader into shader variants

This will allow further optimizations for shader variants that change
GS outputs (affecting the copy shader), and this is mainly about sharing
optimizations with NGG instead of having a totally separate codepath for
legacy GS.

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14266>
This commit is contained in:
Marek Olšák 2021-12-19 21:04:56 -05:00 committed by Marge Bot
parent 1caa94f2a5
commit d4a1766a5a
4 changed files with 111 additions and 34 deletions

View File

@ -1510,6 +1510,15 @@ bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compi
if (!si_llvm_compile_shader(sscreen, compiler, shader, debug, nir, free_nir))
return false;
/* The GS copy shader is compiled next. */
if (sel->info.stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) {
shader->gs_copy_shader = si_generate_gs_copy_shader(sscreen, compiler, sel, debug);
if (!shader->gs_copy_shader) {
fprintf(stderr, "radeonsi: can't create GS copy shader\n");
return false;
}
}
/* Compute vs_output_ps_input_cntl. */
if ((sel->info.stage == MESA_SHADER_VERTEX ||
sel->info.stage == MESA_SHADER_TESS_EVAL ||
@ -1518,7 +1527,7 @@ bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compi
ubyte *vs_output_param_offset = shader->info.vs_output_param_offset;
if (sel->info.stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg)
vs_output_param_offset = sel->gs_copy_shader->info.vs_output_param_offset;
vs_output_param_offset = shader->gs_copy_shader->info.vs_output_param_offset;
/* VS and TES should also set primitive ID output if it's used. */
unsigned num_outputs_with_prim_id = sel->info.num_outputs +
@ -2110,6 +2119,29 @@ bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler
case MESA_SHADER_GEOMETRY:
if (!si_shader_select_gs_parts(sscreen, compiler, shader, debug))
return false;
/* Clone the GS copy shader for the shader variant.
* We can't just copy the pointer because we change the pm4 state and
* si_shader_selector::gs_copy_shader must be immutable because it's shared
* by multiple contexts.
*/
if (!shader->key.ge.as_ngg) {
assert(sel->main_shader_part == mainp);
assert(sel->main_shader_part->gs_copy_shader);
assert(sel->main_shader_part->gs_copy_shader->bo);
assert(!sel->main_shader_part->gs_copy_shader->previous_stage_sel);
assert(!sel->main_shader_part->gs_copy_shader->scratch_bo);
shader->gs_copy_shader = CALLOC_STRUCT(si_shader);
memcpy(shader->gs_copy_shader, sel->main_shader_part->gs_copy_shader,
sizeof(*shader->gs_copy_shader));
/* Increase the reference count. */
pipe_reference(NULL, &shader->gs_copy_shader->bo->b.b.reference);
/* Initialize some fields differently. */
shader->gs_copy_shader->shader_log = NULL;
shader->gs_copy_shader->is_binary_shared = true;
util_queue_fence_init(&shader->gs_copy_shader->ready);
}
break;
case MESA_SHADER_FRAGMENT:
if (!si_shader_select_ps_parts(sscreen, compiler, shader, debug))

View File

@ -454,8 +454,6 @@ struct si_shader_selector {
struct si_shader *main_shader_part_ngg; /* as_ngg is set in the key */
struct si_shader *main_shader_part_ngg_es; /* for Wave32 TES before legacy GS */
struct si_shader *gs_copy_shader;
struct nir_shader *nir;
void *nir_binary;
unsigned nir_size;
@ -816,6 +814,7 @@ struct si_shader {
struct si_shader_part *prolog;
struct si_shader *previous_stage; /* for GFX9 */
struct si_shader_part *epilog;
struct si_shader *gs_copy_shader;
struct si_resource *bo;
struct si_resource *scratch_bo;

View File

@ -177,7 +177,7 @@ static bool si_update_shaders(struct si_context *sctx)
return false;
si_pm4_bind_state(sctx, gs, sctx->shader.gs.current);
if (!NGG) {
si_pm4_bind_state(sctx, vs, sctx->shader.gs.cso->gs_copy_shader);
si_pm4_bind_state(sctx, vs, sctx->shader.gs.current->gs_copy_shader);
if (!si_update_gs_ring_buffers(sctx))
return false;
@ -241,7 +241,7 @@ static bool si_update_shaders(struct si_context *sctx)
} else if (GFX_VERSION >= GFX10) {
if (HAS_GS) {
key.u.gs_wave32 = sctx->shader.gs.current->wave_size == 32;
key.u.vs_wave32 = sctx->shader.gs.cso->gs_copy_shader->wave_size == 32;
key.u.vs_wave32 = sctx->shader.gs.current->gs_copy_shader->wave_size == 32;
} else {
key.u.vs_wave32 = si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->current->wave_size == 32;
}

View File

@ -260,7 +260,7 @@ static uint32_t *read_chunk(uint32_t *ptr, void **data, unsigned *size)
* Return the shader binary in a buffer. The first 4 bytes contain its size
* as integer.
*/
static void *si_get_shader_binary(struct si_shader *shader)
static uint32_t *si_get_shader_binary(struct si_shader *shader)
{
/* There is always a size of data followed by the data itself. */
unsigned llvm_ir_size =
@ -275,8 +275,8 @@ static void *si_get_shader_binary(struct si_shader *shader)
4 + /* CRC32 of the data below */
align(sizeof(shader->config), 4) + align(sizeof(shader->info), 4) + 4 +
align(shader->binary.elf_size, 4) + 4 + align(llvm_ir_size, 4);
void *buffer = CALLOC(1, size);
uint32_t *ptr = (uint32_t *)buffer;
uint32_t *buffer = (uint32_t*)CALLOC(1, size);
uint32_t *ptr = buffer;
if (!buffer)
return NULL;
@ -291,7 +291,7 @@ static void *si_get_shader_binary(struct si_shader *shader)
assert((char *)ptr - (char *)buffer == (ptrdiff_t)size);
/* Compute CRC32. */
ptr = (uint32_t *)buffer;
ptr = buffer;
ptr++;
*ptr = util_hash_crc32(ptr + 1, size - 8);
@ -317,6 +317,29 @@ static bool si_load_shader_binary(struct si_shader *shader, void *binary)
shader->binary.elf_size = elf_size;
ptr = read_chunk(ptr, (void **)&shader->binary.llvm_ir_string, &chunk_size);
if (!shader->is_gs_copy_shader &&
shader->selector->info.stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) {
shader->gs_copy_shader = CALLOC_STRUCT(si_shader);
if (!shader->gs_copy_shader)
return false;
shader->gs_copy_shader->is_gs_copy_shader = true;
if (!si_load_shader_binary(shader->gs_copy_shader, (uint8_t*)binary + size)) {
FREE(shader->gs_copy_shader);
shader->gs_copy_shader = NULL;
return false;
}
util_queue_fence_init(&shader->gs_copy_shader->ready);
shader->gs_copy_shader->selector = shader->selector;
shader->gs_copy_shader->is_gs_copy_shader = true;
shader->gs_copy_shader->wave_size =
si_determine_wave_size(shader->selector->screen, shader->gs_copy_shader);
si_shader_binary_upload(shader->selector->screen, shader->gs_copy_shader, 0);
}
return true;
}
@ -327,7 +350,7 @@ static bool si_load_shader_binary(struct si_shader *shader, void *binary)
void si_shader_cache_insert_shader(struct si_screen *sscreen, unsigned char ir_sha1_cache_key[20],
struct si_shader *shader, bool insert_into_disk_cache)
{
void *hw_binary;
uint32_t *hw_binary;
struct hash_entry *entry;
uint8_t key[CACHE_KEY_SIZE];
bool memory_cache_full = sscreen->shader_cache_size >= sscreen->shader_cache_max_size;
@ -343,6 +366,31 @@ void si_shader_cache_insert_shader(struct si_screen *sscreen, unsigned char ir_s
if (!hw_binary)
return;
unsigned size = *hw_binary;
if (shader->selector->info.stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) {
uint32_t *gs_copy_binary = si_get_shader_binary(shader->gs_copy_shader);
if (!gs_copy_binary) {
FREE(hw_binary);
return;
}
/* Combine both binaries. */
size += *gs_copy_binary;
uint32_t *combined_binary = (uint32_t*)MALLOC(size);
if (!combined_binary) {
FREE(hw_binary);
FREE(gs_copy_binary);
return;
}
memcpy(combined_binary, hw_binary, *hw_binary);
memcpy(combined_binary + *hw_binary / 4, gs_copy_binary, *gs_copy_binary);
FREE(hw_binary);
FREE(gs_copy_binary);
hw_binary = combined_binary;
}
if (!memory_cache_full) {
if (_mesa_hash_table_insert(sscreen->shader_cache,
mem_dup(ir_sha1_cache_key, 20),
@ -350,13 +398,13 @@ void si_shader_cache_insert_shader(struct si_screen *sscreen, unsigned char ir_s
FREE(hw_binary);
return;
}
/* The size is stored at the start of the binary */
sscreen->shader_cache_size += *(uint32_t*)hw_binary;
sscreen->shader_cache_size += size;
}
if (sscreen->disk_shader_cache && insert_into_disk_cache) {
disk_cache_compute_key(sscreen->disk_shader_cache, ir_sha1_cache_key, 20, key);
disk_cache_put(sscreen->disk_shader_cache, key, hw_binary, *((uint32_t *)hw_binary), NULL);
disk_cache_put(sscreen->disk_shader_cache, key, hw_binary, size, NULL);
}
if (memory_cache_full)
@ -382,10 +430,17 @@ bool si_shader_cache_load_shader(struct si_screen *sscreen, unsigned char ir_sha
unsigned char sha1[CACHE_KEY_SIZE];
disk_cache_compute_key(sscreen->disk_shader_cache, ir_sha1_cache_key, 20, sha1);
size_t binary_size;
uint8_t *buffer = (uint8_t*)disk_cache_get(sscreen->disk_shader_cache, sha1, &binary_size);
size_t total_size;
uint32_t *buffer = (uint32_t*)disk_cache_get(sscreen->disk_shader_cache, sha1, &total_size);
if (buffer) {
if (binary_size >= sizeof(uint32_t) && *((uint32_t *)buffer) == binary_size) {
unsigned size = *buffer;
unsigned gs_copy_binary_size = 0;
/* The GS copy shader binary is after the GS binary. */
if (shader->selector->info.stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg)
gs_copy_binary_size = buffer[size / 4];
if (total_size >= sizeof(uint32_t) && size + gs_copy_binary_size == total_size) {
if (si_load_shader_binary(shader, buffer)) {
free(buffer);
si_shader_cache_insert_shader(sscreen, ir_sha1_cache_key, shader, false);
@ -997,7 +1052,7 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
S_028B90_CNT(MIN2(gs_num_invocations, 127)) | S_028B90_ENABLE(gs_num_invocations > 0);
/* Copy over fields from the GS copy shader to make them easily accessible from GS. */
shader->pa_cl_vs_out_cntl = sel->gs_copy_shader->pa_cl_vs_out_cntl;
shader->pa_cl_vs_out_cntl = shader->gs_copy_shader->pa_cl_vs_out_cntl;
va = shader->bo->gpu_address;
@ -1906,10 +1961,13 @@ static void si_shader_init_pm4_state(struct si_screen *sscreen, struct si_shader
si_shader_vs(sscreen, shader, NULL);
break;
case MESA_SHADER_GEOMETRY:
if (shader->key.ge.as_ngg)
if (shader->key.ge.as_ngg) {
gfx10_shader_ngg(sscreen, shader);
else
} else {
/* VS must be initialized first because GS uses its fields. */
si_shader_vs(sscreen, shader->gs_copy_shader, shader->selector);
si_shader_gs(sscreen, shader);
}
break;
case MESA_SHADER_FRAGMENT:
si_shader_ps(sscreen, shader);
@ -2790,19 +2848,6 @@ static void si_init_shader_selector_async(void *job, void *gdata, int thread_ind
if (!compiler->passes)
si_init_compiler(sscreen, compiler);
/* The GS copy shader is always pre-compiled. */
if (sel->info.stage == MESA_SHADER_GEOMETRY &&
(!sscreen->use_ngg || !sscreen->use_ngg_streamout || /* also for PRIMITIVES_GENERATED */
sel->tess_turns_off_ngg)) {
sel->gs_copy_shader = si_generate_gs_copy_shader(sscreen, compiler, sel, debug);
if (!sel->gs_copy_shader) {
fprintf(stderr, "radeonsi: can't create GS copy shader\n");
return;
}
si_shader_vs(sscreen, sel->gs_copy_shader, sel);
}
/* Serialize NIR to save memory. Monolithic shader variants
* have to deserialize NIR before compilation.
*/
@ -3664,6 +3709,9 @@ static void si_delete_shader(struct si_context *sctx, struct si_shader *shader)
default:;
}
if (shader->gs_copy_shader)
si_delete_shader(sctx, shader->gs_copy_shader);
si_shader_selector_reference(sctx, &shader->previous_stage_sel, NULL);
si_shader_destroy(shader);
si_pm4_free_state(sctx, &shader->pm4, state_index);
@ -3697,8 +3745,6 @@ static void si_destroy_shader_selector(struct pipe_context *ctx, void *cso)
si_delete_shader(sctx, sel->main_shader_part_es);
if (sel->main_shader_part_ngg)
si_delete_shader(sctx, sel->main_shader_part_ngg);
if (sel->gs_copy_shader)
si_delete_shader(sctx, sel->gs_copy_shader);
util_queue_fence_destroy(&sel->ready);
simple_mtx_destroy(&sel->mutex);