From d4a1766a5a932a0fc69c4f9e1eaf1923966dcbe5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sun, 19 Dec 2021 21:04:56 -0500 Subject: [PATCH] radeonsi: move the GS copy shader into shader variants This will allow further optimizations for shader variants that change GS outputs (affecting the copy shader), and this is mainly about sharing optimizations with NGG instead of having a totally separate codepath for legacy GS. Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_shader.c | 34 +++++- src/gallium/drivers/radeonsi/si_shader.h | 3 +- .../drivers/radeonsi/si_state_draw.cpp | 4 +- .../drivers/radeonsi/si_state_shaders.cpp | 104 +++++++++++++----- 4 files changed, 111 insertions(+), 34 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 9db4c318735..c96e8c33f1a 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -1510,6 +1510,15 @@ bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compi if (!si_llvm_compile_shader(sscreen, compiler, shader, debug, nir, free_nir)) return false; + /* The GS copy shader is compiled next. */ + if (sel->info.stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) { + shader->gs_copy_shader = si_generate_gs_copy_shader(sscreen, compiler, sel, debug); + if (!shader->gs_copy_shader) { + fprintf(stderr, "radeonsi: can't create GS copy shader\n"); + return false; + } + } + /* Compute vs_output_ps_input_cntl. */ if ((sel->info.stage == MESA_SHADER_VERTEX || sel->info.stage == MESA_SHADER_TESS_EVAL || @@ -1518,7 +1527,7 @@ bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compi ubyte *vs_output_param_offset = shader->info.vs_output_param_offset; if (sel->info.stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) - vs_output_param_offset = sel->gs_copy_shader->info.vs_output_param_offset; + vs_output_param_offset = shader->gs_copy_shader->info.vs_output_param_offset; /* VS and TES should also set primitive ID output if it's used. */ unsigned num_outputs_with_prim_id = sel->info.num_outputs + @@ -2110,6 +2119,29 @@ bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler case MESA_SHADER_GEOMETRY: if (!si_shader_select_gs_parts(sscreen, compiler, shader, debug)) return false; + + /* Clone the GS copy shader for the shader variant. + * We can't just copy the pointer because we change the pm4 state and + * si_shader_selector::gs_copy_shader must be immutable because it's shared + * by multiple contexts. + */ + if (!shader->key.ge.as_ngg) { + assert(sel->main_shader_part == mainp); + assert(sel->main_shader_part->gs_copy_shader); + assert(sel->main_shader_part->gs_copy_shader->bo); + assert(!sel->main_shader_part->gs_copy_shader->previous_stage_sel); + assert(!sel->main_shader_part->gs_copy_shader->scratch_bo); + + shader->gs_copy_shader = CALLOC_STRUCT(si_shader); + memcpy(shader->gs_copy_shader, sel->main_shader_part->gs_copy_shader, + sizeof(*shader->gs_copy_shader)); + /* Increase the reference count. */ + pipe_reference(NULL, &shader->gs_copy_shader->bo->b.b.reference); + /* Initialize some fields differently. */ + shader->gs_copy_shader->shader_log = NULL; + shader->gs_copy_shader->is_binary_shared = true; + util_queue_fence_init(&shader->gs_copy_shader->ready); + } break; case MESA_SHADER_FRAGMENT: if (!si_shader_select_ps_parts(sscreen, compiler, shader, debug)) diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 292edf3af7f..b08f59e44d6 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -454,8 +454,6 @@ struct si_shader_selector { struct si_shader *main_shader_part_ngg; /* as_ngg is set in the key */ struct si_shader *main_shader_part_ngg_es; /* for Wave32 TES before legacy GS */ - struct si_shader *gs_copy_shader; - struct nir_shader *nir; void *nir_binary; unsigned nir_size; @@ -816,6 +814,7 @@ struct si_shader { struct si_shader_part *prolog; struct si_shader *previous_stage; /* for GFX9 */ struct si_shader_part *epilog; + struct si_shader *gs_copy_shader; struct si_resource *bo; struct si_resource *scratch_bo; diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index 36d8a10d2b1..e995ca49d86 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -177,7 +177,7 @@ static bool si_update_shaders(struct si_context *sctx) return false; si_pm4_bind_state(sctx, gs, sctx->shader.gs.current); if (!NGG) { - si_pm4_bind_state(sctx, vs, sctx->shader.gs.cso->gs_copy_shader); + si_pm4_bind_state(sctx, vs, sctx->shader.gs.current->gs_copy_shader); if (!si_update_gs_ring_buffers(sctx)) return false; @@ -241,7 +241,7 @@ static bool si_update_shaders(struct si_context *sctx) } else if (GFX_VERSION >= GFX10) { if (HAS_GS) { key.u.gs_wave32 = sctx->shader.gs.current->wave_size == 32; - key.u.vs_wave32 = sctx->shader.gs.cso->gs_copy_shader->wave_size == 32; + key.u.vs_wave32 = sctx->shader.gs.current->gs_copy_shader->wave_size == 32; } else { key.u.vs_wave32 = si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->current->wave_size == 32; } diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index 9593870ffc7..ede6ea22501 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -260,7 +260,7 @@ static uint32_t *read_chunk(uint32_t *ptr, void **data, unsigned *size) * Return the shader binary in a buffer. The first 4 bytes contain its size * as integer. */ -static void *si_get_shader_binary(struct si_shader *shader) +static uint32_t *si_get_shader_binary(struct si_shader *shader) { /* There is always a size of data followed by the data itself. */ unsigned llvm_ir_size = @@ -275,8 +275,8 @@ static void *si_get_shader_binary(struct si_shader *shader) 4 + /* CRC32 of the data below */ align(sizeof(shader->config), 4) + align(sizeof(shader->info), 4) + 4 + align(shader->binary.elf_size, 4) + 4 + align(llvm_ir_size, 4); - void *buffer = CALLOC(1, size); - uint32_t *ptr = (uint32_t *)buffer; + uint32_t *buffer = (uint32_t*)CALLOC(1, size); + uint32_t *ptr = buffer; if (!buffer) return NULL; @@ -291,7 +291,7 @@ static void *si_get_shader_binary(struct si_shader *shader) assert((char *)ptr - (char *)buffer == (ptrdiff_t)size); /* Compute CRC32. */ - ptr = (uint32_t *)buffer; + ptr = buffer; ptr++; *ptr = util_hash_crc32(ptr + 1, size - 8); @@ -317,6 +317,29 @@ static bool si_load_shader_binary(struct si_shader *shader, void *binary) shader->binary.elf_size = elf_size; ptr = read_chunk(ptr, (void **)&shader->binary.llvm_ir_string, &chunk_size); + if (!shader->is_gs_copy_shader && + shader->selector->info.stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) { + shader->gs_copy_shader = CALLOC_STRUCT(si_shader); + if (!shader->gs_copy_shader) + return false; + + shader->gs_copy_shader->is_gs_copy_shader = true; + + if (!si_load_shader_binary(shader->gs_copy_shader, (uint8_t*)binary + size)) { + FREE(shader->gs_copy_shader); + shader->gs_copy_shader = NULL; + return false; + } + + util_queue_fence_init(&shader->gs_copy_shader->ready); + shader->gs_copy_shader->selector = shader->selector; + shader->gs_copy_shader->is_gs_copy_shader = true; + shader->gs_copy_shader->wave_size = + si_determine_wave_size(shader->selector->screen, shader->gs_copy_shader); + + si_shader_binary_upload(shader->selector->screen, shader->gs_copy_shader, 0); + } + return true; } @@ -327,7 +350,7 @@ static bool si_load_shader_binary(struct si_shader *shader, void *binary) void si_shader_cache_insert_shader(struct si_screen *sscreen, unsigned char ir_sha1_cache_key[20], struct si_shader *shader, bool insert_into_disk_cache) { - void *hw_binary; + uint32_t *hw_binary; struct hash_entry *entry; uint8_t key[CACHE_KEY_SIZE]; bool memory_cache_full = sscreen->shader_cache_size >= sscreen->shader_cache_max_size; @@ -343,6 +366,31 @@ void si_shader_cache_insert_shader(struct si_screen *sscreen, unsigned char ir_s if (!hw_binary) return; + unsigned size = *hw_binary; + + if (shader->selector->info.stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) { + uint32_t *gs_copy_binary = si_get_shader_binary(shader->gs_copy_shader); + if (!gs_copy_binary) { + FREE(hw_binary); + return; + } + + /* Combine both binaries. */ + size += *gs_copy_binary; + uint32_t *combined_binary = (uint32_t*)MALLOC(size); + if (!combined_binary) { + FREE(hw_binary); + FREE(gs_copy_binary); + return; + } + + memcpy(combined_binary, hw_binary, *hw_binary); + memcpy(combined_binary + *hw_binary / 4, gs_copy_binary, *gs_copy_binary); + FREE(hw_binary); + FREE(gs_copy_binary); + hw_binary = combined_binary; + } + if (!memory_cache_full) { if (_mesa_hash_table_insert(sscreen->shader_cache, mem_dup(ir_sha1_cache_key, 20), @@ -350,13 +398,13 @@ void si_shader_cache_insert_shader(struct si_screen *sscreen, unsigned char ir_s FREE(hw_binary); return; } - /* The size is stored at the start of the binary */ - sscreen->shader_cache_size += *(uint32_t*)hw_binary; + + sscreen->shader_cache_size += size; } if (sscreen->disk_shader_cache && insert_into_disk_cache) { disk_cache_compute_key(sscreen->disk_shader_cache, ir_sha1_cache_key, 20, key); - disk_cache_put(sscreen->disk_shader_cache, key, hw_binary, *((uint32_t *)hw_binary), NULL); + disk_cache_put(sscreen->disk_shader_cache, key, hw_binary, size, NULL); } if (memory_cache_full) @@ -382,10 +430,17 @@ bool si_shader_cache_load_shader(struct si_screen *sscreen, unsigned char ir_sha unsigned char sha1[CACHE_KEY_SIZE]; disk_cache_compute_key(sscreen->disk_shader_cache, ir_sha1_cache_key, 20, sha1); - size_t binary_size; - uint8_t *buffer = (uint8_t*)disk_cache_get(sscreen->disk_shader_cache, sha1, &binary_size); + size_t total_size; + uint32_t *buffer = (uint32_t*)disk_cache_get(sscreen->disk_shader_cache, sha1, &total_size); if (buffer) { - if (binary_size >= sizeof(uint32_t) && *((uint32_t *)buffer) == binary_size) { + unsigned size = *buffer; + unsigned gs_copy_binary_size = 0; + + /* The GS copy shader binary is after the GS binary. */ + if (shader->selector->info.stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) + gs_copy_binary_size = buffer[size / 4]; + + if (total_size >= sizeof(uint32_t) && size + gs_copy_binary_size == total_size) { if (si_load_shader_binary(shader, buffer)) { free(buffer); si_shader_cache_insert_shader(sscreen, ir_sha1_cache_key, shader, false); @@ -997,7 +1052,7 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader) S_028B90_CNT(MIN2(gs_num_invocations, 127)) | S_028B90_ENABLE(gs_num_invocations > 0); /* Copy over fields from the GS copy shader to make them easily accessible from GS. */ - shader->pa_cl_vs_out_cntl = sel->gs_copy_shader->pa_cl_vs_out_cntl; + shader->pa_cl_vs_out_cntl = shader->gs_copy_shader->pa_cl_vs_out_cntl; va = shader->bo->gpu_address; @@ -1906,10 +1961,13 @@ static void si_shader_init_pm4_state(struct si_screen *sscreen, struct si_shader si_shader_vs(sscreen, shader, NULL); break; case MESA_SHADER_GEOMETRY: - if (shader->key.ge.as_ngg) + if (shader->key.ge.as_ngg) { gfx10_shader_ngg(sscreen, shader); - else + } else { + /* VS must be initialized first because GS uses its fields. */ + si_shader_vs(sscreen, shader->gs_copy_shader, shader->selector); si_shader_gs(sscreen, shader); + } break; case MESA_SHADER_FRAGMENT: si_shader_ps(sscreen, shader); @@ -2790,19 +2848,6 @@ static void si_init_shader_selector_async(void *job, void *gdata, int thread_ind if (!compiler->passes) si_init_compiler(sscreen, compiler); - /* The GS copy shader is always pre-compiled. */ - if (sel->info.stage == MESA_SHADER_GEOMETRY && - (!sscreen->use_ngg || !sscreen->use_ngg_streamout || /* also for PRIMITIVES_GENERATED */ - sel->tess_turns_off_ngg)) { - sel->gs_copy_shader = si_generate_gs_copy_shader(sscreen, compiler, sel, debug); - if (!sel->gs_copy_shader) { - fprintf(stderr, "radeonsi: can't create GS copy shader\n"); - return; - } - - si_shader_vs(sscreen, sel->gs_copy_shader, sel); - } - /* Serialize NIR to save memory. Monolithic shader variants * have to deserialize NIR before compilation. */ @@ -3664,6 +3709,9 @@ static void si_delete_shader(struct si_context *sctx, struct si_shader *shader) default:; } + if (shader->gs_copy_shader) + si_delete_shader(sctx, shader->gs_copy_shader); + si_shader_selector_reference(sctx, &shader->previous_stage_sel, NULL); si_shader_destroy(shader); si_pm4_free_state(sctx, &shader->pm4, state_index); @@ -3697,8 +3745,6 @@ static void si_destroy_shader_selector(struct pipe_context *ctx, void *cso) si_delete_shader(sctx, sel->main_shader_part_es); if (sel->main_shader_part_ngg) si_delete_shader(sctx, sel->main_shader_part_ngg); - if (sel->gs_copy_shader) - si_delete_shader(sctx, sel->gs_copy_shader); util_queue_fence_destroy(&sel->ready); simple_mtx_destroy(&sel->mutex);