diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index 1c4d6b3683b..8edf4ad7959 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -68,7 +68,7 @@ static void init_scratch_buffer(struct si_context *sctx, struct si_compute *prog unsigned scratch_bytes_needed; si_shader_binary_read_config(&program->shader, offset); - scratch_bytes_needed = program->shader.scratch_bytes_per_wave; + scratch_bytes_needed = program->shader.config.scratch_bytes_per_wave; scratch_bytes = MAX2(scratch_bytes, scratch_bytes_needed); } @@ -86,7 +86,7 @@ static void init_scratch_buffer(struct si_context *sctx, struct si_compute *prog * to the maximum bytes needed, so it can compute the stride * correctly. */ - program->shader.scratch_bytes_per_wave = scratch_bytes; + program->shader.config.scratch_bytes_per_wave = scratch_bytes; /* Patch the shader with the scratch buffer address. */ si_shader_apply_scratch_relocs(sctx, @@ -281,12 +281,12 @@ static void si_launch_grid( memcpy(kernel_args + (num_work_size_bytes / 4), input, program->input_size); - if (shader->scratch_bytes_per_wave > 0) { + if (shader->config.scratch_bytes_per_wave > 0) { COMPUTE_DBG(sctx->screen, "Waves: %u; Scratch per wave: %u bytes; " "Total Scratch: %u bytes\n", num_waves_for_scratch, - shader->scratch_bytes_per_wave, - shader->scratch_bytes_per_wave * + shader->config.scratch_bytes_per_wave, + shader->config.scratch_bytes_per_wave * num_waves_for_scratch); radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, @@ -313,7 +313,7 @@ static void si_launch_grid( si_pm4_set_reg(pm4, R_00B900_COMPUTE_USER_DATA_0 + 8, scratch_buffer_va); si_pm4_set_reg(pm4, R_00B900_COMPUTE_USER_DATA_0 + 12, S_008F04_BASE_ADDRESS_HI(scratch_buffer_va >> 32) - | S_008F04_STRIDE(shader->scratch_bytes_per_wave / 64)); + | S_008F04_STRIDE(shader->config.scratch_bytes_per_wave / 64)); si_pm4_set_reg(pm4, R_00B810_COMPUTE_START_X, 0); si_pm4_set_reg(pm4, R_00B814_COMPUTE_START_Y, 0); @@ -361,9 +361,9 @@ static void si_launch_grid( si_pm4_set_reg(pm4, R_00B830_COMPUTE_PGM_LO, shader_va >> 8); si_pm4_set_reg(pm4, R_00B834_COMPUTE_PGM_HI, shader_va >> 40); - si_pm4_set_reg(pm4, R_00B848_COMPUTE_PGM_RSRC1, shader->rsrc1); + si_pm4_set_reg(pm4, R_00B848_COMPUTE_PGM_RSRC1, shader->config.rsrc1); - lds_blocks = shader->lds_size; + lds_blocks = shader->config.lds_size; /* XXX: We are over allocating LDS. For SI, the shader reports LDS in * blocks of 256 bytes, so if there are 4 bytes lds allocated in * the shader and 4 bytes allocated by the state tracker, then @@ -377,10 +377,10 @@ static void si_launch_grid( assert(lds_blocks <= 0xFF); - shader->rsrc2 &= C_00B84C_LDS_SIZE; - shader->rsrc2 |= S_00B84C_LDS_SIZE(lds_blocks); + shader->config.rsrc2 &= C_00B84C_LDS_SIZE; + shader->config.rsrc2 |= S_00B84C_LDS_SIZE(lds_blocks); - si_pm4_set_reg(pm4, R_00B84C_COMPUTE_PGM_RSRC2, shader->rsrc2); + si_pm4_set_reg(pm4, R_00B84C_COMPUTE_PGM_RSRC2, shader->config.rsrc2); si_pm4_set_reg(pm4, R_00B854_COMPUTE_RESOURCE_LIMITS, 0); si_pm4_set_reg(pm4, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, @@ -402,7 +402,7 @@ static void si_launch_grid( * COMPUTE_PGM_RSRC2.SCRATCH_EN is enabled. */ S_00B860_WAVES(num_waves_for_scratch) - | S_00B860_WAVESIZE(shader->scratch_bytes_per_wave >> 10)) + | S_00B860_WAVESIZE(shader->config.scratch_bytes_per_wave >> 10)) ; si_pm4_cmd_begin(pm4, PKT3_DISPATCH_DIRECT); diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index e40e7c18372..a92bedb2f7b 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -3727,25 +3727,25 @@ void si_shader_binary_read_config(struct si_shader *shader, case R_00B128_SPI_SHADER_PGM_RSRC1_VS: case R_00B228_SPI_SHADER_PGM_RSRC1_GS: case R_00B848_COMPUTE_PGM_RSRC1: - shader->num_sgprs = MAX2(shader->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8); - shader->num_vgprs = MAX2(shader->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4); - shader->float_mode = G_00B028_FLOAT_MODE(value); - shader->rsrc1 = value; + shader->config.num_sgprs = MAX2(shader->config.num_sgprs, (G_00B028_SGPRS(value) + 1) * 8); + shader->config.num_vgprs = MAX2(shader->config.num_vgprs, (G_00B028_VGPRS(value) + 1) * 4); + shader->config.float_mode = G_00B028_FLOAT_MODE(value); + shader->config.rsrc1 = value; break; case R_00B02C_SPI_SHADER_PGM_RSRC2_PS: - shader->lds_size = MAX2(shader->lds_size, G_00B02C_EXTRA_LDS_SIZE(value)); + shader->config.lds_size = MAX2(shader->config.lds_size, G_00B02C_EXTRA_LDS_SIZE(value)); break; case R_00B84C_COMPUTE_PGM_RSRC2: - shader->lds_size = MAX2(shader->lds_size, G_00B84C_LDS_SIZE(value)); - shader->rsrc2 = value; + shader->config.lds_size = MAX2(shader->config.lds_size, G_00B84C_LDS_SIZE(value)); + shader->config.rsrc2 = value; break; case R_0286CC_SPI_PS_INPUT_ENA: - shader->spi_ps_input_ena = value; + shader->config.spi_ps_input_ena = value; break; case R_0286E8_SPI_TMPRING_SIZE: case R_00B860_COMPUTE_TMPRING_SIZE: /* WAVESIZE is in units of 256 dwords. */ - shader->scratch_bytes_per_wave = + shader->config.scratch_bytes_per_wave = G_00B860_WAVESIZE(value) * 256 * 4 * 1; break; default: @@ -3764,7 +3764,7 @@ void si_shader_apply_scratch_relocs(struct si_context *sctx, uint32_t scratch_rsrc_dword0 = scratch_va; uint32_t scratch_rsrc_dword1 = S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) - | S_008F04_STRIDE(shader->scratch_bytes_per_wave / 64); + | S_008F04_STRIDE(shader->config.scratch_bytes_per_wave / 64); for (i = 0 ; i < shader->binary.reloc_count; i++) { const struct radeon_shader_reloc *reloc = @@ -3866,14 +3866,15 @@ void si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader, fprintf(stderr, "*** SHADER STATS ***\n" "SGPRS: %d\nVGPRS: %d\nCode Size: %d bytes\nLDS: %d blocks\n" "Scratch: %d bytes per wave\n********************\n", - shader->num_sgprs, shader->num_vgprs, binary->code_size, - shader->lds_size, shader->scratch_bytes_per_wave); + shader->config.num_sgprs, shader->config.num_vgprs, binary->code_size, + shader->config.lds_size, shader->config.scratch_bytes_per_wave); } pipe_debug_message(debug, SHADER_INFO, "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d LDS: %d Scratch: %d", - shader->num_sgprs, shader->num_vgprs, binary->code_size, - shader->lds_size, shader->scratch_bytes_per_wave); + shader->config.num_sgprs, shader->config.num_vgprs, + binary->code_size, shader->config.lds_size, + shader->config.scratch_bytes_per_wave); } int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader, @@ -3907,7 +3908,7 @@ int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader, FREE(shader->binary.config); FREE(shader->binary.rodata); FREE(shader->binary.global_symbol_offsets); - if (shader->scratch_bytes_per_wave == 0) { + if (shader->config.scratch_bytes_per_wave == 0) { FREE(shader->binary.code); FREE(shader->binary.relocs); memset(&shader->binary, 0, diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index b89d3b29e69..c892ca32803 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -262,6 +262,17 @@ union si_shader_key { } tes; /* tessellation evaluation shader */ }; +struct si_shader_config { + unsigned num_sgprs; + unsigned num_vgprs; + unsigned lds_size; + unsigned spi_ps_input_ena; + unsigned float_mode; + unsigned scratch_bytes_per_wave; + unsigned rsrc1; + unsigned rsrc2; +}; + struct si_shader { struct si_shader_selector *selector; struct si_shader *next_variant; @@ -270,14 +281,9 @@ struct si_shader { struct si_pm4_state *pm4; struct r600_resource *bo; struct r600_resource *scratch_bo; - struct radeon_shader_binary binary; - unsigned num_sgprs; - unsigned num_vgprs; - unsigned lds_size; - unsigned spi_ps_input_ena; - unsigned float_mode; - unsigned scratch_bytes_per_wave; union si_shader_key key; + struct radeon_shader_binary binary; + struct si_shader_config config; unsigned nparam; unsigned vs_output_param_offset[PIPE_MAX_SHADER_OUTPUTS]; @@ -288,9 +294,6 @@ struct si_shader { unsigned nr_param_exports; bool is_gs_copy_shader; bool dx10_clamp_mode; /* convert NaNs to 0 */ - - unsigned rsrc1; - unsigned rsrc2; }; static inline struct tgsi_shader_info *si_get_vs_info(struct si_context *sctx) diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index 87a5afbbc97..91ccd073267 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -163,7 +163,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx, perpatch_output_offset = output_patch0_offset + pervertex_output_patch_size; lds_size = output_patch0_offset + output_patch_size * *num_patches; - ls_rsrc2 = ls->current->rsrc2; + ls_rsrc2 = ls->current->config.rsrc2; if (sctx->b.chip_class >= CIK) { assert(lds_size <= 65536); @@ -178,7 +178,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx, if (sctx->b.chip_class == CIK && sctx->b.family != CHIP_HAWAII) radeon_set_sh_reg(cs, R_00B52C_SPI_SHADER_PGM_RSRC2_LS, ls_rsrc2); radeon_set_sh_reg_seq(cs, R_00B528_SPI_SHADER_PGM_RSRC1_LS, 2); - radeon_emit(cs, ls->current->rsrc1); + radeon_emit(cs, ls->current->config.rsrc1); radeon_emit(cs, ls_rsrc2); /* Compute userdata SGPRs. */ diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index af21f3e054c..64adf699604 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -111,7 +111,7 @@ static void si_shader_ls(struct si_shader *shader) vgpr_comp_cnt = shader->uses_instanceid ? 3 : 1; num_user_sgprs = SI_LS_NUM_USER_SGPR; - num_sgprs = shader->num_sgprs; + num_sgprs = shader->config.num_sgprs; if (num_user_sgprs > num_sgprs) { /* Last 2 reserved SGPRs are used for VCC */ num_sgprs = num_user_sgprs + 2; @@ -121,12 +121,12 @@ static void si_shader_ls(struct si_shader *shader) si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8); si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS, va >> 40); - shader->rsrc1 = S_00B528_VGPRS((shader->num_vgprs - 1) / 4) | + shader->config.rsrc1 = S_00B528_VGPRS((shader->config.num_vgprs - 1) / 4) | S_00B528_SGPRS((num_sgprs - 1) / 8) | S_00B528_VGPR_COMP_CNT(vgpr_comp_cnt) | S_00B528_DX10_CLAMP(shader->dx10_clamp_mode); - shader->rsrc2 = S_00B52C_USER_SGPR(num_user_sgprs) | - S_00B52C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0); + shader->config.rsrc2 = S_00B52C_USER_SGPR(num_user_sgprs) | + S_00B52C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0); } static void si_shader_hs(struct si_shader *shader) @@ -143,7 +143,7 @@ static void si_shader_hs(struct si_shader *shader) si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_USER_SHADER); num_user_sgprs = SI_TCS_NUM_USER_SGPR; - num_sgprs = shader->num_sgprs; + num_sgprs = shader->config.num_sgprs; /* One SGPR after user SGPRs is pre-loaded with tessellation factor * buffer offset. */ if ((num_user_sgprs + 1) > num_sgprs) { @@ -155,12 +155,12 @@ static void si_shader_hs(struct si_shader *shader) si_pm4_set_reg(pm4, R_00B420_SPI_SHADER_PGM_LO_HS, va >> 8); si_pm4_set_reg(pm4, R_00B424_SPI_SHADER_PGM_HI_HS, va >> 40); si_pm4_set_reg(pm4, R_00B428_SPI_SHADER_PGM_RSRC1_HS, - S_00B428_VGPRS((shader->num_vgprs - 1) / 4) | + S_00B428_VGPRS((shader->config.num_vgprs - 1) / 4) | S_00B428_SGPRS((num_sgprs - 1) / 8) | S_00B428_DX10_CLAMP(shader->dx10_clamp_mode)); si_pm4_set_reg(pm4, R_00B42C_SPI_SHADER_PGM_RSRC2_HS, S_00B42C_USER_SGPR(num_user_sgprs) | - S_00B42C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0)); + S_00B42C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0)); } static void si_shader_es(struct si_shader *shader) @@ -187,7 +187,7 @@ static void si_shader_es(struct si_shader *shader) } else unreachable("invalid shader selector type"); - num_sgprs = shader->num_sgprs; + num_sgprs = shader->config.num_sgprs; /* One SGPR after user SGPRs is pre-loaded with es2gs_offset */ if ((num_user_sgprs + 1) > num_sgprs) { /* Last 2 reserved SGPRs are used for VCC */ @@ -200,13 +200,13 @@ static void si_shader_es(struct si_shader *shader) si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8); si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, va >> 40); si_pm4_set_reg(pm4, R_00B328_SPI_SHADER_PGM_RSRC1_ES, - S_00B328_VGPRS((shader->num_vgprs - 1) / 4) | + S_00B328_VGPRS((shader->config.num_vgprs - 1) / 4) | S_00B328_SGPRS((num_sgprs - 1) / 8) | S_00B328_VGPR_COMP_CNT(vgpr_comp_cnt) | S_00B328_DX10_CLAMP(shader->dx10_clamp_mode)); si_pm4_set_reg(pm4, R_00B32C_SPI_SHADER_PGM_RSRC2_ES, S_00B32C_USER_SGPR(num_user_sgprs) | - S_00B32C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0)); + S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0)); if (shader->selector->type == PIPE_SHADER_TESS_EVAL) si_set_tesseval_regs(shader, pm4); @@ -272,7 +272,7 @@ static void si_shader_gs(struct si_shader *shader) si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_HI_GS, va >> 40); num_user_sgprs = SI_GS_NUM_USER_SGPR; - num_sgprs = shader->num_sgprs; + num_sgprs = shader->config.num_sgprs; /* Two SGPRs after user SGPRs are pre-loaded with gs2vs_offset, gs_wave_id */ if ((num_user_sgprs + 2) > num_sgprs) { /* Last 2 reserved SGPRs are used for VCC */ @@ -281,12 +281,12 @@ static void si_shader_gs(struct si_shader *shader) assert(num_sgprs <= 104); si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, - S_00B228_VGPRS((shader->num_vgprs - 1) / 4) | + S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) | S_00B228_SGPRS((num_sgprs - 1) / 8) | S_00B228_DX10_CLAMP(shader->dx10_clamp_mode)); si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, S_00B22C_USER_SGPR(num_user_sgprs) | - S_00B22C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0)); + S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0)); } static void si_shader_vs(struct si_shader *shader) @@ -329,7 +329,7 @@ static void si_shader_vs(struct si_shader *shader) } else unreachable("invalid shader selector type"); - num_sgprs = shader->num_sgprs; + num_sgprs = shader->config.num_sgprs; if (num_user_sgprs > num_sgprs) { /* Last 2 reserved SGPRs are used for VCC */ num_sgprs = num_user_sgprs + 2; @@ -356,7 +356,7 @@ static void si_shader_vs(struct si_shader *shader) si_pm4_set_reg(pm4, R_00B120_SPI_SHADER_PGM_LO_VS, va >> 8); si_pm4_set_reg(pm4, R_00B124_SPI_SHADER_PGM_HI_VS, va >> 40); si_pm4_set_reg(pm4, R_00B128_SPI_SHADER_PGM_RSRC1_VS, - S_00B128_VGPRS((shader->num_vgprs - 1) / 4) | + S_00B128_VGPRS((shader->config.num_vgprs - 1) / 4) | S_00B128_SGPRS((num_sgprs - 1) / 8) | S_00B128_VGPR_COMP_CNT(vgpr_comp_cnt) | S_00B128_DX10_CLAMP(shader->dx10_clamp_mode)); @@ -367,7 +367,7 @@ static void si_shader_vs(struct si_shader *shader) S_00B12C_SO_BASE2_EN(!!shader->selector->so.stride[2]) | S_00B12C_SO_BASE3_EN(!!shader->selector->so.stride[3]) | S_00B12C_SO_EN(!!shader->selector->so.num_outputs) | - S_00B12C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0)); + S_00B12C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0)); if (window_space) si_pm4_set_reg(pm4, R_028818_PA_CL_VTE_CNTL, S_028818_VTX_XY_FMT(1) | S_028818_VTX_Z_FMT(1)); @@ -443,8 +443,8 @@ static void si_shader_ps(struct si_shader *shader) } /* Set interpolation controls. */ - has_centroid = G_0286CC_PERSP_CENTROID_ENA(shader->spi_ps_input_ena) || - G_0286CC_LINEAR_CENTROID_ENA(shader->spi_ps_input_ena); + has_centroid = G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena) || + G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena); spi_ps_in_control = S_0286D8_NUM_INTERP(shader->nparam) | S_0286D8_BC_OPTIMIZE_DISABLE(has_centroid); @@ -468,7 +468,7 @@ static void si_shader_ps(struct si_shader *shader) si_pm4_set_reg(pm4, R_00B024_SPI_SHADER_PGM_HI_PS, va >> 40); num_user_sgprs = SI_PS_NUM_USER_SGPR; - num_sgprs = shader->num_sgprs; + num_sgprs = shader->config.num_sgprs; /* One SGPR after user SGPRs is pre-loaded with {prim_mask, lds_offset} */ if ((num_user_sgprs + 1) > num_sgprs) { /* Last 2 reserved SGPRs are used for VCC */ @@ -477,13 +477,13 @@ static void si_shader_ps(struct si_shader *shader) assert(num_sgprs <= 104); si_pm4_set_reg(pm4, R_00B028_SPI_SHADER_PGM_RSRC1_PS, - S_00B028_VGPRS((shader->num_vgprs - 1) / 4) | + S_00B028_VGPRS((shader->config.num_vgprs - 1) / 4) | S_00B028_SGPRS((num_sgprs - 1) / 8) | S_00B028_DX10_CLAMP(shader->dx10_clamp_mode)); si_pm4_set_reg(pm4, R_00B02C_SPI_SHADER_PGM_RSRC2_PS, - S_00B02C_EXTRA_LDS_SIZE(shader->lds_size) | + S_00B02C_EXTRA_LDS_SIZE(shader->config.lds_size) | S_00B02C_USER_SGPR(num_user_sgprs) | - S_00B32C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0)); + S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0)); } static void si_shader_init_pm4_state(struct si_shader *shader) @@ -1040,7 +1040,7 @@ static void si_emit_spi_ps_input(struct si_context *sctx, struct r600_atom *atom if (!ps) return; - input_ena = ps->spi_ps_input_ena; + input_ena = ps->config.spi_ps_input_ena; /* we need to enable at least one of them, otherwise we hang the GPU */ assert(G_0286CC_PERSP_SAMPLE_ENA(input_ena) || @@ -1269,7 +1269,7 @@ static int si_update_scratch_buffer(struct si_context *sctx, return 0; /* This shader doesn't need a scratch buffer */ - if (shader->scratch_bytes_per_wave == 0) + if (shader->config.scratch_bytes_per_wave == 0) return 0; /* This shader is already configured to use the current @@ -1301,7 +1301,7 @@ static unsigned si_get_current_scratch_buffer_size(struct si_context *sctx) static unsigned si_get_scratch_buffer_bytes_per_wave(struct si_shader *shader) { - return shader ? shader->scratch_bytes_per_wave : 0; + return shader ? shader->config.scratch_bytes_per_wave : 0; } static unsigned si_get_max_scratch_bytes_per_wave(struct si_context *sctx)