From 6c901f067526a878657874806641c1cfd79c940f Mon Sep 17 00:00:00 2001 From: Sonny Jiang Date: Fri, 29 Nov 2019 18:04:54 -0500 Subject: [PATCH] radeonsi: use compute shader for clear 12-byte buffer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Sonny Jiang Reviewed-by: Marek Olšák --- .../drivers/radeonsi/si_compute_blit.c | 81 ++++++++++++++++--- src/gallium/drivers/radeonsi/si_pipe.c | 2 + src/gallium/drivers/radeonsi/si_pipe.h | 2 + .../drivers/radeonsi/si_shaderlib_tgsi.c | 33 ++++++++ 4 files changed, 108 insertions(+), 10 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c index ff573c131f4..8d4f3bab5d7 100644 --- a/src/gallium/drivers/radeonsi/si_compute_blit.c +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c @@ -73,6 +73,76 @@ static void si_compute_internal_end(struct si_context *sctx) sctx->render_cond_force_off = false; } +static void si_compute_clear_12bytes_buffer(struct si_context *sctx, + struct pipe_resource *dst, + unsigned dst_offset, + unsigned size, + const uint32_t *clear_value, + enum si_coherency coher) +{ + struct pipe_context *ctx = &sctx->b; + + assert(dst_offset % 4 == 0); + assert(size % 4 == 0); + unsigned size_12 = DIV_ROUND_UP(size, 12); + + unsigned data[4] = {0}; + memcpy(data, clear_value, 12); + + si_compute_internal_begin(sctx); + + sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | + SI_CONTEXT_CS_PARTIAL_FLUSH | + si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY); + + struct pipe_shader_buffer saved_sb = {0}; + si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb); + + unsigned saved_writable_mask = 0; + if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask & + (1u << si_get_shaderbuf_slot(0))) + saved_writable_mask = 1; + + struct pipe_constant_buffer saved_cb = {}; + si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb); + + void *saved_cs = sctx->cs_shader_state.program; + + struct pipe_constant_buffer cb = {}; + cb.buffer_size = sizeof(data); + cb.user_buffer = data; + ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb); + + struct pipe_shader_buffer sb = {0}; + sb.buffer = dst; + sb.buffer_offset = dst_offset; + sb.buffer_size = size; + + ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &sb, 0x1); + + struct pipe_grid_info info = {0}; + + if (!sctx->cs_clear_12bytes_buffer) + sctx->cs_clear_12bytes_buffer = + si_clear_12bytes_buffer_shader(ctx); + ctx->bind_compute_state(ctx, sctx->cs_clear_12bytes_buffer); + info.block[0] = 64; + info.last_block[0] = size_12 % 64; + info.block[1] = 1; + info.block[2] = 1; + info.grid[0] = DIV_ROUND_UP(size_12, 64); + info.grid[1] = 1; + info.grid[2] = 1; + + ctx->launch_grid(ctx, &info); + + ctx->bind_compute_state(ctx, saved_cs); + ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb, saved_writable_mask); + ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb); + + si_compute_internal_end(sctx); +} + static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_resource *dst, unsigned dst_offset, @@ -231,17 +301,8 @@ void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, clear_value_size = 4; } - /* Use transform feedback for 12-byte clears. */ - /* TODO: Use compute. */ if (clear_value_size == 12) { - union pipe_color_union streamout_clear_value; - - memcpy(&streamout_clear_value, clear_value, clear_value_size); - si_blitter_begin(sctx, SI_DISABLE_RENDER_COND); - util_blitter_clear_buffer(sctx->blitter, dst, offset, - size, clear_value_size / 4, - &streamout_clear_value); - si_blitter_end(sctx); + si_compute_clear_12bytes_buffer(sctx, dst, offset, size, clear_value, coher); return; } diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index aa627279ed3..0ac70a5fdf5 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -232,6 +232,8 @@ static void si_destroy_context(struct pipe_context *context) sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_render_target); if (sctx->cs_clear_render_target_1d_array) sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_render_target_1d_array); + if (sctx->cs_clear_12bytes_buffer) + sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_12bytes_buffer); if (sctx->cs_dcc_retile) sctx->b.delete_compute_state(&sctx->b, sctx->cs_dcc_retile); diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index f313e565d5f..b4e06609582 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -894,6 +894,7 @@ struct si_context { void *cs_copy_image_1d_array; void *cs_clear_render_target; void *cs_clear_render_target_1d_array; + void *cs_clear_12bytes_buffer; void *cs_dcc_retile; void *cs_fmask_expand[3][2]; /* [log2(samples)-1][is_array] */ struct si_screen *screen; @@ -1450,6 +1451,7 @@ void *si_create_copy_image_compute_shader(struct pipe_context *ctx); void *si_create_copy_image_compute_shader_1d_array(struct pipe_context *ctx); void *si_clear_render_target_shader(struct pipe_context *ctx); void *si_clear_render_target_shader_1d_array(struct pipe_context *ctx); +void *si_clear_12bytes_buffer_shader(struct pipe_context *ctx); void *si_create_dcc_retile_cs(struct pipe_context *ctx); void *si_create_fmask_expand_cs(struct pipe_context *ctx, unsigned num_samples, bool is_array); diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c index 0cf0cd95a8b..90eb39e3506 100644 --- a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c +++ b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c @@ -665,6 +665,39 @@ void *si_clear_render_target_shader_1d_array(struct pipe_context *ctx) return ctx->create_compute_state(ctx, &state); } +void *si_clear_12bytes_buffer_shader(struct pipe_context *ctx) +{ + static const char text[] = + "COMP\n" + "PROPERTY CS_FIXED_BLOCK_WIDTH 64\n" + "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n" + "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n" + "DCL SV[0], THREAD_ID\n" + "DCL SV[1], BLOCK_ID\n" + "DCL BUFFER[0]\n" + "DCL CONST[0][0..0]\n" // 0:xyzw + "DCL TEMP[0..0]\n" + "IMM[0] UINT32 {64, 1, 12, 0}\n" + "UMAD TEMP[0].x, SV[1].xyzz, IMM[0].xyyy, SV[0].xyzz\n" + "UMUL TEMP[0].x, TEMP[0].xyzz, IMM[0].zzzz\n" //12 bytes + "STORE BUFFER[0].xyz, TEMP[0].xxxx, CONST[0][0].xyzw\n" + "END\n"; + + struct tgsi_token tokens[1024]; + struct pipe_compute_state state = {0}; + + if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) { + assert(false); + return NULL; + } + + state.ir_type = PIPE_SHADER_IR_TGSI; + state.prog = tokens; + + return ctx->create_compute_state(ctx, &state); +} + + /* Load samples from the image, and copy them to the same image. This looks like * a no-op, but it's not. Loads use FMASK, while stores don't, so samples are * reordered to match expanded FMASK.