diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c index 16be11247e4..bb8d1cbd12d 100644 --- a/src/gallium/drivers/radeonsi/si_blit.c +++ b/src/gallium/drivers/radeonsi/si_blit.c @@ -902,6 +902,7 @@ void si_resource_copy_region(struct pipe_context *ctx, { struct si_context *sctx = (struct si_context *)ctx; struct si_texture *ssrc = (struct si_texture*)src; + struct si_texture *sdst = (struct si_texture*)dst; struct pipe_surface *dst_view, dst_templ; struct pipe_sampler_view src_templ, *src_view; unsigned dst_width, dst_height, src_width0, src_height0; @@ -914,6 +915,17 @@ void si_resource_copy_region(struct pipe_context *ctx, return; } + if (!util_format_is_compressed(src->format) && + !util_format_is_compressed(dst->format) && + !util_format_is_depth_or_stencil(src->format) && + src->nr_samples <= 1 && + !sdst->dcc_offset && + !(dst->target != src->target && + (src->target == PIPE_TEXTURE_1D_ARRAY || dst->target == PIPE_TEXTURE_1D_ARRAY))) { + si_compute_copy_image(sctx, dst, dst_level, src, src_level, dstx, dsty, dstz, src_box); + return; + } + assert(u_max_sample(dst) == u_max_sample(src)); /* The driver doesn't decompress resources automatically while diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c index dfa77a98804..31f5261ad8d 100644 --- a/src/gallium/drivers/radeonsi/si_compute_blit.c +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c @@ -24,6 +24,7 @@ */ #include "si_pipe.h" +#include "util/u_format.h" /* Note: Compute shaders always use SI_COMPUTE_DST_CACHE_POLICY for dst * and L2_STREAM for src. @@ -292,6 +293,116 @@ void si_copy_buffer(struct si_context *sctx, } } +void si_compute_copy_image(struct si_context *sctx, + struct pipe_resource *dst, + unsigned dst_level, + struct pipe_resource *src, + unsigned src_level, + unsigned dstx, unsigned dsty, unsigned dstz, + const struct pipe_box *src_box) +{ + struct pipe_context *ctx = &sctx->b; + unsigned width = src_box->width; + unsigned height = src_box->height; + unsigned depth = src_box->depth; + + unsigned data[] = {src_box->x, src_box->y, src_box->z, 0, dstx, dsty, dstz, 0}; + + if (width == 0 || height == 0) + return; + + sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | + si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM); + si_make_CB_shader_coherent(sctx, dst->nr_samples, true); + + struct pipe_constant_buffer saved_cb = {}; + si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb); + + struct si_images *images = &sctx->images[PIPE_SHADER_COMPUTE]; + struct pipe_image_view saved_image[2] = {0}; + util_copy_image_view(&saved_image[0], &images->views[0]); + util_copy_image_view(&saved_image[1], &images->views[1]); + + void *saved_cs = sctx->cs_shader_state.program; + + struct pipe_constant_buffer cb = {}; + cb.buffer_size = sizeof(data); + cb.user_buffer = data; + ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb); + + struct pipe_image_view image[2] = {0}; + image[0].resource = src; + image[0].shader_access = image[0].access = PIPE_IMAGE_ACCESS_READ; + image[0].format = util_format_linear(src->format); + image[0].u.tex.level = src_level; + image[0].u.tex.first_layer = 0; + image[0].u.tex.last_layer = + src->target == PIPE_TEXTURE_3D ? u_minify(src->depth0, src_level) - 1 + : (unsigned)(src->array_size - 1); + image[1].resource = dst; + image[1].shader_access = image[1].access = PIPE_IMAGE_ACCESS_WRITE; + image[1].format = util_format_linear(dst->format); + image[1].u.tex.level = dst_level; + image[1].u.tex.first_layer = 0; + image[1].u.tex.last_layer = + dst->target == PIPE_TEXTURE_3D ? u_minify(dst->depth0, dst_level) - 1 + : (unsigned)(dst->array_size - 1); + + if (src->format == PIPE_FORMAT_R9G9B9E5_FLOAT) + image[0].format = image[1].format = PIPE_FORMAT_R32_UINT; + + /* SNORM8 blitting has precision issues on some chips. Use the SINT + * equivalent instead, which doesn't force DCC decompression. + * Note that some chips avoid this issue by using SDMA. + */ + if (util_format_is_snorm8(dst->format)) { + image[0].format = image[1].format = + util_format_snorm8_to_sint8(dst->format); + } + + ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, image); + + struct pipe_grid_info info = {0}; + + if (dst->target == PIPE_TEXTURE_1D_ARRAY && src->target == PIPE_TEXTURE_1D_ARRAY) { + if (!sctx->cs_copy_image_1d_array) + sctx->cs_copy_image_1d_array = + si_create_copy_image_compute_shader_1d_array(ctx); + ctx->bind_compute_state(ctx, sctx->cs_copy_image_1d_array); + info.block[0] = 64; + sctx->compute_last_block[0] = width % 64; + info.block[1] = 1; + info.block[2] = 1; + info.grid[0] = DIV_ROUND_UP(width, 64); + info.grid[1] = depth; + info.grid[2] = 1; + } else { + if (!sctx->cs_copy_image) + sctx->cs_copy_image = si_create_copy_image_compute_shader(ctx); + ctx->bind_compute_state(ctx, sctx->cs_copy_image); + info.block[0] = 8; + sctx->compute_last_block[0] = width % 8; + info.block[1] = 8; + sctx->compute_last_block[1] = height % 8; + info.block[2] = 1; + info.grid[0] = DIV_ROUND_UP(width, 8); + info.grid[1] = DIV_ROUND_UP(height, 8); + info.grid[2] = depth; + } + + ctx->launch_grid(ctx, &info); + + sctx->compute_last_block[0] = 0; + sctx->compute_last_block[1] = 0; + + sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | + (sctx->chip_class <= VI ? SI_CONTEXT_WRITEBACK_GLOBAL_L2 : 0) | + si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM); + ctx->bind_compute_state(ctx, saved_cs); + ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, saved_image); + ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb); +} + void si_init_compute_blit_functions(struct si_context *sctx) { sctx->b.clear_buffer = si_pipe_clear_buffer; diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 4edb25494ea..3bb8e04e4ad 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -201,6 +201,10 @@ static void si_destroy_context(struct pipe_context *context) sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_buffer); if (sctx->cs_copy_buffer) sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_buffer); + if (sctx->cs_copy_image) + sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_image); + if (sctx->cs_copy_image_1d_array) + sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_image_1d_array); if (sctx->blitter) util_blitter_destroy(sctx->blitter); diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 37eb15f539e..23052aa7192 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -800,6 +800,8 @@ struct si_context { void *vs_blit_texcoord; void *cs_clear_buffer; void *cs_copy_buffer; + void *cs_copy_image; + void *cs_copy_image_1d_array; struct si_screen *screen; struct pipe_debug_callback debug; struct ac_llvm_compiler compiler; /* only non-threaded compilation */ @@ -1170,6 +1172,13 @@ void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset, unsigned size); +void si_compute_copy_image(struct si_context *sctx, + struct pipe_resource *dst, + unsigned dst_level, + struct pipe_resource *src, + unsigned src_level, + unsigned dstx, unsigned dsty, unsigned dstz, + const struct pipe_box *src_box); void si_init_compute_blit_functions(struct si_context *sctx); /* si_cp_dma.c */ @@ -1283,6 +1292,8 @@ void *si_create_fixed_func_tcs(struct si_context *sctx); void *si_create_dma_compute_shader(struct pipe_context *ctx, unsigned num_dwords_per_thread, bool dst_stream_cache_policy, bool is_copy); +void *si_create_copy_image_compute_shader(struct pipe_context *ctx); +void *si_create_copy_image_compute_shader_1d_array(struct pipe_context *ctx); void *si_create_query_result_cs(struct si_context *sctx); /* si_test_dma.c */ diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c index da55c81dd68..55f96b3a25e 100644 --- a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c +++ b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c @@ -439,3 +439,80 @@ void *si_create_query_result_cs(struct si_context *sctx) return sctx->b.create_compute_state(&sctx->b, &state); } + +/* Create a compute shader implementing copy_image. + * Luckily, this works with all texture targets except 1D_ARRAY. + */ +void *si_create_copy_image_compute_shader(struct pipe_context *ctx) +{ + static const char text[] = + "COMP\n" + "PROPERTY CS_FIXED_BLOCK_WIDTH 8\n" + "PROPERTY CS_FIXED_BLOCK_HEIGHT 8\n" + "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n" + "DCL SV[0], THREAD_ID\n" + "DCL SV[1], BLOCK_ID\n" + "DCL IMAGE[0], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n" + "DCL IMAGE[1], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n" + "DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw + "DCL TEMP[0..4], LOCAL\n" + "IMM[0] UINT32 {8, 1, 0, 0}\n" + "MOV TEMP[0].xyz, CONST[0][0].xyzw\n" + "UMAD TEMP[1].xyz, SV[1].xyzz, IMM[0].xxyy, SV[0].xyzz\n" + "UADD TEMP[2].xyz, TEMP[1].xyzx, TEMP[0].xyzx\n" + "LOAD TEMP[3], IMAGE[0], TEMP[2].xyzx, 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n" + "MOV TEMP[4].xyz, CONST[0][1].xyzw\n" + "UADD TEMP[2].xyz, TEMP[1].xyzx, TEMP[4].xyzx\n" + "STORE IMAGE[1], TEMP[2].xyzz, TEMP[3], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n" + "END\n"; + + struct tgsi_token tokens[1024]; + struct pipe_compute_state state = {0}; + + if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) { + assert(false); + return NULL; + } + + state.ir_type = PIPE_SHADER_IR_TGSI; + state.prog = tokens; + + return ctx->create_compute_state(ctx, &state); +} + +void *si_create_copy_image_compute_shader_1d_array(struct pipe_context *ctx) +{ + static const char text[] = + "COMP\n" + "PROPERTY CS_FIXED_BLOCK_WIDTH 64\n" + "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n" + "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n" + "DCL SV[0], THREAD_ID\n" + "DCL SV[1], BLOCK_ID\n" + "DCL IMAGE[0], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n" + "DCL IMAGE[1], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n" + "DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw + "DCL TEMP[0..4], LOCAL\n" + "IMM[0] UINT32 {64, 1, 0, 0}\n" + "MOV TEMP[0].xy, CONST[0][0].xzzw\n" + "UMAD TEMP[1].xy, SV[1].xyzz, IMM[0].xyyy, SV[0].xyzz\n" + "UADD TEMP[2].xy, TEMP[1].xyzx, TEMP[0].xyzx\n" + "LOAD TEMP[3], IMAGE[0], TEMP[2].xyzx, 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n" + "MOV TEMP[4].xy, CONST[0][1].xzzw\n" + "UADD TEMP[2].xy, TEMP[1].xyzx, TEMP[4].xyzx\n" + "STORE IMAGE[1], TEMP[2].xyzz, TEMP[3], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n" + "END\n"; + + struct tgsi_token tokens[1024]; + struct pipe_compute_state state = {0}; + + if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) { + assert(false); + return NULL; + } + + state.ir_type = PIPE_SHADER_IR_TGSI; + state.prog = tokens; + + return ctx->create_compute_state(ctx, &state); +}