radeonsi: use compute for resource_copy_region when possible
v2: marek: fix snorm8 blits Signed-off-by: Sonny Jiang <sonny.jiang@amd.com> Signed-off-by: Marek Olšák <marek.olsak@amd.com>
This commit is contained in:
parent
8daf5bb209
commit
1b25d340b7
|
@ -902,6 +902,7 @@ void si_resource_copy_region(struct pipe_context *ctx,
|
|||
{
|
||||
struct si_context *sctx = (struct si_context *)ctx;
|
||||
struct si_texture *ssrc = (struct si_texture*)src;
|
||||
struct si_texture *sdst = (struct si_texture*)dst;
|
||||
struct pipe_surface *dst_view, dst_templ;
|
||||
struct pipe_sampler_view src_templ, *src_view;
|
||||
unsigned dst_width, dst_height, src_width0, src_height0;
|
||||
|
@ -914,6 +915,17 @@ void si_resource_copy_region(struct pipe_context *ctx,
|
|||
return;
|
||||
}
|
||||
|
||||
if (!util_format_is_compressed(src->format) &&
|
||||
!util_format_is_compressed(dst->format) &&
|
||||
!util_format_is_depth_or_stencil(src->format) &&
|
||||
src->nr_samples <= 1 &&
|
||||
!sdst->dcc_offset &&
|
||||
!(dst->target != src->target &&
|
||||
(src->target == PIPE_TEXTURE_1D_ARRAY || dst->target == PIPE_TEXTURE_1D_ARRAY))) {
|
||||
si_compute_copy_image(sctx, dst, dst_level, src, src_level, dstx, dsty, dstz, src_box);
|
||||
return;
|
||||
}
|
||||
|
||||
assert(u_max_sample(dst) == u_max_sample(src));
|
||||
|
||||
/* The driver doesn't decompress resources automatically while
|
||||
|
|
|
@ -24,6 +24,7 @@
|
|||
*/
|
||||
|
||||
#include "si_pipe.h"
|
||||
#include "util/u_format.h"
|
||||
|
||||
/* Note: Compute shaders always use SI_COMPUTE_DST_CACHE_POLICY for dst
|
||||
* and L2_STREAM for src.
|
||||
|
@ -292,6 +293,116 @@ void si_copy_buffer(struct si_context *sctx,
|
|||
}
|
||||
}
|
||||
|
||||
void si_compute_copy_image(struct si_context *sctx,
|
||||
struct pipe_resource *dst,
|
||||
unsigned dst_level,
|
||||
struct pipe_resource *src,
|
||||
unsigned src_level,
|
||||
unsigned dstx, unsigned dsty, unsigned dstz,
|
||||
const struct pipe_box *src_box)
|
||||
{
|
||||
struct pipe_context *ctx = &sctx->b;
|
||||
unsigned width = src_box->width;
|
||||
unsigned height = src_box->height;
|
||||
unsigned depth = src_box->depth;
|
||||
|
||||
unsigned data[] = {src_box->x, src_box->y, src_box->z, 0, dstx, dsty, dstz, 0};
|
||||
|
||||
if (width == 0 || height == 0)
|
||||
return;
|
||||
|
||||
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
|
||||
si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
|
||||
si_make_CB_shader_coherent(sctx, dst->nr_samples, true);
|
||||
|
||||
struct pipe_constant_buffer saved_cb = {};
|
||||
si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
|
||||
|
||||
struct si_images *images = &sctx->images[PIPE_SHADER_COMPUTE];
|
||||
struct pipe_image_view saved_image[2] = {0};
|
||||
util_copy_image_view(&saved_image[0], &images->views[0]);
|
||||
util_copy_image_view(&saved_image[1], &images->views[1]);
|
||||
|
||||
void *saved_cs = sctx->cs_shader_state.program;
|
||||
|
||||
struct pipe_constant_buffer cb = {};
|
||||
cb.buffer_size = sizeof(data);
|
||||
cb.user_buffer = data;
|
||||
ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb);
|
||||
|
||||
struct pipe_image_view image[2] = {0};
|
||||
image[0].resource = src;
|
||||
image[0].shader_access = image[0].access = PIPE_IMAGE_ACCESS_READ;
|
||||
image[0].format = util_format_linear(src->format);
|
||||
image[0].u.tex.level = src_level;
|
||||
image[0].u.tex.first_layer = 0;
|
||||
image[0].u.tex.last_layer =
|
||||
src->target == PIPE_TEXTURE_3D ? u_minify(src->depth0, src_level) - 1
|
||||
: (unsigned)(src->array_size - 1);
|
||||
image[1].resource = dst;
|
||||
image[1].shader_access = image[1].access = PIPE_IMAGE_ACCESS_WRITE;
|
||||
image[1].format = util_format_linear(dst->format);
|
||||
image[1].u.tex.level = dst_level;
|
||||
image[1].u.tex.first_layer = 0;
|
||||
image[1].u.tex.last_layer =
|
||||
dst->target == PIPE_TEXTURE_3D ? u_minify(dst->depth0, dst_level) - 1
|
||||
: (unsigned)(dst->array_size - 1);
|
||||
|
||||
if (src->format == PIPE_FORMAT_R9G9B9E5_FLOAT)
|
||||
image[0].format = image[1].format = PIPE_FORMAT_R32_UINT;
|
||||
|
||||
/* SNORM8 blitting has precision issues on some chips. Use the SINT
|
||||
* equivalent instead, which doesn't force DCC decompression.
|
||||
* Note that some chips avoid this issue by using SDMA.
|
||||
*/
|
||||
if (util_format_is_snorm8(dst->format)) {
|
||||
image[0].format = image[1].format =
|
||||
util_format_snorm8_to_sint8(dst->format);
|
||||
}
|
||||
|
||||
ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, image);
|
||||
|
||||
struct pipe_grid_info info = {0};
|
||||
|
||||
if (dst->target == PIPE_TEXTURE_1D_ARRAY && src->target == PIPE_TEXTURE_1D_ARRAY) {
|
||||
if (!sctx->cs_copy_image_1d_array)
|
||||
sctx->cs_copy_image_1d_array =
|
||||
si_create_copy_image_compute_shader_1d_array(ctx);
|
||||
ctx->bind_compute_state(ctx, sctx->cs_copy_image_1d_array);
|
||||
info.block[0] = 64;
|
||||
sctx->compute_last_block[0] = width % 64;
|
||||
info.block[1] = 1;
|
||||
info.block[2] = 1;
|
||||
info.grid[0] = DIV_ROUND_UP(width, 64);
|
||||
info.grid[1] = depth;
|
||||
info.grid[2] = 1;
|
||||
} else {
|
||||
if (!sctx->cs_copy_image)
|
||||
sctx->cs_copy_image = si_create_copy_image_compute_shader(ctx);
|
||||
ctx->bind_compute_state(ctx, sctx->cs_copy_image);
|
||||
info.block[0] = 8;
|
||||
sctx->compute_last_block[0] = width % 8;
|
||||
info.block[1] = 8;
|
||||
sctx->compute_last_block[1] = height % 8;
|
||||
info.block[2] = 1;
|
||||
info.grid[0] = DIV_ROUND_UP(width, 8);
|
||||
info.grid[1] = DIV_ROUND_UP(height, 8);
|
||||
info.grid[2] = depth;
|
||||
}
|
||||
|
||||
ctx->launch_grid(ctx, &info);
|
||||
|
||||
sctx->compute_last_block[0] = 0;
|
||||
sctx->compute_last_block[1] = 0;
|
||||
|
||||
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
|
||||
(sctx->chip_class <= VI ? SI_CONTEXT_WRITEBACK_GLOBAL_L2 : 0) |
|
||||
si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
|
||||
ctx->bind_compute_state(ctx, saved_cs);
|
||||
ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, saved_image);
|
||||
ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
|
||||
}
|
||||
|
||||
void si_init_compute_blit_functions(struct si_context *sctx)
|
||||
{
|
||||
sctx->b.clear_buffer = si_pipe_clear_buffer;
|
||||
|
|
|
@ -201,6 +201,10 @@ static void si_destroy_context(struct pipe_context *context)
|
|||
sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_buffer);
|
||||
if (sctx->cs_copy_buffer)
|
||||
sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_buffer);
|
||||
if (sctx->cs_copy_image)
|
||||
sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_image);
|
||||
if (sctx->cs_copy_image_1d_array)
|
||||
sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_image_1d_array);
|
||||
|
||||
if (sctx->blitter)
|
||||
util_blitter_destroy(sctx->blitter);
|
||||
|
|
|
@ -800,6 +800,8 @@ struct si_context {
|
|||
void *vs_blit_texcoord;
|
||||
void *cs_clear_buffer;
|
||||
void *cs_copy_buffer;
|
||||
void *cs_copy_image;
|
||||
void *cs_copy_image_1d_array;
|
||||
struct si_screen *screen;
|
||||
struct pipe_debug_callback debug;
|
||||
struct ac_llvm_compiler compiler; /* only non-threaded compilation */
|
||||
|
@ -1170,6 +1172,13 @@ void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
|
|||
void si_copy_buffer(struct si_context *sctx,
|
||||
struct pipe_resource *dst, struct pipe_resource *src,
|
||||
uint64_t dst_offset, uint64_t src_offset, unsigned size);
|
||||
void si_compute_copy_image(struct si_context *sctx,
|
||||
struct pipe_resource *dst,
|
||||
unsigned dst_level,
|
||||
struct pipe_resource *src,
|
||||
unsigned src_level,
|
||||
unsigned dstx, unsigned dsty, unsigned dstz,
|
||||
const struct pipe_box *src_box);
|
||||
void si_init_compute_blit_functions(struct si_context *sctx);
|
||||
|
||||
/* si_cp_dma.c */
|
||||
|
@ -1283,6 +1292,8 @@ void *si_create_fixed_func_tcs(struct si_context *sctx);
|
|||
void *si_create_dma_compute_shader(struct pipe_context *ctx,
|
||||
unsigned num_dwords_per_thread,
|
||||
bool dst_stream_cache_policy, bool is_copy);
|
||||
void *si_create_copy_image_compute_shader(struct pipe_context *ctx);
|
||||
void *si_create_copy_image_compute_shader_1d_array(struct pipe_context *ctx);
|
||||
void *si_create_query_result_cs(struct si_context *sctx);
|
||||
|
||||
/* si_test_dma.c */
|
||||
|
|
|
@ -439,3 +439,80 @@ void *si_create_query_result_cs(struct si_context *sctx)
|
|||
|
||||
return sctx->b.create_compute_state(&sctx->b, &state);
|
||||
}
|
||||
|
||||
/* Create a compute shader implementing copy_image.
|
||||
* Luckily, this works with all texture targets except 1D_ARRAY.
|
||||
*/
|
||||
void *si_create_copy_image_compute_shader(struct pipe_context *ctx)
|
||||
{
|
||||
static const char text[] =
|
||||
"COMP\n"
|
||||
"PROPERTY CS_FIXED_BLOCK_WIDTH 8\n"
|
||||
"PROPERTY CS_FIXED_BLOCK_HEIGHT 8\n"
|
||||
"PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
|
||||
"DCL SV[0], THREAD_ID\n"
|
||||
"DCL SV[1], BLOCK_ID\n"
|
||||
"DCL IMAGE[0], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
|
||||
"DCL IMAGE[1], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
|
||||
"DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw
|
||||
"DCL TEMP[0..4], LOCAL\n"
|
||||
"IMM[0] UINT32 {8, 1, 0, 0}\n"
|
||||
"MOV TEMP[0].xyz, CONST[0][0].xyzw\n"
|
||||
"UMAD TEMP[1].xyz, SV[1].xyzz, IMM[0].xxyy, SV[0].xyzz\n"
|
||||
"UADD TEMP[2].xyz, TEMP[1].xyzx, TEMP[0].xyzx\n"
|
||||
"LOAD TEMP[3], IMAGE[0], TEMP[2].xyzx, 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
|
||||
"MOV TEMP[4].xyz, CONST[0][1].xyzw\n"
|
||||
"UADD TEMP[2].xyz, TEMP[1].xyzx, TEMP[4].xyzx\n"
|
||||
"STORE IMAGE[1], TEMP[2].xyzz, TEMP[3], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
|
||||
"END\n";
|
||||
|
||||
struct tgsi_token tokens[1024];
|
||||
struct pipe_compute_state state = {0};
|
||||
|
||||
if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
|
||||
assert(false);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
state.ir_type = PIPE_SHADER_IR_TGSI;
|
||||
state.prog = tokens;
|
||||
|
||||
return ctx->create_compute_state(ctx, &state);
|
||||
}
|
||||
|
||||
void *si_create_copy_image_compute_shader_1d_array(struct pipe_context *ctx)
|
||||
{
|
||||
static const char text[] =
|
||||
"COMP\n"
|
||||
"PROPERTY CS_FIXED_BLOCK_WIDTH 64\n"
|
||||
"PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
|
||||
"PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
|
||||
"DCL SV[0], THREAD_ID\n"
|
||||
"DCL SV[1], BLOCK_ID\n"
|
||||
"DCL IMAGE[0], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
|
||||
"DCL IMAGE[1], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
|
||||
"DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw
|
||||
"DCL TEMP[0..4], LOCAL\n"
|
||||
"IMM[0] UINT32 {64, 1, 0, 0}\n"
|
||||
"MOV TEMP[0].xy, CONST[0][0].xzzw\n"
|
||||
"UMAD TEMP[1].xy, SV[1].xyzz, IMM[0].xyyy, SV[0].xyzz\n"
|
||||
"UADD TEMP[2].xy, TEMP[1].xyzx, TEMP[0].xyzx\n"
|
||||
"LOAD TEMP[3], IMAGE[0], TEMP[2].xyzx, 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
|
||||
"MOV TEMP[4].xy, CONST[0][1].xzzw\n"
|
||||
"UADD TEMP[2].xy, TEMP[1].xyzx, TEMP[4].xyzx\n"
|
||||
"STORE IMAGE[1], TEMP[2].xyzz, TEMP[3], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
|
||||
"END\n";
|
||||
|
||||
struct tgsi_token tokens[1024];
|
||||
struct pipe_compute_state state = {0};
|
||||
|
||||
if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
|
||||
assert(false);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
state.ir_type = PIPE_SHADER_IR_TGSI;
|
||||
state.prog = tokens;
|
||||
|
||||
return ctx->create_compute_state(ctx, &state);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue