radeonsi/gfx10: fix overflow and primitive queries

This aligns the offsets to match the memory layout of the query buffer
defined by gfx10_sh_query_buffer_mem and calls si_launch_grid_internal
to flush caches and wait for completion of shaders prior to retrieving
results.

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7181>
This commit is contained in:
Indrajit Kumar Das 2020-10-16 10:09:02 +05:30 committed by Marge Bot
parent fd4016f978
commit 5d14562da8
4 changed files with 28 additions and 23 deletions

View File

@ -360,11 +360,11 @@ static void gfx10_sh_query_get_result_resource(struct si_context *sctx, struct s
if (index >= 0) {
switch (query->b.type) {
case PIPE_QUERY_PRIMITIVES_GENERATED:
consts.offset = sizeof(uint32_t) * query->stream;
consts.offset = 4 * sizeof(uint64_t) * query->stream + 2 * sizeof(uint64_t);
consts.config = 0;
break;
case PIPE_QUERY_PRIMITIVES_EMITTED:
consts.offset = sizeof(uint32_t) * (4 + query->stream);
consts.offset = 4 * sizeof(uint64_t) * query->stream + 3 * sizeof(uint64_t);
consts.config = 0;
break;
case PIPE_QUERY_SO_STATISTICS:
@ -372,7 +372,7 @@ static void gfx10_sh_query_get_result_resource(struct si_context *sctx, struct s
consts.config = 0;
break;
case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
consts.offset = sizeof(uint32_t) * query->stream;
consts.offset = 4 * sizeof(uint64_t) * query->stream;
consts.config = 2;
break;
case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
@ -454,8 +454,9 @@ static void gfx10_sh_query_get_result_resource(struct si_context *sctx, struct s
si_cp_wait_mem(sctx, sctx->gfx_cs, va, 0x00000001, 0x00000001, 0);
}
sctx->b.launch_grid(&sctx->b, &grid);
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
void *saved_cs = sctx->cs_shader_state.program;
si_launch_grid_internal((struct si_context *)&sctx->b, &grid, saved_cs,
SI_CS_WAIT_FOR_IDLE | SI_CS_PARTIAL_FLUSH_DISABLE);
if (qbuf == query->last)
break;

View File

@ -60,15 +60,13 @@ unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher,
}
}
#define SI_CS_IMAGE_OP (1 << 0)
#define SI_CS_WAIT_FOR_IDLE (1 << 1)
#define SI_CS_RENDER_COND_ENABLE (1 << 2)
static void si_launch_grid_internal(struct si_context *sctx, struct pipe_grid_info *info,
void si_launch_grid_internal(struct si_context *sctx, struct pipe_grid_info *info,
void *restore_cs, unsigned flags)
{
/* Wait for previous shaders to finish. */
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_PS_PARTIAL_FLUSH;
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
if (!(flags & SI_CS_PARTIAL_FLUSH_DISABLE))
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
/* Invalidate L0-L1 caches. */
/* sL0 is never invalidated, because src resources don't use it. */
sctx->flags |= SI_CONTEXT_INV_VCACHE;

View File

@ -1339,8 +1339,15 @@ bool vi_dcc_clear_level(struct si_context *sctx, struct si_texture *tex, unsigne
void si_init_clear_functions(struct si_context *sctx);
/* si_compute_blit.c */
#define SI_CS_IMAGE_OP (1 << 0)
#define SI_CS_WAIT_FOR_IDLE (1 << 1)
#define SI_CS_RENDER_COND_ENABLE (1 << 2)
#define SI_CS_PARTIAL_FLUSH_DISABLE (1 << 3)
unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher,
enum si_cache_policy cache_policy);
void si_launch_grid_internal(struct si_context *sctx, struct pipe_grid_info *info,
void *restore_cs, unsigned flags);
void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, uint64_t offset,
uint64_t size, uint32_t *clear_value, uint32_t clear_value_size,
enum si_coherency coher, bool force_cpdma);

View File

@ -816,7 +816,7 @@ void *gfx10_create_sh_query_result_cs(struct si_context *sctx)
"DCL BUFFER[2]\n"
"DCL CONST[0][0..0]\n"
"DCL TEMP[0..5]\n"
"IMM[0] UINT32 {0, 7, 0, 4294967295}\n"
"IMM[0] UINT32 {0, 7, 256, 4294967295}\n"
"IMM[1] UINT32 {1, 2, 4, 8}\n"
"IMM[2] UINT32 {16, 32, 64, 128}\n"
@ -855,13 +855,13 @@ void *gfx10_create_sh_query_result_cs(struct si_context *sctx)
"UADD TEMP[1].x, TEMP[1].xxxx, IMM[0].wwww\n"
/*
fence = buffer[0]@(base_offset + 32);
fence = buffer[0]@(base_offset + sizeof(gfx10_sh_query_buffer_mem.stream));
if (!fence) {
acc_missing = ~0u;
break;
}
*/
"UADD TEMP[5].x, TEMP[1].yyyy, IMM[2].yyyy\n"
"UADD TEMP[5].x, TEMP[1].yyyy, IMM[2].wwww\n"
"LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n"
"USEQ TEMP[5], TEMP[5].xxxx, IMM[0].xxxx\n"
"UIF TEMP[5]\n"
@ -897,22 +897,21 @@ void *gfx10_create_sh_query_result_cs(struct si_context *sctx)
/*
do {
generated = buffer[0]@stream_offset;
emitted = buffer[0]@(stream_offset + 16);
generated = buffer[0]@(stream_offset + 2 * sizeof(uint64_t));
emitted = buffer[0]@(stream_offset + 3 * sizeof(uint64_t));
if (generated != emitted) {
acc_result = 1;
result_remaining = 0;
break;
}
stream_offset += 4;
stream_offset += sizeof(gfx10_sh_query_buffer_mem.stream[0]);
} while (--count);
*/
"BGNLOOP\n"
"UADD TEMP[5].x, TEMP[2].xxxx, IMM[2].xxxx\n"
"LOAD TEMP[4].x, BUFFER[0], TEMP[2].xxxx\n"
"LOAD TEMP[4].y, BUFFER[0], TEMP[5].xxxx\n"
"USNE TEMP[5], TEMP[4].xxxx, TEMP[4].yyyy\n"
"LOAD TEMP[4].xyzw, BUFFER[0], TEMP[5].xxxx\n"
"USNE TEMP[5], TEMP[4].xyxy, TEMP[4].zwzw\n"
"UIF TEMP[5]\n"
"MOV TEMP[0].x, IMM[1].xxxx\n"
"MOV TEMP[1].y, IMM[0].xxxx\n"
@ -924,15 +923,15 @@ void *gfx10_create_sh_query_result_cs(struct si_context *sctx)
"UIF TEMP[5]\n"
"BRK\n"
"ENDIF\n"
"UADD TEMP[2].x, TEMP[2].xxxx, IMM[1].zzzz\n"
"UADD TEMP[2].x, TEMP[2].xxxx, IMM[2].yyyy\n"
"ENDLOOP\n"
"ENDIF\n"
/*
base_offset += 64;
base_offset += sizeof(gfx10_sh_query_buffer_mem);
} // end outer loop
*/
"UADD TEMP[1].y, TEMP[1].yyyy, IMM[2].zzzz\n"
"UADD TEMP[1].y, TEMP[1].yyyy, IMM[0].zzzz\n"
"ENDLOOP\n"
/*