freedreno/a5xx: indirect grids

Signed-off-by: Rob Clark <robdclark@gmail.com>
This commit is contained in:
Rob Clark 2017-11-10 12:53:13 -05:00
parent 471aa1b6d0
commit e7b2719f69
3 changed files with 86 additions and 20 deletions

View File

@ -26,6 +26,8 @@
#include "pipe/p_state.h"
#include "freedreno_resource.h"
#include "fd5_compute.h"
#include "fd5_context.h"
#include "fd5_emit.h"
@ -110,9 +112,9 @@ cs_program_emit(struct fd_ringbuffer *ring, struct ir3_shader_variant *v)
OUT_PKT4(ring, REG_A5XX_HLSQ_CS_CNTL_0, 2);
OUT_RING(ring, A5XX_HLSQ_CS_CNTL_0_WGIDCONSTID(work_group_id) |
A5XX_HLSQ_CS_CNTL_0_UNK0(regid(63, 0)) |
A5XX_HLSQ_CS_CNTL_0_UNK1(regid(63, 0)) |
A5XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
A5XX_HLSQ_CS_CNTL_0_UNK0(regid(63, 0)) |
A5XX_HLSQ_CS_CNTL_0_UNK1(regid(63, 0)) |
A5XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
OUT_RING(ring, 0x1); /* HLSQ_CS_CNTL_1 */
fd5_emit_shader(ring, v);
@ -126,9 +128,6 @@ fd5_launch_grid(struct fd_context *ctx, const struct pipe_grid_info *info)
struct ir3_shader_variant *v;
struct fd_ringbuffer *ring = ctx->batch->draw;
if (info->indirect)
return; // TODO
v = ir3_shader_variant(so->shader, key, &ctx->debug);
if (ctx->dirty_shader[PIPE_SHADER_COMPUTE] & FD_DIRTY_SHADER_PROG)
@ -158,11 +157,29 @@ fd5_launch_grid(struct fd_context *ctx, const struct pipe_grid_info *info)
OUT_RING(ring, 1); /* HLSQ_CS_KERNEL_GROUP_Y */
OUT_RING(ring, 1); /* HLSQ_CS_KERNEL_GROUP_Z */
OUT_PKT7(ring, CP_EXEC_CS, 4);
OUT_RING(ring, 0x00000000);
OUT_RING(ring, CP_EXEC_CS_1_NGROUPS_X(info->grid[0]));
OUT_RING(ring, CP_EXEC_CS_2_NGROUPS_Y(info->grid[1]));
OUT_RING(ring, CP_EXEC_CS_3_NGROUPS_Z(info->grid[2]));
if (info->indirect) {
struct fd_resource *rsc = fd_resource(info->indirect);
OUT_PKT7(ring, CP_EVENT_WRITE, 4);
OUT_RING(ring, CACHE_FLUSH_TS);
OUT_RELOCW(ring, fd5_context(ctx)->blit_mem, 0, 0, 0); /* ADDR_LO/HI */
OUT_RING(ring, 0x00000000);
OUT_WFI5(ring);
OUT_PKT7(ring, CP_EXEC_CS_INDIRECT, 4);
OUT_RING(ring, 0x00000000);
OUT_RELOC(ring, rsc->bo, info->indirect_offset, 0, 0); /* ADDR_LO/HI */
OUT_RING(ring, CP_EXEC_CS_INDIRECT_3_LOCALSIZEX(local_size[0] - 1) |
CP_EXEC_CS_INDIRECT_3_LOCALSIZEY(local_size[1] - 1) |
CP_EXEC_CS_INDIRECT_3_LOCALSIZEZ(local_size[2] - 1));
} else {
OUT_PKT7(ring, CP_EXEC_CS, 4);
OUT_RING(ring, 0x00000000);
OUT_RING(ring, CP_EXEC_CS_1_NGROUPS_X(info->grid[0]));
OUT_RING(ring, CP_EXEC_CS_2_NGROUPS_Y(info->grid[1]));
OUT_RING(ring, CP_EXEC_CS_3_NGROUPS_Z(info->grid[2]));
}
}
void

View File

@ -859,15 +859,59 @@ ir3_emit_cs_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *rin
/* emit compute-shader driver-params: */
uint32_t offset = v->constbase.driver_param;
if (v->constlen > offset) {
uint32_t compute_params[IR3_DP_CS_COUNT] = {
[IR3_DP_NUM_WORK_GROUPS_X] = info->grid[0],
[IR3_DP_NUM_WORK_GROUPS_Y] = info->grid[1],
[IR3_DP_NUM_WORK_GROUPS_Z] = info->grid[2],
/* do we need work-group-size? */
};
fd_wfi(ctx->batch, ring);
ctx->emit_const(ring, SHADER_COMPUTE, offset * 4, 0,
ARRAY_SIZE(compute_params), compute_params, NULL);
if (info->indirect) {
struct pipe_resource *indirect = NULL;
unsigned indirect_offset;
/* This is a bit awkward, but CP_LOAD_STATE.EXT_SRC_ADDR needs
* to be aligned more strongly than 4 bytes. So in this case
* we need a temporary buffer to copy NumWorkGroups.xyz to.
*
* TODO if previous compute job is writing to info->indirect,
* we might need a WFI.. but since we currently flush for each
* compute job, we are probably ok for now.
*/
if (info->indirect_offset & 0xf) {
indirect = pipe_buffer_create(&ctx->screen->base,
PIPE_BIND_COMMAND_ARGS_BUFFER, PIPE_USAGE_STREAM,
0x1000);
indirect_offset = 0;
if (is_a5xx(ctx->screen)) {
struct fd_bo *src = fd_resource(info->indirect)->bo;
struct fd_bo *dst = fd_resource(indirect)->bo;
for (unsigned i = 0; i < 3; i++) {
unsigned dst_off = i * 4;
unsigned src_off = (i * 4) + info->indirect_offset;
OUT_PKT7(ring, CP_MEM_TO_MEM, 5);
OUT_RING(ring, 0x00000000);
OUT_RELOCW(ring, dst, dst_off, 0, 0);
OUT_RELOC (ring, src, src_off, 0, 0);
}
} else {
assert(0);
}
} else {
pipe_resource_reference(&indirect, info->indirect);
indirect_offset = info->indirect_offset;
}
ctx->emit_const(ring, SHADER_COMPUTE, offset * 4,
indirect_offset, 4, NULL, indirect);
pipe_resource_reference(&indirect, NULL);
} else {
uint32_t compute_params[IR3_DP_CS_COUNT] = {
[IR3_DP_NUM_WORK_GROUPS_X] = info->grid[0],
[IR3_DP_NUM_WORK_GROUPS_Y] = info->grid[1],
[IR3_DP_NUM_WORK_GROUPS_Z] = info->grid[2],
/* do we need work-group-size? */
};
ctx->emit_const(ring, SHADER_COMPUTE, offset * 4, 0,
ARRAY_SIZE(compute_params), compute_params, NULL);
}
}
}

View File

@ -44,6 +44,11 @@ enum ir3_driver_param {
IR3_DP_NUM_WORK_GROUPS_X = 0,
IR3_DP_NUM_WORK_GROUPS_Y = 1,
IR3_DP_NUM_WORK_GROUPS_Z = 2,
/* NOTE: gl_NumWorkGroups should be vec4 aligned because
* glDispatchComputeIndirect() needs to load these from
* the info->indirect buffer. Keep that in mind when/if
* adding any addition CS driver params.
*/
IR3_DP_CS_COUNT = 4, /* must be aligned to vec4 */
/* vertex shader driver params: */