iris: implement scratch space!

we borrow the approach from anv rather than i965, as it works better
with pre-baked state that needs to contain scratch BO addresses

fixes a bunch of varying packing tests
This commit is contained in:
Kenneth Graunke 2018-11-07 22:05:14 -08:00
parent 9511b89ef9
commit 4d219b0eb3
4 changed files with 111 additions and 32 deletions

View File

@ -328,7 +328,7 @@ struct iris_vtable {
uint64_t imm); uint64_t imm);
unsigned (*derived_program_state_size)(enum iris_program_cache_id id); unsigned (*derived_program_state_size)(enum iris_program_cache_id id);
void (*store_derived_program_state)(const struct gen_device_info *devinfo, void (*store_derived_program_state)(struct iris_context *ice,
enum iris_program_cache_id cache_id, enum iris_program_cache_id cache_id,
struct iris_compiled_shader *shader); struct iris_compiled_shader *shader);
uint32_t *(*create_so_decl_list)(const struct pipe_stream_output_info *sol, uint32_t *(*create_so_decl_list)(const struct pipe_stream_output_info *sol,
@ -394,6 +394,14 @@ struct iris_context {
struct hash_table *cache; struct hash_table *cache;
unsigned urb_size; unsigned urb_size;
/**
* Scratch buffers for various sizes and stages.
*
* Indexed by the "Per-Thread Scratch Space" field's 4-bit encoding,
* and shader stage.
*/
struct iris_bo *scratch_bos[1 << 4][MESA_SHADER_STAGES];
} shaders; } shaders;
struct { struct {
@ -552,7 +560,9 @@ const struct shader_info *iris_get_shader_info(const struct iris_context *ice,
gl_shader_stage stage); gl_shader_stage stage);
unsigned iris_get_shader_num_ubos(const struct iris_context *ice, unsigned iris_get_shader_num_ubos(const struct iris_context *ice,
gl_shader_stage stage); gl_shader_stage stage);
uint32_t iris_get_scratch_space(struct iris_context *ice,
unsigned per_thread_scratch,
gl_shader_stage stage);
/* iris_program_cache.c */ /* iris_program_cache.c */

View File

@ -1072,6 +1072,56 @@ iris_fill_cs_push_const_buffer(struct brw_cs_prog_data *cs_prog_data,
dst[8 * t] = t; dst[8 * t] = t;
} }
/**
* Allocate scratch BOs as needed for the given per-thread size and stage.
*
* Returns the 32-bit "Scratch Space Base Pointer" value.
*/
uint32_t
iris_get_scratch_space(struct iris_context *ice,
unsigned per_thread_scratch,
gl_shader_stage stage)
{
struct iris_screen *screen = (struct iris_screen *)ice->ctx.screen;
struct iris_bufmgr *bufmgr = screen->bufmgr;
const struct gen_device_info *devinfo = &screen->devinfo;
unsigned encoded_size = ffs(per_thread_scratch) - 11;
assert(encoded_size < (1 << 16));
struct iris_bo **bop = &ice->shaders.scratch_bos[encoded_size][stage];
/* The documentation for 3DSTATE_PS "Scratch Space Base Pointer" says:
*
* "Scratch Space per slice is computed based on 4 sub-slices. SW must
* allocate scratch space enough so that each slice has 4 slices
* allowed."
*
* According to the other driver team, this applies to compute shaders
* as well. This is not currently documented at all.
*/
unsigned subslice_total = 4 * devinfo->num_slices;
assert(subslice_total >= screen->subslice_total);
if (!*bop) {
unsigned scratch_ids_per_subslice = devinfo->max_cs_threads;
uint32_t max_threads[] = {
[MESA_SHADER_VERTEX] = devinfo->max_vs_threads,
[MESA_SHADER_TESS_CTRL] = devinfo->max_tcs_threads,
[MESA_SHADER_TESS_EVAL] = devinfo->max_tes_threads,
[MESA_SHADER_GEOMETRY] = devinfo->max_gs_threads,
[MESA_SHADER_FRAGMENT] = devinfo->max_wm_threads,
[MESA_SHADER_COMPUTE] = scratch_ids_per_subslice * subslice_total,
};
uint32_t size = per_thread_scratch * max_threads[stage];
*bop = iris_bo_alloc(bufmgr, "scratch", size, IRIS_MEMZONE_SHADER);
}
return (*bop)->gtt_offset;
}
void void
iris_init_program_functions(struct pipe_context *ctx) iris_init_program_functions(struct pipe_context *ctx)
{ {

View File

@ -241,8 +241,6 @@ iris_upload_shader(struct iris_context *ice,
struct brw_stage_prog_data *prog_data, struct brw_stage_prog_data *prog_data,
uint32_t *streamout) uint32_t *streamout)
{ {
struct iris_screen *screen = (void *) ice->ctx.screen;
struct gen_device_info *devinfo = &screen->devinfo;
struct hash_table *cache = ice->shaders.cache; struct hash_table *cache = ice->shaders.cache;
struct iris_compiled_shader *shader = struct iris_compiled_shader *shader =
rzalloc_size(cache, sizeof(struct iris_compiled_shader) + rzalloc_size(cache, sizeof(struct iris_compiled_shader) +
@ -277,7 +275,7 @@ iris_upload_shader(struct iris_context *ice,
ralloc_steal(shader, shader->streamout); ralloc_steal(shader, shader->streamout);
/* Store the 3DSTATE shader packets and other derived state. */ /* Store the 3DSTATE shader packets and other derived state. */
ice->vtbl.store_derived_program_state(devinfo, cache_id, shader); ice->vtbl.store_derived_program_state(ice, cache_id, shader);
struct keybox *keybox = make_keybox(cache, cache_id, key, key_size); struct keybox *keybox = make_keybox(cache, cache_id, key, key_size);
_mesa_hash_table_insert(ice->shaders.cache, keybox, shader); _mesa_hash_table_insert(ice->shaders.cache, keybox, shader);

View File

@ -2981,8 +2981,6 @@ iris_populate_cs_key(const struct iris_context *ice,
// XXX: these need to go in INIT_THREAD_DISPATCH_FIELDS // XXX: these need to go in INIT_THREAD_DISPATCH_FIELDS
pkt.SamplerCount = \ pkt.SamplerCount = \
DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4); \ DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4); \
pkt.PerThreadScratchSpace = prog_data->total_scratch == 0 ? 0 : \
ffs(stage_state->per_thread_scratch) - 11; \
#endif #endif
@ -2997,7 +2995,7 @@ KSP(const struct iris_compiled_shader *shader)
// prefetching of binding tables in A0 and B0 steppings. XXX: Revisit // prefetching of binding tables in A0 and B0 steppings. XXX: Revisit
// this WA on C0 stepping. // this WA on C0 stepping.
#define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix) \ #define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix, stage) \
pkt.KernelStartPointer = KSP(shader); \ pkt.KernelStartPointer = KSP(shader); \
pkt.BindingTableEntryCount = GEN_GEN == 11 ? 0 : \ pkt.BindingTableEntryCount = GEN_GEN == 11 ? 0 : \
prog_data->binding_table.size_bytes / 4; \ prog_data->binding_table.size_bytes / 4; \
@ -3009,20 +3007,28 @@ KSP(const struct iris_compiled_shader *shader)
pkt.prefix##URBEntryReadOffset = 0; \ pkt.prefix##URBEntryReadOffset = 0; \
\ \
pkt.StatisticsEnable = true; \ pkt.StatisticsEnable = true; \
pkt.Enable = true; pkt.Enable = true; \
\
if (prog_data->total_scratch) { \
uint32_t scratch_addr = \
iris_get_scratch_space(ice, prog_data->total_scratch, stage); \
pkt.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 11; \
pkt.ScratchSpaceBasePointer = rw_bo(NULL, scratch_addr); \
}
/** /**
* Encode most of 3DSTATE_VS based on the compiled shader. * Encode most of 3DSTATE_VS based on the compiled shader.
*/ */
static void static void
iris_store_vs_state(const struct gen_device_info *devinfo, iris_store_vs_state(struct iris_context *ice,
const struct gen_device_info *devinfo,
struct iris_compiled_shader *shader) struct iris_compiled_shader *shader)
{ {
struct brw_stage_prog_data *prog_data = shader->prog_data; struct brw_stage_prog_data *prog_data = shader->prog_data;
struct brw_vue_prog_data *vue_prog_data = (void *) prog_data; struct brw_vue_prog_data *vue_prog_data = (void *) prog_data;
iris_pack_command(GENX(3DSTATE_VS), shader->derived_data, vs) { iris_pack_command(GENX(3DSTATE_VS), shader->derived_data, vs) {
INIT_THREAD_DISPATCH_FIELDS(vs, Vertex); INIT_THREAD_DISPATCH_FIELDS(vs, Vertex, MESA_SHADER_VERTEX);
vs.MaximumNumberofThreads = devinfo->max_vs_threads - 1; vs.MaximumNumberofThreads = devinfo->max_vs_threads - 1;
vs.SIMD8DispatchEnable = true; vs.SIMD8DispatchEnable = true;
vs.UserClipDistanceCullTestEnableBitmask = vs.UserClipDistanceCullTestEnableBitmask =
@ -3034,7 +3040,8 @@ iris_store_vs_state(const struct gen_device_info *devinfo,
* Encode most of 3DSTATE_HS based on the compiled shader. * Encode most of 3DSTATE_HS based on the compiled shader.
*/ */
static void static void
iris_store_tcs_state(const struct gen_device_info *devinfo, iris_store_tcs_state(struct iris_context *ice,
const struct gen_device_info *devinfo,
struct iris_compiled_shader *shader) struct iris_compiled_shader *shader)
{ {
struct brw_stage_prog_data *prog_data = shader->prog_data; struct brw_stage_prog_data *prog_data = shader->prog_data;
@ -3042,7 +3049,7 @@ iris_store_tcs_state(const struct gen_device_info *devinfo,
struct brw_tcs_prog_data *tcs_prog_data = (void *) prog_data; struct brw_tcs_prog_data *tcs_prog_data = (void *) prog_data;
iris_pack_command(GENX(3DSTATE_HS), shader->derived_data, hs) { iris_pack_command(GENX(3DSTATE_HS), shader->derived_data, hs) {
INIT_THREAD_DISPATCH_FIELDS(hs, Vertex); INIT_THREAD_DISPATCH_FIELDS(hs, Vertex, MESA_SHADER_TESS_CTRL);
hs.InstanceCount = tcs_prog_data->instances - 1; hs.InstanceCount = tcs_prog_data->instances - 1;
hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1; hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1;
@ -3054,7 +3061,8 @@ iris_store_tcs_state(const struct gen_device_info *devinfo,
* Encode 3DSTATE_TE and most of 3DSTATE_DS based on the compiled shader. * Encode 3DSTATE_TE and most of 3DSTATE_DS based on the compiled shader.
*/ */
static void static void
iris_store_tes_state(const struct gen_device_info *devinfo, iris_store_tes_state(struct iris_context *ice,
const struct gen_device_info *devinfo,
struct iris_compiled_shader *shader) struct iris_compiled_shader *shader)
{ {
struct brw_stage_prog_data *prog_data = shader->prog_data; struct brw_stage_prog_data *prog_data = shader->prog_data;
@ -3074,7 +3082,7 @@ iris_store_tes_state(const struct gen_device_info *devinfo,
} }
iris_pack_command(GENX(3DSTATE_DS), ds_state, ds) { iris_pack_command(GENX(3DSTATE_DS), ds_state, ds) {
INIT_THREAD_DISPATCH_FIELDS(ds, Patch); INIT_THREAD_DISPATCH_FIELDS(ds, Patch, MESA_SHADER_TESS_EVAL);
ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH; ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH;
ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1; ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1;
@ -3091,7 +3099,8 @@ iris_store_tes_state(const struct gen_device_info *devinfo,
* Encode most of 3DSTATE_GS based on the compiled shader. * Encode most of 3DSTATE_GS based on the compiled shader.
*/ */
static void static void
iris_store_gs_state(const struct gen_device_info *devinfo, iris_store_gs_state(struct iris_context *ice,
const struct gen_device_info *devinfo,
struct iris_compiled_shader *shader) struct iris_compiled_shader *shader)
{ {
struct brw_stage_prog_data *prog_data = shader->prog_data; struct brw_stage_prog_data *prog_data = shader->prog_data;
@ -3099,7 +3108,7 @@ iris_store_gs_state(const struct gen_device_info *devinfo,
struct brw_gs_prog_data *gs_prog_data = (void *) prog_data; struct brw_gs_prog_data *gs_prog_data = (void *) prog_data;
iris_pack_command(GENX(3DSTATE_GS), shader->derived_data, gs) { iris_pack_command(GENX(3DSTATE_GS), shader->derived_data, gs) {
INIT_THREAD_DISPATCH_FIELDS(gs, Vertex); INIT_THREAD_DISPATCH_FIELDS(gs, Vertex, MESA_SHADER_GEOMETRY);
gs.OutputVertexSize = gs_prog_data->output_vertex_size_hwords * 2 - 1; gs.OutputVertexSize = gs_prog_data->output_vertex_size_hwords * 2 - 1;
gs.OutputTopology = gs_prog_data->output_topology; gs.OutputTopology = gs_prog_data->output_topology;
@ -3138,7 +3147,8 @@ iris_store_gs_state(const struct gen_device_info *devinfo,
* Encode most of 3DSTATE_PS and 3DSTATE_PS_EXTRA based on the shader. * Encode most of 3DSTATE_PS and 3DSTATE_PS_EXTRA based on the shader.
*/ */
static void static void
iris_store_fs_state(const struct gen_device_info *devinfo, iris_store_fs_state(struct iris_context *ice,
const struct gen_device_info *devinfo,
struct iris_compiled_shader *shader) struct iris_compiled_shader *shader)
{ {
struct brw_stage_prog_data *prog_data = shader->prog_data; struct brw_stage_prog_data *prog_data = shader->prog_data;
@ -3193,6 +3203,14 @@ iris_store_fs_state(const struct gen_device_info *devinfo,
KSP(shader) + brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1); KSP(shader) + brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1);
ps.KernelStartPointer2 = ps.KernelStartPointer2 =
KSP(shader) + brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2); KSP(shader) + brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2);
if (prog_data->total_scratch) {
uint32_t scratch_addr =
iris_get_scratch_space(ice, prog_data->total_scratch,
MESA_SHADER_FRAGMENT);
ps.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 11;
ps.ScratchSpaceBasePointer = rw_bo(NULL, scratch_addr);
}
} }
iris_pack_command(GENX(3DSTATE_PS_EXTRA), psx_state, psx) { iris_pack_command(GENX(3DSTATE_PS_EXTRA), psx_state, psx) {
@ -3226,7 +3244,8 @@ iris_store_fs_state(const struct gen_device_info *devinfo,
* This must match the data written by the iris_store_xs_state() functions. * This must match the data written by the iris_store_xs_state() functions.
*/ */
static void static void
iris_store_cs_state(const struct gen_device_info *devinfo, iris_store_cs_state(struct iris_context *ice,
const struct gen_device_info *devinfo,
struct iris_compiled_shader *shader) struct iris_compiled_shader *shader)
{ {
struct brw_stage_prog_data *prog_data = shader->prog_data; struct brw_stage_prog_data *prog_data = shader->prog_data;
@ -3271,28 +3290,31 @@ iris_derived_program_state_size(enum iris_program_cache_id cache_id)
* get most of the state packet without having to reconstruct it. * get most of the state packet without having to reconstruct it.
*/ */
static void static void
iris_store_derived_program_state(const struct gen_device_info *devinfo, iris_store_derived_program_state(struct iris_context *ice,
enum iris_program_cache_id cache_id, enum iris_program_cache_id cache_id,
struct iris_compiled_shader *shader) struct iris_compiled_shader *shader)
{ {
struct iris_screen *screen = (void *) ice->ctx.screen;
const struct gen_device_info *devinfo = &screen->devinfo;
switch (cache_id) { switch (cache_id) {
case IRIS_CACHE_VS: case IRIS_CACHE_VS:
iris_store_vs_state(devinfo, shader); iris_store_vs_state(ice, devinfo, shader);
break; break;
case IRIS_CACHE_TCS: case IRIS_CACHE_TCS:
iris_store_tcs_state(devinfo, shader); iris_store_tcs_state(ice, devinfo, shader);
break; break;
case IRIS_CACHE_TES: case IRIS_CACHE_TES:
iris_store_tes_state(devinfo, shader); iris_store_tes_state(ice, devinfo, shader);
break; break;
case IRIS_CACHE_GS: case IRIS_CACHE_GS:
iris_store_gs_state(devinfo, shader); iris_store_gs_state(ice, devinfo, shader);
break; break;
case IRIS_CACHE_FS: case IRIS_CACHE_FS:
iris_store_fs_state(devinfo, shader); iris_store_fs_state(ice, devinfo, shader);
break; break;
case IRIS_CACHE_CS: case IRIS_CACHE_CS:
iris_store_cs_state(devinfo, shader); iris_store_cs_state(ice, devinfo, shader);
case IRIS_CACHE_BLORP: case IRIS_CACHE_BLORP:
break; break;
default: default:
@ -4401,12 +4423,11 @@ iris_upload_compute_state(struct iris_context *ice,
iris_emit_cmd(batch, GENX(MEDIA_VFE_STATE), vfe) { iris_emit_cmd(batch, GENX(MEDIA_VFE_STATE), vfe) {
if (prog_data->total_scratch) { if (prog_data->total_scratch) {
/* Per Thread Scratch Space is in the range [0, 11] where uint32_t scratch_addr =
* 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M. iris_get_scratch_space(ice, prog_data->total_scratch,
*/ MESA_SHADER_COMPUTE);
// XXX: vfe.ScratchSpaceBasePointer vfe.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 11;
//vfe.PerThreadScratchSpace = vfe.ScratchSpaceBasePointer = rw_bo(NULL, scratch_addr);
//ffs(stage_state->per_thread_scratch) - 11;
} }
vfe.MaximumNumberofThreads = vfe.MaximumNumberofThreads =