iris: implement scratch space!

we borrow the approach from anv rather than i965, as it works better
with pre-baked state that needs to contain scratch BO addresses

fixes a bunch of varying packing tests
This commit is contained in:
Kenneth Graunke 2018-11-07 22:05:14 -08:00
parent 9511b89ef9
commit 4d219b0eb3
4 changed files with 111 additions and 32 deletions

View File

@ -328,7 +328,7 @@ struct iris_vtable {
uint64_t imm);
unsigned (*derived_program_state_size)(enum iris_program_cache_id id);
void (*store_derived_program_state)(const struct gen_device_info *devinfo,
void (*store_derived_program_state)(struct iris_context *ice,
enum iris_program_cache_id cache_id,
struct iris_compiled_shader *shader);
uint32_t *(*create_so_decl_list)(const struct pipe_stream_output_info *sol,
@ -394,6 +394,14 @@ struct iris_context {
struct hash_table *cache;
unsigned urb_size;
/**
* Scratch buffers for various sizes and stages.
*
* Indexed by the "Per-Thread Scratch Space" field's 4-bit encoding,
* and shader stage.
*/
struct iris_bo *scratch_bos[1 << 4][MESA_SHADER_STAGES];
} shaders;
struct {
@ -552,7 +560,9 @@ const struct shader_info *iris_get_shader_info(const struct iris_context *ice,
gl_shader_stage stage);
unsigned iris_get_shader_num_ubos(const struct iris_context *ice,
gl_shader_stage stage);
uint32_t iris_get_scratch_space(struct iris_context *ice,
unsigned per_thread_scratch,
gl_shader_stage stage);
/* iris_program_cache.c */

View File

@ -1072,6 +1072,56 @@ iris_fill_cs_push_const_buffer(struct brw_cs_prog_data *cs_prog_data,
dst[8 * t] = t;
}
/**
* Allocate scratch BOs as needed for the given per-thread size and stage.
*
* Returns the 32-bit "Scratch Space Base Pointer" value.
*/
uint32_t
iris_get_scratch_space(struct iris_context *ice,
unsigned per_thread_scratch,
gl_shader_stage stage)
{
struct iris_screen *screen = (struct iris_screen *)ice->ctx.screen;
struct iris_bufmgr *bufmgr = screen->bufmgr;
const struct gen_device_info *devinfo = &screen->devinfo;
unsigned encoded_size = ffs(per_thread_scratch) - 11;
assert(encoded_size < (1 << 16));
struct iris_bo **bop = &ice->shaders.scratch_bos[encoded_size][stage];
/* The documentation for 3DSTATE_PS "Scratch Space Base Pointer" says:
*
* "Scratch Space per slice is computed based on 4 sub-slices. SW must
* allocate scratch space enough so that each slice has 4 slices
* allowed."
*
* According to the other driver team, this applies to compute shaders
* as well. This is not currently documented at all.
*/
unsigned subslice_total = 4 * devinfo->num_slices;
assert(subslice_total >= screen->subslice_total);
if (!*bop) {
unsigned scratch_ids_per_subslice = devinfo->max_cs_threads;
uint32_t max_threads[] = {
[MESA_SHADER_VERTEX] = devinfo->max_vs_threads,
[MESA_SHADER_TESS_CTRL] = devinfo->max_tcs_threads,
[MESA_SHADER_TESS_EVAL] = devinfo->max_tes_threads,
[MESA_SHADER_GEOMETRY] = devinfo->max_gs_threads,
[MESA_SHADER_FRAGMENT] = devinfo->max_wm_threads,
[MESA_SHADER_COMPUTE] = scratch_ids_per_subslice * subslice_total,
};
uint32_t size = per_thread_scratch * max_threads[stage];
*bop = iris_bo_alloc(bufmgr, "scratch", size, IRIS_MEMZONE_SHADER);
}
return (*bop)->gtt_offset;
}
void
iris_init_program_functions(struct pipe_context *ctx)
{

View File

@ -241,8 +241,6 @@ iris_upload_shader(struct iris_context *ice,
struct brw_stage_prog_data *prog_data,
uint32_t *streamout)
{
struct iris_screen *screen = (void *) ice->ctx.screen;
struct gen_device_info *devinfo = &screen->devinfo;
struct hash_table *cache = ice->shaders.cache;
struct iris_compiled_shader *shader =
rzalloc_size(cache, sizeof(struct iris_compiled_shader) +
@ -277,7 +275,7 @@ iris_upload_shader(struct iris_context *ice,
ralloc_steal(shader, shader->streamout);
/* Store the 3DSTATE shader packets and other derived state. */
ice->vtbl.store_derived_program_state(devinfo, cache_id, shader);
ice->vtbl.store_derived_program_state(ice, cache_id, shader);
struct keybox *keybox = make_keybox(cache, cache_id, key, key_size);
_mesa_hash_table_insert(ice->shaders.cache, keybox, shader);

View File

@ -2981,8 +2981,6 @@ iris_populate_cs_key(const struct iris_context *ice,
// XXX: these need to go in INIT_THREAD_DISPATCH_FIELDS
pkt.SamplerCount = \
DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4); \
pkt.PerThreadScratchSpace = prog_data->total_scratch == 0 ? 0 : \
ffs(stage_state->per_thread_scratch) - 11; \
#endif
@ -2997,7 +2995,7 @@ KSP(const struct iris_compiled_shader *shader)
// prefetching of binding tables in A0 and B0 steppings. XXX: Revisit
// this WA on C0 stepping.
#define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix) \
#define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix, stage) \
pkt.KernelStartPointer = KSP(shader); \
pkt.BindingTableEntryCount = GEN_GEN == 11 ? 0 : \
prog_data->binding_table.size_bytes / 4; \
@ -3009,20 +3007,28 @@ KSP(const struct iris_compiled_shader *shader)
pkt.prefix##URBEntryReadOffset = 0; \
\
pkt.StatisticsEnable = true; \
pkt.Enable = true;
pkt.Enable = true; \
\
if (prog_data->total_scratch) { \
uint32_t scratch_addr = \
iris_get_scratch_space(ice, prog_data->total_scratch, stage); \
pkt.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 11; \
pkt.ScratchSpaceBasePointer = rw_bo(NULL, scratch_addr); \
}
/**
* Encode most of 3DSTATE_VS based on the compiled shader.
*/
static void
iris_store_vs_state(const struct gen_device_info *devinfo,
iris_store_vs_state(struct iris_context *ice,
const struct gen_device_info *devinfo,
struct iris_compiled_shader *shader)
{
struct brw_stage_prog_data *prog_data = shader->prog_data;
struct brw_vue_prog_data *vue_prog_data = (void *) prog_data;
iris_pack_command(GENX(3DSTATE_VS), shader->derived_data, vs) {
INIT_THREAD_DISPATCH_FIELDS(vs, Vertex);
INIT_THREAD_DISPATCH_FIELDS(vs, Vertex, MESA_SHADER_VERTEX);
vs.MaximumNumberofThreads = devinfo->max_vs_threads - 1;
vs.SIMD8DispatchEnable = true;
vs.UserClipDistanceCullTestEnableBitmask =
@ -3034,7 +3040,8 @@ iris_store_vs_state(const struct gen_device_info *devinfo,
* Encode most of 3DSTATE_HS based on the compiled shader.
*/
static void
iris_store_tcs_state(const struct gen_device_info *devinfo,
iris_store_tcs_state(struct iris_context *ice,
const struct gen_device_info *devinfo,
struct iris_compiled_shader *shader)
{
struct brw_stage_prog_data *prog_data = shader->prog_data;
@ -3042,7 +3049,7 @@ iris_store_tcs_state(const struct gen_device_info *devinfo,
struct brw_tcs_prog_data *tcs_prog_data = (void *) prog_data;
iris_pack_command(GENX(3DSTATE_HS), shader->derived_data, hs) {
INIT_THREAD_DISPATCH_FIELDS(hs, Vertex);
INIT_THREAD_DISPATCH_FIELDS(hs, Vertex, MESA_SHADER_TESS_CTRL);
hs.InstanceCount = tcs_prog_data->instances - 1;
hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1;
@ -3054,7 +3061,8 @@ iris_store_tcs_state(const struct gen_device_info *devinfo,
* Encode 3DSTATE_TE and most of 3DSTATE_DS based on the compiled shader.
*/
static void
iris_store_tes_state(const struct gen_device_info *devinfo,
iris_store_tes_state(struct iris_context *ice,
const struct gen_device_info *devinfo,
struct iris_compiled_shader *shader)
{
struct brw_stage_prog_data *prog_data = shader->prog_data;
@ -3074,7 +3082,7 @@ iris_store_tes_state(const struct gen_device_info *devinfo,
}
iris_pack_command(GENX(3DSTATE_DS), ds_state, ds) {
INIT_THREAD_DISPATCH_FIELDS(ds, Patch);
INIT_THREAD_DISPATCH_FIELDS(ds, Patch, MESA_SHADER_TESS_EVAL);
ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH;
ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1;
@ -3091,7 +3099,8 @@ iris_store_tes_state(const struct gen_device_info *devinfo,
* Encode most of 3DSTATE_GS based on the compiled shader.
*/
static void
iris_store_gs_state(const struct gen_device_info *devinfo,
iris_store_gs_state(struct iris_context *ice,
const struct gen_device_info *devinfo,
struct iris_compiled_shader *shader)
{
struct brw_stage_prog_data *prog_data = shader->prog_data;
@ -3099,7 +3108,7 @@ iris_store_gs_state(const struct gen_device_info *devinfo,
struct brw_gs_prog_data *gs_prog_data = (void *) prog_data;
iris_pack_command(GENX(3DSTATE_GS), shader->derived_data, gs) {
INIT_THREAD_DISPATCH_FIELDS(gs, Vertex);
INIT_THREAD_DISPATCH_FIELDS(gs, Vertex, MESA_SHADER_GEOMETRY);
gs.OutputVertexSize = gs_prog_data->output_vertex_size_hwords * 2 - 1;
gs.OutputTopology = gs_prog_data->output_topology;
@ -3138,7 +3147,8 @@ iris_store_gs_state(const struct gen_device_info *devinfo,
* Encode most of 3DSTATE_PS and 3DSTATE_PS_EXTRA based on the shader.
*/
static void
iris_store_fs_state(const struct gen_device_info *devinfo,
iris_store_fs_state(struct iris_context *ice,
const struct gen_device_info *devinfo,
struct iris_compiled_shader *shader)
{
struct brw_stage_prog_data *prog_data = shader->prog_data;
@ -3193,6 +3203,14 @@ iris_store_fs_state(const struct gen_device_info *devinfo,
KSP(shader) + brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1);
ps.KernelStartPointer2 =
KSP(shader) + brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2);
if (prog_data->total_scratch) {
uint32_t scratch_addr =
iris_get_scratch_space(ice, prog_data->total_scratch,
MESA_SHADER_FRAGMENT);
ps.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 11;
ps.ScratchSpaceBasePointer = rw_bo(NULL, scratch_addr);
}
}
iris_pack_command(GENX(3DSTATE_PS_EXTRA), psx_state, psx) {
@ -3226,7 +3244,8 @@ iris_store_fs_state(const struct gen_device_info *devinfo,
* This must match the data written by the iris_store_xs_state() functions.
*/
static void
iris_store_cs_state(const struct gen_device_info *devinfo,
iris_store_cs_state(struct iris_context *ice,
const struct gen_device_info *devinfo,
struct iris_compiled_shader *shader)
{
struct brw_stage_prog_data *prog_data = shader->prog_data;
@ -3271,28 +3290,31 @@ iris_derived_program_state_size(enum iris_program_cache_id cache_id)
* get most of the state packet without having to reconstruct it.
*/
static void
iris_store_derived_program_state(const struct gen_device_info *devinfo,
iris_store_derived_program_state(struct iris_context *ice,
enum iris_program_cache_id cache_id,
struct iris_compiled_shader *shader)
{
struct iris_screen *screen = (void *) ice->ctx.screen;
const struct gen_device_info *devinfo = &screen->devinfo;
switch (cache_id) {
case IRIS_CACHE_VS:
iris_store_vs_state(devinfo, shader);
iris_store_vs_state(ice, devinfo, shader);
break;
case IRIS_CACHE_TCS:
iris_store_tcs_state(devinfo, shader);
iris_store_tcs_state(ice, devinfo, shader);
break;
case IRIS_CACHE_TES:
iris_store_tes_state(devinfo, shader);
iris_store_tes_state(ice, devinfo, shader);
break;
case IRIS_CACHE_GS:
iris_store_gs_state(devinfo, shader);
iris_store_gs_state(ice, devinfo, shader);
break;
case IRIS_CACHE_FS:
iris_store_fs_state(devinfo, shader);
iris_store_fs_state(ice, devinfo, shader);
break;
case IRIS_CACHE_CS:
iris_store_cs_state(devinfo, shader);
iris_store_cs_state(ice, devinfo, shader);
case IRIS_CACHE_BLORP:
break;
default:
@ -4401,12 +4423,11 @@ iris_upload_compute_state(struct iris_context *ice,
iris_emit_cmd(batch, GENX(MEDIA_VFE_STATE), vfe) {
if (prog_data->total_scratch) {
/* Per Thread Scratch Space is in the range [0, 11] where
* 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M.
*/
// XXX: vfe.ScratchSpaceBasePointer
//vfe.PerThreadScratchSpace =
//ffs(stage_state->per_thread_scratch) - 11;
uint32_t scratch_addr =
iris_get_scratch_space(ice, prog_data->total_scratch,
MESA_SHADER_COMPUTE);
vfe.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 11;
vfe.ScratchSpaceBasePointer = rw_bo(NULL, scratch_addr);
}
vfe.MaximumNumberofThreads =