diff --git a/src/gallium/drivers/panfrost/pan_assemble.c b/src/gallium/drivers/panfrost/pan_assemble.c index 66549add540..4d90850f520 100644 --- a/src/gallium/drivers/panfrost/pan_assemble.c +++ b/src/gallium/drivers/panfrost/pan_assemble.c @@ -40,35 +40,30 @@ #include "tgsi/tgsi_dump.h" static void -pan_prepare_midgard_props(struct panfrost_shader_state *state, - panfrost_program *program, - gl_shader_stage stage) +pan_prepare_midgard_props(struct panfrost_shader_state *state) { pan_prepare(&state->properties, RENDERER_PROPERTIES); - state->properties.uniform_buffer_count = state->ubo_count; - state->properties.midgard.uniform_count = program->uniform_cutoff; - state->properties.midgard.shader_has_side_effects = state->writes_global; + state->properties.uniform_buffer_count = state->info.ubo_count; + state->properties.midgard.uniform_count = state->info.midgard.uniform_cutoff; + state->properties.midgard.shader_has_side_effects = state->info.writes_global; state->properties.midgard.fp_mode = MALI_FP_MODE_GL_INF_NAN_ALLOWED; /* For fragment shaders, work register count, early-z, reads at draw-time */ - if (stage != MESA_SHADER_FRAGMENT) - state->properties.midgard.work_register_count = state->work_reg_count; + if (state->info.stage != MESA_SHADER_FRAGMENT) + state->properties.midgard.work_register_count = state->info.work_reg_count; } static void -pan_prepare_bifrost_props(struct panfrost_shader_state *state, - panfrost_program *program, - gl_shader_stage stage, - shader_info *info) +pan_prepare_bifrost_props(struct panfrost_shader_state *state) { - unsigned fau_count = DIV_ROUND_UP(program->push.count, 2); + unsigned fau_count = DIV_ROUND_UP(state->info.push.count, 2); - switch (stage) { + switch (state->info.stage) { case MESA_SHADER_VERTEX: pan_prepare(&state->properties, RENDERER_PROPERTIES); state->properties.bifrost.zs_update_operation = MALI_PIXEL_KILL_STRONG_EARLY; - state->properties.uniform_buffer_count = state->ubo_count; + state->properties.uniform_buffer_count = state->info.ubo_count; pan_prepare(&state->preload, PRELOAD); state->preload.uniform_count = fau_count; @@ -78,39 +73,39 @@ pan_prepare_bifrost_props(struct panfrost_shader_state *state, case MESA_SHADER_FRAGMENT: pan_prepare(&state->properties, RENDERER_PROPERTIES); /* Early-Z set at draw-time */ - if (state->writes_depth || state->writes_stencil) { + if (state->info.fs.writes_depth || state->info.fs.writes_stencil) { state->properties.bifrost.zs_update_operation = MALI_PIXEL_KILL_FORCE_LATE; state->properties.bifrost.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_LATE; - } else if (state->can_discard) { + } else if (state->info.fs.can_discard) { state->properties.bifrost.zs_update_operation = MALI_PIXEL_KILL_FORCE_LATE; state->properties.bifrost.pixel_kill_operation = MALI_PIXEL_KILL_WEAK_EARLY; } else { state->properties.bifrost.zs_update_operation = MALI_PIXEL_KILL_STRONG_EARLY; state->properties.bifrost.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_EARLY; } - state->properties.uniform_buffer_count = state->ubo_count; - state->properties.bifrost.shader_modifies_coverage = state->can_discard; - state->properties.bifrost.shader_wait_dependency_6 = program->wait_6; - state->properties.bifrost.shader_wait_dependency_7 = program->wait_7; + state->properties.uniform_buffer_count = state->info.ubo_count; + state->properties.bifrost.shader_modifies_coverage = state->info.fs.can_discard; + state->properties.bifrost.shader_wait_dependency_6 = state->info.bifrost.wait_6; + state->properties.bifrost.shader_wait_dependency_7 = state->info.bifrost.wait_7; pan_prepare(&state->preload, PRELOAD); state->preload.uniform_count = fau_count; - state->preload.fragment.fragment_position = state->reads_frag_coord; + state->preload.fragment.fragment_position = state->info.fs.reads_frag_coord; state->preload.fragment.coverage = true; - state->preload.fragment.primitive_flags = state->reads_face; + state->preload.fragment.primitive_flags = state->info.fs.reads_face; /* Contains sample ID and sample mask. Sample position and * helper invocation are expressed in terms of the above, so * preload for those too */ state->preload.fragment.sample_mask_id = - BITSET_TEST(info->system_values_read, SYSTEM_VALUE_SAMPLE_ID) || - BITSET_TEST(info->system_values_read, SYSTEM_VALUE_SAMPLE_POS) || - BITSET_TEST(info->system_values_read, SYSTEM_VALUE_SAMPLE_MASK_IN) || - BITSET_TEST(info->system_values_read, SYSTEM_VALUE_HELPER_INVOCATION); + state->info.fs.reads_sample_id | + state->info.fs.reads_sample_pos | + state->info.fs.reads_sample_mask_in | + state->info.fs.reads_helper_invocation; break; case MESA_SHADER_COMPUTE: pan_prepare(&state->properties, RENDERER_PROPERTIES); - state->properties.uniform_buffer_count = state->ubo_count; + state->properties.uniform_buffer_count = state->info.ubo_count; pan_prepare(&state->preload, PRELOAD); state->preload.uniform_count = fau_count; @@ -152,112 +147,12 @@ pan_upload_shader_descriptor(struct panfrost_context *ctx, u_upload_unmap(ctx->state_uploader); } -static unsigned -pan_format_from_nir_base(nir_alu_type base) -{ - switch (base) { - case nir_type_int: - return MALI_FORMAT_SINT; - case nir_type_uint: - case nir_type_bool: - return MALI_FORMAT_UINT; - case nir_type_float: - return MALI_CHANNEL_FLOAT; - default: - unreachable("Invalid base"); - } -} - -static unsigned -pan_format_from_nir_size(nir_alu_type base, unsigned size) -{ - if (base == nir_type_float) { - switch (size) { - case 16: return MALI_FORMAT_SINT; - case 32: return MALI_FORMAT_UNORM; - default: - unreachable("Invalid float size for format"); - } - } else { - switch (size) { - case 1: - case 8: return MALI_CHANNEL_8; - case 16: return MALI_CHANNEL_16; - case 32: return MALI_CHANNEL_32; - default: - unreachable("Invalid int size for format"); - } - } -} - -static enum mali_format -pan_format_from_glsl(const struct glsl_type *type, unsigned precision, unsigned frac) -{ - const struct glsl_type *column = glsl_without_array_or_matrix(type); - enum glsl_base_type glsl_base = glsl_get_base_type(column); - nir_alu_type t = nir_get_nir_type_for_glsl_base_type(glsl_base); - unsigned chan = glsl_get_components(column); - - /* If we have a fractional location added, we need to increase the size - * so it will fit, i.e. a vec3 in YZW requires us to allocate a vec4. - * We could do better but this is an edge case as it is, normally - * packed varyings will be aligned. */ - chan += frac; - - assert(chan >= 1 && chan <= 4); - - unsigned base = nir_alu_type_get_base_type(t); - unsigned size = nir_alu_type_get_type_size(t); - - /* Demote to fp16 where possible. int16 varyings are TODO as the hw - * will saturate instead of wrap which is not conformant, so we need to - * insert i2i16/u2u16 instructions before the st_vary_32i/32u to get - * the intended behaviour */ - - bool is_16 = (precision == GLSL_PRECISION_MEDIUM) - || (precision == GLSL_PRECISION_LOW); - - if (is_16 && base == nir_type_float) - size = 16; - else - size = 32; - - return pan_format_from_nir_base(base) | - pan_format_from_nir_size(base, size) | - MALI_NR_CHANNELS(chan); -} - -static enum mali_bifrost_register_file_format -bifrost_blend_type_from_nir(nir_alu_type nir_type) -{ - switch(nir_type) { - case 0: /* Render target not in use */ - return 0; - case nir_type_float16: - return MALI_BIFROST_REGISTER_FILE_FORMAT_F16; - case nir_type_float32: - return MALI_BIFROST_REGISTER_FILE_FORMAT_F32; - case nir_type_int32: - return MALI_BIFROST_REGISTER_FILE_FORMAT_I32; - case nir_type_uint32: - return MALI_BIFROST_REGISTER_FILE_FORMAT_U32; - case nir_type_int16: - return MALI_BIFROST_REGISTER_FILE_FORMAT_I16; - case nir_type_uint16: - return MALI_BIFROST_REGISTER_FILE_FORMAT_U16; - default: - unreachable("Unsupported blend shader type for NIR alu type"); - return 0; - } -} - void panfrost_shader_compile(struct panfrost_context *ctx, enum pipe_shader_ir ir_type, const void *ir, gl_shader_stage stage, - struct panfrost_shader_state *state, - uint64_t *outputs_written) + struct panfrost_shader_state *state) { struct panfrost_device *dev = pan_device(ctx->base.screen); @@ -280,169 +175,62 @@ panfrost_shader_compile(struct panfrost_context *ctx, memcpy(inputs.rt_formats, state->rt_formats, sizeof(inputs.rt_formats)); - panfrost_program *program; + struct util_dynarray binary; - program = pan_shader_compile(dev, NULL, s, &inputs); + util_dynarray_init(&binary, NULL); + pan_shader_compile(dev, s, &inputs, &binary, &state->info); /* Prepare the compiled binary for upload */ mali_ptr shader = 0; - unsigned attribute_count = 0, varying_count = 0; - int size = program->compiled.size; + int size = binary.size; if (size) { state->bo = panfrost_bo_create(dev, size, PAN_BO_EXECUTE); - memcpy(state->bo->ptr.cpu, program->compiled.data, size); + memcpy(state->bo->ptr.cpu, binary.data, size); shader = state->bo->ptr.gpu; } /* Midgard needs the first tag on the bottom nibble */ - if (!pan_is_bifrost(dev)) { - /* If size = 0, we tag as "end-of-shader" */ - - if (size) - shader |= program->first_tag; - else - shader = 0x1; - } - - state->sysval_count = program->sysval_count; - memcpy(state->sysval, program->sysvals, sizeof(state->sysval[0]) * state->sysval_count); - memcpy(&state->push, &program->push, sizeof(program->push)); - - bool vertex_id = BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_VERTEX_ID); - bool instance_id = BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID); - - state->writes_global = s->info.writes_memory; - - switch (stage) { - case MESA_SHADER_VERTEX: - attribute_count = util_bitcount64(s->info.inputs_read) + - util_bitcount(s->info.images_used); - varying_count = util_bitcount64(s->info.outputs_written); - - if (vertex_id) - attribute_count = MAX2(attribute_count, PAN_VERTEX_ID + 1); - - if (instance_id) - attribute_count = MAX2(attribute_count, PAN_INSTANCE_ID + 1); - - break; - case MESA_SHADER_FRAGMENT: - for (unsigned i = 0; i < ARRAY_SIZE(state->blend_ret_addrs); i++) { - if (!program->blend_ret_offsets[i]) - continue; - - state->blend_ret_addrs[i] = (state->bo->ptr.gpu & UINT32_MAX) + - program->blend_ret_offsets[i]; - assert(!(state->blend_ret_addrs[i] & 0x7)); - } - attribute_count = util_bitcount(s->info.images_used); - varying_count = util_bitcount64(s->info.inputs_read); - if (s->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) - state->writes_depth = true; - if (s->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) - state->writes_stencil = true; - - uint64_t outputs_read = s->info.outputs_read; - if (outputs_read & BITFIELD64_BIT(FRAG_RESULT_COLOR)) - outputs_read |= BITFIELD64_BIT(FRAG_RESULT_DATA0); - - state->outputs_read = outputs_read >> FRAG_RESULT_DATA0; - - /* EXT_shader_framebuffer_fetch requires per-sample */ - state->sample_shading = s->info.fs.uses_sample_shading || - outputs_read; - - /* List of reasons we need to execute frag shaders when things - * are masked off */ - - state->fs_sidefx = - s->info.writes_memory || - s->info.fs.uses_discard || - s->info.fs.uses_demote; - - state->can_discard = s->info.fs.uses_discard; - break; - case MESA_SHADER_COMPUTE: - attribute_count = util_bitcount(s->info.images_used); - state->shared_size = s->info.cs.shared_size; - break; - default: - unreachable("Unknown shader state"); - } - - state->stack_size = program->tls_size; - state->reads_frag_coord = (s->info.inputs_read & (1 << VARYING_SLOT_POS)) || - BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_FRAG_COORD); - state->reads_point_coord = s->info.inputs_read & (1 << VARYING_SLOT_PNTC); - state->reads_face = (s->info.inputs_read & (1 << VARYING_SLOT_FACE)) || - BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_FRONT_FACE); - state->writes_point_size = s->info.outputs_written & (1 << VARYING_SLOT_PSIZ); - - if (outputs_written) - *outputs_written = s->info.outputs_written; - - state->work_reg_count = program->work_register_count; - - if (pan_is_bifrost(dev)) - for (unsigned i = 0; i < ARRAY_SIZE(state->blend_types); i++) - state->blend_types[i] = bifrost_blend_type_from_nir(program->blend_types[i]); - - /* Record the varying mapping for the command stream's bookkeeping */ - - nir_variable_mode varying_mode = - stage == MESA_SHADER_VERTEX ? nir_var_shader_out : nir_var_shader_in; - - nir_foreach_variable_with_modes(var, s, varying_mode) { - unsigned loc = var->data.driver_location; - unsigned sz = glsl_count_attribute_slots(var->type, FALSE); - - for (int c = 0; c < sz; ++c) { - state->varyings_loc[loc + c] = var->data.location + c; - state->varyings[loc + c] = pan_format_from_glsl(var->type, - var->data.precision, var->data.location_frac); - } - } - - /* Needed for linkage */ - state->attribute_count = attribute_count; - state->varying_count = varying_count; - - /* Sysvals have dedicated UBO */ - state->ubo_count = s->info.num_ubos + (state->sysval_count ? 1 : 0); + if (!pan_is_bifrost(dev)) + shader |= state->info.midgard.first_tag; /* Prepare the descriptors at compile-time */ state->shader.shader = shader; - state->shader.attribute_count = attribute_count; - state->shader.varying_count = varying_count; - state->shader.texture_count = s->info.num_textures; - state->shader.sampler_count = s->info.num_textures; + state->shader.attribute_count = state->info.attribute_count; + state->shader.varying_count = state->info.varyings.input_count + + state->info.varyings.output_count; + state->shader.texture_count = state->info.texture_count; + state->shader.sampler_count = state->info.texture_count; if (pan_is_bifrost(dev)) - pan_prepare_bifrost_props(state, program, stage, &s->info); + pan_prepare_bifrost_props(state); else - pan_prepare_midgard_props(state, program, stage); + pan_prepare_midgard_props(state); state->properties.shader_contains_barrier = - s->info.uses_memory_barrier | - s->info.uses_control_barrier; + state->info.contains_barrier; /* Ordering gaurantees are the same */ if (stage == MESA_SHADER_FRAGMENT) { state->properties.shader_contains_barrier |= - s->info.fs.needs_quad_helper_invocations; + state->info.fs.helper_invocations; + state->properties.stencil_from_shader = + state->info.fs.writes_stencil; + state->properties.depth_source = + state->info.fs.writes_depth ? + MALI_DEPTH_SOURCE_SHADER : + MALI_DEPTH_SOURCE_FIXED_FUNCTION; + } else { + state->properties.depth_source = + MALI_DEPTH_SOURCE_FIXED_FUNCTION; } - state->properties.stencil_from_shader = state->writes_stencil; - state->properties.depth_source = state->writes_depth ? - MALI_DEPTH_SOURCE_SHADER : - MALI_DEPTH_SOURCE_FIXED_FUNCTION; if (stage != MESA_SHADER_FRAGMENT) pan_upload_shader_descriptor(ctx, state); - ralloc_free(program); + util_dynarray_fini(&binary); /* In both clone and tgsi_to_nir paths, the shader is ralloc'd against * a NULL context */ diff --git a/src/gallium/drivers/panfrost/pan_blend_shaders.c b/src/gallium/drivers/panfrost/pan_blend_shaders.c index 16c8b0937cd..21a4a2720da 100644 --- a/src/gallium/drivers/panfrost/pan_blend_shaders.c +++ b/src/gallium/drivers/panfrost/pan_blend_shaders.c @@ -295,21 +295,23 @@ panfrost_compile_blend_shader(struct panfrost_blend_shader *shader, if (constants) memcpy(inputs.blend.constants, constants, sizeof(inputs.blend.constants)); - panfrost_program *program; - if (pan_is_bifrost(dev)) { inputs.blend.bifrost_blend_desc = bifrost_get_blend_desc(dev, shader->key.format, shader->key.rt); } - program = pan_shader_compile(dev, NULL, shader->nir, &inputs); + struct pan_shader_info info; + struct util_dynarray binary; + + util_dynarray_init(&binary, NULL); + pan_shader_compile(dev, shader->nir, &inputs, &binary, &info); /* Allow us to patch later */ - shader->first_tag = program->first_tag; - shader->size = program->compiled.size; + shader->first_tag = pan_is_bifrost(dev) ? 0 : info.midgard.first_tag; + shader->size = binary.size; shader->buffer = reralloc_size(shader, shader->buffer, shader->size); - memcpy(shader->buffer, program->compiled.data, shader->size); - shader->work_count = program->work_register_count; + memcpy(shader->buffer, binary.data, shader->size); + shader->work_count = info.work_reg_count; - ralloc_free(program); + util_dynarray_fini(&binary); } diff --git a/src/gallium/drivers/panfrost/pan_cmdstream.c b/src/gallium/drivers/panfrost/pan_cmdstream.c index 62975cef5a7..bda57198155 100644 --- a/src/gallium/drivers/panfrost/pan_cmdstream.c +++ b/src/gallium/drivers/panfrost/pan_cmdstream.c @@ -241,7 +241,7 @@ panfrost_fs_required( unsigned rt_count) { /* If we generally have side effects */ - if (fs->fs_sidefx) + if (fs->info.fs.sidefx) return true; /* If colour is written we need to execute */ @@ -252,7 +252,31 @@ panfrost_fs_required( /* If depth is written and not implied we need to execute. * TODO: Predicate on Z/S writes being enabled */ - return (fs->writes_depth || fs->writes_stencil); + return (fs->info.fs.writes_depth || fs->info.fs.writes_stencil); +} + +static enum mali_bifrost_register_file_format +bifrost_blend_type_from_nir(nir_alu_type nir_type) +{ + switch(nir_type) { + case 0: /* Render target not in use */ + return 0; + case nir_type_float16: + return MALI_BIFROST_REGISTER_FILE_FORMAT_F16; + case nir_type_float32: + return MALI_BIFROST_REGISTER_FILE_FORMAT_F32; + case nir_type_int32: + return MALI_BIFROST_REGISTER_FILE_FORMAT_I32; + case nir_type_uint32: + return MALI_BIFROST_REGISTER_FILE_FORMAT_U32; + case nir_type_int16: + return MALI_BIFROST_REGISTER_FILE_FORMAT_I16; + case nir_type_uint16: + return MALI_BIFROST_REGISTER_FILE_FORMAT_U16; + default: + unreachable("Unsupported blend shader type for NIR alu type"); + return 0; + } } static void @@ -292,8 +316,12 @@ panfrost_emit_bifrost_blend(struct panfrost_batch *batch, assert((blend[i].shader.gpu & (0xffffffffull << 32)) == (fs->bo->ptr.gpu & (0xffffffffull << 32))); cfg.bifrost.internal.shader.pc = (u32)blend[i].shader.gpu; - assert(!(fs->blend_ret_addrs[i] & 0x7)); - cfg.bifrost.internal.shader.return_value = fs->blend_ret_addrs[i]; + unsigned ret_offset = fs->info.bifrost.blend[i].return_offset; + if (ret_offset) { + assert(!(ret_offset & 0x7)); + cfg.bifrost.internal.shader.return_value = + fs->bo->ptr.gpu + ret_offset; + } cfg.bifrost.internal.mode = MALI_BIFROST_BLEND_MODE_SHADER; } else { enum pipe_format format = batch->key.cbufs[i]->format; @@ -324,7 +352,7 @@ panfrost_emit_bifrost_blend(struct panfrost_batch *batch, cfg.bifrost.internal.fixed_function.conversion.memory_format = panfrost_format_to_bifrost_blend(dev, format_desc, true); cfg.bifrost.internal.fixed_function.conversion.register_format = - fs->blend_types[i]; + bifrost_blend_type_from_nir(fs->info.bifrost.blend[i].type); cfg.bifrost.internal.fixed_function.rt = i; } } @@ -412,7 +440,9 @@ panfrost_prepare_bifrost_fs_state(struct panfrost_context *ctx, state->properties = fs->properties; state->properties.bifrost.allow_forward_pixel_to_kill = - !fs->can_discard && !fs->writes_depth && no_blend; + !fs->info.fs.can_discard && + !fs->info.fs.writes_depth && + no_blend; state->shader = fs->shader; state->preload = fs->preload; } @@ -436,8 +466,8 @@ panfrost_prepare_midgard_fs_state(struct panfrost_context *ctx, state->properties.midgard.force_early_z = true; } else { /* Reasons to disable early-Z from a shader perspective */ - bool late_z = fs->can_discard || fs->writes_global || - fs->writes_depth || fs->writes_stencil; + bool late_z = fs->info.fs.can_discard || fs->info.writes_global || + fs->info.fs.writes_depth || fs->info.fs.writes_stencil; /* If either depth or stencil is enabled, discard matters */ bool zs_enabled = @@ -452,9 +482,9 @@ panfrost_prepare_midgard_fs_state(struct panfrost_context *ctx, /* TODO: Reduce this limit? */ state->properties = fs->properties; if (has_blend_shader) - state->properties.midgard.work_register_count = MAX2(fs->work_reg_count, 8); + state->properties.midgard.work_register_count = MAX2(fs->info.work_reg_count, 8); else - state->properties.midgard.work_register_count = fs->work_reg_count; + state->properties.midgard.work_register_count = fs->info.work_reg_count; state->properties.midgard.force_early_z = !(late_z || alpha_to_coverage); @@ -463,8 +493,10 @@ panfrost_prepare_midgard_fs_state(struct panfrost_context *ctx, * lying to the hardware about the discard and setting the * reads tilebuffer? flag to compensate */ state->properties.midgard.shader_reads_tilebuffer = - fs->outputs_read || (!zs_enabled && fs->can_discard); - state->properties.midgard.shader_contains_discard = zs_enabled && fs->can_discard; + fs->info.fs.outputs_read || + (!zs_enabled && fs->info.fs.can_discard); + state->properties.midgard.shader_contains_discard = + zs_enabled && fs->info.fs.can_discard; state->shader = fs->shader; } @@ -528,7 +560,7 @@ panfrost_prepare_fs_state(struct panfrost_context *ctx, state->multisample_misc.sample_mask = (msaa ? ctx->sample_mask : ~0) & 0xFFFF; state->multisample_misc.evaluate_per_sample = - msaa && (ctx->min_samples > 1 || fs->sample_shading); + msaa && (ctx->min_samples > 1 || fs->info.fs.sample_shading); state->multisample_misc.depth_function = zsa->base.depth_enabled ? panfrost_translate_compare_func(zsa->base.depth_func) : @@ -930,8 +962,8 @@ panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf, { struct sysval_uniform *uniforms = (void *)buf; - for (unsigned i = 0; i < ss->sysval_count; ++i) { - int sysval = ss->sysval[i]; + for (unsigned i = 0; i < ss->info.sysvals.sysval_count; ++i) { + int sysval = ss->info.sysvals.sysvals[i]; switch (PAN_SYSVAL_TYPE(sysval)) { case PAN_SYSVAL_VIEWPORT_SCALE: @@ -1023,7 +1055,7 @@ panfrost_emit_const_buf(struct panfrost_batch *batch, struct panfrost_shader_state *ss = &all->variants[all->active_variant]; /* Allocate room for the sysval and the uniforms */ - size_t sys_size = sizeof(float) * 4 * ss->sysval_count; + size_t sys_size = sizeof(float) * 4 * ss->info.sysvals.sysval_count; struct panfrost_ptr transfer = panfrost_pool_alloc_aligned(&batch->pool, sys_size, 16); @@ -1032,7 +1064,7 @@ panfrost_emit_const_buf(struct panfrost_batch *batch, /* Next up, attach UBOs. UBO count includes gaps but no sysval UBO */ struct panfrost_shader_state *shader = panfrost_get_shader_state(ctx, stage); - unsigned ubo_count = shader->ubo_count - (sys_size ? 1 : 0); + unsigned ubo_count = shader->info.ubo_count - (sys_size ? 1 : 0); unsigned sysval_ubo = sys_size ? ubo_count : ~0; size_t sz = MALI_UNIFORM_BUFFER_LENGTH * (ubo_count + 1); @@ -1076,13 +1108,14 @@ panfrost_emit_const_buf(struct panfrost_batch *batch, /* Copy push constants required by the shader */ struct panfrost_ptr push_transfer = - panfrost_pool_alloc_aligned(&batch->pool, ss->push.count * 4, 16); + panfrost_pool_alloc_aligned(&batch->pool, + ss->info.push.count * 4, 16); uint32_t *push_cpu = (uint32_t *) push_transfer.cpu; *push_constants = push_transfer.gpu; - for (unsigned i = 0; i < ss->push.count; ++i) { - struct panfrost_ubo_word src = ss->push.words[i]; + for (unsigned i = 0; i < ss->info.push.count; ++i) { + struct panfrost_ubo_word src = ss->info.push.words[i]; /* Map the UBO, this should be cheap. However this is reading * from write-combine memory which is _very_ slow. It might pay @@ -1108,7 +1141,7 @@ panfrost_emit_shared_memory(struct panfrost_batch *batch, struct panfrost_device *dev = pan_device(ctx->base.screen); struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE]; struct panfrost_shader_state *ss = &all->variants[all->active_variant]; - unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size, + unsigned single_size = util_next_power_of_two(MAX2(ss->info.wls_size, 128)); unsigned instances = @@ -1130,12 +1163,12 @@ panfrost_emit_shared_memory(struct panfrost_batch *batch, ls.wls_instances = instances; ls.wls_size_scale = util_logbase2(single_size) + 1; - if (ss->stack_size) { + if (ss->info.tls_size) { unsigned shift = - panfrost_get_stack_shift(ss->stack_size); + panfrost_get_stack_shift(ss->info.tls_size); struct panfrost_bo *bo = panfrost_batch_get_scratchpad(batch, - ss->stack_size, + ss->info.tls_size, dev->thread_tls_alloc, dev->core_count); @@ -1366,7 +1399,7 @@ panfrost_emit_image_attribs(struct panfrost_batch *batch, struct panfrost_context *ctx = batch->ctx; struct panfrost_shader_state *shader = panfrost_get_shader_state(ctx, type); - if (!shader->attribute_count) { + if (!shader->info.attribute_count) { *buffers = 0; return 0; } @@ -1375,11 +1408,11 @@ panfrost_emit_image_attribs(struct panfrost_batch *batch, unsigned attrib_buf_size = MALI_ATTRIBUTE_BUFFER_LENGTH + MALI_ATTRIBUTE_BUFFER_CONTINUATION_3D_LENGTH; unsigned bytes_per_image_desc = MALI_ATTRIBUTE_LENGTH + attrib_buf_size; - unsigned attribs_offset = attrib_buf_size * shader->attribute_count; + unsigned attribs_offset = attrib_buf_size * shader->info.attribute_count; struct panfrost_ptr ptr = panfrost_pool_alloc_aligned(&batch->pool, - bytes_per_image_desc * shader->attribute_count, + bytes_per_image_desc * shader->info.attribute_count, util_next_power_of_two(bytes_per_image_desc)); emit_image_attribs(batch, type, ptr.cpu + attribs_offset, ptr.cpu, 0); @@ -1404,7 +1437,7 @@ panfrost_emit_vertex_data(struct panfrost_batch *batch, * Also, we allocate more memory than what's needed here if either instancing * is enabled or images are present, this can be improved. */ unsigned bufs_per_attrib = (ctx->instance_count > 1 || nr_images > 0) ? 2 : 1; - unsigned nr_bufs = (vs->attribute_count * bufs_per_attrib) + + unsigned nr_bufs = (vs->info.attribute_count * bufs_per_attrib) + (pan_is_bifrost(dev) ? 1 : 0); if (!nr_bufs) { @@ -1417,7 +1450,7 @@ panfrost_emit_vertex_data(struct panfrost_batch *batch, MALI_ATTRIBUTE_BUFFER_LENGTH * 2); struct panfrost_ptr T = panfrost_pool_alloc_aligned(&batch->pool, - MALI_ATTRIBUTE_LENGTH * vs->attribute_count, + MALI_ATTRIBUTE_LENGTH * vs->info.attribute_count, MALI_ATTRIBUTE_LENGTH); struct mali_attribute_buffer_packed *bufs = @@ -1525,7 +1558,7 @@ panfrost_emit_vertex_data(struct panfrost_batch *batch, /* Add special gl_VertexID/gl_InstanceID buffers */ - if (unlikely(vs->attribute_count >= PAN_VERTEX_ID)) { + if (unlikely(vs->info.attribute_count >= PAN_VERTEX_ID)) { panfrost_vertex_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1); pan_pack(out + PAN_VERTEX_ID, ATTRIBUTE, cfg) { @@ -1742,22 +1775,22 @@ pan_varying_present(const struct panfrost_device *dev, /* Enable special buffers by the shader info */ - if (vs->writes_point_size) + if (vs->info.vs.writes_point_size) present |= (1 << PAN_VARY_PSIZ); - if (fs->reads_point_coord) + if (fs->info.fs.reads_point_coord) present |= (1 << PAN_VARY_PNTCOORD); - if (fs->reads_face) + if (fs->info.fs.reads_face) present |= (1 << PAN_VARY_FACE); - if (fs->reads_frag_coord && !pan_is_bifrost(dev)) + if (fs->info.fs.reads_frag_coord && !pan_is_bifrost(dev)) present |= (1 << PAN_VARY_FRAGCOORD); /* Also, if we have a point sprite, we need a point coord buffer */ - for (unsigned i = 0; i < fs->varying_count; i++) { - gl_varying_slot loc = fs->varyings_loc[i]; + for (unsigned i = 0; i < fs->info.varyings.input_count; i++) { + gl_varying_slot loc = fs->info.varyings.input[i].location; if (util_varying_is_point_coord(loc, point_coord_mask)) present |= (1 << PAN_VARY_PNTCOORD); @@ -1886,10 +1919,18 @@ pan_emit_general_varying(const struct panfrost_device *dev, bool should_alloc) { /* Check if we're linked */ + unsigned other_varying_count = + other->info.stage == MESA_SHADER_FRAGMENT ? + other->info.varyings.input_count : + other->info.varyings.output_count; + const struct pan_shader_varying *other_varyings = + other->info.stage == MESA_SHADER_FRAGMENT ? + other->info.varyings.input : + other->info.varyings.output; signed other_idx = -1; - for (unsigned j = 0; j < other->varying_count; ++j) { - if (other->varyings_loc[j] == loc) { + for (unsigned j = 0; j < other_varying_count; ++j) { + if (other_varyings[j].location == loc) { other_idx = j; break; } @@ -1904,7 +1945,8 @@ pan_emit_general_varying(const struct panfrost_device *dev, if (should_alloc) { /* We're linked, so allocate a space via a watermark allocation */ - enum mali_format alt = other->varyings[other_idx]; + enum mali_format alt = + dev->formats[other_varyings[other_idx].format].hw >> 12; /* Do interpolation at minimum precision */ unsigned size_main = pan_varying_size(format); @@ -1953,8 +1995,14 @@ panfrost_emit_varying(const struct panfrost_device *dev, bool should_alloc, bool is_fragment) { - gl_varying_slot loc = stage->varyings_loc[idx]; - enum mali_format format = stage->varyings[idx]; + gl_varying_slot loc = + stage->info.stage == MESA_SHADER_FRAGMENT ? + stage->info.varyings.input[idx].location : + stage->info.varyings.output[idx].location; + enum mali_format format = + stage->info.stage == MESA_SHADER_FRAGMENT ? + dev->formats[stage->info.varyings.input[idx].format].hw >> 12 : + dev->formats[stage->info.varyings.output[idx].format].hw >> 12; /* Override format to match linkage */ if (!should_alloc && gen_formats[idx]) @@ -2018,8 +2066,8 @@ panfrost_emit_varying_descriptor(struct panfrost_batch *batch, vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX); fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT); - vs_size = MALI_ATTRIBUTE_LENGTH * vs->varying_count; - fs_size = MALI_ATTRIBUTE_LENGTH * fs->varying_count; + vs_size = MALI_ATTRIBUTE_LENGTH * vs->info.varyings.output_count; + fs_size = MALI_ATTRIBUTE_LENGTH * fs->info.varyings.input_count; struct panfrost_ptr trans = panfrost_pool_alloc_aligned( &batch->pool, vs_size + fs_size, MALI_ATTRIBUTE_LENGTH); @@ -2044,8 +2092,8 @@ panfrost_emit_varying_descriptor(struct panfrost_batch *batch, memset(gen_formats, 0, sizeof(gen_formats)); unsigned gen_stride = 0; - assert(vs->varying_count < ARRAY_SIZE(gen_offsets)); - assert(fs->varying_count < ARRAY_SIZE(gen_offsets)); + assert(vs->info.varyings.output_count < ARRAY_SIZE(gen_offsets)); + assert(fs->info.varyings.input_count < ARRAY_SIZE(gen_offsets)); unsigned streamout_offsets[32]; @@ -2056,16 +2104,16 @@ panfrost_emit_varying_descriptor(struct panfrost_batch *batch, } struct mali_attribute_packed *ovs = (struct mali_attribute_packed *)trans.cpu; - struct mali_attribute_packed *ofs = ovs + vs->varying_count; + struct mali_attribute_packed *ofs = ovs + vs->info.varyings.output_count; - for (unsigned i = 0; i < vs->varying_count; i++) { + for (unsigned i = 0; i < vs->info.varyings.output_count; i++) { panfrost_emit_varying(dev, ovs + i, vs, fs, vs, present, 0, ctx->streamout.num_targets, streamout_offsets, gen_offsets, gen_formats, &gen_stride, i, true, false); } - for (unsigned i = 0; i < fs->varying_count; i++) { + for (unsigned i = 0; i < fs->info.varyings.input_count; i++) { panfrost_emit_varying(dev, ofs + i, fs, vs, vs, present, point_coord_mask, ctx->streamout.num_targets, streamout_offsets, gen_offsets, gen_formats, &gen_stride, i, @@ -2114,8 +2162,8 @@ panfrost_emit_varying_descriptor(struct panfrost_batch *batch, pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD, MALI_ATTRIBUTE_SPECIAL_FRAG_COORD); *buffers = T.gpu; - *vs_attribs = vs->varying_count ? trans.gpu : 0; - *fs_attribs = fs->varying_count ? trans.gpu + vs_size : 0; + *vs_attribs = vs->info.varyings.output_count ? trans.gpu : 0; + *fs_attribs = fs->info.varyings.input_count ? trans.gpu + vs_size : 0; } void diff --git a/src/gallium/drivers/panfrost/pan_compute.c b/src/gallium/drivers/panfrost/pan_compute.c index af9bafb81e5..049e4ff21d9 100644 --- a/src/gallium/drivers/panfrost/pan_compute.c +++ b/src/gallium/drivers/panfrost/pan_compute.c @@ -71,7 +71,7 @@ panfrost_create_compute_state( } panfrost_shader_compile(ctx, so->cbase.ir_type, so->cbase.prog, - MESA_SHADER_COMPUTE, v, NULL); + MESA_SHADER_COMPUTE, v); return so; } diff --git a/src/gallium/drivers/panfrost/pan_context.c b/src/gallium/drivers/panfrost/pan_context.c index f7ce64896aa..d324dd79377 100644 --- a/src/gallium/drivers/panfrost/pan_context.c +++ b/src/gallium/drivers/panfrost/pan_context.c @@ -150,7 +150,7 @@ panfrost_writes_point_size(struct panfrost_context *ctx) assert(ctx->shader[PIPE_SHADER_VERTEX]); struct panfrost_shader_state *vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX); - return vs->writes_point_size && ctx->active_prim == PIPE_PRIM_POINTS; + return vs->info.vs.writes_point_size && ctx->active_prim == PIPE_PRIM_POINTS; } /* The entire frame is in memory -- send it off to the kernel! */ @@ -739,12 +739,11 @@ panfrost_create_shader_state( struct panfrost_context *ctx = pan_context(pctx); struct panfrost_shader_state state = { 0 }; - uint64_t outputs_written; panfrost_shader_compile(ctx, PIPE_SHADER_IR_NIR, so->base.ir.nir, tgsi_processor_to_shader_stage(stage), - &state, &outputs_written); + &state); } return so; @@ -821,11 +820,12 @@ panfrost_variant_matches( { struct panfrost_device *dev = pan_device(ctx->base.screen); - if (variant->outputs_read) { + if (variant->info.stage == MESA_SHADER_FRAGMENT && + variant->info.fs.outputs_read) { struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer; unsigned i; - BITSET_FOREACH_SET(i, &variant->outputs_read, 8) { + BITSET_FOREACH_SET(i, &variant->info.fs.outputs_read, 8) { enum pipe_format fmt = PIPE_FORMAT_R8G8B8A8_UNORM; if ((fb->nr_cbufs > i) && fb->cbufs[i]) @@ -963,15 +963,12 @@ panfrost_bind_shader_state( /* We finally have a variant, so compile it */ if (!shader_state->compiled) { - uint64_t outputs_written = 0; - panfrost_shader_compile(ctx, variants->base.type, variants->base.type == PIPE_SHADER_IR_NIR ? variants->base.ir.nir : variants->base.tokens, tgsi_processor_to_shader_stage(type), - shader_state, - &outputs_written); + shader_state); shader_state->compiled = true; @@ -980,7 +977,8 @@ panfrost_bind_shader_state( shader_state->stream_output = variants->base.stream_output; shader_state->so_mask = - update_so_info(&shader_state->stream_output, outputs_written); + update_so_info(&shader_state->stream_output, + shader_state->info.outputs_written); } } @@ -1251,7 +1249,8 @@ panfrost_set_framebuffer_state(struct pipe_context *pctx, * keyed to the framebuffer format (due to EXT_framebuffer_fetch) */ struct panfrost_shader_variants *fs = ctx->shader[PIPE_SHADER_FRAGMENT]; - if (fs && fs->variant_count && fs->variants[fs->active_variant].outputs_read) + if (fs && fs->variant_count && + fs->variants[fs->active_variant].info.fs.outputs_read) ctx->base.bind_fs_state(&ctx->base, fs); } diff --git a/src/gallium/drivers/panfrost/pan_context.h b/src/gallium/drivers/panfrost/pan_context.h index 5dea026d4b6..153060b13f3 100644 --- a/src/gallium/drivers/panfrost/pan_context.h +++ b/src/gallium/drivers/panfrost/pan_context.h @@ -214,46 +214,15 @@ struct panfrost_shader_state { struct MALI_RENDERER_PROPERTIES properties; struct MALI_PRELOAD preload; - /* Non-descript information */ - unsigned work_reg_count; - bool sample_shading; - bool can_discard; - bool writes_point_size; - bool writes_depth; - bool writes_stencil; - bool reads_point_coord; - bool reads_face; - bool reads_frag_coord; - bool writes_global; - unsigned stack_size; - unsigned shared_size; + struct pan_shader_info info; - /* Does the fragment shader have side effects? In particular, if output - * is masked out, is it legal to skip shader execution? */ - bool fs_sidefx; - - /* For Bifrost - output type for each RT */ - enum mali_bifrost_register_file_format blend_types[MALI_BIFROST_BLEND_MAX_RT]; - - unsigned attribute_count, varying_count, ubo_count; - enum mali_format varyings[PIPE_MAX_ATTRIBS]; - gl_varying_slot varyings_loc[PIPE_MAX_ATTRIBS]; struct pipe_stream_output_info stream_output; uint64_t so_mask; - unsigned sysval_count; - unsigned sysval[MAX_SYSVAL_COUNT]; - - struct panfrost_ubo_push push; - /* GPU-executable memory */ struct panfrost_bo *bo; - BITSET_WORD outputs_read; enum pipe_format rt_formats[8]; - - /* Blend return addresses */ - uint32_t blend_ret_addrs[8]; }; /* A collection of varyings (the CSO) */ @@ -374,8 +343,7 @@ panfrost_shader_compile(struct panfrost_context *ctx, enum pipe_shader_ir ir_type, const void *ir, gl_shader_stage stage, - struct panfrost_shader_state *state, - uint64_t *outputs_written); + struct panfrost_shader_state *state); void panfrost_create_sampler_view_bo(struct panfrost_sampler_view *so, diff --git a/src/gallium/drivers/panfrost/pan_job.c b/src/gallium/drivers/panfrost/pan_job.c index 3eac60d7296..25f664a6e4d 100644 --- a/src/gallium/drivers/panfrost/pan_job.c +++ b/src/gallium/drivers/panfrost/pan_job.c @@ -1238,7 +1238,7 @@ panfrost_batch_adjust_stack_size(struct panfrost_batch *batch) if (!ss) continue; - batch->stack_size = MAX2(batch->stack_size, ss->stack_size); + batch->stack_size = MAX2(batch->stack_size, ss->info.tls_size); } } diff --git a/src/panfrost/bifrost/bi_opt_push_ubo.c b/src/panfrost/bifrost/bi_opt_push_ubo.c index bdd6ddf59a2..9442315b213 100644 --- a/src/panfrost/bifrost/bi_opt_push_ubo.c +++ b/src/panfrost/bifrost/bi_opt_push_ubo.c @@ -119,10 +119,10 @@ void bi_opt_push_ubo(bi_context *ctx) { /* This pass only runs once */ - assert(ctx->push->count == 0); + assert(ctx->info->push.count == 0); struct bi_ubo_analysis analysis = bi_analyze_ranges(ctx); - bi_pick_ubo(ctx->push, &analysis); + bi_pick_ubo(&ctx->info->push, &analysis); bi_foreach_instr_global_safe(ctx, ins) { if (!bi_is_direct_aligned_ubo(ins)) continue; @@ -141,8 +141,9 @@ bi_opt_push_ubo(bi_context *ctx) for (unsigned w = 0; w < channels; ++w) { /* FAU is grouped in pairs (2 x 4-byte) */ - unsigned base = pan_lookup_pushed_ubo(ctx->push, ubo, - (offset + 4 * w)); + unsigned base = + pan_lookup_pushed_ubo(&ctx->info->push, ubo, + (offset + 4 * w)); unsigned fau_idx = (base >> 1); unsigned fau_hi = (base & 1); diff --git a/src/panfrost/bifrost/bi_pack.c b/src/panfrost/bifrost/bi_pack.c index aaed48094b7..b43e5a8aae5 100644 --- a/src/panfrost/bifrost/bi_pack.c +++ b/src/panfrost/bifrost/bi_pack.c @@ -713,11 +713,11 @@ bi_collect_blend_ret_addr(bi_context *ctx, struct util_dynarray *emission, unsigned loc = tuple->regs.fau_idx - BIR_FAU_BLEND_0; - assert(loc < ARRAY_SIZE(ctx->blend_ret_offsets)); - assert(!ctx->blend_ret_offsets[loc]); - ctx->blend_ret_offsets[loc] = + assert(loc < ARRAY_SIZE(ctx->info->bifrost.blend)); + assert(!ctx->info->bifrost.blend[loc].return_offset); + ctx->info->bifrost.blend[loc].return_offset = util_dynarray_num_elements(emission, uint8_t); - assert(!(ctx->blend_ret_offsets[loc] & 0x7)); + assert(!(ctx->info->bifrost.blend[loc].return_offset & 0x7)); } unsigned diff --git a/src/panfrost/bifrost/bi_ra.c b/src/panfrost/bifrost/bi_ra.c index f927b3eeb2c..fb3f38f99f6 100644 --- a/src/panfrost/bifrost/bi_ra.c +++ b/src/panfrost/bifrost/bi_ra.c @@ -388,7 +388,7 @@ bi_register_allocate(bi_context *ctx) unsigned iter_count = 1000; /* max iterations */ /* Number of bytes of memory we've spilled into */ - unsigned spill_count = ctx->tls_size; + unsigned spill_count = ctx->info->tls_size; do { if (l) { @@ -410,7 +410,7 @@ bi_register_allocate(bi_context *ctx) assert(success); - ctx->tls_size = spill_count; + ctx->info->tls_size = spill_count; bi_install_registers(ctx, l); lcra_free(l); diff --git a/src/panfrost/bifrost/bifrost_compile.c b/src/panfrost/bifrost/bifrost_compile.c index 6cbc593c27d..68a4a928f3e 100644 --- a/src/panfrost/bifrost/bifrost_compile.c +++ b/src/panfrost/bifrost/bifrost_compile.c @@ -297,7 +297,8 @@ bi_load_sysval_to(bi_builder *b, bi_index dest, int sysval, unsigned nr_components, unsigned offset) { unsigned uniform = - pan_lookup_sysval(b->shader->sysval_to_id, &b->shader->sysvals, + pan_lookup_sysval(b->shader->sysval_to_id, + &b->shader->info->sysvals, sysval); unsigned idx = (uniform * 16) + offset; @@ -368,8 +369,7 @@ bi_emit_blend_op(bi_builder *b, bi_index rgba, nir_alu_type T, unsigned rt) } assert(rt < 8); - assert(b->shader->blend_types); - b->shader->blend_types[rt] = T; + b->shader->info->bifrost.blend[rt].type = T; } /* Blend shaders do not need to run ATEST since they are dependent on a @@ -2511,23 +2511,23 @@ bi_lower_branch(bi_block *block) } } -panfrost_program * -bifrost_compile_shader_nir(void *mem_ctx, nir_shader *nir, - const struct panfrost_compile_inputs *inputs) +void +bifrost_compile_shader_nir(nir_shader *nir, + const struct panfrost_compile_inputs *inputs, + struct util_dynarray *binary, + struct pan_shader_info *info) { - panfrost_program *program = rzalloc(mem_ctx, panfrost_program); - bifrost_debug = debug_get_option_bifrost_debug(); bi_context *ctx = rzalloc(NULL, bi_context); - ctx->sysval_to_id = panfrost_init_sysvals(&ctx->sysvals, ctx); + ctx->sysval_to_id = panfrost_init_sysvals(&info->sysvals, ctx); ctx->inputs = inputs; ctx->nir = nir; + ctx->info = info; ctx->stage = nir->info.stage; ctx->quirks = bifrost_get_quirks(inputs->gpu_id); ctx->arch = inputs->gpu_id >> 12; - ctx->push = &program->push; list_inithead(&ctx->blocks); /* Lower gl_Position pre-optimisation, but after lowering vars to ssa @@ -2565,8 +2565,7 @@ bifrost_compile_shader_nir(void *mem_ctx, nir_shader *nir, nir_print_shader(nir, stdout); } - ctx->blend_types = program->blend_types; - ctx->tls_size = nir->scratch_size; + info->tls_size = nir->scratch_size; nir_foreach_function(func, nir) { if (!func->impl) @@ -2614,8 +2613,7 @@ bifrost_compile_shader_nir(void *mem_ctx, nir_shader *nir, if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) bi_print_shader(ctx, stdout); - util_dynarray_init(&program->compiled, NULL); - unsigned final_clause = bi_pack(ctx, &program->compiled); + unsigned final_clause = bi_pack(ctx, binary); /* If we need to wait for ATEST or BLEND in the first clause, pass the * corresponding bits through to the renderer state descriptor */ @@ -2623,17 +2621,12 @@ bifrost_compile_shader_nir(void *mem_ctx, nir_shader *nir, bi_clause *first_clause = bi_next_clause(ctx, first_block, NULL); unsigned first_deps = first_clause ? first_clause->dependencies : 0; - program->wait_6 = (first_deps & (1 << 6)); - program->wait_7 = (first_deps & (1 << 7)); - - memcpy(program->blend_ret_offsets, ctx->blend_ret_offsets, sizeof(program->blend_ret_offsets)); - program->sysval_count = ctx->sysvals.sysval_count; - memcpy(program->sysvals, ctx->sysvals.sysvals, sizeof(ctx->sysvals.sysvals[0]) * ctx->sysvals.sysval_count); + info->bifrost.wait_6 = (first_deps & (1 << 6)); + info->bifrost.wait_7 = (first_deps & (1 << 7)); if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) { - disassemble_bifrost(stdout, program->compiled.data, - program->compiled.size, - bifrost_debug & BIFROST_DBG_VERBOSE); + disassemble_bifrost(stdout, binary->data, binary->size, + bifrost_debug & BIFROST_DBG_VERBOSE); } /* Pad the shader with enough zero bytes to trick the prefetcher, @@ -2641,19 +2634,15 @@ bifrost_compile_shader_nir(void *mem_ctx, nir_shader *nir, * so the size remains 0) */ unsigned prefetch_size = BIFROST_SHADER_PREFETCH - final_clause; - if (program->compiled.size) { - memset(util_dynarray_grow(&program->compiled, uint8_t, prefetch_size), + if (binary->size) { + memset(util_dynarray_grow(binary, uint8_t, prefetch_size), 0, prefetch_size); } - program->tls_size = ctx->tls_size; - if ((bifrost_debug & BIFROST_DBG_SHADERDB || inputs->shaderdb) && !skip_internal) { - bi_print_stats(ctx, program->compiled.size, stderr); + bi_print_stats(ctx, binary->size, stderr); } ralloc_free(ctx); - - return program; } diff --git a/src/panfrost/bifrost/bifrost_compile.h b/src/panfrost/bifrost/bifrost_compile.h index b64721d3f41..4fe468c1e12 100644 --- a/src/panfrost/bifrost/bifrost_compile.h +++ b/src/panfrost/bifrost/bifrost_compile.h @@ -28,9 +28,11 @@ #include "util/u_dynarray.h" #include "panfrost/util/pan_ir.h" -panfrost_program * -bifrost_compile_shader_nir(void *mem_ctx, nir_shader *nir, - const struct panfrost_compile_inputs *inputs); +void +bifrost_compile_shader_nir(nir_shader *nir, + const struct panfrost_compile_inputs *inputs, + struct util_dynarray *binary, + struct pan_shader_info *info); static const nir_shader_compiler_options bifrost_nir_options = { .lower_scmp = true, diff --git a/src/panfrost/bifrost/cmdline.c b/src/panfrost/bifrost/cmdline.c index d9515e17465..0792e9c2980 100644 --- a/src/panfrost/bifrost/cmdline.c +++ b/src/panfrost/bifrost/cmdline.c @@ -32,7 +32,7 @@ #include "util/u_dynarray.h" #include "bifrost_compile.h" -static panfrost_program * +static void compile_shader(char **argv, bool vertex_only) { struct gl_shader_program *prog; @@ -53,7 +53,10 @@ compile_shader(char **argv, bool vertex_only) prog = standalone_compile_shader(&options, 2, argv, &local_ctx); prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program->info.stage = MESA_SHADER_FRAGMENT; - panfrost_program *compiled; + struct util_dynarray binary; + + util_dynarray_init(&binary, NULL); + for (unsigned i = 0; i < 2; ++i) { nir[i] = glsl_to_nir(&local_ctx, prog, shader_types[i], &bifrost_nir_options); NIR_PASS_V(nir[i], nir_lower_global_vars_to_local); @@ -70,14 +73,16 @@ compile_shader(char **argv, bool vertex_only) struct panfrost_compile_inputs inputs = { .gpu_id = 0x7212, /* Mali G52 */ }; + struct pan_shader_info info; - compiled = bifrost_compile_shader_nir(NULL, nir[i], &inputs); + util_dynarray_clear(&binary); + bifrost_compile_shader_nir(nir[i], &inputs, &binary, &info); if (vertex_only) - return compiled; + break; } - return compiled; + util_dynarray_fini(&binary); } #define BI_FOURCC(ch0, ch1, ch2, ch3) ( \ diff --git a/src/panfrost/bifrost/compiler.h b/src/panfrost/bifrost/compiler.h index 37de7ccdb27..0c35920cd7e 100644 --- a/src/panfrost/bifrost/compiler.h +++ b/src/panfrost/bifrost/compiler.h @@ -496,17 +496,12 @@ typedef struct bi_block { typedef struct { const struct panfrost_compile_inputs *inputs; nir_shader *nir; + struct pan_shader_info *info; gl_shader_stage stage; struct list_head blocks; /* list of bi_block */ - struct panfrost_sysvals sysvals; struct hash_table_u64 *sysval_to_id; - struct panfrost_ubo_push *push; uint32_t quirks; unsigned arch; - unsigned tls_size; - - /* Blend return offsets */ - uint32_t blend_ret_offsets[8]; /* During NIR->BIR */ bi_block *current_block; @@ -514,7 +509,6 @@ typedef struct { bi_block *break_block; bi_block *continue_block; bool emitted_atest; - nir_alu_type *blend_types; /* For creating temporaries */ unsigned ssa_alloc; diff --git a/src/panfrost/lib/pan_blit.c b/src/panfrost/lib/pan_blit.c index ac2ea8f1455..99fed039e77 100644 --- a/src/panfrost/lib/pan_blit.c +++ b/src/panfrost/lib/pan_blit.c @@ -43,11 +43,13 @@ * This is primarily designed as a fallback for preloads but could be extended * for other clears/blits if needed in the future. */ -static panfrost_program * +static void panfrost_build_blit_shader(struct panfrost_device *dev, gl_frag_result loc, nir_alu_type T, - bool ms) + bool ms, + struct util_dynarray *binary, + struct pan_shader_info *info) { bool is_colour = loc >= FRAG_RESULT_DATA0; @@ -110,11 +112,9 @@ panfrost_build_blit_shader(struct panfrost_device *dev, .is_blit = true, }; - panfrost_program *program = - pan_shader_compile(dev, NULL, shader, &inputs); + pan_shader_compile(dev, shader, &inputs, binary, info); ralloc_free(shader); - return program; } /* Compile and upload all possible blit shaders ahead-of-time to reduce draw @@ -162,6 +162,9 @@ panfrost_init_blit_shaders(struct panfrost_device *dev) /* Don't bother generating multisampling variants if we don't actually * support multisampling */ bool has_ms = !(dev->quirks & MIDGARD_SFBD); + struct util_dynarray binary; + + util_dynarray_init(&binary, NULL); for (unsigned ms = 0; ms <= has_ms; ++ms) { for (unsigned i = 0; i < ARRAY_SIZE(shader_descs); ++i) { @@ -172,27 +175,38 @@ panfrost_init_blit_shaders(struct panfrost_device *dev) continue; struct pan_blit_shader *shader = &dev->blit_shaders.loads[loc][T][ms]; - panfrost_program *program = - panfrost_build_blit_shader(dev, loc, - nir_types[T], ms); + struct pan_shader_info info; - assert(offset + program->compiled.size < total_size); + util_dynarray_clear(&binary); + panfrost_build_blit_shader(dev, loc, + nir_types[T], ms, + &binary, &info); + + assert(offset + binary.size < total_size); memcpy(dev->blit_shaders.bo->ptr.cpu + offset, - program->compiled.data, program->compiled.size); + binary.data, binary.size); - shader->shader = (dev->blit_shaders.bo->ptr.gpu + offset) | - program->first_tag; + shader->shader = (dev->blit_shaders.bo->ptr.gpu + offset); + if (pan_is_bifrost(dev)) { + int rt = loc - FRAG_RESULT_DATA0; + if (rt >= 0 && rt < 8 && + info.bifrost.blend[rt].return_offset) { + shader->blend_ret_addr = + shader->shader + + info.bifrost.blend[rt].return_offset; + } + } else { + shader->shader |= info.midgard.first_tag; + } - int rt = loc - FRAG_RESULT_DATA0; - if (rt >= 0 && rt < 8 && program->blend_ret_offsets[rt]) - shader->blend_ret_addr = program->blend_ret_offsets[rt] + shader->shader; - offset += ALIGN_POT(program->compiled.size, + offset += ALIGN_POT(binary.size, pan_is_bifrost(dev) ? 128 : 64); - ralloc_free(program); } } } + + util_dynarray_fini(&binary); } static void diff --git a/src/panfrost/lib/pan_shader.c b/src/panfrost/lib/pan_shader.c index e3b13189db2..20e9d43817c 100644 --- a/src/panfrost/lib/pan_shader.c +++ b/src/panfrost/lib/pan_shader.c @@ -37,13 +37,196 @@ pan_shader_get_compiler_options(const struct panfrost_device *dev) return &midgard_nir_options; } -panfrost_program * -pan_shader_compile(const struct panfrost_device *dev, - void *mem_ctx, nir_shader *nir, - const struct panfrost_compile_inputs *inputs) +static enum pipe_format +varying_format(nir_alu_type t, unsigned ncomps) { - if (pan_is_bifrost(dev)) - return bifrost_compile_shader_nir(mem_ctx, nir, inputs); +#define VARYING_FORMAT(ntype, nsz, ptype, psz) \ + { \ + .type = nir_type_ ## ntype ## nsz, \ + .formats = { \ + PIPE_FORMAT_R ## psz ## _ ## ptype, \ + PIPE_FORMAT_R ## psz ## G ## psz ## _ ## ptype, \ + PIPE_FORMAT_R ## psz ## G ## psz ## B ## psz ## _ ## ptype, \ + PIPE_FORMAT_R ## psz ## G ## psz ## B ## psz ## A ## psz ## _ ## ptype, \ + } \ + } - return midgard_compile_shader_nir(mem_ctx, nir, inputs); + static const struct { + nir_alu_type type; + enum pipe_format formats[4]; + } conv[] = { + VARYING_FORMAT(float, 32, FLOAT, 32), + VARYING_FORMAT(int, 32, SINT, 32), + VARYING_FORMAT(uint, 32, UINT, 32), + VARYING_FORMAT(float, 16, FLOAT, 16), + VARYING_FORMAT(int, 16, SINT, 16), + VARYING_FORMAT(uint, 16, UINT, 16), + VARYING_FORMAT(int, 8, SINT, 8), + VARYING_FORMAT(uint, 8, UINT, 8), + VARYING_FORMAT(bool, 32, UINT, 32), + VARYING_FORMAT(bool, 16, UINT, 16), + VARYING_FORMAT(bool, 8, UINT, 8), + VARYING_FORMAT(bool, 1, UINT, 8), + }; +#undef VARYING_FORMAT + + assert(ncomps > 0 && ncomps <= ARRAY_SIZE(conv[0].formats)); + + for (unsigned i = 0; i < ARRAY_SIZE(conv); i++) { + if (conv[i].type == t) + return conv[i].formats[ncomps - 1]; + } + + return PIPE_FORMAT_NONE; +} + +static void +collect_varyings(nir_shader *s, nir_variable_mode varying_mode, + struct pan_shader_varying *varyings, + unsigned *varying_count) +{ + *varying_count = 0; + + nir_foreach_variable_with_modes(var, s, varying_mode) { + unsigned loc = var->data.driver_location; + unsigned sz = glsl_count_attribute_slots(var->type, FALSE); + const struct glsl_type *column = + glsl_without_array_or_matrix(var->type); + unsigned chan = glsl_get_components(column); + enum glsl_base_type base_type = glsl_get_base_type(column); + + /* If we have a fractional location added, we need to increase the size + * so it will fit, i.e. a vec3 in YZW requires us to allocate a vec4. + * We could do better but this is an edge case as it is, normally + * packed varyings will be aligned. + */ + chan += var->data.location_frac; + assert(chan >= 1 && chan <= 4); + + nir_alu_type type = nir_get_nir_type_for_glsl_base_type(base_type); + + type = nir_alu_type_get_base_type(type); + + /* Demote to fp16 where possible. int16 varyings are TODO as the hw + * will saturate instead of wrap which is not conformant, so we need to + * insert i2i16/u2u16 instructions before the st_vary_32i/32u to get + * the intended behaviour. + */ + if (type == nir_type_float && + (var->data.precision == GLSL_PRECISION_MEDIUM || + var->data.precision == GLSL_PRECISION_LOW)) { + type |= 16; + } else { + type |= 32; + } + + enum pipe_format format = varying_format(type, chan); + assert(format != PIPE_FORMAT_NONE); + + for (int c = 0; c < sz; ++c) { + varyings[loc + c].location = var->data.location + c; + varyings[loc + c].format = format; + } + + *varying_count = MAX2(*varying_count, loc + sz); + } +} + +void +pan_shader_compile(const struct panfrost_device *dev, + nir_shader *s, + const struct panfrost_compile_inputs *inputs, + struct util_dynarray *binary, + struct pan_shader_info *info) +{ + memset(info, 0, sizeof(*info)); + + if (pan_is_bifrost(dev)) + bifrost_compile_shader_nir(s, inputs, binary, info); + else + midgard_compile_shader_nir(s, inputs, binary, info); + + info->stage = s->info.stage; + info->contains_barrier = s->info.uses_memory_barrier || + s->info.uses_control_barrier; + + switch (info->stage) { + case MESA_SHADER_VERTEX: + info->attribute_count = util_bitcount64(s->info.inputs_read); + + bool vertex_id = BITSET_TEST(s->info.system_values_read, + SYSTEM_VALUE_VERTEX_ID); + if (vertex_id) + info->attribute_count = MAX2(info->attribute_count, PAN_VERTEX_ID + 1); + + bool instance_id = BITSET_TEST(s->info.system_values_read, + SYSTEM_VALUE_INSTANCE_ID); + if (instance_id) + info->attribute_count = MAX2(info->attribute_count, PAN_INSTANCE_ID + 1); + + info->vs.writes_point_size = + s->info.outputs_written & (1 << VARYING_SLOT_PSIZ); + collect_varyings(s, nir_var_shader_out, info->varyings.output, + &info->varyings.output_count); + break; + case MESA_SHADER_FRAGMENT: + if (s->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) + info->fs.writes_depth = true; + if (s->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) + info->fs.writes_stencil = true; + + uint64_t outputs_read = s->info.outputs_read; + if (outputs_read & BITFIELD64_BIT(FRAG_RESULT_COLOR)) + outputs_read |= BITFIELD64_BIT(FRAG_RESULT_DATA0); + + info->fs.outputs_read = outputs_read >> FRAG_RESULT_DATA0; + + /* EXT_shader_framebuffer_fetch requires per-sample */ + info->fs.sample_shading = s->info.fs.uses_sample_shading || + outputs_read; + + info->fs.can_discard = s->info.fs.uses_discard; + info->fs.helper_invocations = s->info.fs.needs_quad_helper_invocations; + + /* List of reasons we need to execute frag shaders when things + * are masked off */ + + info->fs.sidefx = s->info.writes_memory || + s->info.fs.uses_discard || + s->info.fs.uses_demote; + info->fs.reads_frag_coord = + (s->info.inputs_read & (1 << VARYING_SLOT_POS)) || + BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_FRAG_COORD); + info->fs.reads_point_coord = + s->info.inputs_read & (1 << VARYING_SLOT_PNTC); + info->fs.reads_face = + (s->info.inputs_read & (1 << VARYING_SLOT_FACE)) || + BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_FRONT_FACE); + info->fs.reads_sample_id = + BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_SAMPLE_ID); + info->fs.reads_sample_pos = + BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_SAMPLE_POS); + info->fs.reads_sample_mask_in = + BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_SAMPLE_MASK_IN); + info->fs.reads_helper_invocation = + BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_HELPER_INVOCATION); + collect_varyings(s, nir_var_shader_in, info->varyings.input, + &info->varyings.input_count); + break; + case MESA_SHADER_COMPUTE: + info->wls_size = s->info.cs.shared_size; + break; + default: + unreachable("Unknown shader state"); + } + + info->outputs_written = s->info.outputs_written; + + /* Sysvals have dedicated UBO */ + info->ubo_count = s->info.num_ubos + (info->sysvals.sysval_count ? 1 : 0); + + info->attribute_count += util_bitcount(s->info.images_used); + info->writes_global = s->info.writes_memory; + + info->texture_count = s->info.num_textures; } diff --git a/src/panfrost/lib/pan_shader.h b/src/panfrost/lib/pan_shader.h index 7b3501da444..b67d5a2e596 100644 --- a/src/panfrost/lib/pan_shader.h +++ b/src/panfrost/lib/pan_shader.h @@ -33,9 +33,11 @@ struct panfrost_device; const nir_shader_compiler_options * pan_shader_get_compiler_options(const struct panfrost_device *dev); -panfrost_program * +void pan_shader_compile(const struct panfrost_device *dev, - void *mem_ctx, nir_shader *nir, - const struct panfrost_compile_inputs *inputs); + nir_shader *nir, + const struct panfrost_compile_inputs *inputs, + struct util_dynarray *binary, + struct pan_shader_info *info); #endif diff --git a/src/panfrost/midgard/compiler.h b/src/panfrost/midgard/compiler.h index fdb8160d203..f804ee0f53c 100644 --- a/src/panfrost/midgard/compiler.h +++ b/src/panfrost/midgard/compiler.h @@ -238,6 +238,7 @@ enum midgard_rt_id { typedef struct compiler_context { const struct panfrost_compile_inputs *inputs; nir_shader *nir; + struct pan_shader_info *info; gl_shader_stage stage; /* Number of samples for a keyed blend shader. Depends on is_blend */ @@ -249,9 +250,6 @@ typedef struct compiler_context { /* Index to precolour to r2 for a dual-source blend colour */ unsigned blend_src1; - /* Number of bytes used for Thread Local Storage */ - unsigned tls_size; - /* Count of spills and fills for shaderdb */ unsigned spills; unsigned fills; @@ -291,10 +289,6 @@ typedef struct compiler_context { /* Set of NIR indices that were already emitted as outmods */ BITSET_WORD *already_emitted; - /* Just the count of the max register used. Higher count => higher - * register pressure */ - int work_registers; - /* The number of uniforms allowable for the fast path */ int uniform_cutoff; @@ -312,9 +306,7 @@ typedef struct compiler_context { /* Writeout instructions for each render target */ midgard_instruction *writeout_branch[MIDGARD_NUM_RTS][MIDGARD_MAX_SAMPLE_ITER]; - struct panfrost_sysvals sysvals; struct hash_table_u64 *sysval_to_id; - struct panfrost_ubo_push *push; } compiler_context; /* Per-block live_in/live_out */ diff --git a/src/panfrost/midgard/midgard_compile.c b/src/panfrost/midgard/midgard_compile.c index 16617e0b8fc..7f6c18a26fc 100644 --- a/src/panfrost/midgard/midgard_compile.c +++ b/src/panfrost/midgard/midgard_compile.c @@ -1448,7 +1448,7 @@ emit_sysval_read(compiler_context *ctx, nir_instr *instr, int sysval = panfrost_sysval_for_instr(instr, &nir_dest); unsigned dest = nir_dest_index(&nir_dest); unsigned uniform = - pan_lookup_sysval(ctx->sysval_to_id, &ctx->sysvals, sysval); + pan_lookup_sysval(ctx->sysval_to_id, &ctx->info->sysvals, sysval); /* Emit the read itself -- this is never indirect */ midgard_instruction *ins = @@ -2978,24 +2978,22 @@ mir_add_writeout_loops(compiler_context *ctx) } } -panfrost_program * -midgard_compile_shader_nir(void *mem_ctx, nir_shader *nir, - const struct panfrost_compile_inputs *inputs) +void +midgard_compile_shader_nir(nir_shader *nir, + const struct panfrost_compile_inputs *inputs, + struct util_dynarray *binary, + struct pan_shader_info *info) { - panfrost_program *program = rzalloc(mem_ctx, panfrost_program); - - struct util_dynarray *compiled = &program->compiled; - midgard_debug = debug_get_option_midgard_debug(); /* TODO: Bound against what? */ compiler_context *ctx = rzalloc(NULL, compiler_context); - ctx->sysval_to_id = panfrost_init_sysvals(&ctx->sysvals, ctx); + ctx->sysval_to_id = panfrost_init_sysvals(&info->sysvals, ctx); ctx->inputs = inputs; ctx->nir = nir; + ctx->info = info; ctx->stage = nir->info.stage; - ctx->push = &program->push; if (inputs->is_blend) { unsigned nr_samples = MAX2(inputs->blend.nr_samples, 1); @@ -3013,7 +3011,7 @@ midgard_compile_shader_nir(void *mem_ctx, nir_shader *nir, /* Start off with a safe cutoff, allowing usage of all 16 work * registers. Later, we'll promote uniform reads to uniform registers * if we determine it is beneficial to do so */ - ctx->uniform_cutoff = 8; + info->midgard.uniform_cutoff = 8; /* Initialize at a global (not block) level hash tables */ @@ -3059,7 +3057,7 @@ midgard_compile_shader_nir(void *mem_ctx, nir_shader *nir, nir_print_shader(nir, stdout); } - ctx->tls_size = nir->scratch_size; + info->tls_size = nir->scratch_size; nir_foreach_function(func, nir) { if (!func->impl) @@ -3086,8 +3084,6 @@ midgard_compile_shader_nir(void *mem_ctx, nir_shader *nir, break; /* TODO: Multi-function shaders */ } - util_dynarray_init(compiled, program); - /* Per-block lowering before opts */ mir_foreach_block(ctx, _block) { @@ -3164,7 +3160,7 @@ midgard_compile_shader_nir(void *mem_ctx, nir_shader *nir, if (!bundle->last_writeout && (current_bundle + 1 < bundle_count)) lookahead = source_order_bundles[current_bundle + 1]->tag; - emit_binary_bundle(ctx, block, bundle, compiled, lookahead); + emit_binary_bundle(ctx, block, bundle, binary, lookahead); ++current_bundle; } @@ -3175,20 +3171,11 @@ midgard_compile_shader_nir(void *mem_ctx, nir_shader *nir, free(source_order_bundles); /* Report the very first tag executed */ - program->first_tag = midgard_get_first_tag_from_block(ctx, 0); - - /* Deal with off-by-one related to the fencepost problem */ - program->work_register_count = ctx->work_registers + 1; - program->uniform_cutoff = ctx->uniform_cutoff; - - program->tls_size = ctx->tls_size; - - program->sysval_count = ctx->sysvals.sysval_count; - memcpy(program->sysvals, ctx->sysvals.sysvals, sizeof(ctx->sysvals.sysvals[0]) * ctx->sysvals.sysval_count); + info->midgard.first_tag = midgard_get_first_tag_from_block(ctx, 0); if ((midgard_debug & MIDGARD_DBG_SHADERS) && !nir->info.internal) { - disassemble_midgard(stdout, program->compiled.data, - program->compiled.size, inputs->gpu_id); + disassemble_midgard(stdout, binary->data, + binary->size, inputs->gpu_id); } if ((midgard_debug & MIDGARD_DBG_SHADERDB || inputs->shaderdb) && @@ -3209,7 +3196,7 @@ midgard_compile_shader_nir(void *mem_ctx, nir_shader *nir, /* Calculate thread count. There are certain cutoffs by * register count for thread count */ - unsigned nr_registers = program->work_register_count; + unsigned nr_registers = info->work_reg_count; unsigned nr_threads = (nr_registers <= 4) ? 4 : @@ -3232,6 +3219,4 @@ midgard_compile_shader_nir(void *mem_ctx, nir_shader *nir, } ralloc_free(ctx); - - return program; } diff --git a/src/panfrost/midgard/midgard_compile.h b/src/panfrost/midgard/midgard_compile.h index 00d43a64e90..f049fbabb6b 100644 --- a/src/panfrost/midgard/midgard_compile.h +++ b/src/panfrost/midgard/midgard_compile.h @@ -29,9 +29,11 @@ #include "util/u_dynarray.h" #include "panfrost/util/pan_ir.h" -panfrost_program * -midgard_compile_shader_nir(void *mem_ctx, nir_shader *nir, - const struct panfrost_compile_inputs *inputs); +void +midgard_compile_shader_nir(nir_shader *nir, + const struct panfrost_compile_inputs *inputs, + struct util_dynarray *binary, + struct pan_shader_info *info); /* NIR options are shared between the standalone compiler and the online * compiler. Defining it here is the simplest, though maybe not the Right diff --git a/src/panfrost/midgard/midgard_ra.c b/src/panfrost/midgard/midgard_ra.c index 37cecb1c339..44b3c7dc1c6 100644 --- a/src/panfrost/midgard/midgard_ra.c +++ b/src/panfrost/midgard/midgard_ra.c @@ -99,7 +99,7 @@ index_to_reg(compiler_context *ctx, struct lcra_state *l, unsigned reg, unsigned /* Report that we actually use this register, and return it */ if (r.reg < 16) - ctx->work_registers = MAX2(ctx->work_registers, r.reg); + ctx->info->work_reg_count = MAX2(ctx->info->work_reg_count, r.reg + 1); return r; } @@ -395,7 +395,7 @@ allocate_registers(compiler_context *ctx, bool *spilled) * uniforms start and the shader stage. By ABI we limit blend shaders * to 8 registers, should be lower XXX */ int work_count = ctx->inputs->is_blend ? 8 : - 16 - MAX2((ctx->uniform_cutoff - 8), 0); + 16 - MAX2((ctx->info->midgard.uniform_cutoff - 8), 0); /* No register allocation to do with no SSA */ @@ -646,7 +646,7 @@ allocate_registers(compiler_context *ctx, bool *spilled) if (ctx->blend_src1 != ~0) { assert(ctx->blend_src1 < ctx->temp_count); l->solutions[ctx->blend_src1] = (16 * 2); - ctx->work_registers = MAX2(ctx->work_registers, 2); + ctx->info->work_reg_count = MAX2(ctx->info->work_reg_count, 3); } mir_compute_interference(ctx, l); @@ -959,13 +959,14 @@ mir_spill_register( static void mir_demote_uniforms(compiler_context *ctx, unsigned new_cutoff) { - unsigned old_work_count = 16 - MAX2((ctx->uniform_cutoff - 8), 0); + unsigned old_work_count = + 16 - MAX2((ctx->info->midgard.uniform_cutoff - 8), 0); unsigned work_count = 16 - MAX2((new_cutoff - 8), 0); unsigned min_demote = SSA_FIXED_REGISTER(old_work_count); unsigned max_demote = SSA_FIXED_REGISTER(work_count); - ctx->uniform_cutoff = new_cutoff; + ctx->info->midgard.uniform_cutoff = new_cutoff; mir_foreach_block(ctx, _block) { midgard_block *block = (midgard_block *) _block; @@ -978,7 +979,7 @@ mir_demote_uniforms(compiler_context *ctx, unsigned new_cutoff) unsigned temp = make_compiler_temp(ctx); unsigned idx = (23 - SSA_REG_FROM_FIXED(ins->src[i])) * 4; - assert(idx < ctx->push->count); + assert(idx < ctx->info->push.count); midgard_instruction ld = { .type = TAG_LOAD_STORE_4, @@ -989,10 +990,10 @@ mir_demote_uniforms(compiler_context *ctx, unsigned new_cutoff) .swizzle = SWIZZLE_IDENTITY_4, .op = midgard_op_ld_ubo_int4, .load_store = { - .arg_1 = ctx->push->words[idx].ubo, + .arg_1 = ctx->info->push.words[idx].ubo, .arg_2 = 0x1E, }, - .constants.u32[0] = ctx->push->words[idx].offset + .constants.u32[0] = ctx->info->push.words[idx].offset }; mir_insert_instruction_before_scheduled(ctx, block, before, ld); @@ -1013,7 +1014,7 @@ mir_ra(compiler_context *ctx) int iter_count = 1000; /* max iterations */ /* Number of 128-bit slots in memory we've spilled into */ - unsigned spill_count = DIV_ROUND_UP(ctx->tls_size, 16); + unsigned spill_count = DIV_ROUND_UP(ctx->info->tls_size, 16); mir_create_pipeline_registers(ctx); @@ -1025,9 +1026,9 @@ mir_ra(compiler_context *ctx) /* It's a lot cheaper to demote uniforms to get more * work registers than to spill to TLS. */ if (l->spill_class == REG_CLASS_WORK && - ctx->uniform_cutoff > 8) { + ctx->info->midgard.uniform_cutoff > 8) { - mir_demote_uniforms(ctx, MAX2(ctx->uniform_cutoff - 4, 8)); + mir_demote_uniforms(ctx, MAX2(ctx->info->midgard.uniform_cutoff - 4, 8)); } else if (spill_node == -1) { fprintf(stderr, "ERROR: Failed to choose spill node\n"); lcra_free(l); @@ -1056,7 +1057,7 @@ mir_ra(compiler_context *ctx) /* Report spilling information. spill_count is in 128-bit slots (vec4 x * fp32), but tls_size is in bytes, so multiply by 16 */ - ctx->tls_size = spill_count * 16; + ctx->info->tls_size = spill_count * 16; install_registers(ctx, l); diff --git a/src/panfrost/midgard/mir_promote_uniforms.c b/src/panfrost/midgard/mir_promote_uniforms.c index b5e063e0600..744d88e540e 100644 --- a/src/panfrost/midgard/mir_promote_uniforms.c +++ b/src/panfrost/midgard/mir_promote_uniforms.c @@ -263,7 +263,7 @@ midgard_promote_uniforms(compiler_context *ctx) unsigned work_count = mir_work_heuristic(ctx, &analysis); unsigned promoted_count = 24 - work_count; - mir_pick_ubo(ctx->push, &analysis, promoted_count); + mir_pick_ubo(&ctx->info->push, &analysis, promoted_count); /* First, figure out special indices a priori so we don't recompute a lot */ BITSET_WORD *special = mir_special_indices(ctx); @@ -279,7 +279,7 @@ midgard_promote_uniforms(compiler_context *ctx) if (!BITSET_TEST(analysis.blocks[ubo].pushed, qword)) continue; /* Find where we pushed to, TODO: unaligned pushes to pack */ - unsigned base = pan_lookup_pushed_ubo(ctx->push, ubo, qword * 16); + unsigned base = pan_lookup_pushed_ubo(&ctx->info->push, ubo, qword * 16); assert((base & 0x3) == 0); unsigned address = base / 4; @@ -288,7 +288,8 @@ midgard_promote_uniforms(compiler_context *ctx) /* Should've taken into account when pushing */ assert(address < promoted_count); - ctx->uniform_cutoff = MAX2(ctx->uniform_cutoff, address + 1); + ctx->info->midgard.uniform_cutoff = + MAX2(ctx->info->midgard.uniform_cutoff, address + 1); unsigned promoted = SSA_FIXED_REGISTER(uniform_reg); /* We do need the move for safety for a non-SSA dest, or if diff --git a/src/panfrost/util/pan_ir.h b/src/panfrost/util/pan_ir.h index a48b3e92c97..1c094793f6d 100644 --- a/src/panfrost/util/pan_ir.h +++ b/src/panfrost/util/pan_ir.h @@ -115,40 +115,6 @@ pan_lookup_sysval(struct hash_table_u64 *sysval_to_id, int panfrost_sysval_for_instr(nir_instr *instr, nir_dest *dest); -typedef struct { - int work_register_count; - int uniform_cutoff; - - /* For Bifrost - output type for each RT */ - nir_alu_type blend_types[8]; - - /* For Bifrost - return address for blend instructions */ - uint32_t blend_ret_offsets[8]; - - /* Prepended before uniforms, mapping to SYSVAL_ names for the - * sysval */ - - unsigned sysval_count; - unsigned sysvals[MAX_SYSVAL_COUNT]; - - /* UBOs to push to Register Mapped Uniforms (Midgard) or Fast Access - * Uniforms (Bifrost) */ - struct panfrost_ubo_push push; - - int first_tag; - - struct util_dynarray compiled; - - /* The number of bytes to allocate per-thread for Thread Local Storage - * (register spilling), or zero if no spilling is used */ - unsigned tls_size; - - /* For Bifrost, should the program wait on dependency slots 6/7 before - * starting? For ATEST/BLEND in the first clause, which can occur with - * extremely simple shaders */ - bool wait_6, wait_7; -} panfrost_program; - struct panfrost_compile_inputs { unsigned gpu_id; bool is_blend, is_blit; @@ -163,6 +129,82 @@ struct panfrost_compile_inputs { enum pipe_format rt_formats[8]; }; +struct pan_shader_varying { + gl_varying_slot location; + enum pipe_format format; +}; + +struct bifrost_shader_blend_info { + nir_alu_type type; + uint32_t return_offset; +}; + +struct bifrost_shader_info { + struct bifrost_shader_blend_info blend[8]; + bool wait_6, wait_7; +}; + +struct midgard_shader_info { + unsigned uniform_cutoff; + unsigned first_tag; +}; + +struct pan_shader_info { + gl_shader_stage stage; + unsigned work_reg_count; + unsigned tls_size; + unsigned wls_size; + + union { + struct { + bool reads_frag_coord; + bool reads_point_coord; + bool reads_face; + bool helper_invocations; + bool can_discard; + bool writes_depth; + bool writes_stencil; + bool sidefx; + bool reads_sample_id; + bool reads_sample_pos; + bool reads_sample_mask_in; + bool reads_helper_invocation; + bool sample_shading; + BITSET_WORD outputs_read; + } fs; + + struct { + bool writes_point_size; + } vs; + }; + + bool contains_barrier; + bool writes_global; + uint64_t outputs_written; + + unsigned texture_count; + unsigned ubo_count; + unsigned attribute_count; + + struct { + unsigned input_count; + struct pan_shader_varying input[MAX_VARYING]; + unsigned output_count; + struct pan_shader_varying output[MAX_VARYING]; + } varyings; + + struct panfrost_sysvals sysvals; + + /* UBOs to push to Register Mapped Uniforms (Midgard) or Fast Access + * Uniforms (Bifrost) */ + struct panfrost_ubo_push push; + + union { + struct bifrost_shader_info bifrost; + struct midgard_shader_info midgard; + }; +}; + typedef struct pan_block { /* Link to next block. Must be first for mir_get_block */ struct list_head link;