/* * Copyright (C) 2020-2021 Collabora, Ltd. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * * Authors: * Alyssa Rosenzweig * Boris Brezillon */ #include "pan_blitter.h" #include #include #include "compiler/nir/nir_builder.h" #include "util/u_math.h" #include "pan_blend.h" #include "pan_desc.h" #include "pan_encoder.h" #include "pan_jc.h" #include "pan_pool.h" #include "pan_shader.h" #include "pan_texture.h" #if PAN_ARCH >= 6 /* On Midgard, the native blit infrastructure (via MFBD preloads) is broken or * missing in many cases. We instead use software paths as fallbacks to * implement blits, which are done as TILER jobs. No vertex shader is * necessary since we can supply screen-space coordinates directly. * * This is primarily designed as a fallback for preloads but could be extended * for other clears/blits if needed in the future. */ static enum mali_register_file_format blit_type_to_reg_fmt(nir_alu_type in) { switch (in) { case nir_type_float32: return MALI_REGISTER_FILE_FORMAT_F32; case nir_type_int32: return MALI_REGISTER_FILE_FORMAT_I32; case nir_type_uint32: return MALI_REGISTER_FILE_FORMAT_U32; default: unreachable("Invalid blit type"); } } #endif /* On Valhall, the driver gives the hardware a table of resource tables. * Resources are addressed as the index of the table together with the index of * the resource within the table. For simplicity, we put one type of resource * in each table and fix the numbering of the tables. * * This numbering is arbitrary. */ enum pan_blit_resource_table { PAN_BLIT_TABLE_ATTRIBUTE = 0, PAN_BLIT_TABLE_ATTRIBUTE_BUFFER, PAN_BLIT_TABLE_SAMPLER, PAN_BLIT_TABLE_TEXTURE, PAN_BLIT_NUM_RESOURCE_TABLES }; struct pan_blit_surface { gl_frag_result loc : 4; nir_alu_type type : 8; enum mali_texture_dimension dim : 2; bool array : 1; unsigned src_samples : 5; unsigned dst_samples : 5; }; struct pan_blit_shader_key { struct pan_blit_surface surfaces[8]; }; struct pan_blit_shader_data { struct pan_blit_shader_key key; struct pan_shader_info info; mali_ptr address; unsigned blend_ret_offsets[8]; nir_alu_type blend_types[8]; }; struct pan_blit_blend_shader_key { enum pipe_format format; nir_alu_type type; unsigned rt : 3; unsigned nr_samples : 5; unsigned pad : 24; }; struct pan_blit_blend_shader_data { struct pan_blit_blend_shader_key key; mali_ptr address; }; struct pan_blit_rsd_key { struct { enum pipe_format format; nir_alu_type type : 8; unsigned src_samples : 5; unsigned dst_samples : 5; enum mali_texture_dimension dim : 2; bool array : 1; } rts[8], z, s; }; struct pan_blit_rsd_data { struct pan_blit_rsd_key key; mali_ptr address; }; #if PAN_ARCH >= 5 static void pan_blitter_emit_blend(unsigned rt, const struct pan_image_view *iview, const struct pan_blit_shader_data *blit_shader, mali_ptr blend_shader, void *out) { assert(blend_shader == 0 || PAN_ARCH <= 5); pan_pack(out, BLEND, cfg) { if (!iview) { cfg.enable = false; #if PAN_ARCH >= 6 cfg.internal.mode = MALI_BLEND_MODE_OFF; #endif continue; } cfg.round_to_fb_precision = true; cfg.srgb = util_format_is_srgb(iview->format); #if PAN_ARCH >= 6 cfg.internal.mode = MALI_BLEND_MODE_OPAQUE; #endif if (!blend_shader) { cfg.equation.rgb.a = MALI_BLEND_OPERAND_A_SRC; cfg.equation.rgb.b = MALI_BLEND_OPERAND_B_SRC; cfg.equation.rgb.c = MALI_BLEND_OPERAND_C_ZERO; cfg.equation.alpha.a = MALI_BLEND_OPERAND_A_SRC; cfg.equation.alpha.b = MALI_BLEND_OPERAND_B_SRC; cfg.equation.alpha.c = MALI_BLEND_OPERAND_C_ZERO; cfg.equation.color_mask = 0xf; #if PAN_ARCH >= 6 nir_alu_type type = blit_shader->key.surfaces[rt].type; cfg.internal.fixed_function.num_comps = 4; cfg.internal.fixed_function.conversion.memory_format = GENX( panfrost_dithered_format_from_pipe_format)(iview->format, false); cfg.internal.fixed_function.conversion.register_format = blit_type_to_reg_fmt(type); cfg.internal.fixed_function.rt = rt; #endif } else { #if PAN_ARCH <= 5 cfg.blend_shader = true; cfg.shader_pc = blend_shader; #endif } } } #endif struct pan_blitter_views { unsigned rt_count; const struct pan_image_view *src_rts[8]; const struct pan_image_view *dst_rts[8]; const struct pan_image_view *src_z; const struct pan_image_view *dst_z; const struct pan_image_view *src_s; const struct pan_image_view *dst_s; }; static bool pan_blitter_is_ms(struct pan_blitter_views *views) { for (unsigned i = 0; i < views->rt_count; i++) { if (views->dst_rts[i]) { if (pan_image_view_get_nr_samples(views->dst_rts[i]) > 1) return true; } } if (views->dst_z && pan_image_view_get_nr_samples(views->dst_z) > 1) return true; if (views->dst_s && pan_image_view_get_nr_samples(views->dst_s) > 1) return true; return false; } #if PAN_ARCH >= 5 static void pan_blitter_emit_blends(const struct pan_blit_shader_data *blit_shader, struct pan_blitter_views *views, mali_ptr *blend_shaders, void *out) { for (unsigned i = 0; i < MAX2(views->rt_count, 1); ++i) { void *dest = out + pan_size(BLEND) * i; const struct pan_image_view *rt_view = views->dst_rts[i]; mali_ptr blend_shader = blend_shaders ? blend_shaders[i] : 0; pan_blitter_emit_blend(i, rt_view, blit_shader, blend_shader, dest); } } #endif #if PAN_ARCH <= 7 static void pan_blitter_emit_rsd(const struct pan_blit_shader_data *blit_shader, struct pan_blitter_views *views, mali_ptr *blend_shaders, void *out) { UNUSED bool zs = (views->dst_z || views->dst_s); bool ms = pan_blitter_is_ms(views); pan_pack(out, RENDERER_STATE, cfg) { assert(blit_shader->address); pan_shader_prepare_rsd(&blit_shader->info, blit_shader->address, &cfg); cfg.multisample_misc.sample_mask = 0xFFFF; cfg.multisample_misc.multisample_enable = ms; cfg.multisample_misc.evaluate_per_sample = ms; cfg.multisample_misc.depth_write_mask = views->dst_z != NULL; cfg.multisample_misc.depth_function = MALI_FUNC_ALWAYS; cfg.stencil_mask_misc.stencil_enable = views->dst_s != NULL; cfg.stencil_mask_misc.stencil_mask_front = 0xFF; cfg.stencil_mask_misc.stencil_mask_back = 0xFF; cfg.stencil_front.compare_function = MALI_FUNC_ALWAYS; cfg.stencil_front.stencil_fail = MALI_STENCIL_OP_REPLACE; cfg.stencil_front.depth_fail = MALI_STENCIL_OP_REPLACE; cfg.stencil_front.depth_pass = MALI_STENCIL_OP_REPLACE; cfg.stencil_front.mask = 0xFF; cfg.stencil_back = cfg.stencil_front; #if PAN_ARCH >= 6 if (zs) { /* Writing Z/S requires late updates */ cfg.properties.zs_update_operation = MALI_PIXEL_KILL_FORCE_LATE; cfg.properties.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_LATE; } else { /* Skipping ATEST requires forcing Z/S */ cfg.properties.zs_update_operation = MALI_PIXEL_KILL_STRONG_EARLY; cfg.properties.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_EARLY; } /* However, while shaders writing Z/S can normally be killed, on v6 * for frame shaders it can cause GPU timeouts, so only allow colour * blit shaders to be killed. */ cfg.properties.allow_forward_pixel_to_kill = !zs; if (PAN_ARCH == 6) cfg.properties.allow_forward_pixel_to_be_killed = !zs; #else mali_ptr blend_shader = blend_shaders ? panfrost_last_nonnull(blend_shaders, MAX2(views->rt_count, 1)) : 0; cfg.properties.work_register_count = 4; cfg.properties.force_early_z = !zs; cfg.stencil_mask_misc.alpha_test_compare_function = MALI_FUNC_ALWAYS; /* Set even on v5 for erratum workaround */ #if PAN_ARCH == 5 cfg.legacy_blend_shader = blend_shader; #else cfg.blend_shader = blend_shader; cfg.stencil_mask_misc.write_enable = true; cfg.stencil_mask_misc.dither_disable = true; cfg.multisample_misc.blend_shader = !!blend_shader; cfg.blend_shader = blend_shader; if (!cfg.multisample_misc.blend_shader) { cfg.blend_equation.rgb.a = MALI_BLEND_OPERAND_A_SRC; cfg.blend_equation.rgb.b = MALI_BLEND_OPERAND_B_SRC; cfg.blend_equation.rgb.c = MALI_BLEND_OPERAND_C_ZERO; cfg.blend_equation.alpha.a = MALI_BLEND_OPERAND_A_SRC; cfg.blend_equation.alpha.b = MALI_BLEND_OPERAND_B_SRC; cfg.blend_equation.alpha.c = MALI_BLEND_OPERAND_C_ZERO; cfg.blend_constant = 0; if (views->dst_rts[0] != NULL) { cfg.stencil_mask_misc.srgb = util_format_is_srgb(views->dst_rts[0]->format); cfg.blend_equation.color_mask = 0xf; } } #endif #endif } #if PAN_ARCH >= 5 pan_blitter_emit_blends(blit_shader, views, blend_shaders, out + pan_size(RENDERER_STATE)); #endif } #endif #if PAN_ARCH <= 5 static void pan_blitter_get_blend_shaders(struct pan_blitter_cache *cache, unsigned rt_count, const struct pan_image_view **rts, const struct pan_blit_shader_data *blit_shader, mali_ptr *blend_shaders) { if (!rt_count) return; struct pan_blend_state blend_state = { .rt_count = rt_count, }; for (unsigned i = 0; i < rt_count; i++) { if (!rts[i] || panfrost_blendable_formats_v7[rts[i]->format].internal) continue; struct pan_blit_blend_shader_key key = { .format = rts[i]->format, .rt = i, .nr_samples = pan_image_view_get_nr_samples(rts[i]), .type = blit_shader->blend_types[i], }; pthread_mutex_lock(&cache->shaders.lock); struct hash_entry *he = _mesa_hash_table_search(cache->shaders.blend, &key); struct pan_blit_blend_shader_data *blend_shader = he ? he->data : NULL; if (blend_shader) { blend_shaders[i] = blend_shader->address; pthread_mutex_unlock(&cache->shaders.lock); continue; } blend_shader = rzalloc(cache->shaders.blend, struct pan_blit_blend_shader_data); blend_shader->key = key; blend_state.rts[i] = (struct pan_blend_rt_state){ .format = rts[i]->format, .nr_samples = pan_image_view_get_nr_samples(rts[i]), .equation = { .blend_enable = false, .color_mask = 0xf, }, }; pthread_mutex_lock(&cache->blend_shader_cache->lock); struct pan_blend_shader_variant *b = GENX(pan_blend_get_shader_locked)( cache->blend_shader_cache, &blend_state, blit_shader->blend_types[i], nir_type_float32, /* unused */ i); assert(b->work_reg_count <= 4); struct panfrost_ptr bin = pan_pool_alloc_aligned(cache->shaders.pool, b->binary.size, 64); memcpy(bin.cpu, b->binary.data, b->binary.size); blend_shader->address = bin.gpu | b->first_tag; pthread_mutex_unlock(&cache->blend_shader_cache->lock); _mesa_hash_table_insert(cache->shaders.blend, &blend_shader->key, blend_shader); pthread_mutex_unlock(&cache->shaders.lock); blend_shaders[i] = blend_shader->address; } } #endif /* * Early Mali GPUs did not respect sampler LOD clamps or bias, so the Midgard * compiler inserts lowering code with a load_sampler_lod_parameters_pan sysval * that we need to lower. Our samplers do not use LOD clamps or bias, so we * lower to the identity settings and let constant folding get rid of the * unnecessary lowering. */ static bool lower_sampler_parameters(nir_builder *b, nir_intrinsic_instr *intr, UNUSED void *data) { if (intr->intrinsic != nir_intrinsic_load_sampler_lod_parameters_pan) return false; const nir_const_value constants[4] = { nir_const_value_for_float(0.0f, 32), /* min_lod */ nir_const_value_for_float(INFINITY, 32), /* max_lod */ nir_const_value_for_float(0.0f, 32), /* lod_bias */ }; b->cursor = nir_after_instr(&intr->instr); nir_def_rewrite_uses(&intr->def, nir_build_imm(b, 3, 32, constants)); return true; } static uint32_t sampler_hw_index(uint32_t index) { return PAN_ARCH >= 9 ? pan_res_handle(PAN_BLIT_TABLE_SAMPLER, index) : index; } static uint32_t tex_hw_index(uint32_t index) { return PAN_ARCH >= 9 ? pan_res_handle(PAN_BLIT_TABLE_TEXTURE, index) : index; } static uint32_t attr_hw_index(uint32_t index) { return PAN_ARCH >= 9 ? pan_res_handle(PAN_BLIT_TABLE_ATTRIBUTE, index) : index; } static const struct pan_blit_shader_data * pan_blitter_get_blit_shader(struct pan_blitter_cache *cache, const struct pan_blit_shader_key *key) { pthread_mutex_lock(&cache->shaders.lock); struct hash_entry *he = _mesa_hash_table_search(cache->shaders.blit, key); struct pan_blit_shader_data *shader = he ? he->data : NULL; if (shader) goto out; unsigned coord_comps = 0; unsigned sig_offset = 0; char sig[256]; bool first = true; for (unsigned i = 0; i < ARRAY_SIZE(key->surfaces); i++) { const char *type_str, *dim_str; if (key->surfaces[i].type == nir_type_invalid) continue; switch (key->surfaces[i].type) { case nir_type_float32: type_str = "float"; break; case nir_type_uint32: type_str = "uint"; break; case nir_type_int32: type_str = "int"; break; default: unreachable("Invalid type\n"); } switch (key->surfaces[i].dim) { case MALI_TEXTURE_DIMENSION_CUBE: dim_str = "cube"; break; case MALI_TEXTURE_DIMENSION_1D: dim_str = "1D"; break; case MALI_TEXTURE_DIMENSION_2D: dim_str = "2D"; break; case MALI_TEXTURE_DIMENSION_3D: dim_str = "3D"; break; default: unreachable("Invalid dim\n"); } coord_comps = MAX2(coord_comps, (key->surfaces[i].dim ?: 3) + (key->surfaces[i].array ? 1 : 0)); if (sig_offset >= sizeof(sig)) { first = false; continue; } sig_offset += snprintf(sig + sig_offset, sizeof(sig) - sig_offset, "%s[%s;%s;%s%s;src_samples=%d,dst_samples=%d]", first ? "" : ",", gl_frag_result_name(key->surfaces[i].loc), type_str, dim_str, key->surfaces[i].array ? "[]" : "", key->surfaces[i].src_samples, key->surfaces[i].dst_samples); first = false; } nir_builder b = nir_builder_init_simple_shader( MESA_SHADER_FRAGMENT, GENX(pan_shader_get_compiler_options)(), "pan_blit(%s)", sig); nir_def *barycentric = nir_load_barycentric( &b, nir_intrinsic_load_barycentric_pixel, INTERP_MODE_SMOOTH); nir_def *coord = nir_load_interpolated_input( &b, coord_comps, 32, barycentric, nir_imm_int(&b, 0), .base = attr_hw_index(0), .dest_type = nir_type_float32, .io_semantics.location = VARYING_SLOT_VAR0, .io_semantics.num_slots = 1); unsigned active_count = 0; for (unsigned i = 0; i < ARRAY_SIZE(key->surfaces); i++) { if (key->surfaces[i].type == nir_type_invalid) continue; /* Resolve operations only work for N -> 1 samples. */ assert(key->surfaces[i].dst_samples == 1 || key->surfaces[i].src_samples == key->surfaces[i].dst_samples); bool resolve = key->surfaces[i].src_samples > key->surfaces[i].dst_samples; bool ms = key->surfaces[i].src_samples > 1; enum glsl_sampler_dim sampler_dim; switch (key->surfaces[i].dim) { case MALI_TEXTURE_DIMENSION_1D: sampler_dim = GLSL_SAMPLER_DIM_1D; break; case MALI_TEXTURE_DIMENSION_2D: sampler_dim = ms ? GLSL_SAMPLER_DIM_MS : GLSL_SAMPLER_DIM_2D; break; case MALI_TEXTURE_DIMENSION_3D: sampler_dim = GLSL_SAMPLER_DIM_3D; break; case MALI_TEXTURE_DIMENSION_CUBE: sampler_dim = GLSL_SAMPLER_DIM_CUBE; break; } nir_def *res = NULL; if (resolve) { /* When resolving a float type, we need to calculate * the average of all samples. For integer resolve, GL * and Vulkan say that one sample should be chosen * without telling which. Let's just pick the first one * in that case. */ nir_alu_type base_type = nir_alu_type_get_base_type(key->surfaces[i].type); unsigned nsamples = base_type == nir_type_float ? key->surfaces[i].src_samples : 1; for (unsigned s = 0; s < nsamples; s++) { nir_tex_instr *tex = nir_tex_instr_create(b.shader, 3); tex->op = nir_texop_txf_ms; tex->dest_type = key->surfaces[i].type; tex->texture_index = tex_hw_index(active_count); tex->sampler_index = sampler_hw_index(0); tex->is_array = key->surfaces[i].array; tex->sampler_dim = sampler_dim; tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord, nir_f2i32(&b, coord)); tex->coord_components = coord_comps; tex->src[1] = nir_tex_src_for_ssa(nir_tex_src_ms_index, nir_imm_int(&b, s)); tex->src[2] = nir_tex_src_for_ssa(nir_tex_src_lod, nir_imm_int(&b, 0)); nir_def_init(&tex->instr, &tex->def, 4, 32); nir_builder_instr_insert(&b, &tex->instr); res = res ? nir_fadd(&b, res, &tex->def) : &tex->def; } if (base_type == nir_type_float) res = nir_fmul_imm(&b, res, 1.0f / nsamples); } else { nir_tex_instr *tex = nir_tex_instr_create(b.shader, ms ? 3 : 1); tex->dest_type = key->surfaces[i].type; tex->texture_index = tex_hw_index(active_count); tex->sampler_index = sampler_hw_index(0); tex->is_array = key->surfaces[i].array; tex->sampler_dim = sampler_dim; if (ms) { tex->op = nir_texop_txf_ms; tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord, nir_f2i32(&b, coord)); tex->coord_components = coord_comps; tex->src[1] = nir_tex_src_for_ssa(nir_tex_src_ms_index, nir_load_sample_id(&b)); tex->src[2] = nir_tex_src_for_ssa(nir_tex_src_lod, nir_imm_int(&b, 0)); } else { tex->op = nir_texop_txl; tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord, coord); tex->coord_components = coord_comps; } nir_def_init(&tex->instr, &tex->def, 4, 32); nir_builder_instr_insert(&b, &tex->instr); res = &tex->def; } assert(res); if (key->surfaces[i].loc >= FRAG_RESULT_DATA0) { nir_store_output( &b, res, nir_imm_int(&b, 0), .base = active_count, .src_type = key->surfaces[i].type, .io_semantics.location = key->surfaces[i].loc, .io_semantics.num_slots = 1, .write_mask = nir_component_mask(res->num_components)); } else { unsigned c = key->surfaces[i].loc == FRAG_RESULT_STENCIL ? 1 : 0; nir_store_output( &b, nir_channel(&b, res, c), nir_imm_int(&b, 0), .base = active_count, .src_type = key->surfaces[i].type, .io_semantics.location = key->surfaces[i].loc, .io_semantics.num_slots = 1, .write_mask = nir_component_mask(1)); } active_count++; } struct panfrost_compile_inputs inputs = { .gpu_id = cache->gpu_id, .is_blit = true, .no_idvs = true, }; struct util_dynarray binary; util_dynarray_init(&binary, NULL); shader = rzalloc(cache->shaders.blit, struct pan_blit_shader_data); nir_shader_gather_info(b.shader, nir_shader_get_entrypoint(b.shader)); for (unsigned i = 0; i < active_count; ++i) BITSET_SET(b.shader->info.textures_used, i); pan_shader_preprocess(b.shader, inputs.gpu_id); if (PAN_ARCH == 4) { NIR_PASS_V(b.shader, nir_shader_intrinsics_pass, lower_sampler_parameters, nir_metadata_control_flow, NULL); } GENX(pan_shader_compile)(b.shader, &inputs, &binary, &shader->info); shader->key = *key; shader->address = pan_pool_upload_aligned(cache->shaders.pool, binary.data, binary.size, PAN_ARCH >= 6 ? 128 : 64); util_dynarray_fini(&binary); ralloc_free(b.shader); #if PAN_ARCH >= 6 for (unsigned i = 0; i < ARRAY_SIZE(shader->blend_ret_offsets); i++) { shader->blend_ret_offsets[i] = shader->info.bifrost.blend[i].return_offset; shader->blend_types[i] = shader->info.bifrost.blend[i].type; } #endif _mesa_hash_table_insert(cache->shaders.blit, &shader->key, shader); out: pthread_mutex_unlock(&cache->shaders.lock); return shader; } static struct pan_blit_shader_key pan_blitter_get_key(struct pan_blitter_views *views) { struct pan_blit_shader_key key = {0}; if (views->src_z) { assert(views->dst_z); key.surfaces[0].loc = FRAG_RESULT_DEPTH; key.surfaces[0].type = nir_type_float32; key.surfaces[0].src_samples = pan_image_view_get_nr_samples(views->src_z); key.surfaces[0].dst_samples = pan_image_view_get_nr_samples(views->dst_z); key.surfaces[0].dim = views->src_z->dim; key.surfaces[0].array = views->src_z->first_layer != views->src_z->last_layer; } if (views->src_s) { assert(views->dst_s); key.surfaces[1].loc = FRAG_RESULT_STENCIL; key.surfaces[1].type = nir_type_uint32; key.surfaces[1].src_samples = pan_image_view_get_nr_samples(views->src_s); key.surfaces[1].dst_samples = pan_image_view_get_nr_samples(views->dst_s); key.surfaces[1].dim = views->src_s->dim; key.surfaces[1].array = views->src_s->first_layer != views->src_s->last_layer; } for (unsigned i = 0; i < views->rt_count; i++) { if (!views->src_rts[i]) continue; assert(views->dst_rts[i]); key.surfaces[i].loc = FRAG_RESULT_DATA0 + i; key.surfaces[i].type = util_format_is_pure_uint(views->src_rts[i]->format) ? nir_type_uint32 : util_format_is_pure_sint(views->src_rts[i]->format) ? nir_type_int32 : nir_type_float32; key.surfaces[i].src_samples = pan_image_view_get_nr_samples(views->src_rts[i]); key.surfaces[i].dst_samples = pan_image_view_get_nr_samples(views->dst_rts[i]); key.surfaces[i].dim = views->src_rts[i]->dim; key.surfaces[i].array = views->src_rts[i]->first_layer != views->src_rts[i]->last_layer; } return key; } #if PAN_ARCH <= 7 static mali_ptr pan_blitter_get_rsd(struct pan_blitter_cache *cache, struct pan_blitter_views *views) { struct pan_blit_rsd_key rsd_key = {0}; assert(!views->rt_count || (!views->src_z && !views->src_s)); struct pan_blit_shader_key blit_key = pan_blitter_get_key(views); if (views->src_z) { assert(views->dst_z); rsd_key.z.format = views->dst_z->format; rsd_key.z.type = blit_key.surfaces[0].type; rsd_key.z.src_samples = blit_key.surfaces[0].src_samples; rsd_key.z.dst_samples = blit_key.surfaces[0].dst_samples; rsd_key.z.dim = blit_key.surfaces[0].dim; rsd_key.z.array = blit_key.surfaces[0].array; } if (views->src_s) { assert(views->dst_s); rsd_key.s.format = views->dst_s->format; rsd_key.s.type = blit_key.surfaces[1].type; rsd_key.s.src_samples = blit_key.surfaces[1].src_samples; rsd_key.s.dst_samples = blit_key.surfaces[1].dst_samples; rsd_key.s.dim = blit_key.surfaces[1].dim; rsd_key.s.array = blit_key.surfaces[1].array; } for (unsigned i = 0; i < views->rt_count; i++) { if (!views->src_rts[i]) continue; assert(views->dst_rts[i]); rsd_key.rts[i].format = views->dst_rts[i]->format; rsd_key.rts[i].type = blit_key.surfaces[i].type; rsd_key.rts[i].src_samples = blit_key.surfaces[i].src_samples; rsd_key.rts[i].dst_samples = blit_key.surfaces[i].dst_samples; rsd_key.rts[i].dim = blit_key.surfaces[i].dim; rsd_key.rts[i].array = blit_key.surfaces[i].array; } pthread_mutex_lock(&cache->rsds.lock); struct hash_entry *he = _mesa_hash_table_search(cache->rsds.rsds, &rsd_key); struct pan_blit_rsd_data *rsd = he ? he->data : NULL; if (rsd) goto out; rsd = rzalloc(cache->rsds.rsds, struct pan_blit_rsd_data); rsd->key = rsd_key; #if PAN_ARCH == 4 struct panfrost_ptr rsd_ptr = pan_pool_alloc_desc(cache->rsds.pool, RENDERER_STATE); #else unsigned bd_count = PAN_ARCH >= 5 ? MAX2(views->rt_count, 1) : 0; struct panfrost_ptr rsd_ptr = pan_pool_alloc_desc_aggregate( cache->rsds.pool, PAN_DESC(RENDERER_STATE), PAN_DESC_ARRAY(bd_count, BLEND)); #endif mali_ptr blend_shaders[8] = {0}; const struct pan_blit_shader_data *blit_shader = pan_blitter_get_blit_shader(cache, &blit_key); #if PAN_ARCH <= 5 pan_blitter_get_blend_shaders(cache, views->rt_count, views->dst_rts, blit_shader, blend_shaders); #endif pan_blitter_emit_rsd(blit_shader, views, blend_shaders, rsd_ptr.cpu); rsd->address = rsd_ptr.gpu; _mesa_hash_table_insert(cache->rsds.rsds, &rsd->key, rsd); out: pthread_mutex_unlock(&cache->rsds.lock); return rsd->address; } static mali_ptr pan_blit_get_rsd(struct pan_blitter_cache *cache, const struct pan_image_view *src_views, const struct pan_image_view *dst_view) { const struct util_format_description *desc = util_format_description(src_views[0].format); struct pan_blitter_views views = {}; if (util_format_has_depth(desc)) { views.src_z = &src_views[0]; views.dst_z = dst_view; } if (src_views[1].format) { views.src_s = &src_views[1]; views.dst_s = dst_view; } else if (util_format_has_stencil(desc)) { views.src_s = &src_views[0]; views.dst_s = dst_view; } if (!views.src_z && !views.src_s) { views.rt_count = 1; views.src_rts[0] = src_views; views.dst_rts[0] = dst_view; } return pan_blitter_get_rsd(cache, &views); } #endif static struct pan_blitter_views pan_preload_get_views(const struct pan_fb_info *fb, bool zs, struct pan_image_view *patched_s) { struct pan_blitter_views views = {0}; if (zs) { if (fb->zs.preload.z) views.src_z = views.dst_z = fb->zs.view.zs; if (fb->zs.preload.s) { const struct pan_image_view *view = fb->zs.view.s ?: fb->zs.view.zs; enum pipe_format fmt = util_format_get_depth_only(view->format); switch (view->format) { case PIPE_FORMAT_Z24_UNORM_S8_UINT: fmt = PIPE_FORMAT_X24S8_UINT; break; case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: fmt = PIPE_FORMAT_X32_S8X24_UINT; break; default: fmt = view->format; break; } if (fmt != view->format) { *patched_s = *view; patched_s->format = fmt; views.src_s = views.dst_s = patched_s; } else { views.src_s = views.dst_s = view; } } } else { for (unsigned i = 0; i < fb->rt_count; i++) { if (fb->rts[i].preload) { views.src_rts[i] = fb->rts[i].view; views.dst_rts[i] = fb->rts[i].view; } } views.rt_count = fb->rt_count; } return views; } static bool pan_preload_needed(const struct pan_fb_info *fb, bool zs) { if (zs) { if (fb->zs.preload.z || fb->zs.preload.s) return true; } else { for (unsigned i = 0; i < fb->rt_count; i++) { if (fb->rts[i].preload) return true; } } return false; } static mali_ptr pan_blitter_emit_varying(struct pan_pool *pool) { struct panfrost_ptr varying = pan_pool_alloc_desc(pool, ATTRIBUTE); pan_pack(varying.cpu, ATTRIBUTE, cfg) { cfg.buffer_index = 0; cfg.offset_enable = PAN_ARCH <= 5; cfg.format = GENX(panfrost_format_from_pipe_format)(PIPE_FORMAT_R32G32B32_FLOAT)->hw; #if PAN_ARCH >= 9 cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D; cfg.table = PAN_BLIT_TABLE_ATTRIBUTE_BUFFER; cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_VERTEX; cfg.stride = 4 * sizeof(float); #endif } return varying.gpu; } static mali_ptr pan_blitter_emit_varying_buffer(struct pan_pool *pool, mali_ptr coordinates) { #if PAN_ARCH >= 9 struct panfrost_ptr varying_buffer = pan_pool_alloc_desc(pool, BUFFER); pan_pack(varying_buffer.cpu, BUFFER, cfg) { cfg.address = coordinates; cfg.size = 4 * sizeof(float) * 4; } #else /* Bifrost needs an empty desc to mark end of prefetching */ bool padding_buffer = PAN_ARCH >= 6; struct panfrost_ptr varying_buffer = pan_pool_alloc_desc_array( pool, (padding_buffer ? 2 : 1), ATTRIBUTE_BUFFER); pan_pack(varying_buffer.cpu, ATTRIBUTE_BUFFER, cfg) { cfg.pointer = coordinates; cfg.stride = 4 * sizeof(float); cfg.size = cfg.stride * 4; } if (padding_buffer) { pan_pack(varying_buffer.cpu + pan_size(ATTRIBUTE_BUFFER), ATTRIBUTE_BUFFER, cfg) ; } #endif return varying_buffer.gpu; } static mali_ptr pan_blitter_emit_sampler(struct pan_pool *pool, bool nearest_filter) { struct panfrost_ptr sampler = pan_pool_alloc_desc(pool, SAMPLER); pan_pack(sampler.cpu, SAMPLER, cfg) { cfg.seamless_cube_map = false; cfg.normalized_coordinates = false; cfg.minify_nearest = nearest_filter; cfg.magnify_nearest = nearest_filter; } return sampler.gpu; } static mali_ptr pan_blitter_emit_textures(struct pan_pool *pool, unsigned tex_count, const struct pan_image_view **views) { #if PAN_ARCH >= 6 struct panfrost_ptr textures = pan_pool_alloc_desc_array(pool, tex_count, TEXTURE); for (unsigned i = 0; i < tex_count; i++) { void *texture = textures.cpu + (pan_size(TEXTURE) * i); size_t payload_size = GENX(panfrost_estimate_texture_payload_size)(views[i]); struct panfrost_ptr surfaces = pan_pool_alloc_aligned(pool, payload_size, 64); GENX(panfrost_new_texture)(views[i], texture, &surfaces); } return textures.gpu; #else mali_ptr textures[8] = {0}; for (unsigned i = 0; i < tex_count; i++) { size_t sz = pan_size(TEXTURE) + GENX(panfrost_estimate_texture_payload_size)(views[i]); struct panfrost_ptr texture = pan_pool_alloc_aligned(pool, sz, pan_alignment(TEXTURE)); struct panfrost_ptr surfaces = { .cpu = texture.cpu + pan_size(TEXTURE), .gpu = texture.gpu + pan_size(TEXTURE), }; GENX(panfrost_new_texture)(views[i], texture.cpu, &surfaces); textures[i] = texture.gpu; } return pan_pool_upload_aligned(pool, textures, tex_count * sizeof(mali_ptr), sizeof(mali_ptr)); #endif } static mali_ptr pan_preload_emit_textures(struct pan_pool *pool, const struct pan_fb_info *fb, bool zs, unsigned *tex_count_out) { const struct pan_image_view *views[8]; struct pan_image_view patched_s_view; unsigned tex_count = 0; if (zs) { if (fb->zs.preload.z) views[tex_count++] = fb->zs.view.zs; if (fb->zs.preload.s) { const struct pan_image_view *view = fb->zs.view.s ?: fb->zs.view.zs; enum pipe_format fmt = util_format_get_depth_only(view->format); switch (view->format) { case PIPE_FORMAT_Z24_UNORM_S8_UINT: fmt = PIPE_FORMAT_X24S8_UINT; break; case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: fmt = PIPE_FORMAT_X32_S8X24_UINT; break; default: fmt = view->format; break; } if (fmt != view->format) { patched_s_view = *view; patched_s_view.format = fmt; view = &patched_s_view; } views[tex_count++] = view; } } else { for (unsigned i = 0; i < fb->rt_count; i++) { if (fb->rts[i].preload) views[tex_count++] = fb->rts[i].view; } } *tex_count_out = tex_count; return pan_blitter_emit_textures(pool, tex_count, views); } #if PAN_ARCH >= 8 /* TODO: cache */ static mali_ptr pan_blitter_emit_zs(struct pan_pool *pool, bool z, bool s) { struct panfrost_ptr zsd = pan_pool_alloc_desc(pool, DEPTH_STENCIL); pan_pack(zsd.cpu, DEPTH_STENCIL, cfg) { cfg.depth_function = MALI_FUNC_ALWAYS; cfg.depth_write_enable = z; if (z) cfg.depth_source = MALI_DEPTH_SOURCE_SHADER; cfg.stencil_test_enable = s; cfg.stencil_from_shader = s; cfg.front_compare_function = MALI_FUNC_ALWAYS; cfg.front_stencil_fail = MALI_STENCIL_OP_REPLACE; cfg.front_depth_fail = MALI_STENCIL_OP_REPLACE; cfg.front_depth_pass = MALI_STENCIL_OP_REPLACE; cfg.front_write_mask = 0xFF; cfg.front_value_mask = 0xFF; cfg.back_compare_function = MALI_FUNC_ALWAYS; cfg.back_stencil_fail = MALI_STENCIL_OP_REPLACE; cfg.back_depth_fail = MALI_STENCIL_OP_REPLACE; cfg.back_depth_pass = MALI_STENCIL_OP_REPLACE; cfg.back_write_mask = 0xFF; cfg.back_value_mask = 0xFF; cfg.depth_cull_enable = false; } return zsd.gpu; } #else static mali_ptr pan_blitter_emit_viewport(struct pan_pool *pool, uint16_t minx, uint16_t miny, uint16_t maxx, uint16_t maxy) { struct panfrost_ptr vp = pan_pool_alloc_desc(pool, VIEWPORT); pan_pack(vp.cpu, VIEWPORT, cfg) { cfg.scissor_minimum_x = minx; cfg.scissor_minimum_y = miny; cfg.scissor_maximum_x = maxx; cfg.scissor_maximum_y = maxy; } return vp.gpu; } #endif static void pan_preload_emit_dcd(struct pan_blitter_cache *cache, struct pan_pool *pool, struct pan_fb_info *fb, bool zs, mali_ptr coordinates, mali_ptr tsd, void *out, bool always_write) { unsigned tex_count = 0; mali_ptr textures = pan_preload_emit_textures(pool, fb, zs, &tex_count); mali_ptr samplers = pan_blitter_emit_sampler(pool, true); mali_ptr varyings = pan_blitter_emit_varying(pool); mali_ptr varying_buffers = pan_blitter_emit_varying_buffer(pool, coordinates); /* Tiles updated by blit shaders are still considered clean (separate * for colour and Z/S), allowing us to suppress unnecessary writeback */ UNUSED bool clean_fragment_write = !always_write; /* Image view used when patching stencil formats for combined * depth/stencil preloads. */ struct pan_image_view patched_s; struct pan_blitter_views views = pan_preload_get_views(fb, zs, &patched_s); #if PAN_ARCH <= 7 pan_pack(out, DRAW, cfg) { uint16_t minx = 0, miny = 0, maxx, maxy; if (PAN_ARCH == 4) { maxx = fb->width - 1; maxy = fb->height - 1; } else { /* Align on 32x32 tiles */ minx = fb->extent.minx & ~31; miny = fb->extent.miny & ~31; maxx = MIN2(ALIGN_POT(fb->extent.maxx + 1, 32), fb->width) - 1; maxy = MIN2(ALIGN_POT(fb->extent.maxy + 1, 32), fb->height) - 1; } cfg.thread_storage = tsd; cfg.state = pan_blitter_get_rsd(cache, &views); cfg.position = coordinates; cfg.viewport = pan_blitter_emit_viewport(pool, minx, miny, maxx, maxy); cfg.varyings = varyings; cfg.varying_buffers = varying_buffers; cfg.textures = textures; cfg.samplers = samplers; #if PAN_ARCH >= 6 cfg.clean_fragment_write = clean_fragment_write; #endif } #else struct panfrost_ptr T; unsigned nr_tables = PAN_BLIT_NUM_RESOURCE_TABLES; /* Although individual resources need only 16 byte alignment, the * resource table as a whole must be 64-byte aligned. */ T = pan_pool_alloc_aligned(pool, nr_tables * pan_size(RESOURCE), 64); memset(T.cpu, 0, nr_tables * pan_size(RESOURCE)); panfrost_make_resource_table(T, PAN_BLIT_TABLE_TEXTURE, textures, tex_count); panfrost_make_resource_table(T, PAN_BLIT_TABLE_SAMPLER, samplers, 1); panfrost_make_resource_table(T, PAN_BLIT_TABLE_ATTRIBUTE, varyings, 1); panfrost_make_resource_table(T, PAN_BLIT_TABLE_ATTRIBUTE_BUFFER, varying_buffers, 1); struct pan_blit_shader_key key = pan_blitter_get_key(&views); const struct pan_blit_shader_data *blit_shader = pan_blitter_get_blit_shader(cache, &key); bool z = fb->zs.preload.z; bool s = fb->zs.preload.s; bool ms = pan_blitter_is_ms(&views); struct panfrost_ptr spd = pan_pool_alloc_desc(pool, SHADER_PROGRAM); pan_pack(spd.cpu, SHADER_PROGRAM, cfg) { cfg.stage = MALI_SHADER_STAGE_FRAGMENT; cfg.fragment_coverage_bitmask_type = MALI_COVERAGE_BITMASK_TYPE_GL; cfg.register_allocation = MALI_SHADER_REGISTER_ALLOCATION_32_PER_THREAD; cfg.binary = blit_shader->address; cfg.preload.r48_r63 = blit_shader->info.preload >> 48; } unsigned bd_count = views.rt_count; struct panfrost_ptr blend = pan_pool_alloc_desc_array(pool, bd_count, BLEND); if (!zs) { pan_blitter_emit_blends(blit_shader, &views, NULL, blend.cpu); } pan_pack(out, DRAW, cfg) { if (zs) { /* ZS_EMIT requires late update/kill */ cfg.zs_update_operation = MALI_PIXEL_KILL_FORCE_LATE; cfg.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_LATE; cfg.blend_count = 0; } else { /* Skipping ATEST requires forcing Z/S */ cfg.zs_update_operation = MALI_PIXEL_KILL_STRONG_EARLY; cfg.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_EARLY; cfg.blend = blend.gpu; cfg.blend_count = bd_count; cfg.render_target_mask = 0x1; } cfg.allow_forward_pixel_to_kill = !zs; cfg.allow_forward_pixel_to_be_killed = true; cfg.depth_stencil = pan_blitter_emit_zs(pool, z, s); cfg.sample_mask = 0xFFFF; cfg.multisample_enable = ms; cfg.evaluate_per_sample = ms; cfg.maximum_z = 1.0; cfg.clean_fragment_write = clean_fragment_write; cfg.shader.resources = T.gpu | nr_tables; cfg.shader.shader = spd.gpu; cfg.shader.thread_storage = tsd; } #endif } #if PAN_ARCH <= 7 static void * pan_blit_emit_tiler_job(struct pan_pool *pool, struct pan_jc *jc, mali_ptr tiler, struct panfrost_ptr *job) { *job = pan_pool_alloc_desc(pool, TILER_JOB); pan_section_pack(job->cpu, TILER_JOB, PRIMITIVE, cfg) { cfg.draw_mode = MALI_DRAW_MODE_TRIANGLE_STRIP; cfg.index_count = 4; cfg.job_task_split = 6; } pan_section_pack(job->cpu, TILER_JOB, PRIMITIVE_SIZE, cfg) { cfg.constant = 1.0f; } void *invoc = pan_section_ptr(job->cpu, TILER_JOB, INVOCATION); panfrost_pack_work_groups_compute(invoc, 1, 4, 1, 1, 1, 1, true, false); #if PAN_ARCH >= 6 pan_section_pack(job->cpu, TILER_JOB, PADDING, cfg) ; pan_section_pack(job->cpu, TILER_JOB, TILER, cfg) { cfg.address = tiler; } #endif pan_jc_add_job(jc, MALI_JOB_TYPE_TILER, false, false, 0, 0, job, false); return pan_section_ptr(job->cpu, TILER_JOB, DRAW); } #endif #if PAN_ARCH >= 6 static void pan_preload_fb_alloc_pre_post_dcds(struct pan_pool *desc_pool, struct pan_fb_info *fb) { if (fb->bifrost.pre_post.dcds.gpu) return; fb->bifrost.pre_post.dcds = pan_pool_alloc_desc_array(desc_pool, 3, DRAW); } static void pan_preload_emit_pre_frame_dcd(struct pan_blitter_cache *cache, struct pan_pool *desc_pool, struct pan_fb_info *fb, bool zs, mali_ptr coords, mali_ptr tsd) { unsigned dcd_idx = zs ? 1 : 0; pan_preload_fb_alloc_pre_post_dcds(desc_pool, fb); assert(fb->bifrost.pre_post.dcds.cpu); void *dcd = fb->bifrost.pre_post.dcds.cpu + (dcd_idx * pan_size(DRAW)); /* We only use crc_rt to determine whether to force writes for updating * the CRCs, so use a conservative tile size (16x16). */ int crc_rt = GENX(pan_select_crc_rt)(fb, 16 * 16); bool always_write = false; /* If CRC data is currently invalid and this batch will make it valid, * write even clean tiles to make sure CRC data is updated. */ if (crc_rt >= 0) { bool *valid = fb->rts[crc_rt].crc_valid; bool full = !fb->extent.minx && !fb->extent.miny && fb->extent.maxx == (fb->width - 1) && fb->extent.maxy == (fb->height - 1); if (full && !(*valid)) always_write = true; } pan_preload_emit_dcd(cache, desc_pool, fb, zs, coords, tsd, dcd, always_write); if (zs) { enum pipe_format fmt = fb->zs.view.zs ? fb->zs.view.zs->planes[0]->layout.format : fb->zs.view.s->planes[0]->layout.format; bool always = false; /* If we're dealing with a combined ZS resource and only one * component is cleared, we need to reload the whole surface * because the zs_clean_pixel_write_enable flag is set in that * case. */ if (util_format_is_depth_and_stencil(fmt) && fb->zs.clear.z != fb->zs.clear.s) always = true; /* We could use INTERSECT on Bifrost v7 too, but * EARLY_ZS_ALWAYS has the advantage of reloading the ZS tile * buffer one or more tiles ahead, making ZS data immediately * available for any ZS tests taking place in other shaders. * Thing's haven't been benchmarked to determine what's * preferable (saving bandwidth vs having ZS preloaded * earlier), so let's leave it like that for now. */ fb->bifrost.pre_post.modes[dcd_idx] = PAN_ARCH > 6 ? MALI_PRE_POST_FRAME_SHADER_MODE_EARLY_ZS_ALWAYS : always ? MALI_PRE_POST_FRAME_SHADER_MODE_ALWAYS : MALI_PRE_POST_FRAME_SHADER_MODE_INTERSECT; } else { fb->bifrost.pre_post.modes[dcd_idx] = always_write ? MALI_PRE_POST_FRAME_SHADER_MODE_ALWAYS : MALI_PRE_POST_FRAME_SHADER_MODE_INTERSECT; } } #else static struct panfrost_ptr pan_preload_emit_tiler_job(struct pan_blitter_cache *cache, struct pan_pool *desc_pool, struct pan_jc *jc, struct pan_fb_info *fb, bool zs, mali_ptr coords, mali_ptr tsd) { struct panfrost_ptr job = pan_pool_alloc_desc(desc_pool, TILER_JOB); pan_preload_emit_dcd(cache, desc_pool, fb, zs, coords, tsd, pan_section_ptr(job.cpu, TILER_JOB, DRAW), false); pan_section_pack(job.cpu, TILER_JOB, PRIMITIVE, cfg) { cfg.draw_mode = MALI_DRAW_MODE_TRIANGLE_STRIP; cfg.index_count = 4; cfg.job_task_split = 6; } pan_section_pack(job.cpu, TILER_JOB, PRIMITIVE_SIZE, cfg) { cfg.constant = 1.0f; } void *invoc = pan_section_ptr(job.cpu, TILER_JOB, INVOCATION); panfrost_pack_work_groups_compute(invoc, 1, 4, 1, 1, 1, 1, true, false); pan_jc_add_job(jc, MALI_JOB_TYPE_TILER, false, false, 0, 0, &job, true); return job; } #endif static struct panfrost_ptr pan_preload_fb_part(struct pan_blitter_cache *cache, struct pan_pool *pool, struct pan_jc *jc, struct pan_fb_info *fb, bool zs, mali_ptr coords, mali_ptr tsd, mali_ptr tiler) { struct panfrost_ptr job = {0}; #if PAN_ARCH >= 6 pan_preload_emit_pre_frame_dcd(cache, pool, fb, zs, coords, tsd); #else job = pan_preload_emit_tiler_job(cache, pool, jc, fb, zs, coords, tsd); #endif return job; } unsigned GENX(pan_preload_fb)(struct pan_blitter_cache *cache, struct pan_pool *pool, struct pan_jc *jc, struct pan_fb_info *fb, mali_ptr tsd, mali_ptr tiler, struct panfrost_ptr *jobs) { bool preload_zs = pan_preload_needed(fb, true); bool preload_rts = pan_preload_needed(fb, false); mali_ptr coords; if (!preload_zs && !preload_rts) return 0; float rect[] = { 0.0, 0.0, 0.0, 1.0, fb->width, 0.0, 0.0, 1.0, 0.0, fb->height, 0.0, 1.0, fb->width, fb->height, 0.0, 1.0, }; coords = pan_pool_upload_aligned(pool, rect, sizeof(rect), 64); unsigned njobs = 0; if (preload_zs) { struct panfrost_ptr job = pan_preload_fb_part(cache, pool, jc, fb, true, coords, tsd, tiler); if (jobs && job.cpu) jobs[njobs++] = job; } if (preload_rts) { struct panfrost_ptr job = pan_preload_fb_part(cache, pool, jc, fb, false, coords, tsd, tiler); if (jobs && job.cpu) jobs[njobs++] = job; } return njobs; } #if PAN_ARCH <= 7 void GENX(pan_blit_ctx_init)(struct pan_blitter_cache *cache, const struct pan_blit_info *info, struct pan_pool *blit_pool, struct pan_blit_context *ctx) { memset(ctx, 0, sizeof(*ctx)); struct pan_image_view sviews[2] = { { .format = info->src.planes[0].format, .planes = { info->src.planes[0].image, info->src.planes[1].image, info->src.planes[2].image, }, .dim = info->src.planes[0].image->layout.dim == MALI_TEXTURE_DIMENSION_CUBE ? MALI_TEXTURE_DIMENSION_2D : info->src.planes[0].image->layout.dim, .first_level = info->src.level, .last_level = info->src.level, .first_layer = info->src.start.layer, .last_layer = info->src.end.layer, .swizzle = { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W, }, }, }; struct pan_image_view dview = { .format = info->dst.planes[0].format, .planes = { info->dst.planes[0].image, info->dst.planes[1].image, info->dst.planes[2].image, }, .dim = info->dst.planes[0].image->layout.dim == MALI_TEXTURE_DIMENSION_1D ? MALI_TEXTURE_DIMENSION_1D : MALI_TEXTURE_DIMENSION_2D, .first_level = info->dst.level, .last_level = info->dst.level, .first_layer = info->dst.start.layer, .last_layer = info->dst.start.layer, .swizzle = { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W, }, }; ctx->src.start.x = info->src.start.x; ctx->src.start.y = info->src.start.y; ctx->src.end.x = info->src.end.x; ctx->src.end.y = info->src.end.y; ctx->src.dim = sviews[0].dim; if (info->dst.planes[0].image->layout.dim == MALI_TEXTURE_DIMENSION_3D) { unsigned max_z = u_minify(info->dst.planes[0].image->layout.depth, info->dst.level) - 1; ctx->z_scale = (float)(info->src.end.z - info->src.start.z) / (info->dst.end.z - info->dst.start.z); assert(info->dst.start.z != info->dst.end.z); if (info->dst.start.z > info->dst.end.z) { ctx->dst.cur_layer = info->dst.start.z - 1; ctx->dst.last_layer = info->dst.end.z; } else { ctx->dst.cur_layer = info->dst.start.z; ctx->dst.last_layer = info->dst.end.z - 1; } ctx->dst.cur_layer = MIN2(MAX2(ctx->dst.cur_layer, 0), max_z); ctx->dst.last_layer = MIN2(MAX2(ctx->dst.last_layer, 0), max_z); ctx->dst.layer_offset = ctx->dst.cur_layer; } else { unsigned max_layer = info->dst.planes[0].image->layout.array_size - 1; ctx->dst.layer_offset = info->dst.start.layer; ctx->dst.cur_layer = info->dst.start.layer; ctx->dst.last_layer = MIN2(info->dst.end.layer, max_layer); ctx->z_scale = 1; } if (sviews[0].dim == MALI_TEXTURE_DIMENSION_3D) { if (info->src.start.z < info->src.end.z) ctx->src.z_offset = info->src.start.z + fabs(ctx->z_scale * 0.5f); else ctx->src.z_offset = info->src.start.z - fabs(ctx->z_scale * 0.5f); } else { ctx->src.layer_offset = info->src.start.layer; } /* Split depth and stencil */ if (util_format_is_depth_and_stencil(sviews[0].format)) { sviews[1] = sviews[0]; sviews[0].format = util_format_get_depth_only(sviews[0].format); sviews[1].format = util_format_stencil_only(sviews[1].format); } else if (info->src.planes[1].format) { sviews[1] = sviews[0]; sviews[1].format = info->src.planes[1].format; sviews[1].planes[0] = info->src.planes[1].image; } ctx->rsd = pan_blit_get_rsd(cache, sviews, &dview); ASSERTED unsigned nlayers = info->src.end.layer - info->src.start.layer + 1; assert(nlayers == (info->dst.end.layer - info->dst.start.layer + 1)); unsigned dst_w = u_minify(info->dst.planes[0].image->layout.width, info->dst.level); unsigned dst_h = u_minify(info->dst.planes[0].image->layout.height, info->dst.level); unsigned maxx = MIN2(MAX2(info->dst.start.x, info->dst.end.x), dst_w - 1); unsigned maxy = MIN2(MAX2(info->dst.start.y, info->dst.end.y), dst_h - 1); unsigned minx = MAX2(MIN3(info->dst.start.x, info->dst.end.x, maxx), 0); unsigned miny = MAX2(MIN3(info->dst.start.y, info->dst.end.y, maxy), 0); if (info->scissor.enable) { minx = MAX2(minx, info->scissor.minx); miny = MAX2(miny, info->scissor.miny); maxx = MIN2(maxx, info->scissor.maxx); maxy = MIN2(maxy, info->scissor.maxy); } const struct pan_image_view *sview_ptrs[] = {&sviews[0], &sviews[1]}; unsigned nviews = sviews[1].format ? 2 : 1; ctx->textures = pan_blitter_emit_textures(blit_pool, nviews, sview_ptrs); ctx->samplers = pan_blitter_emit_sampler(blit_pool, info->nearest); ctx->vpd = pan_blitter_emit_viewport(blit_pool, minx, miny, maxx, maxy); float dst_rect[] = { info->dst.start.x, info->dst.start.y, 0.0, 1.0, info->dst.end.x, info->dst.start.y, 0.0, 1.0, info->dst.start.x, info->dst.end.y, 0.0, 1.0, info->dst.end.x, info->dst.end.y, 0.0, 1.0, }; ctx->position = pan_pool_upload_aligned(blit_pool, dst_rect, sizeof(dst_rect), 64); } struct panfrost_ptr GENX(pan_blit)(struct pan_blit_context *ctx, struct pan_pool *pool, struct pan_jc *jc, mali_ptr tsd, mali_ptr tiler) { if (ctx->dst.cur_layer < 0 || (ctx->dst.last_layer >= ctx->dst.layer_offset && ctx->dst.cur_layer > ctx->dst.last_layer) || (ctx->dst.last_layer < ctx->dst.layer_offset && ctx->dst.cur_layer < ctx->dst.last_layer)) return (struct panfrost_ptr){0}; int32_t layer = ctx->dst.cur_layer - ctx->dst.layer_offset; float src_z; if (ctx->src.dim == MALI_TEXTURE_DIMENSION_3D) src_z = (ctx->z_scale * layer) + ctx->src.z_offset; else src_z = ctx->src.layer_offset + layer; float src_rect[] = { ctx->src.start.x, ctx->src.start.y, src_z, 1.0, ctx->src.end.x, ctx->src.start.y, src_z, 1.0, ctx->src.start.x, ctx->src.end.y, src_z, 1.0, ctx->src.end.x, ctx->src.end.y, src_z, 1.0, }; mali_ptr src_coords = pan_pool_upload_aligned(pool, src_rect, sizeof(src_rect), 64); struct panfrost_ptr job = {0}; void *dcd = pan_blit_emit_tiler_job(pool, jc, tiler, &job); pan_pack(dcd, DRAW, cfg) { cfg.thread_storage = tsd; cfg.state = ctx->rsd; cfg.position = ctx->position; cfg.varyings = pan_blitter_emit_varying(pool); cfg.varying_buffers = pan_blitter_emit_varying_buffer(pool, src_coords); cfg.viewport = ctx->vpd; cfg.textures = ctx->textures; cfg.samplers = ctx->samplers; } return job; } #endif DERIVE_HASH_TABLE(pan_blit_shader_key); DERIVE_HASH_TABLE(pan_blit_blend_shader_key); DERIVE_HASH_TABLE(pan_blit_rsd_key); static void pan_blitter_prefill_blit_shader_cache(struct pan_blitter_cache *cache) { static const struct pan_blit_shader_key prefill[] = { { .surfaces[0] = { .loc = FRAG_RESULT_DEPTH, .type = nir_type_float32, .dim = MALI_TEXTURE_DIMENSION_2D, .src_samples = 1, .dst_samples = 1, }, }, { .surfaces[1] = { .loc = FRAG_RESULT_STENCIL, .type = nir_type_uint32, .dim = MALI_TEXTURE_DIMENSION_2D, .src_samples = 1, .dst_samples = 1, }, }, { .surfaces[0] = { .loc = FRAG_RESULT_DATA0, .type = nir_type_float32, .dim = MALI_TEXTURE_DIMENSION_2D, .src_samples = 1, .dst_samples = 1, }, }, }; for (unsigned i = 0; i < ARRAY_SIZE(prefill); i++) pan_blitter_get_blit_shader(cache, &prefill[i]); } void GENX(pan_blitter_cache_init)(struct pan_blitter_cache *cache, unsigned gpu_id, struct pan_blend_shader_cache *blend_shader_cache, struct pan_pool *bin_pool, struct pan_pool *desc_pool) { cache->gpu_id = gpu_id; cache->shaders.blit = pan_blit_shader_key_table_create(NULL); cache->shaders.blend = pan_blit_blend_shader_key_table_create(NULL); cache->shaders.pool = bin_pool; pthread_mutex_init(&cache->shaders.lock, NULL); pan_blitter_prefill_blit_shader_cache(cache); cache->rsds.pool = desc_pool; cache->rsds.rsds = pan_blit_rsd_key_table_create(NULL); pthread_mutex_init(&cache->rsds.lock, NULL); cache->blend_shader_cache = blend_shader_cache; } void GENX(pan_blitter_cache_cleanup)(struct pan_blitter_cache *cache) { _mesa_hash_table_destroy(cache->shaders.blit, NULL); _mesa_hash_table_destroy(cache->shaders.blend, NULL); pthread_mutex_destroy(&cache->shaders.lock); _mesa_hash_table_destroy(cache->rsds.rsds, NULL); pthread_mutex_destroy(&cache->rsds.lock); }