radv: implement dynamic vertex input state using vertex shader prologs

This doesn't actually use the functionality or implement prolog
compilation yet.

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11717>
This commit is contained in:
Rhys Perry 2021-04-16 11:55:59 +01:00 committed by Marge Bot
parent 2b8d88ed91
commit 80841196b2
14 changed files with 646 additions and 38 deletions

View File

@ -670,6 +670,8 @@ RADV driver environment variables
disable VRS for flat shading (only on GFX10.3+)
``preoptir``
dump LLVM IR before any optimizations
``prologs``
dump vertex shader prologs
``shaders``
dump shaders
``shaderstats``

View File

@ -263,3 +263,10 @@ aco_compile_shader(unsigned shader_count, struct nir_shader* const* shaders,
*binary = (radv_shader_binary*)legacy_binary;
}
void
aco_compile_vs_prolog(const struct radv_vs_prolog_key* key, struct radv_prolog_binary** binary,
const struct radv_shader_args* args)
{
unreachable("TODO");
}

View File

@ -44,6 +44,9 @@ extern const struct aco_compiler_statistic_info* aco_statistic_infos;
void aco_compile_shader(unsigned shader_count, struct nir_shader* const* shaders,
struct radv_shader_binary** binary, const struct radv_shader_args* args);
void aco_compile_vs_prolog(const struct radv_vs_prolog_key* key, struct radv_prolog_binary** binary,
const struct radv_shader_args* args);
#ifdef __cplusplus
}
#endif

View File

@ -38,6 +38,7 @@
struct radv_shader_args;
struct radv_shader_info;
struct radv_vs_prolog_key;
namespace aco {

View File

@ -37,6 +37,8 @@
#include "ac_debug.h"
#include "util/fast_idiv_by_const.h"
enum {
RADV_PREFETCH_VBO_DESCRIPTORS = (1 << 0),
RADV_PREFETCH_VS = (1 << 1),
@ -2647,8 +2649,300 @@ radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer)
cmd_buffer->state.context_roll_without_scissor_emitted = true;
}
union vs_prolog_key_header {
struct {
uint32_t key_size : 8;
uint32_t num_attributes : 6;
uint32_t as_ls : 1;
uint32_t is_ngg : 1;
uint32_t wave32 : 1;
uint32_t next_stage : 3;
uint32_t instance_rate_inputs : 1;
uint32_t alpha_adjust_lo : 1;
uint32_t alpha_adjust_hi : 1;
uint32_t misaligned_mask : 1;
uint32_t post_shuffle : 1;
uint32_t nontrivial_divisors : 1;
/* We need this to ensure the padding is zero. It's useful even if it's unused. */
uint32_t padding0 : 6;
};
uint32_t v;
};
uint32_t
radv_hash_vs_prolog(const void *key_)
{
const uint32_t *key = key_;
union vs_prolog_key_header header;
header.v = key[0];
return _mesa_hash_data(key, header.key_size);
}
bool
radv_cmp_vs_prolog(const void *a_, const void *b_)
{
const uint32_t *a = a_;
const uint32_t *b = b_;
if (a[0] != b[0])
return false;
union vs_prolog_key_header header;
header.v = a[0];
return memcmp(a, b, header.key_size) == 0;
}
static struct radv_shader_prolog *
lookup_vs_prolog(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_variant *vs_shader,
uint32_t *nontrivial_divisors)
{
STATIC_ASSERT(sizeof(union vs_prolog_key_header) == 4);
assert(vs_shader->info.vs.dynamic_inputs);
struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input;
struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
struct radv_device *device = cmd_buffer->device;
enum chip_class chip = device->physical_device->rad_info.chip_class;
unsigned num_attributes = util_last_bit(vs_shader->info.vs.vb_desc_usage_mask);
uint32_t attribute_mask = BITFIELD_MASK(num_attributes);
uint32_t instance_rate_inputs = state->instance_rate_inputs & attribute_mask;
*nontrivial_divisors = state->nontrivial_divisors & attribute_mask;
uint32_t misaligned_mask = 0;
if (chip == GFX6 || chip >= GFX10) {
u_foreach_bit(index, state->attribute_mask & attribute_mask)
{
uint8_t req = state->format_align_req_minus_1[index];
struct radv_vertex_binding *vb = &cmd_buffer->vertex_bindings[state->bindings[index]];
VkDeviceSize offset = vb->offset + state->offsets[index];
if (vb->buffer && ((offset & req) || (vb->stride & req)))
misaligned_mask |= 1u << index;
}
}
struct radv_vs_prolog_key key;
key.state = state;
key.num_attributes = num_attributes;
key.misaligned_mask = misaligned_mask;
/* The instance ID input VGPR is placed differently when as_ls=true. */
key.as_ls = vs_shader->info.vs.as_ls && instance_rate_inputs;
key.is_ngg = vs_shader->info.is_ngg;
key.wave32 = vs_shader->info.wave_size == 32;
key.next_stage = MESA_SHADER_VERTEX;
if (pipeline->shaders[MESA_SHADER_TESS_CTRL] == vs_shader)
key.next_stage = MESA_SHADER_TESS_CTRL;
else if (pipeline->shaders[MESA_SHADER_GEOMETRY] == vs_shader)
key.next_stage = MESA_SHADER_GEOMETRY;
uint32_t key_words[16];
unsigned key_size = 1;
union vs_prolog_key_header header;
header.v = 0;
header.num_attributes = num_attributes;
header.as_ls = key.as_ls;
header.is_ngg = key.is_ngg;
header.wave32 = key.wave32;
header.next_stage = key.next_stage;
if (instance_rate_inputs & ~*nontrivial_divisors) {
header.instance_rate_inputs = true;
key_words[key_size++] = instance_rate_inputs;
}
if (*nontrivial_divisors) {
header.nontrivial_divisors = true;
key_words[key_size++] = *nontrivial_divisors;
}
if (misaligned_mask) {
header.misaligned_mask = true;
key_words[key_size++] = misaligned_mask;
uint8_t *formats = (uint8_t *)&key_words[key_size];
unsigned num_formats = 0;
u_foreach_bit(index, misaligned_mask) formats[num_formats++] = state->formats[index];
while (num_formats & 0x3)
formats[num_formats++] = 0;
key_size += num_formats / 4u;
if (state->post_shuffle & attribute_mask) {
header.post_shuffle = true;
key_words[key_size++] = state->post_shuffle & attribute_mask;
}
}
if (state->alpha_adjust_lo & attribute_mask) {
header.alpha_adjust_lo = true;
key_words[key_size++] = state->alpha_adjust_lo & attribute_mask;
}
if (state->alpha_adjust_hi & attribute_mask) {
header.alpha_adjust_hi = true;
key_words[key_size++] = state->alpha_adjust_hi & attribute_mask;
}
header.key_size = key_size * sizeof(key_words[0]);
key_words[0] = header.v;
uint32_t hash = radv_hash_vs_prolog(key_words);
if (cmd_buffer->state.emitted_vs_prolog &&
cmd_buffer->state.emitted_vs_prolog_key_hash == hash &&
radv_cmp_vs_prolog(key_words, cmd_buffer->state.emitted_vs_prolog_key))
return cmd_buffer->state.emitted_vs_prolog;
u_rwlock_rdlock(&device->vs_prologs_lock);
struct hash_entry *prolog_entry =
_mesa_hash_table_search_pre_hashed(device->vs_prologs, hash, key_words);
u_rwlock_rdunlock(&device->vs_prologs_lock);
if (!prolog_entry) {
u_rwlock_wrlock(&device->vs_prologs_lock);
prolog_entry = _mesa_hash_table_search_pre_hashed(device->vs_prologs, hash, key_words);
if (prolog_entry) {
u_rwlock_wrunlock(&device->vs_prologs_lock);
return prolog_entry->data;
}
struct radv_shader_prolog *prolog = radv_create_vs_prolog(device, &key);
uint32_t *key2 = malloc(key_size * 4);
if (!prolog || !key2) {
free(key2);
u_rwlock_wrunlock(&device->vs_prologs_lock);
return NULL;
}
memcpy(key2, key_words, key_size * 4);
_mesa_hash_table_insert_pre_hashed(device->vs_prologs, hash, key2, prolog);
u_rwlock_wrunlock(&device->vs_prologs_lock);
return prolog;
}
return prolog_entry->data;
}
static void
radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer *cmd_buffer)
emit_prolog_regs(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_variant *vs_shader,
struct radv_shader_prolog *prolog, bool pipeline_is_dirty)
{
/* no need to re-emit anything in this case */
if (cmd_buffer->state.emitted_vs_prolog == prolog && !pipeline_is_dirty)
return;
enum chip_class chip = cmd_buffer->device->physical_device->rad_info.chip_class;
struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
uint64_t prolog_va = radv_buffer_get_va(prolog->bo) + prolog->alloc->offset;
assert(cmd_buffer->state.emitted_pipeline == cmd_buffer->state.pipeline);
assert(vs_shader->info.num_input_sgprs <= prolog->num_preserved_sgprs);
uint32_t rsrc1 = vs_shader->config.rsrc1;
if (chip < GFX10 && G_00B228_SGPRS(prolog->rsrc1) > G_00B228_SGPRS(vs_shader->config.rsrc1))
rsrc1 = (rsrc1 & C_00B228_SGPRS) | (prolog->rsrc1 & ~C_00B228_SGPRS);
/* The main shader must not use less VGPRs than the prolog, otherwise shared vgprs might not
* work.
*/
assert(G_00B848_VGPRS(vs_shader->config.rsrc1) >= G_00B848_VGPRS(prolog->rsrc1));
unsigned pgm_lo_reg = R_00B120_SPI_SHADER_PGM_LO_VS;
unsigned rsrc1_reg = R_00B128_SPI_SHADER_PGM_RSRC1_VS;
if (vs_shader->info.is_ngg || pipeline->shaders[MESA_SHADER_GEOMETRY] == vs_shader) {
pgm_lo_reg = chip >= GFX10 ? R_00B320_SPI_SHADER_PGM_LO_ES : R_00B210_SPI_SHADER_PGM_LO_ES;
rsrc1_reg = R_00B228_SPI_SHADER_PGM_RSRC1_GS;
} else if (pipeline->shaders[MESA_SHADER_TESS_CTRL] == vs_shader) {
pgm_lo_reg = chip >= GFX10 ? R_00B520_SPI_SHADER_PGM_LO_LS : R_00B410_SPI_SHADER_PGM_LO_LS;
rsrc1_reg = R_00B428_SPI_SHADER_PGM_RSRC1_HS;
} else if (vs_shader->info.vs.as_ls) {
pgm_lo_reg = R_00B520_SPI_SHADER_PGM_LO_LS;
rsrc1_reg = R_00B528_SPI_SHADER_PGM_RSRC1_LS;
} else if (vs_shader->info.vs.as_es) {
pgm_lo_reg = R_00B320_SPI_SHADER_PGM_LO_ES;
rsrc1_reg = R_00B328_SPI_SHADER_PGM_RSRC1_ES;
}
radeon_set_sh_reg_seq(cmd_buffer->cs, pgm_lo_reg, 2);
radeon_emit(cmd_buffer->cs, prolog_va >> 8);
radeon_emit(cmd_buffer->cs, S_00B124_MEM_BASE(prolog_va >> 40));
if (chip < GFX10)
radeon_set_sh_reg(cmd_buffer->cs, rsrc1_reg, rsrc1);
else
assert(rsrc1 == vs_shader->config.rsrc1);
radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, prolog->bo);
}
static void
emit_prolog_inputs(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_variant *vs_shader,
uint32_t nontrivial_divisors, bool pipeline_is_dirty)
{
/* no need to re-emit anything in this case */
if (!nontrivial_divisors && !pipeline_is_dirty)
return;
struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input;
uint64_t input_va = radv_shader_variant_get_va(vs_shader);
if (nontrivial_divisors) {
unsigned inputs_offset;
uint32_t *inputs;
unsigned size = 8 + util_bitcount(nontrivial_divisors) * 8;
if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, &inputs_offset, (void **)&inputs))
return;
*(inputs++) = input_va;
*(inputs++) = input_va >> 32;
u_foreach_bit(index, nontrivial_divisors)
{
uint32_t div = state->divisors[index];
if (div == 0) {
*(inputs++) = 0;
*(inputs++) = 1;
} else if (util_is_power_of_two_or_zero(div)) {
*(inputs++) = util_logbase2(div) | (1 << 8);
*(inputs++) = 0xffffffffu;
} else {
struct util_fast_udiv_info info = util_compute_fast_udiv_info(div, 32, 32);
*(inputs++) = info.pre_shift | (info.increment << 8) | (info.post_shift << 16);
*(inputs++) = info.multiplier;
}
}
input_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + inputs_offset;
}
struct radv_userdata_info *loc =
&vs_shader->info.user_sgprs_locs.shader_data[AC_UD_VS_PROLOG_INPUTS];
uint32_t base_reg = cmd_buffer->state.pipeline->user_data_0[MESA_SHADER_VERTEX];
assert(loc->sgpr_idx != -1);
assert(loc->num_sgprs == 2);
radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4,
input_va, true);
}
static void
radv_emit_vertex_state(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
{
struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
struct radv_shader_variant *vs_shader = radv_get_shader(pipeline, MESA_SHADER_VERTEX);
if (!vs_shader->info.vs.has_prolog)
return;
uint32_t nontrivial_divisors;
struct radv_shader_prolog *prolog =
lookup_vs_prolog(cmd_buffer, vs_shader, &nontrivial_divisors);
if (!prolog) {
cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
return;
}
emit_prolog_regs(cmd_buffer, vs_shader, prolog, pipeline_is_dirty);
emit_prolog_inputs(cmd_buffer, vs_shader, nontrivial_divisors, pipeline_is_dirty);
cmd_buffer->state.emitted_vs_prolog = prolog;
}
static void
radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
{
uint64_t states =
cmd_buffer->state.dirty & cmd_buffer->state.emitted_pipeline->graphics.needed_dynamic_state;
@ -2717,6 +3011,9 @@ radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer *cmd_buffer)
if (states & RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE)
radv_emit_color_write_enable(cmd_buffer);
if (states & RADV_CMD_DIRTY_VERTEX_STATE)
radv_emit_vertex_state(cmd_buffer, pipeline_is_dirty);
cmd_buffer->state.dirty &= ~states;
}
@ -2923,33 +3220,105 @@ radv_flush_constants(struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags stag
cmd_buffer->push_constant_stages |= dirty_stages;
}
enum radv_dst_sel {
DST_SEL_0001 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_0) |
S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1),
DST_SEL_X001 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_0) |
S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1),
DST_SEL_XY01 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1),
DST_SEL_XYZ1 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1),
DST_SEL_XYZW = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W),
DST_SEL_ZYXW = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W),
};
static const uint32_t data_format_dst_sel[] = {
[V_008F0C_BUF_DATA_FORMAT_INVALID] = DST_SEL_0001,
[V_008F0C_BUF_DATA_FORMAT_8] = DST_SEL_X001,
[V_008F0C_BUF_DATA_FORMAT_16] = DST_SEL_X001,
[V_008F0C_BUF_DATA_FORMAT_8_8] = DST_SEL_XY01,
[V_008F0C_BUF_DATA_FORMAT_32] = DST_SEL_X001,
[V_008F0C_BUF_DATA_FORMAT_16_16] = DST_SEL_XY01,
[V_008F0C_BUF_DATA_FORMAT_10_11_11] = DST_SEL_XYZ1,
[V_008F0C_BUF_DATA_FORMAT_11_11_10] = DST_SEL_XYZ1,
[V_008F0C_BUF_DATA_FORMAT_10_10_10_2] = DST_SEL_XYZW,
[V_008F0C_BUF_DATA_FORMAT_2_10_10_10] = DST_SEL_XYZW,
[V_008F0C_BUF_DATA_FORMAT_8_8_8_8] = DST_SEL_XYZW,
[V_008F0C_BUF_DATA_FORMAT_32_32] = DST_SEL_XY01,
[V_008F0C_BUF_DATA_FORMAT_16_16_16_16] = DST_SEL_XYZW,
[V_008F0C_BUF_DATA_FORMAT_32_32_32] = DST_SEL_XYZ1,
[V_008F0C_BUF_DATA_FORMAT_32_32_32_32] = DST_SEL_XYZW,
};
static void
radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
{
if ((pipeline_is_dirty || (cmd_buffer->state.dirty & RADV_CMD_DIRTY_VERTEX_BUFFER)) &&
cmd_buffer->state.pipeline->vb_desc_usage_mask) {
struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
struct radv_shader_variant *vs_shader = radv_get_shader(pipeline, MESA_SHADER_VERTEX);
enum chip_class chip = cmd_buffer->device->physical_device->rad_info.chip_class;
unsigned vb_offset;
void *vb_ptr;
unsigned desc_index = 0;
uint32_t mask = pipeline->vb_desc_usage_mask;
uint64_t va;
struct radv_vs_input_state *vs_state =
vs_shader->info.vs.dynamic_inputs ? &cmd_buffer->state.dynamic_vs_input : NULL;
/* allocate some descriptor state for vertex buffers */
if (!radv_cmd_buffer_upload_alloc(cmd_buffer, pipeline->vb_desc_alloc_size, &vb_offset, &vb_ptr))
return;
assert(!vs_state || pipeline->use_per_attribute_vb_descs);
while (mask) {
unsigned i = u_bit_scan(&mask);
uint32_t *desc = &((uint32_t *)vb_ptr)[desc_index++ * 4];
uint32_t offset;
unsigned binding = pipeline->use_per_attribute_vb_descs ? pipeline->attrib_bindings[i] : i;
uint32_t offset, rsrc_word3;
unsigned binding =
vs_state ? cmd_buffer->state.dynamic_vs_input.bindings[i]
: (pipeline->use_per_attribute_vb_descs ? pipeline->attrib_bindings[i] : i);
struct radv_buffer *buffer = cmd_buffer->vertex_bindings[binding].buffer;
unsigned num_records;
unsigned stride;
if (vs_state) {
unsigned format = vs_state->formats[i];
unsigned dfmt = format & 0xf;
unsigned nfmt = (format >> 4) & 0x7;
rsrc_word3 =
vs_state->post_shuffle & (1u << i) ? DST_SEL_ZYXW : data_format_dst_sel[dfmt];
if (chip >= GFX10)
rsrc_word3 |= S_008F0C_FORMAT(ac_get_tbuffer_format(chip, dfmt, nfmt));
else
rsrc_word3 |= S_008F0C_NUM_FORMAT(nfmt) | S_008F0C_DATA_FORMAT(dfmt);
} else {
if (chip >= GFX10)
rsrc_word3 = DST_SEL_XYZW | S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_UINT);
else
rsrc_word3 = DST_SEL_XYZW | S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
}
if (!buffer) {
memset(desc, 0, 4 * 4);
if (vs_state) {
/* Stride needs to be non-zero on GFX9, or else bounds checking is disabled. We need
* to include the format/word3 so that the alpha channel is 1 for formats without an
* alpha channel.
*/
desc[0] = 0;
desc[1] = S_008F04_STRIDE(16);
desc[2] = 0;
desc[3] = rsrc_word3;
} else {
memset(desc, 0, 4 * 4);
}
continue;
}
@ -2957,6 +3326,8 @@ radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_
offset = cmd_buffer->vertex_bindings[binding].offset;
va += offset + buffer->offset;
if (vs_state)
va += vs_state->offsets[i];
if (cmd_buffer->vertex_bindings[binding].size) {
num_records = cmd_buffer->vertex_bindings[binding].size;
@ -2970,9 +3341,9 @@ radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_
stride = pipeline->binding_stride[binding];
}
enum chip_class chip = cmd_buffer->device->physical_device->rad_info.chip_class;
if (pipeline->use_per_attribute_vb_descs) {
uint32_t attrib_end = pipeline->attrib_ends[i];
uint32_t attrib_end = vs_state ? vs_state->offsets[i] + vs_state->format_sizes[i]
: pipeline->attrib_ends[i];
if (num_records < attrib_end) {
num_records = 0; /* not enough space for one vertex */
@ -2997,7 +3368,14 @@ radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_
* num_records and stride are zero. This doesn't seem necessary on GFX8, GFX10 and
* GFX10.3 but it doesn't hurt.
*/
memset(desc, 0, 16);
if (vs_state) {
desc[0] = 0;
desc[1] = S_008F04_STRIDE(16);
desc[2] = 0;
desc[3] = rsrc_word3;
} else {
memset(desc, 0, 16);
}
continue;
}
} else {
@ -3005,22 +3383,13 @@ radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_
num_records = DIV_ROUND_UP(num_records, stride);
}
uint32_t rsrc_word3 =
S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
if (chip >= GFX10) {
/* OOB_SELECT chooses the out-of-bounds check:
* - 1: index >= NUM_RECORDS (Structured)
* - 3: offset >= NUM_RECORDS (Raw)
*/
int oob_select = stride ? V_008F0C_OOB_SELECT_STRUCTURED : V_008F0C_OOB_SELECT_RAW;
rsrc_word3 |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_UINT) |
S_008F0C_OOB_SELECT(oob_select) | S_008F0C_RESOURCE_LEVEL(1);
} else {
rsrc_word3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
rsrc_word3 |= S_008F0C_OOB_SELECT(oob_select) | S_008F0C_RESOURCE_LEVEL(1);
}
desc[0] = va;
@ -4009,7 +4378,7 @@ radv_CmdBindVertexBuffers2EXT(VkCommandBuffer commandBuffer, uint32_t firstBindi
return;
}
cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER;
cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_STATE;
}
static uint32_t
@ -4397,7 +4766,7 @@ radv_CmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipeline
if (!pipeline)
break;
cmd_buffer->state.dirty |= RADV_CMD_DIRTY_PIPELINE;
cmd_buffer->state.dirty |= RADV_CMD_DIRTY_PIPELINE | RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT;
cmd_buffer->push_constant_stages |= pipeline->active_stages;
/* the new vertex shader might not have the same user regs */
@ -5712,7 +6081,7 @@ radv_need_late_scissor_emission(struct radv_cmd_buffer *cmd_buffer,
/* Index, vertex and streamout buffers don't change context regs, and
* pipeline is already handled.
*/
used_states &= ~(RADV_CMD_DIRTY_INDEX_BUFFER | RADV_CMD_DIRTY_VERTEX_BUFFER |
used_states &= ~(RADV_CMD_DIRTY_INDEX_BUFFER | RADV_CMD_DIRTY_VERTEX_STATE |
RADV_CMD_DIRTY_STREAMOUT_BUFFER | RADV_CMD_DIRTY_PIPELINE);
if (cmd_buffer->state.dirty & used_states)
@ -5918,7 +6287,8 @@ radv_emit_ngg_culling_state(struct radv_cmd_buffer *cmd_buffer, const struct rad
}
static void
radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info)
radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info,
bool pipeline_is_dirty)
{
bool late_scissor_emission;
@ -5955,7 +6325,7 @@ radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer, const struct r
}
}
radv_cmd_buffer_flush_dynamic_state(cmd_buffer);
radv_cmd_buffer_flush_dynamic_state(cmd_buffer, pipeline_is_dirty);
radv_emit_draw_registers(cmd_buffer, info);
@ -6004,7 +6374,7 @@ radv_before_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info
* the CUs are idle is very short. (there are only SET_SH
* packets between the wait and the draw)
*/
radv_emit_all_graphics_states(cmd_buffer, info);
radv_emit_all_graphics_states(cmd_buffer, info, pipeline_is_dirty);
si_emit_cache_flush(cmd_buffer);
/* <-- CUs are idle here --> */
@ -6024,7 +6394,7 @@ radv_before_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info
radv_upload_graphics_shader_descriptors(cmd_buffer, pipeline_is_dirty);
radv_emit_all_graphics_states(cmd_buffer, info);
radv_emit_all_graphics_states(cmd_buffer, info, pipeline_is_dirty);
}
radv_describe_draw(cmd_buffer);

View File

@ -63,6 +63,7 @@ enum {
RADV_DEBUG_NO_VRS_FLAT_SHADING = 1ull << 32,
RADV_DEBUG_NO_ATOC_DITHERING = 1ull << 33,
RADV_DEBUG_NO_NGGC = 1ull << 34,
RADV_DEBUG_DUMP_PROLOGS = 1ull << 35,
};
enum {

View File

@ -853,6 +853,7 @@ static const struct debug_control radv_debug_options[] = {
{"novrsflatshading", RADV_DEBUG_NO_VRS_FLAT_SHADING},
{"noatocdithering", RADV_DEBUG_NO_ATOC_DITHERING},
{"nonggc", RADV_DEBUG_NO_NGGC},
{"prologs", RADV_DEBUG_DUMP_PROLOGS},
{NULL, 0}};
const char *
@ -2666,6 +2667,30 @@ radv_device_finish_border_color(struct radv_device *device)
}
}
static VkResult
radv_device_init_vs_prologs(struct radv_device *device)
{
u_rwlock_init(&device->vs_prologs_lock);
device->vs_prologs = _mesa_hash_table_create(NULL, &radv_hash_vs_prolog, &radv_cmp_vs_prolog);
if (!device->vs_prologs)
return vk_error(device->physical_device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
return VK_SUCCESS;
}
static void
radv_device_finish_vs_prologs(struct radv_device *device)
{
if (device->vs_prologs) {
hash_table_foreach(device->vs_prologs, entry)
{
free((void *)entry->key);
radv_prolog_destroy(device, entry->data);
}
_mesa_hash_table_destroy(device->vs_prologs, NULL);
}
}
VkResult
radv_device_init_vrs_state(struct radv_device *device)
{
@ -2799,6 +2824,7 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr
bool custom_border_colors = false;
bool attachment_vrs_enabled = false;
bool image_float32_atomics = false;
bool vs_prologs = false;
/* Check enabled features */
if (pCreateInfo->pEnabledFeatures) {
@ -3090,6 +3116,12 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr
goto fail;
}
if (vs_prologs) {
result = radv_device_init_vs_prologs(device);
if (result != VK_SUCCESS)
goto fail;
}
for (int family = 0; family < RADV_MAX_QUEUE_FAMILIES; ++family) {
device->empty_cs[family] = device->ws->cs_create(device->ws, family);
if (!device->empty_cs[family])
@ -3156,6 +3188,7 @@ fail:
if (device->gfx_init)
device->ws->buffer_destroy(device->ws, device->gfx_init);
radv_device_finish_vs_prologs(device);
radv_device_finish_border_color(device);
for (unsigned i = 0; i < RADV_MAX_QUEUE_FAMILIES; i++) {
@ -3186,6 +3219,7 @@ radv_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
if (device->gfx_init)
device->ws->buffer_destroy(device->ws, device->gfx_init);
radv_device_finish_vs_prologs(device);
radv_device_finish_border_color(device);
radv_device_finish_vrs_image(device);

View File

@ -2742,8 +2742,8 @@ radv_determine_ngg_settings(struct radv_pipeline *pipeline,
: nir[es_stage]->info.tess.primitive_mode == GL_ISOLINES ? 2
: 3;
infos[es_stage].has_ngg_culling =
radv_consider_culling(device, nir[es_stage], ps_inputs_read, num_vertices_per_prim);
infos[es_stage].has_ngg_culling = radv_consider_culling(
device, nir[es_stage], ps_inputs_read, num_vertices_per_prim, &infos[es_stage]);
nir_function_impl *impl = nir_shader_get_entrypoint(nir[es_stage]);
infos[es_stage].has_ngg_early_prim_export = exec_list_is_singular(&impl->body);
@ -5386,7 +5386,10 @@ radv_pipeline_init_vertex_input_state(struct radv_pipeline *pipeline,
}
pipeline->use_per_attribute_vb_descs = info->vs.use_per_attribute_vb_descs;
pipeline->vb_desc_usage_mask = info->vs.vb_desc_usage_mask;
if (info->vs.dynamic_inputs)
pipeline->vb_desc_usage_mask = BITFIELD_MASK(util_last_bit(info->vs.vb_desc_usage_mask));
else
pipeline->vb_desc_usage_mask = info->vs.vb_desc_usage_mask;
pipeline->vb_desc_alloc_size = util_bitcount(pipeline->vb_desc_usage_mask) * 16;
}

View File

@ -832,6 +832,9 @@ struct radv_device {
struct radv_buffer *buffer; /* HTILE */
struct radv_device_memory *mem;
} vrs;
struct u_rwlock vs_prologs_lock;
struct hash_table *vs_prologs;
};
VkResult _radv_device_set_lost(struct radv_device *device, const char *file, int line,
@ -997,7 +1000,8 @@ enum radv_dynamic_state_bits {
RADV_DYNAMIC_LOGIC_OP = 1ull << 26,
RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE = 1ull << 27,
RADV_DYNAMIC_COLOR_WRITE_ENABLE = 1ull << 28,
RADV_DYNAMIC_ALL = (1ull << 29) - 1,
RADV_DYNAMIC_VERTEX_INPUT = 1ull << 29,
RADV_DYNAMIC_ALL = (1ull << 30) - 1,
};
enum radv_cmd_dirty_bits {
@ -1032,12 +1036,14 @@ enum radv_cmd_dirty_bits {
RADV_CMD_DIRTY_DYNAMIC_LOGIC_OP = 1ull << 26,
RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE = 1ull << 27,
RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE = 1ull << 28,
RADV_CMD_DIRTY_DYNAMIC_ALL = (1ull << 29) - 1,
RADV_CMD_DIRTY_PIPELINE = 1ull << 29,
RADV_CMD_DIRTY_INDEX_BUFFER = 1ull << 30,
RADV_CMD_DIRTY_FRAMEBUFFER = 1ull << 31,
RADV_CMD_DIRTY_VERTEX_BUFFER = 1ull << 32,
RADV_CMD_DIRTY_STREAMOUT_BUFFER = 1ull << 33
RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT = 1ull << 29,
RADV_CMD_DIRTY_DYNAMIC_ALL = (1ull << 30) - 1,
RADV_CMD_DIRTY_PIPELINE = 1ull << 30,
RADV_CMD_DIRTY_INDEX_BUFFER = 1ull << 31,
RADV_CMD_DIRTY_FRAMEBUFFER = 1ull << 32,
RADV_CMD_DIRTY_VERTEX_BUFFER = 1ull << 33,
RADV_CMD_DIRTY_STREAMOUT_BUFFER = 1ull << 34,
RADV_CMD_DIRTY_VERTEX_STATE = RADV_CMD_DIRTY_VERTEX_BUFFER | RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT,
};
enum radv_cmd_flush_bits {
@ -1349,6 +1355,7 @@ struct radv_cmd_state {
struct radv_render_pass *pass;
const struct radv_subpass *subpass;
struct radv_dynamic_state dynamic;
struct radv_vs_input_state dynamic_vs_input;
struct radv_attachment_state *attachments;
struct radv_streamout_state streamout;
VkRect2D render_area;
@ -1414,6 +1421,10 @@ struct radv_cmd_state {
bool uses_draw_indirect_multi;
uint32_t rt_stack_size;
struct radv_shader_prolog *emitted_vs_prolog;
uint32_t *emitted_vs_prolog_key;
uint32_t emitted_vs_prolog_key_hash;
};
struct radv_cmd_pool {
@ -1531,6 +1542,10 @@ void si_cp_dma_clear_buffer(struct radv_cmd_buffer *cmd_buffer, uint64_t va, uin
void si_cp_dma_wait_for_idle(struct radv_cmd_buffer *cmd_buffer);
void radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer);
uint32_t radv_hash_vs_prolog(const void *key_);
bool radv_cmp_vs_prolog(const void *a_, const void *b_);
bool radv_cmd_buffer_upload_alloc(struct radv_cmd_buffer *cmd_buffer, unsigned size,
unsigned *out_offset, void **ptr);
void radv_cmd_buffer_set_subpass(struct radv_cmd_buffer *cmd_buffer,

View File

@ -888,8 +888,8 @@ radv_lower_io_to_mem(struct radv_device *device, struct nir_shader *nir,
}
bool
radv_consider_culling(struct radv_device *device, struct nir_shader *nir,
uint64_t ps_inputs_read, unsigned num_vertices_per_primitive)
radv_consider_culling(struct radv_device *device, struct nir_shader *nir, uint64_t ps_inputs_read,
unsigned num_vertices_per_primitive, const struct radv_shader_info *info)
{
/* Culling doesn't make sense for meta shaders. */
if (!!nir->info.name)
@ -899,6 +899,10 @@ radv_consider_culling(struct radv_device *device, struct nir_shader *nir,
if (nir->info.outputs_written & (VARYING_BIT_VIEWPORT | VARYING_BIT_VIEWPORT_MASK))
return false;
/* We don't support culling with vertex shader prologs. */
if (info->vs.has_prolog)
return false;
if (!device->physical_device->use_ngg_culling)
return false;
@ -1910,6 +1914,72 @@ radv_create_trap_handler_shader(struct radv_device *device)
return shader;
}
static struct radv_shader_prolog *
upload_vs_prolog(struct radv_device *device, struct radv_prolog_binary *bin, unsigned wave_size)
{
struct radv_shader_prolog *prolog = malloc(sizeof(struct radv_shader_prolog));
if (!prolog)
return NULL;
prolog->alloc = alloc_shader_memory(device, bin->code_size, NULL);
if (!prolog->alloc) {
free(prolog);
return NULL;
}
prolog->bo = prolog->alloc->arena->bo;
char *dest_ptr = prolog->alloc->arena->ptr + prolog->alloc->offset;
memcpy(dest_ptr, bin->data, bin->code_size);
prolog->rsrc1 = S_00B848_VGPRS((bin->num_vgprs - 1) / (wave_size == 32 ? 8 : 4)) |
S_00B228_SGPRS((bin->num_sgprs - 1) / 8);
prolog->num_preserved_sgprs = bin->num_preserved_sgprs;
return prolog;
}
struct radv_shader_prolog *
radv_create_vs_prolog(struct radv_device *device, const struct radv_vs_prolog_key *key)
{
struct radv_nir_compiler_options options = {0};
options.explicit_scratch_args = true;
options.family = device->physical_device->rad_info.family;
options.chip_class = device->physical_device->rad_info.chip_class;
options.info = &device->physical_device->rad_info;
options.address32_hi = device->physical_device->rad_info.address32_hi;
options.dump_shader = device->instance->debug_flags & RADV_DEBUG_DUMP_PROLOGS;
struct radv_shader_info info = {0};
info.wave_size = key->wave32 ? 32 : 64;
info.vs.needs_instance_id = true;
info.vs.needs_base_instance = true;
info.vs.needs_draw_id = true;
info.vs.use_per_attribute_vb_descs = true;
info.vs.vb_desc_usage_mask = BITFIELD_MASK(key->num_attributes);
info.vs.has_prolog = true;
info.vs.as_ls = key->as_ls;
info.is_ngg = key->is_ngg;
struct radv_shader_args args = {0};
args.options = &options;
args.shader_info = &info;
radv_declare_shader_args(&args, key->next_stage, key->next_stage != MESA_SHADER_VERTEX,
MESA_SHADER_VERTEX);
#ifdef LLVM_AVAILABLE
if (options.dump_shader)
ac_init_llvm_once();
#endif
struct radv_prolog_binary *binary = NULL;
aco_compile_vs_prolog(key, &binary, &args);
struct radv_shader_prolog *prolog = upload_vs_prolog(device, binary, info.wave_size);
free(binary);
return prolog;
}
void
radv_shader_variant_destroy(struct radv_device *device, struct radv_shader_variant *variant)
{
@ -1926,6 +1996,16 @@ radv_shader_variant_destroy(struct radv_device *device, struct radv_shader_varia
free(variant);
}
void
radv_prolog_destroy(struct radv_device *device, struct radv_shader_prolog *prolog)
{
if (!prolog)
return;
free_shader_memory(device, prolog->alloc);
free(prolog);
}
uint64_t
radv_shader_variant_get_va(const struct radv_shader_variant *variant)
{

View File

@ -46,6 +46,7 @@ struct radv_device;
struct radv_pipeline;
struct radv_pipeline_cache;
struct radv_pipeline_key;
struct radv_vs_input_state;
enum radv_vs_input_alpha_adjust {
ALPHA_ADJUST_NONE = 0,
@ -71,6 +72,7 @@ struct radv_pipeline_key {
enum radv_vs_input_alpha_adjust vertex_alpha_adjust[MAX_VERTEX_ATTRIBS];
uint32_t vertex_post_shuffle;
uint32_t provoking_vtx_last : 1;
uint32_t dynamic_input_state : 1;
uint8_t topology;
} vs;
@ -145,6 +147,7 @@ enum radv_ud_index {
AC_UD_SHADER_START = 9,
AC_UD_VS_VERTEX_BUFFERS = AC_UD_SHADER_START,
AC_UD_VS_BASE_VERTEX_START_INSTANCE,
AC_UD_VS_PROLOG_INPUTS,
AC_UD_VS_MAX_UD,
AC_UD_PS_MAX_UD,
AC_UD_CS_GRID_SIZE = AC_UD_SHADER_START,
@ -259,6 +262,8 @@ struct radv_shader_info {
bool needs_base_instance;
bool use_per_attribute_vb_descs;
uint32_t vb_desc_usage_mask;
bool has_prolog;
bool dynamic_inputs;
} vs;
struct {
uint8_t output_usage_mask[VARYING_SLOT_VAR31 + 1];
@ -353,6 +358,37 @@ struct radv_shader_info {
struct gfx10_ngg_info ngg_info;
};
struct radv_vs_input_state {
uint32_t attribute_mask;
uint8_t bindings[MAX_VERTEX_ATTRIBS];
uint32_t instance_rate_inputs;
uint32_t nontrivial_divisors;
uint32_t divisors[MAX_VERTEX_ATTRIBS];
uint32_t offsets[MAX_VERTEX_ATTRIBS];
uint32_t post_shuffle;
/* Having two separate fields instead of a single uint64_t makes it easier to remove attributes
* using bitwise arithmetic.
*/
uint32_t alpha_adjust_lo;
uint32_t alpha_adjust_hi;
uint8_t formats[MAX_VERTEX_ATTRIBS];
uint8_t format_align_req_minus_1[MAX_VERTEX_ATTRIBS];
uint8_t format_sizes[MAX_VERTEX_ATTRIBS];
};
struct radv_vs_prolog_key {
struct radv_vs_input_state *state;
unsigned num_attributes;
uint32_t misaligned_mask;
bool as_ls;
bool is_ngg;
bool wave32;
gl_shader_stage next_stage;
};
enum radv_shader_binary_type { RADV_BINARY_TYPE_LEGACY, RADV_BINARY_TYPE_RTLD };
struct radv_shader_binary {
@ -387,6 +423,14 @@ struct radv_shader_binary_rtld {
uint8_t data[0];
};
struct radv_prolog_binary {
uint8_t num_sgprs;
uint8_t num_vgprs;
uint8_t num_preserved_sgprs;
unsigned code_size;
uint8_t data[0];
};
struct radv_shader_arena {
struct list_head list;
struct list_head entries;
@ -429,6 +473,13 @@ struct radv_shader_variant {
uint32_t *statistics;
};
struct radv_shader_prolog {
struct radeon_winsys_bo *bo;
union radv_shader_arena_block *alloc;
uint32_t rsrc1;
uint8_t num_preserved_sgprs;
};
void radv_optimize_nir(const struct radv_device *device, struct nir_shader *shader,
bool optimize_conservatively, bool allow_copies);
void radv_optimize_nir_algebraic(nir_shader *shader, bool opt_offsets);
@ -469,8 +520,13 @@ radv_create_gs_copy_shader(struct radv_device *device, struct nir_shader *nir,
struct radv_shader_variant *radv_create_trap_handler_shader(struct radv_device *device);
struct radv_shader_prolog *radv_create_vs_prolog(struct radv_device *device,
const struct radv_vs_prolog_key *key);
void radv_shader_variant_destroy(struct radv_device *device, struct radv_shader_variant *variant);
void radv_prolog_destroy(struct radv_device *device, struct radv_shader_prolog *prolog);
uint64_t radv_shader_variant_get_va(const struct radv_shader_variant *variant);
struct radv_shader_variant *radv_find_shader_variant(struct radv_device *device, uint64_t pc);
@ -577,7 +633,8 @@ void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir,
const struct radv_pipeline_key *pl_key);
bool radv_consider_culling(struct radv_device *device, struct nir_shader *nir,
uint64_t ps_inputs_read, unsigned num_vertices_per_primitive);
uint64_t ps_inputs_read, unsigned num_vertices_per_primitive,
const struct radv_shader_info *info);
void radv_get_nir_options(struct radv_physical_device *device);

View File

@ -184,6 +184,10 @@ allocate_user_sgprs(struct radv_shader_args *args, gl_shader_stage stage, bool h
/* 2 user sgprs will always be allocated for scratch/rings */
user_sgpr_count += 2;
/* prolog inputs */
if (args->shader_info->vs.has_prolog)
user_sgpr_count += 2;
switch (stage) {
case MESA_SHADER_COMPUTE:
if (args->shader_info->cs.uses_sbt)
@ -281,6 +285,9 @@ static void
declare_vs_specific_input_sgprs(struct radv_shader_args *args, gl_shader_stage stage,
bool has_previous_stage, gl_shader_stage previous_stage)
{
if (args->shader_info->vs.has_prolog)
ac_add_arg(&args->ac, AC_ARG_SGPR, 2, AC_ARG_INT, &args->prolog_inputs);
if (!args->is_gs_copy_shader && (stage == MESA_SHADER_VERTEX ||
(has_previous_stage && previous_stage == MESA_SHADER_VERTEX))) {
if (args->shader_info->vs.vb_desc_usage_mask) {
@ -328,6 +335,17 @@ declare_vs_input_vgprs(struct radv_shader_args *args)
}
}
}
if (args->shader_info->vs.dynamic_inputs) {
assert(args->shader_info->vs.use_per_attribute_vb_descs);
unsigned num_attributes = util_last_bit(args->shader_info->vs.vb_desc_usage_mask);
for (unsigned i = 0; i < num_attributes; i++)
ac_add_arg(&args->ac, AC_ARG_VGPR, 4, AC_ARG_INT, &args->vs_inputs[i]);
/* Ensure the main shader doesn't use less vgprs than the prolog. The prolog requires one
* VGPR more than the number of shader arguments in the case of non-trivial divisors on GFX8.
*/
ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);
}
}
static void
@ -463,6 +481,9 @@ set_vs_specific_input_locs(struct radv_shader_args *args, gl_shader_stage stage,
bool has_previous_stage, gl_shader_stage previous_stage,
uint8_t *user_sgpr_idx)
{
if (args->prolog_inputs.used)
set_loc_shader(args, AC_UD_VS_PROLOG_INPUTS, user_sgpr_idx, 2);
if (!args->is_gs_copy_shader && (stage == MESA_SHADER_VERTEX ||
(has_previous_stage && previous_stage == MESA_SHADER_VERTEX))) {
if (args->ac.vertex_buffers.used) {

View File

@ -45,6 +45,9 @@ struct radv_shader_args {
struct ac_arg ngg_viewport_scale[2];
struct ac_arg ngg_viewport_translate[2];
struct ac_arg prolog_inputs;
struct ac_arg vs_inputs[MAX_VERTEX_ATTRIBS];
bool is_gs_copy_shader;
bool is_trap_handler_shader;
};

View File

@ -608,12 +608,23 @@ radv_nir_shader_info_pass(struct radv_device *device, const struct nir_shader *n
}
if (nir->info.stage == MESA_SHADER_VERTEX) {
if (pipeline_key->vs.dynamic_input_state && nir->info.inputs_read) {
info->vs.has_prolog = true;
info->vs.dynamic_inputs = true;
}
/* Use per-attribute vertex descriptors to prevent faults and
* for correct bounds checking.
*/
info->vs.use_per_attribute_vb_descs = device->robust_buffer_access;
info->vs.use_per_attribute_vb_descs = device->robust_buffer_access || info->vs.dynamic_inputs;
}
/* We have to ensure consistent input register assignments between the main shader and the
* prolog. */
info->vs.needs_instance_id |= info->vs.has_prolog;
info->vs.needs_base_instance |= info->vs.has_prolog;
info->vs.needs_draw_id |= info->vs.has_prolog;
nir_foreach_shader_in_variable (variable, nir)
gather_info_input_decl(nir, variable, pipeline_key, info);