radv: use typed buffer loads for vertex input fetches

This drastically reduces the number of SGPRs because the driver
now uses descriptors per vertex binding, instead of per vertex
attribute format.

29077 shaders in 15096 tests
Totals:
SGPRS: 1354285 -> 1282109 (-5.33 %)
VGPRS: 909896 -> 908800 (-0.12 %)
Spilled SGPRs: 24840 -> 24811 (-0.12 %)
Code Size: 49221144 -> 48986628 (-0.48 %) bytes
Max Waves: 243930 -> 244229 (0.12 %)

Totals from affected shaders:
SGPRS: 390648 -> 318472 (-18.48 %)
VGPRS: 288432 -> 287336 (-0.38 %)
Spilled SGPRs: 94 -> 65 (-30.85 %)
Code Size: 11548412 -> 11313896 (-2.03 %) bytes
Max Waves: 86460 -> 86759 (0.35 %)

This gives a really tiny boost.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
This commit is contained in:
Samuel Pitoiset 2019-02-26 13:42:28 +01:00
parent 0b9a06a1a0
commit a66b186beb
4 changed files with 57 additions and 53 deletions

View File

@ -1988,13 +1988,13 @@ radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer,
{
if ((pipeline_is_dirty ||
(cmd_buffer->state.dirty & RADV_CMD_DIRTY_VERTEX_BUFFER)) &&
cmd_buffer->state.pipeline->vertex_elements.count &&
cmd_buffer->state.pipeline->num_vertex_bindings &&
radv_get_shader(cmd_buffer->state.pipeline, MESA_SHADER_VERTEX)->info.info.vs.has_vertex_buffers) {
struct radv_vertex_elements_info *velems = &cmd_buffer->state.pipeline->vertex_elements;
unsigned vb_offset;
void *vb_ptr;
uint32_t i = 0;
uint32_t count = velems->count;
uint32_t count = cmd_buffer->state.pipeline->num_vertex_bindings;
uint64_t va;
/* allocate some descriptor state for vertex buffers */
@ -2005,13 +2005,15 @@ radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer,
for (i = 0; i < count; i++) {
uint32_t *desc = &((uint32_t *)vb_ptr)[i * 4];
uint32_t offset;
int vb = velems->binding[i];
struct radv_buffer *buffer = cmd_buffer->vertex_bindings[vb].buffer;
uint32_t stride = cmd_buffer->state.pipeline->binding_stride[vb];
struct radv_buffer *buffer = cmd_buffer->vertex_bindings[i].buffer;
uint32_t stride = cmd_buffer->state.pipeline->binding_stride[i];
if (!buffer)
continue;
va = radv_buffer_get_va(buffer->bo);
offset = cmd_buffer->vertex_bindings[vb].offset + velems->offset[i];
offset = cmd_buffer->vertex_bindings[i].offset;
va += offset + buffer->offset;
desc[0] = va;
desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride);
@ -2019,7 +2021,12 @@ radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer,
desc[2] = (buffer->size - offset - velems->format_size[i]) / stride + 1;
else
desc[2] = buffer->size - offset;
desc[3] = velems->rsrc_word3[i];
desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
}
va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);

View File

@ -2008,6 +2008,8 @@ adjust_vertex_fetch_alpha(struct radv_shader_context *ctx,
LLVMValueRef c30 = LLVMConstInt(ctx->ac.i32, 30, 0);
alpha = LLVMBuildBitCast(ctx->ac.builder, alpha, ctx->ac.f32, "");
if (adjustment == RADV_ALPHA_ADJUST_SSCALED)
alpha = LLVMBuildFPToUI(ctx->ac.builder, alpha, ctx->ac.i32, "");
else
@ -2035,7 +2037,7 @@ adjust_vertex_fetch_alpha(struct radv_shader_context *ctx,
alpha = LLVMBuildSIToFP(ctx->ac.builder, alpha, ctx->ac.f32, "");
}
return alpha;
return LLVMBuildBitCast(ctx->ac.builder, alpha, ctx->ac.i32, "");
}
static unsigned
@ -2096,7 +2098,7 @@ radv_fixup_vertex_input_fetches(struct radv_shader_context *ctx,
for (unsigned i = num_channels; i < 4; i++) {
chan[i] = i == 3 ? one : zero;
chan[i] = ac_to_float(&ctx->ac, chan[i]);
chan[i] = ac_to_integer(&ctx->ac, chan[i]);
}
return ac_build_gather_values(&ctx->ac, chan, 4);
@ -2154,20 +2156,49 @@ handle_vs_input_decl(struct radv_shader_context *ctx,
} else
buffer_index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.vertex_id,
ctx->abi.base_vertex, "");
t_offset = LLVMConstInt(ctx->ac.i32, attrib_index, false);
t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset);
/* Adjust the number of channels to load based on the vertex
* attribute format.
*/
unsigned num_format_channels = get_num_channels_from_data_format(data_format);
unsigned num_channels = MIN2(num_input_channels, num_format_channels);
unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[attrib_index];
unsigned attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[attrib_index];
unsigned attrib_stride = ctx->options->key.vs.vertex_attribute_strides[attrib_index];
input = ac_build_buffer_load_format(&ctx->ac, t_list,
if (attrib_stride != 0 && attrib_offset > attrib_stride) {
LLVMValueRef buffer_offset =
LLVMConstInt(ctx->ac.i32,
attrib_offset / attrib_stride, false);
buffer_index = LLVMBuildAdd(ctx->ac.builder,
buffer_index,
ctx->ac.i32_0,
num_channels, false, true);
buffer_offset, "");
attrib_offset = attrib_offset % attrib_stride;
}
t_offset = LLVMConstInt(ctx->ac.i32, attrib_binding, false);
t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset);
input = ac_build_tbuffer_load(&ctx->ac, t_list, buffer_index,
LLVMConstInt(ctx->ac.i32, attrib_offset, false),
ctx->ac.i32_0, ctx->ac.i32_0,
num_channels,
data_format, num_format,
false, false, true);
if (ctx->options->key.vs.post_shuffle & (1 << attrib_index)) {
if (num_channels > 1) {
LLVMValueRef c[4];
c[0] = ac_llvm_extract_elem(&ctx->ac, input, 2);
c[1] = ac_llvm_extract_elem(&ctx->ac, input, 1);
c[2] = ac_llvm_extract_elem(&ctx->ac, input, 0);
c[3] = ac_llvm_extract_elem(&ctx->ac, input, 3);
input = ac_build_gather_values(&ctx->ac, c, 4);
}
}
input = radv_fixup_vertex_input_fetches(ctx, input, num_channels,
is_float);

View File

@ -1244,25 +1244,6 @@ si_conv_prim_to_gs_out(enum VkPrimitiveTopology topology)
}
}
static unsigned si_map_swizzle(unsigned swizzle)
{
switch (swizzle) {
case VK_SWIZZLE_Y:
return V_008F0C_SQ_SEL_Y;
case VK_SWIZZLE_Z:
return V_008F0C_SQ_SEL_Z;
case VK_SWIZZLE_W:
return V_008F0C_SQ_SEL_W;
case VK_SWIZZLE_0:
return V_008F0C_SQ_SEL_0;
case VK_SWIZZLE_1:
return V_008F0C_SQ_SEL_1;
default: /* VK_SWIZZLE_X */
return V_008F0C_SQ_SEL_X;
}
}
static unsigned radv_dynamic_state_mask(VkDynamicState state)
{
switch(state) {
@ -3557,24 +3538,10 @@ radv_compute_vertex_input_state(struct radv_pipeline *pipeline,
&vi_info->pVertexAttributeDescriptions[i];
unsigned loc = desc->location;
const struct vk_format_description *format_desc;
int first_non_void;
uint32_t num_format, data_format;
format_desc = vk_format_description(desc->format);
first_non_void = vk_format_get_first_non_void_channel(desc->format);
num_format = radv_translate_buffer_numformat(format_desc, first_non_void);
data_format = radv_translate_buffer_dataformat(format_desc, first_non_void);
velems->rsrc_word3[loc] = S_008F0C_DST_SEL_X(si_map_swizzle(format_desc->swizzle[0])) |
S_008F0C_DST_SEL_Y(si_map_swizzle(format_desc->swizzle[1])) |
S_008F0C_DST_SEL_Z(si_map_swizzle(format_desc->swizzle[2])) |
S_008F0C_DST_SEL_W(si_map_swizzle(format_desc->swizzle[3])) |
S_008F0C_NUM_FORMAT(num_format) |
S_008F0C_DATA_FORMAT(data_format);
velems->format_size[loc] = format_desc->block.bits / 8;
velems->offset[loc] = desc->offset;
velems->binding[loc] = desc->binding;
velems->count = MAX2(velems->count, loc + 1);
}
for (uint32_t i = 0; i < vi_info->vertexBindingDescriptionCount; i++) {
@ -3582,6 +3549,8 @@ radv_compute_vertex_input_state(struct radv_pipeline *pipeline,
&vi_info->pVertexBindingDescriptions[i];
pipeline->binding_stride[desc->binding] = desc->stride;
pipeline->num_vertex_bindings =
MAX2(pipeline->num_vertex_bindings, desc->binding + 1);
}
}

View File

@ -1342,11 +1342,7 @@ struct radv_prim_vertex_count {
};
struct radv_vertex_elements_info {
uint32_t rsrc_word3[MAX_VERTEX_ATTRIBS];
uint32_t format_size[MAX_VERTEX_ATTRIBS];
uint32_t binding[MAX_VERTEX_ATTRIBS];
uint32_t offset[MAX_VERTEX_ATTRIBS];
uint32_t count;
};
struct radv_ia_multi_vgt_param_helpers {
@ -1378,6 +1374,7 @@ struct radv_pipeline {
struct radv_vertex_elements_info vertex_elements;
uint32_t binding_stride[MAX_VBS];
uint8_t num_vertex_bindings;
uint32_t user_data_0[MESA_SHADER_STAGES];
union {