diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c index 7b3a3a47dd5..53303e0e745 100644 --- a/src/intel/vulkan/anv_cmd_buffer.c +++ b/src/intel/vulkan/anv_cmd_buffer.c @@ -653,6 +653,35 @@ void anv_CmdBindVertexBuffers( } } +void anv_CmdBindTransformFeedbackBuffersEXT( + VkCommandBuffer commandBuffer, + uint32_t firstBinding, + uint32_t bindingCount, + const VkBuffer* pBuffers, + const VkDeviceSize* pOffsets, + const VkDeviceSize* pSizes) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + struct anv_xfb_binding *xfb = cmd_buffer->state.xfb_bindings; + + /* We have to defer setting up vertex buffer since we need the buffer + * stride from the pipeline. */ + + assert(firstBinding + bindingCount <= MAX_XFB_BUFFERS); + for (uint32_t i = 0; i < bindingCount; i++) { + if (pBuffers[i] == VK_NULL_HANDLE) { + xfb[firstBinding + i].buffer = NULL; + } else { + ANV_FROM_HANDLE(anv_buffer, buffer, pBuffers[i]); + xfb[firstBinding + i].buffer = buffer; + xfb[firstBinding + i].offset = pOffsets[i]; + xfb[firstBinding + i].size = + anv_buffer_get_range(buffer, pOffsets[i], + pSizes ? pSizes[i] : VK_WHOLE_SIZE); + } + } +} + enum isl_format anv_isl_format_for_descriptor_type(VkDescriptorType type) { diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c index 9d8a982110b..6a5e51d0e91 100644 --- a/src/intel/vulkan/anv_device.c +++ b/src/intel/vulkan/anv_device.c @@ -970,6 +970,14 @@ void anv_GetPhysicalDeviceFeatures2( break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TRANSFORM_FEEDBACK_FEATURES_EXT: { + VkPhysicalDeviceTransformFeedbackFeaturesEXT *features = + (VkPhysicalDeviceTransformFeedbackFeaturesEXT *)ext; + features->transformFeedback = VK_TRUE; + features->geometryStreams = VK_TRUE; + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VERTEX_ATTRIBUTE_DIVISOR_FEATURES_EXT: { VkPhysicalDeviceVertexAttributeDivisorFeaturesEXT *features = (VkPhysicalDeviceVertexAttributeDivisorFeaturesEXT *)ext; @@ -1287,6 +1295,23 @@ void anv_GetPhysicalDeviceProperties2( break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TRANSFORM_FEEDBACK_PROPERTIES_EXT: { + VkPhysicalDeviceTransformFeedbackPropertiesEXT *props = + (VkPhysicalDeviceTransformFeedbackPropertiesEXT *)ext; + + props->maxTransformFeedbackStreams = MAX_XFB_STREAMS; + props->maxTransformFeedbackBuffers = MAX_XFB_BUFFERS; + props->maxTransformFeedbackBufferSize = (1ull << 32); + props->maxTransformFeedbackStreamDataSize = 128 * 4; + props->maxTransformFeedbackBufferDataSize = 128 * 4; + props->maxTransformFeedbackBufferDataStride = 2048; + props->transformFeedbackQueries = VK_FALSE; + props->transformFeedbackStreamsLinesTriangles = VK_FALSE; + props->transformFeedbackRasterizationStreamSelect = VK_FALSE; + props->transformFeedbackDraw = VK_FALSE; + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VERTEX_ATTRIBUTE_DIVISOR_PROPERTIES_EXT: { VkPhysicalDeviceVertexAttributeDivisorPropertiesEXT *props = (VkPhysicalDeviceVertexAttributeDivisorPropertiesEXT *)ext; diff --git a/src/intel/vulkan/anv_extensions.py b/src/intel/vulkan/anv_extensions.py index 29875f3fbce..22bad94e5b8 100644 --- a/src/intel/vulkan/anv_extensions.py +++ b/src/intel/vulkan/anv_extensions.py @@ -132,7 +132,7 @@ EXTENSIONS = [ Extension('VK_EXT_scalar_block_layout', 1, True), Extension('VK_EXT_shader_viewport_index_layer', 1, True), Extension('VK_EXT_shader_stencil_export', 1, 'device->info.gen >= 9'), - Extension('VK_EXT_transform_feedback', 1, False), + Extension('VK_EXT_transform_feedback', 1, True), Extension('VK_EXT_vertex_attribute_divisor', 3, True), Extension('VK_GOOGLE_decorate_string', 1, True), Extension('VK_GOOGLE_hlsl_functionality1', 1, True), diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c index 75e3c362a1d..be869cfa061 100644 --- a/src/intel/vulkan/anv_pipeline.c +++ b/src/intel/vulkan/anv_pipeline.c @@ -32,6 +32,7 @@ #include "anv_private.h" #include "compiler/brw_nir.h" #include "anv_nir.h" +#include "nir/nir_xfb_info.h" #include "spirv/nir_spirv.h" #include "vk_util.h" @@ -138,6 +139,7 @@ anv_shader_compile_to_nir(struct anv_device *device, .device_group = true, .draw_parameters = true, .float64 = pdevice->info.gen >= 8, + .geometry_streams = true, .image_write_without_format = true, .int16 = pdevice->info.gen >= 8, .int64 = pdevice->info.gen >= 8, @@ -155,6 +157,7 @@ anv_shader_compile_to_nir(struct anv_device *device, .subgroup_shuffle = true, .subgroup_vote = true, .tessellation = true, + .transform_feedback = pdevice->info.gen >= 8, .variable_pointers = true, }, .ubo_ptr_type = glsl_vector_type(GLSL_TYPE_UINT, 2), @@ -1082,6 +1085,12 @@ anv_pipeline_compile_graphics(struct anv_pipeline *pipeline, void *stage_ctx = ralloc_context(NULL); + nir_xfb_info *xfb_info = NULL; + if (s == MESA_SHADER_VERTEX || + s == MESA_SHADER_TESS_EVAL || + s == MESA_SHADER_GEOMETRY) + xfb_info = nir_gather_xfb_info(stages[s].nir, stage_ctx); + anv_pipeline_lower_nir(pipeline, stage_ctx, &stages[s], layout); const unsigned *code; @@ -1123,7 +1132,7 @@ anv_pipeline_compile_graphics(struct anv_pipeline *pipeline, stages[s].nir->constant_data_size, &stages[s].prog_data.base, brw_prog_data_size(s), - NULL, &stages[s].bind_map); + xfb_info, &stages[s].bind_map); if (!bin) { ralloc_free(stage_ctx); result = vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 6e89ab7a80b..3889065c93c 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -151,6 +151,8 @@ struct gen_l3_config; #define ANV_HZ_FC_VAL 1.0f #define MAX_VBS 28 +#define MAX_XFB_BUFFERS 4 +#define MAX_XFB_STREAMS 4 #define MAX_SETS 8 #define MAX_RTS 8 #define MAX_VIEWPORTS 16 @@ -1769,6 +1771,7 @@ enum anv_cmd_dirty_bits { ANV_CMD_DIRTY_PIPELINE = 1 << 9, ANV_CMD_DIRTY_INDEX_BUFFER = 1 << 10, ANV_CMD_DIRTY_RENDER_TARGETS = 1 << 11, + ANV_CMD_DIRTY_XFB_ENABLE = 1 << 12, }; typedef uint32_t anv_cmd_dirty_mask_t; @@ -1972,6 +1975,12 @@ struct anv_vertex_binding { VkDeviceSize offset; }; +struct anv_xfb_binding { + struct anv_buffer * buffer; + VkDeviceSize offset; + VkDeviceSize size; +}; + #define ANV_PARAM_PUSH(offset) ((1 << 16) | (uint32_t)(offset)) #define ANV_PARAM_PUSH_OFFSET(param) ((param) & 0xffff) @@ -2164,6 +2173,8 @@ struct anv_cmd_state { VkRect2D render_area; uint32_t restart_index; struct anv_vertex_binding vertex_bindings[MAX_VBS]; + bool xfb_enabled; + struct anv_xfb_binding xfb_bindings[MAX_XFB_BUFFERS]; VkShaderStageFlags push_constant_stages; struct anv_push_constants * push_constants[MESA_SHADER_STAGES]; struct anv_state binding_tables[MESA_SHADER_STAGES]; @@ -2582,6 +2593,8 @@ struct anv_pipeline { uint32_t instance_divisor; } vb[MAX_VBS]; + uint8_t xfb_used; + bool primitive_restart; uint32_t topology; diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index cec4819ba4a..d58559f8719 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -2642,6 +2642,34 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer) cmd_buffer->state.gfx.vb_dirty &= ~vb_emit; +#if GEN_GEN >= 8 + if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_XFB_ENABLE) { + /* We don't need any per-buffer dirty tracking because you're not + * allowed to bind different XFB buffers while XFB is enabled. + */ + for (unsigned idx = 0; idx < MAX_XFB_BUFFERS; idx++) { + struct anv_xfb_binding *xfb = &cmd_buffer->state.xfb_bindings[idx]; + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SO_BUFFER), sob) { + sob.SOBufferIndex = idx; + + if (cmd_buffer->state.xfb_enabled && xfb->buffer) { + sob.SOBufferEnable = true; + sob.MOCS = cmd_buffer->device->default_mocs, + sob.StreamOffsetWriteEnable = false; + sob.SurfaceBaseAddress = anv_address_add(xfb->buffer->address, + xfb->offset); + /* Size is in DWords - 1 */ + sob.SurfaceSize = xfb->size / 4 - 1; + } + } + } + + /* CNL and later require a CS stall after 3DSTATE_SO_BUFFER */ + if (GEN_GEN >= 10) + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT; + } +#endif + if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) { anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->batch); @@ -3272,6 +3300,107 @@ void genX(CmdDrawIndexedIndirectCountKHR)( } } +void genX(CmdBeginTransformFeedbackEXT)( + VkCommandBuffer commandBuffer, + uint32_t firstCounterBuffer, + uint32_t counterBufferCount, + const VkBuffer* pCounterBuffers, + const VkDeviceSize* pCounterBufferOffsets) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + + assert(firstCounterBuffer < MAX_XFB_BUFFERS); + assert(counterBufferCount <= MAX_XFB_BUFFERS); + assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS); + + /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET: + * + * "Ssoftware must ensure that no HW stream output operations can be in + * process or otherwise pending at the point that the MI_LOAD/STORE + * commands are processed. This will likely require a pipeline flush." + */ + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT; + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + for (uint32_t idx = 0; idx < MAX_XFB_BUFFERS; idx++) { + /* If we have a counter buffer, this is a resume so we need to load the + * value into the streamout offset register. Otherwise, this is a begin + * and we need to reset it to zero. + */ + if (pCounterBuffers && + idx >= firstCounterBuffer && + idx - firstCounterBuffer < counterBufferCount && + pCounterBuffers[idx - firstCounterBuffer] != VK_NULL_HANDLE) { + uint32_t cb_idx = idx - firstCounterBuffer; + ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]); + uint64_t offset = pCounterBufferOffsets ? + pCounterBufferOffsets[cb_idx] : 0; + + anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { + lrm.RegisterAddress = GENX(SO_WRITE_OFFSET0_num) + idx * 4; + lrm.MemoryAddress = anv_address_add(counter_buffer->address, + offset); + } + } else { + anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) { + lri.RegisterOffset = GENX(SO_WRITE_OFFSET0_num) + idx * 4; + lri.DataDWord = 0; + } + } + } + + cmd_buffer->state.xfb_enabled = true; + cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE; +} + +void genX(CmdEndTransformFeedbackEXT)( + VkCommandBuffer commandBuffer, + uint32_t firstCounterBuffer, + uint32_t counterBufferCount, + const VkBuffer* pCounterBuffers, + const VkDeviceSize* pCounterBufferOffsets) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + + assert(firstCounterBuffer < MAX_XFB_BUFFERS); + assert(counterBufferCount <= MAX_XFB_BUFFERS); + assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS); + + /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET: + * + * "Ssoftware must ensure that no HW stream output operations can be in + * process or otherwise pending at the point that the MI_LOAD/STORE + * commands are processed. This will likely require a pipeline flush." + */ + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT; + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + for (uint32_t cb_idx = 0; cb_idx < counterBufferCount; cb_idx++) { + unsigned idx = firstCounterBuffer + cb_idx; + + /* If we have a counter buffer, this is a resume so we need to load the + * value into the streamout offset register. Otherwise, this is a begin + * and we need to reset it to zero. + */ + if (pCounterBuffers && + cb_idx < counterBufferCount && + pCounterBuffers[cb_idx] != VK_NULL_HANDLE) { + ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]); + uint64_t offset = pCounterBufferOffsets ? + pCounterBufferOffsets[cb_idx] : 0; + + anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) { + srm.MemoryAddress = anv_address_add(counter_buffer->address, + offset); + srm.RegisterAddress = GENX(SO_WRITE_OFFSET0_num) + idx * 4; + } + } + } + + cmd_buffer->state.xfb_enabled = false; + cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE; +} + static VkResult flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer) { diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_pipeline.c index 1bc445eae82..899a96fd84f 100644 --- a/src/intel/vulkan/genX_pipeline.c +++ b/src/intel/vulkan/genX_pipeline.c @@ -28,6 +28,7 @@ #include "common/gen_l3_config.h" #include "common/gen_sample_positions.h" +#include "nir/nir_xfb_info.h" #include "vk_util.h" #include "vk_format_info.h" @@ -1127,9 +1128,130 @@ static void emit_3dstate_streamout(struct anv_pipeline *pipeline, const VkPipelineRasterizationStateCreateInfo *rs_info) { +#if GEN_GEN >= 8 + const struct brw_vue_prog_data *prog_data = + anv_pipeline_get_last_vue_prog_data(pipeline); + const struct brw_vue_map *vue_map = &prog_data->vue_map; +#endif + + nir_xfb_info *xfb_info; + if (anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) + xfb_info = pipeline->shaders[MESA_SHADER_GEOMETRY]->xfb_info; + else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) + xfb_info = pipeline->shaders[MESA_SHADER_TESS_EVAL]->xfb_info; + else + xfb_info = pipeline->shaders[MESA_SHADER_VERTEX]->xfb_info; + + pipeline->xfb_used = xfb_info ? xfb_info->buffers_written : 0; + anv_batch_emit(&pipeline->batch, GENX(3DSTATE_STREAMOUT), so) { so.RenderingDisable = rs_info->rasterizerDiscardEnable; + +#if GEN_GEN >= 8 + if (xfb_info) { + so.SOFunctionEnable = true; + + const VkPipelineRasterizationStateStreamCreateInfoEXT *stream_info = + vk_find_struct_const(rs_info, PIPELINE_RASTERIZATION_STATE_STREAM_CREATE_INFO_EXT); + so.RenderStreamSelect = stream_info ? + stream_info->rasterizationStream : 0; + + so.Buffer0SurfacePitch = xfb_info->strides[0]; + so.Buffer1SurfacePitch = xfb_info->strides[1]; + so.Buffer2SurfacePitch = xfb_info->strides[2]; + so.Buffer3SurfacePitch = xfb_info->strides[3]; + + int urb_entry_read_offset = 0; + int urb_entry_read_length = + (prog_data->vue_map.num_slots + 1) / 2 - urb_entry_read_offset; + + /* We always read the whole vertex. This could be reduced at some + * point by reading less and offsetting the register index in the + * SO_DECLs. + */ + so.Stream0VertexReadOffset = urb_entry_read_offset; + so.Stream0VertexReadLength = urb_entry_read_length - 1; + so.Stream1VertexReadOffset = urb_entry_read_offset; + so.Stream1VertexReadLength = urb_entry_read_length - 1; + so.Stream2VertexReadOffset = urb_entry_read_offset; + so.Stream2VertexReadLength = urb_entry_read_length - 1; + so.Stream3VertexReadOffset = urb_entry_read_offset; + so.Stream3VertexReadLength = urb_entry_read_length - 1; + } +#endif /* GEN_GEN >= 8 */ } + +#if GEN_GEN >= 8 + if (xfb_info) { + struct GENX(SO_DECL) so_decl[MAX_XFB_STREAMS][128]; + int next_offset[MAX_XFB_BUFFERS] = {0, 0, 0, 0}; + int decls[MAX_XFB_STREAMS] = {0, 0, 0, 0}; + + memset(so_decl, 0, sizeof(so_decl)); + + for (unsigned i = 0; i < xfb_info->output_count; i++) { + const nir_xfb_output_info *output = &xfb_info->outputs[i]; + unsigned buffer = output->buffer; + unsigned stream = xfb_info->buffer_to_stream[buffer]; + + /* Our hardware is unusual in that it requires us to program SO_DECLs + * for fake "hole" components, rather than simply taking the offset + * for each real varying. Each hole can have size 1, 2, 3, or 4; we + * program as many size = 4 holes as we can, then a final hole to + * accommodate the final 1, 2, or 3 remaining. + */ + int hole_dwords = (output->offset - next_offset[buffer]) / 4; + while (hole_dwords > 0) { + so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) { + .HoleFlag = 1, + .OutputBufferSlot = buffer, + .ComponentMask = (1 << MIN2(hole_dwords, 4)) - 1, + }; + hole_dwords -= 4; + } + + next_offset[buffer] = output->offset + + __builtin_popcount(output->component_mask) * 4; + + so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) { + .OutputBufferSlot = buffer, + .RegisterIndex = vue_map->varying_to_slot[output->location], + .ComponentMask = output->component_mask, + }; + } + + int max_decls = 0; + for (unsigned s = 0; s < MAX_XFB_STREAMS; s++) + max_decls = MAX2(max_decls, decls[s]); + + uint8_t sbs[MAX_XFB_STREAMS] = { }; + for (unsigned b = 0; b < MAX_XFB_BUFFERS; b++) { + if (xfb_info->buffers_written & (1 << b)) + sbs[xfb_info->buffer_to_stream[b]] |= 1 << b; + } + + uint32_t *dw = anv_batch_emitn(&pipeline->batch, 3 + 2 * max_decls, + GENX(3DSTATE_SO_DECL_LIST), + .StreamtoBufferSelects0 = sbs[0], + .StreamtoBufferSelects1 = sbs[1], + .StreamtoBufferSelects2 = sbs[2], + .StreamtoBufferSelects3 = sbs[3], + .NumEntries0 = decls[0], + .NumEntries1 = decls[1], + .NumEntries2 = decls[2], + .NumEntries3 = decls[3]); + + for (int i = 0; i < max_decls; i++) { + GENX(SO_DECL_ENTRY_pack)(NULL, dw + 3 + i * 2, + &(struct GENX(SO_DECL_ENTRY)) { + .Stream0Decl = so_decl[0][i], + .Stream1Decl = so_decl[1][i], + .Stream2Decl = so_decl[2][i], + .Stream3Decl = so_decl[3][i], + }); + } + } +#endif /* GEN_GEN >= 8 */ } static uint32_t