mesa/src/intel/compiler/brw_fs.h

668 lines
25 KiB
C
Raw Permalink Normal View History

/*
* Copyright © 2010 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* Authors:
* Eric Anholt <eric@anholt.net>
*
*/
#ifndef BRW_FS_H
#define BRW_FS_H
#include "brw_shader.h"
#include "brw_ir_fs.h"
#include "brw_fs_builder.h"
#include "brw_fs_live_variables.h"
intel/ir: Import shader performance analysis pass. This introduces an analysis pass intended to estimate several performance statistics of the shader, including cycle count latency and throughput values, based on static modeling. It has instruction performance information more comprehensive than the current scheduling pass for all platforms between Gen4-11, and works on both the FS and VEC4 back-end. The most immediate purpose of this pass is to implement a heuristic meant to determine whether using SIMD32 dispatch for a fragment shader can be expected to help more than it hurts. In addition this will allow the effect of passes run after scheduling (e.g. the TGL software scoreboard pass and the VEC4 dependency control pass) to be visible in shader-db statistics. But that isn't the end of the story, other potential applications of this pass (not part of this MR) I've been playing around with are: - Implement a similar SIMD16 heuristic allowing the identification of inefficient SIMD16 fragment shaders. - Implement similar SIMD16 and SIMD32 heuristics for the compute shader stage -- Currently compute shader builds always use the SIMD16 shader if available and never use the SIMD32 shader unless strictly necessary, which is suboptimal under certain conditions. - Hook up to the instruction scheduler in order to improve the accuracy of its timing information. - Use as heuristic in order to drive the selection of scheduling modes (Matt was experimenting with that). - Plug to the TGL software scoreboard pass in order to implement a more effective SBID token allocation algorithm, since in general the optimal token allocation depends on the timings of all instructions in the program. - Use its bottleneck detection functionality in order to implement a heuristic computing a more optimal bound for the number of fragment shader threads executed in parallel (by adjusting the MaximumNumberofThreadsPerPSD control of 3DSTATE_PS). As a follow-up I'm planning to submit updated timing information for Gen12 platforms -- Everything else required to support Gen12 like SWSB handling is already included in this patch, but there were some IP concerns regarding the TGL timing parameters since they cannot currently be obtained with the documentation and hardware which is publicly available. The timing parameters for any previous Gen7-11 platforms can be obtained by anyone by sampling the timestamp register using e.g. shader_time, though I have some more convenient instrumentation coming up. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2020-03-26 21:59:02 +00:00
#include "brw_ir_performance.h"
#include "compiler/nir/nir.h"
struct bblock_t;
namespace {
struct acp_entry;
}
class fs_visitor;
namespace brw {
/**
* Register pressure analysis of a shader. Estimates how many registers
* are live at any point of the program in GRF units.
*/
struct register_pressure {
register_pressure(const fs_visitor *v);
~register_pressure();
analysis_dependency_class
dependency_class() const
{
return (DEPENDENCY_INSTRUCTION_IDENTITY |
DEPENDENCY_INSTRUCTION_DATA_FLOW |
DEPENDENCY_VARIABLES);
}
bool
validate(const fs_visitor *) const
{
/* FINISHME */
return true;
}
unsigned *regs_live_at_ip;
};
}
struct brw_gs_compile;
static inline fs_reg
offset(const fs_reg &reg, const brw::fs_builder &bld, unsigned delta)
{
return offset(reg, bld.dispatch_width(), delta);
}
struct shader_stats {
const char *scheduler_mode;
unsigned promoted_constants;
unsigned spill_count;
unsigned fill_count;
};
/**
* The fragment shader front-end.
*
* Translates either GLSL IR or Mesa IR (for ARB_fragment_program) into FS IR.
*/
class fs_visitor : public backend_shader
{
public:
fs_visitor(const struct brw_compiler *compiler, void *log_data,
void *mem_ctx,
const brw_base_prog_key *key,
struct brw_stage_prog_data *prog_data,
const nir_shader *shader,
unsigned dispatch_width,
bool debug_enabled);
fs_visitor(const struct brw_compiler *compiler, void *log_data,
void *mem_ctx,
struct brw_gs_compile *gs_compile,
struct brw_gs_prog_data *prog_data,
const nir_shader *shader,
bool debug_enabled);
void init();
~fs_visitor();
fs_reg vgrf(const glsl_type *const type);
void import_uniforms(fs_visitor *v);
void VARYING_PULL_CONSTANT_LOAD(const brw::fs_builder &bld,
const fs_reg &dst,
const fs_reg &surf_index,
const fs_reg &varying_offset,
uint32_t const_offset,
uint8_t alignment);
void DEP_RESOLVE_MOV(const brw::fs_builder &bld, int grf);
bool run_fs(bool allow_spilling, bool do_rep_send);
bool run_vs();
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8 Our tessellation control shaders can be dispatched in several modes. - SINGLE_PATCH (Gen7+) processes a single patch per thread, with each channel corresponding to a different patch vertex. PATCHLIST_N will launch (N / 8) threads. If N is less than 8, some channels will be disabled, leaving some untapped hardware capabilities. Conditionals based on gl_InvocationID are non-uniform, which means that they'll often have to execute both paths. However, if there are fewer than 8 vertices, all invocations will happen within a single thread, so barriers can become no-ops, which is nice. We also burn a maximum of 4 registers for ICP handles, so we can compile without regard for the value of N. It also works in all cases. - DUAL_PATCH mode processes up to two patches at a time, where the first four channels come from patch 1, and the second group of four come from patch 2. This tries to provide better EU utilization for small patches (N <= 4). It cannot be used in all cases. - 8_PATCH mode processes 8 patches at a time, with a thread launched per vertex in the patch. Each channel corresponds to the same vertex, but in each of the 8 patches. This utilizes all channels even for small patches. It also makes conditions on gl_InvocationID uniform, leading to proper jumps. Barriers, unfortunately, become real. Worse, for PATCHLIST_N, the thread payload burns N registers for ICP handles. This can burn up to 32 registers, or 1/4 of our register file, for URB handles. For Vulkan (and DX), we know the number of vertices at compile time, so we can limit the amount of waste. In GL, the patch dimension is dynamic state, so we either would have to waste all 32 (not reasonable) or guess (badly) and recompile. This is unfortunate. Because we can only spawn 16 thread instances, we can only use this mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH. This patch implements the new 8_PATCH TCS mode, but leaves us using SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to using 8_PATCH mode for testing and benchmarking purposes. We may want to consider using 8_PATCH mode in Vulkan in some cases. The data I've seen shows that 8_PATCH mode can be more efficient in some cases, but SINGLE_PATCH mode (the one we use today) is faster in other cases. Ultimately, the TES matters much more than the TCS for performance, so the decision may not matter much. Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 22:57:54 +01:00
bool run_tcs();
bool run_tes();
bool run_gs();
bool run_cs(bool allow_spilling);
bool run_bs(bool allow_spilling);
bool run_task(bool allow_spilling);
bool run_mesh(bool allow_spilling);
void optimize();
void allocate_registers(bool allow_spilling);
void setup_fs_payload_gfx4();
void setup_fs_payload_gfx6();
void setup_vs_payload();
void setup_gs_payload();
void setup_cs_payload();
bool fixup_sends_duplicate_payload();
void fixup_3src_null_dest();
void emit_dummy_memory_fence_before_eot();
bool fixup_nomask_control_flow();
void assign_curb_setup();
void assign_urb_setup();
void convert_attr_sources_to_hw_regs(fs_inst *inst);
void assign_vs_urb_setup();
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8 Our tessellation control shaders can be dispatched in several modes. - SINGLE_PATCH (Gen7+) processes a single patch per thread, with each channel corresponding to a different patch vertex. PATCHLIST_N will launch (N / 8) threads. If N is less than 8, some channels will be disabled, leaving some untapped hardware capabilities. Conditionals based on gl_InvocationID are non-uniform, which means that they'll often have to execute both paths. However, if there are fewer than 8 vertices, all invocations will happen within a single thread, so barriers can become no-ops, which is nice. We also burn a maximum of 4 registers for ICP handles, so we can compile without regard for the value of N. It also works in all cases. - DUAL_PATCH mode processes up to two patches at a time, where the first four channels come from patch 1, and the second group of four come from patch 2. This tries to provide better EU utilization for small patches (N <= 4). It cannot be used in all cases. - 8_PATCH mode processes 8 patches at a time, with a thread launched per vertex in the patch. Each channel corresponds to the same vertex, but in each of the 8 patches. This utilizes all channels even for small patches. It also makes conditions on gl_InvocationID uniform, leading to proper jumps. Barriers, unfortunately, become real. Worse, for PATCHLIST_N, the thread payload burns N registers for ICP handles. This can burn up to 32 registers, or 1/4 of our register file, for URB handles. For Vulkan (and DX), we know the number of vertices at compile time, so we can limit the amount of waste. In GL, the patch dimension is dynamic state, so we either would have to waste all 32 (not reasonable) or guess (badly) and recompile. This is unfortunate. Because we can only spawn 16 thread instances, we can only use this mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH. This patch implements the new 8_PATCH TCS mode, but leaves us using SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to using 8_PATCH mode for testing and benchmarking purposes. We may want to consider using 8_PATCH mode in Vulkan in some cases. The data I've seen shows that 8_PATCH mode can be more efficient in some cases, but SINGLE_PATCH mode (the one we use today) is faster in other cases. Ultimately, the TES matters much more than the TCS for performance, so the decision may not matter much. Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 22:57:54 +01:00
void assign_tcs_urb_setup();
void assign_tes_urb_setup();
void assign_gs_urb_setup();
bool assign_regs(bool allow_spilling, bool spill_all);
void assign_regs_trivial();
void calculate_payload_ranges(int payload_node_count,
int *payload_last_use_ip) const;
bool split_virtual_grfs();
bool compact_virtual_grfs();
void assign_constant_locations();
bool get_pull_locs(const fs_reg &src, unsigned *out_surf_index,
unsigned *out_pull_index);
void lower_constant_loads();
virtual void invalidate_analysis(brw::analysis_dependency_class c);
void validate();
bool opt_algebraic();
bool opt_redundant_halt();
bool opt_cse();
bool opt_cse_local(const brw::fs_live_variables &live, bblock_t *block, int &ip);
bool opt_copy_propagation();
bool try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry);
bool try_constant_propagate(fs_inst *inst, acp_entry *entry);
bool opt_copy_propagation_local(void *mem_ctx, bblock_t *block,
exec_list *acp);
bool opt_drop_redundant_mov_to_flags();
bool opt_register_renaming();
intel/fs: Implement GRF bank conflict mitigation pass. Unnecessary GRF bank conflicts increase the issue time of ternary instructions (the overwhelmingly most common of which is MAD) by roughly 50%, leading to reduced ALU throughput. This pass attempts to minimize the number of bank conflicts by rearranging the layout of the GRF space post-register allocation. It's in general not possible to eliminate all of them without introducing extra copies, which are typically more expensive than the bank conflict itself. In a shader-db run on SKL this helps roughly 46k shaders: total conflicts in shared programs: 1008981 -> 600461 (-40.49%) conflicts in affected programs: 816222 -> 407702 (-50.05%) helped: 46234 HURT: 72 The running time of shader-db itself on SKL seems to be increased by roughly 2.52%±1.13% with n=20 due to the additional work done by the compiler back-end. On earlier generations the pass is somewhat less effective in relative terms because the hardware incurs a bank conflict anytime the last two sources of the instruction are duplicate (e.g. while trying to square a value using MAD), which is impossible to avoid without introducing copies. E.g. for a shader-db run on SNB: total conflicts in shared programs: 944636 -> 623185 (-34.03%) conflicts in affected programs: 853258 -> 531807 (-37.67%) helped: 31052 HURT: 19 And on BDW: total conflicts in shared programs: 1418393 -> 987539 (-30.38%) conflicts in affected programs: 1179787 -> 748933 (-36.52%) helped: 47592 HURT: 70 On SKL GT4e this improves performance of GpuTest Volplosion by 3.64% ±0.33% with n=16. NOTE: This patch intentionally disregards some i965 coding conventions for the sake of reviewability. This is addressed by the next squash patch which introduces an amount of (for the most part boring) boilerplate that might distract reviewers from the non-trivial algorithmic details of the pass. The following patch is squashed in: SQUASH: intel/fs/bank_conflicts: Roll back to the nineties. Acked-by: Matt Turner <mattst88@gmail.com>
2017-06-15 23:23:57 +01:00
bool opt_bank_conflicts();
intel/fs: Opportunistically split SEND message payloads While we've taken advantage of split-sends in select situations, there are many other cases (such as sampler messages, framebuffer writes, and URB writes) that have never received that treatment, and continued to use monolithic send payloads. This commit introduces a new optimization pass which detects SEND messages with a single payload, finds an adjacent LOAD_PAYLOAD that produces that payload, splits it two, and updates the SEND to use both of the new smaller payloads. In places where we manually used split SENDS, we rely on underlying knowledge of the message to determine a natural split point. For example, header and data, or address and value. In this pass, we instead infer a natural split point by looking at the source registers. Often times, consecutive LOAD_PAYLOAD sources may already be grouped together in a contiguous block, such as a texture coordinate. Then, there is another bit of data, such as a LOD, that may come from elsewhere. We look for the point where the source list switches VGRFs, and split it there. (If there is a message header, we choose to split there, as it will naturally come from elsewhere.) This not only reduces the payload sizes, alleviating register pressure, but it means that we may be able to eliminate some payload construction altogether, if we have a contiguous block already and some extra data being tacked on to one side or the other. shader-db results for Icelake are: total instructions in shared programs: 19602513 -> 19369255 (-1.19%) instructions in affected programs: 6085404 -> 5852146 (-3.83%) helped: 23650 / HURT: 15 helped stats (abs) min: 1 max: 1344 x̄: 9.87 x̃: 3 helped stats (rel) min: 0.03% max: 35.71% x̄: 3.78% x̃: 2.15% HURT stats (abs) min: 1 max: 44 x̄: 7.20 x̃: 2 HURT stats (rel) min: 1.04% max: 20.00% x̄: 4.13% x̃: 2.00% 95% mean confidence interval for instructions value: -10.16 -9.55 95% mean confidence interval for instructions %-change: -3.84% -3.72% Instructions are helped. total cycles in shared programs: 848180368 -> 842208063 (-0.70%) cycles in affected programs: 599931746 -> 593959441 (-1.00%) helped: 22114 / HURT: 13053 helped stats (abs) min: 1 max: 482486 x̄: 580.94 x̃: 22 helped stats (rel) min: <.01% max: 78.92% x̄: 4.76% x̃: 0.75% HURT stats (abs) min: 1 max: 94022 x̄: 526.67 x̃: 22 HURT stats (rel) min: <.01% max: 188.99% x̄: 4.52% x̃: 0.61% 95% mean confidence interval for cycles value: -222.87 -116.79 95% mean confidence interval for cycles %-change: -1.44% -1.20% Cycles are helped. total spills in shared programs: 8387 -> 6569 (-21.68%) spills in affected programs: 5110 -> 3292 (-35.58%) helped: 359 / HURT: 3 total fills in shared programs: 11833 -> 8218 (-30.55%) fills in affected programs: 8635 -> 5020 (-41.86%) helped: 358 / HURT: 3 LOST: 1 SIMD16 shader, 659 SIMD32 shaders GAINED: 65 SIMD16 shaders, 959 SIMD32 shaders Total CPU time (seconds): 1505.48 -> 1474.08 (-2.09%) Examining these results: the few shaders where spills/fills increased were already spilling significantly, and were only slightly hurt. The applications affected were also helped in countless other shaders, and other shaders stopped spilling altogether or had 50% reductions. Many SIMD16 shaders were gained, and overall we gain more SIMD32, though many close to the register pressure line go back and forth. Reviewed-by: Francisco Jerez <currojerez@riseup.net> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17018>
2022-06-13 10:21:49 +01:00
bool opt_split_sends();
bool register_coalesce();
bool compute_to_mrf();
bool eliminate_find_live_channel();
bool dead_code_eliminate();
bool remove_duplicate_mrf_writes();
bool remove_extra_rounding_modes();
i965/fs: Combine tex/fb_write operations (opt) Certain platforms support the ability to sample from a texture, and write it out to the file RT - thus saving a costly send instructions (note that this is a potnential win if one wanted to backport to a tag that didn't have the patch from Topi which removed excess MOVs from LOAD_PAYLOAD - 97caf5fa04dbd2), v2: Modify the algorithm. Instead of iterating in reverse through blocks and insts, since the last block/inst is the only thing which can benefit. Rebased on top of Ken's patching modifying is_last_send v3: Rebased over almost 2 months, and Incorporated feedback from Matt: Some comment typo fixes and rewordings. Whitespace Move the optimization pass outside of the optimize loop v4: Some cosmetic changes requested from Ken. These changes ensured that the optimization function always returned true when an optimization occurred, and false when one did not. This behavior did not exist with the original patch. As a result, having the separate helper function which Matt did not like no longer made sense, and so now I believe everyone should be happy. Benchmark (n=20) %diff *OglBatch5 -1.4 *OglBatch7 -1.79 OglFillTexMulti 5.57 OglFillTexSingle 1.16 OglShMapPcf 0.05 OglTexFilterAniso 3.01 OglTexFilterTri 1.94 No piglit regressions: (http://otc-gfxtest-01.jf.intel.com:8080/view/dev/job/bwidawsk/112/) [*] I believe my measurements are incorrect for Batch5-7. If I add this new optimization, but never emit the new instruction I see similar results. v5: Remove declaration of combine_tex_header since v4 dropped that function (Ben) Remove check for impossible case of an empty block (Matt) Set dest earlier to avoid extra special-casing in generate_tex (Matt) Signed-off-by: Ben Widawsky <ben@bwidawsk.net> Reviewed-by: Matt Turner <mattst88@gmail.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-02-08 21:59:57 +00:00
void schedule_instructions(instruction_scheduler_mode mode);
void insert_gfx4_send_dependency_workarounds();
void insert_gfx4_pre_send_dependency_workarounds(bblock_t *block,
fs_inst *inst);
void insert_gfx4_post_send_dependency_workarounds(bblock_t *block,
fs_inst *inst);
void vfail(const char *msg, va_list args);
void fail(const char *msg, ...);
void limit_dispatch_width(unsigned n, const char *msg);
void lower_uniform_pull_constant_loads();
bool lower_load_payload();
bool lower_pack();
bool lower_regioning();
bool lower_logical_sends();
bool lower_integer_multiplication();
bool lower_minmax();
bool lower_simd_width();
bool lower_barycentrics();
bool lower_derivatives();
bool lower_scoreboard();
bool lower_sub_sat();
bool opt_combine_constants();
void emit_dummy_fs();
void emit_repclear_shader();
void emit_fragcoord_interpolation(fs_reg wpos);
void emit_is_helper_invocation(fs_reg result);
fs_reg emit_frontfacing_interpolation();
fs_reg emit_samplepos_setup();
fs_reg emit_sampleid_setup();
fs_reg emit_samplemaskin_setup();
fs_reg emit_shading_rate_setup();
void emit_interpolation_setup_gfx4();
void emit_interpolation_setup_gfx6();
fs_reg emit_mcs_fetch(const fs_reg &coordinate, unsigned components,
const fs_reg &texture,
const fs_reg &texture_handle);
fs_reg resolve_source_modifiers(const fs_reg &src);
void emit_fsign(const class brw::fs_builder &, const nir_alu_instr *instr,
fs_reg result, fs_reg *op, unsigned fsign_src);
void emit_shader_float_controls_execution_mode();
bool opt_peephole_sel();
bool opt_peephole_predicated_break();
bool opt_saturate_propagation();
bool opt_cmod_propagation();
bool opt_zero_samples();
void set_tcs_invocation_id();
void emit_nir_code();
void nir_setup_outputs();
void nir_setup_uniforms();
void nir_emit_system_values();
void nir_emit_impl(nir_function_impl *impl);
void nir_emit_cf_list(exec_list *list);
void nir_emit_if(nir_if *if_stmt);
void nir_emit_loop(nir_loop *loop);
void nir_emit_block(nir_block *block);
void nir_emit_instr(nir_instr *instr);
void nir_emit_alu(const brw::fs_builder &bld, nir_alu_instr *instr,
bool need_dest);
intel/fs: Emit better code for b2f(inot(a)) and b2i(inot(a)) Since Boolean values are either -1 (true) or 0 (false), b2f(inot(a)) maps -1 => 0.0 and 0 => 1.0. This is equivalent to 1.0 + float(boolBitsToInt(a)). On Intel GPUs, ADD is one of the few instructions that can type-convert during write to destination, so we can achieve this in a single instruction: add g47F, g26D, 1D v2: Fix swizzles. v3: Fix typos in comments. Noticed by Ken. All Gen6+ platforms had similar results. (Skylake shown) Skylake total instructions in shared programs: 15185583 -> 15184683 (<.01%) instructions in affected programs: 239389 -> 238489 (-0.38%) helped: 899 HURT: 1 helped stats (abs) min: 1 max: 2 x̄: 1.00 x̃: 1 helped stats (rel) min: 0.15% max: 1.85% x̄: 0.49% x̃: 0.44% HURT stats (abs) min: 2 max: 2 x̄: 2.00 x̃: 2 HURT stats (rel) min: 0.09% max: 0.09% x̄: 0.09% x̃: 0.09% 95% mean confidence interval for instructions value: -1.01 -0.99 95% mean confidence interval for instructions %-change: -0.51% -0.48% Instructions are helped. total cycles in shared programs: 370964249 -> 370961508 (<.01%) cycles in affected programs: 1487586 -> 1484845 (-0.18%) helped: 420 HURT: 268 helped stats (abs) min: 1 max: 232 x̄: 22.41 x̃: 6 helped stats (rel) min: 0.05% max: 22.60% x̄: 1.30% x̃: 0.41% HURT stats (abs) min: 1 max: 230 x̄: 24.90 x̃: 10 HURT stats (rel) min: <.01% max: 21.60% x̄: 1.45% x̃: 0.52% 95% mean confidence interval for cycles value: -7.61 -0.36 95% mean confidence interval for cycles %-change: -0.44% -0.02% Cycles are helped. No changes on Iron Lake or GM45. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2018-12-03 23:53:36 +00:00
bool try_emit_b2fi_of_inot(const brw::fs_builder &bld, fs_reg result,
nir_alu_instr *instr);
void nir_emit_load_const(const brw::fs_builder &bld,
nir_load_const_instr *instr);
void nir_emit_vs_intrinsic(const brw::fs_builder &bld,
nir_intrinsic_instr *instr);
void nir_emit_tcs_intrinsic(const brw::fs_builder &bld,
nir_intrinsic_instr *instr);
void nir_emit_gs_intrinsic(const brw::fs_builder &bld,
nir_intrinsic_instr *instr);
void nir_emit_fs_intrinsic(const brw::fs_builder &bld,
nir_intrinsic_instr *instr);
void nir_emit_cs_intrinsic(const brw::fs_builder &bld,
nir_intrinsic_instr *instr);
void nir_emit_bs_intrinsic(const brw::fs_builder &bld,
nir_intrinsic_instr *instr);
void nir_emit_task_intrinsic(const brw::fs_builder &bld,
nir_intrinsic_instr *instr);
void nir_emit_mesh_intrinsic(const brw::fs_builder &bld,
nir_intrinsic_instr *instr);
void nir_emit_task_mesh_intrinsic(const brw::fs_builder &bld,
nir_intrinsic_instr *instr);
anv,i965: Lower away image derefs in the driver Previously, the back-end compiler turn image access into magic uniform reads and there was a complex contract between back-end compiler and driver about setting up and filling out those params. As of this commit, both drivers now lower image_deref_load_param_intel intrinsics to load_uniform intrinsics controlled by the driver and lower the other image_deref_* intrinsics to image_* intrinsics which take an actual binding table index. There are still "magic" uniforms but they are now added and controlled entirely by the driver and that contract no longer spans components. This also has the side-effect of making most image use compile-time binding table indices. Previously, all image access pulled the binding table index from a uniform. Part of the reason for this was that the magic uniforms made it difficult to decouple binding table indices from the uniforms and, since they are indexed completely differently (especially in Vulkan), it was hard to pull them apart. Now that the driver is handling both, it's trivial to decouple the two and provide actual binding table indices. Shader-db results on Kaby Lake: total instructions in shared programs: 15166872 -> 15164293 (-0.02%) instructions in affected programs: 115834 -> 113255 (-2.23%) helped: 191 HURT: 0 total cycles in shared programs: 571311495 -> 571196465 (-0.02%) cycles in affected programs: 4757115 -> 4642085 (-2.42%) helped: 73 HURT: 67 total spills in shared programs: 10951 -> 10926 (-0.23%) spills in affected programs: 742 -> 717 (-3.37%) helped: 7 HURT: 0 total fills in shared programs: 22226 -> 22201 (-0.11%) fills in affected programs: 1146 -> 1121 (-2.18%) helped: 7 HURT: 0 Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2018-08-16 22:23:10 +01:00
fs_reg get_nir_image_intrinsic_image(const brw::fs_builder &bld,
nir_intrinsic_instr *instr);
fs_reg get_nir_ssbo_intrinsic_index(const brw::fs_builder &bld,
nir_intrinsic_instr *instr);
fs_reg swizzle_nir_scratch_addr(const brw::fs_builder &bld,
const fs_reg &addr,
bool in_dwords);
void nir_emit_intrinsic(const brw::fs_builder &bld,
nir_intrinsic_instr *instr);
void nir_emit_tes_intrinsic(const brw::fs_builder &bld,
nir_intrinsic_instr *instr);
void nir_emit_ssbo_atomic(const brw::fs_builder &bld,
int op, nir_intrinsic_instr *instr);
void nir_emit_ssbo_atomic_float(const brw::fs_builder &bld,
int op, nir_intrinsic_instr *instr);
void nir_emit_shared_atomic(const brw::fs_builder &bld,
int op, nir_intrinsic_instr *instr);
void nir_emit_shared_atomic_float(const brw::fs_builder &bld,
int op, nir_intrinsic_instr *instr);
void nir_emit_global_atomic(const brw::fs_builder &bld,
int op, nir_intrinsic_instr *instr);
void nir_emit_global_atomic_float(const brw::fs_builder &bld,
int op, nir_intrinsic_instr *instr);
void nir_emit_texture(const brw::fs_builder &bld,
nir_tex_instr *instr);
void nir_emit_jump(const brw::fs_builder &bld,
nir_jump_instr *instr);
fs_reg get_nir_src(const nir_src &src);
fs_reg get_nir_src_imm(const nir_src &src);
fs_reg get_nir_dest(const nir_dest &dest);
fs_reg get_indirect_offset(nir_intrinsic_instr *instr);
fs_reg get_tcs_single_patch_icp_handle(const brw::fs_builder &bld,
nir_intrinsic_instr *instr);
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8 Our tessellation control shaders can be dispatched in several modes. - SINGLE_PATCH (Gen7+) processes a single patch per thread, with each channel corresponding to a different patch vertex. PATCHLIST_N will launch (N / 8) threads. If N is less than 8, some channels will be disabled, leaving some untapped hardware capabilities. Conditionals based on gl_InvocationID are non-uniform, which means that they'll often have to execute both paths. However, if there are fewer than 8 vertices, all invocations will happen within a single thread, so barriers can become no-ops, which is nice. We also burn a maximum of 4 registers for ICP handles, so we can compile without regard for the value of N. It also works in all cases. - DUAL_PATCH mode processes up to two patches at a time, where the first four channels come from patch 1, and the second group of four come from patch 2. This tries to provide better EU utilization for small patches (N <= 4). It cannot be used in all cases. - 8_PATCH mode processes 8 patches at a time, with a thread launched per vertex in the patch. Each channel corresponds to the same vertex, but in each of the 8 patches. This utilizes all channels even for small patches. It also makes conditions on gl_InvocationID uniform, leading to proper jumps. Barriers, unfortunately, become real. Worse, for PATCHLIST_N, the thread payload burns N registers for ICP handles. This can burn up to 32 registers, or 1/4 of our register file, for URB handles. For Vulkan (and DX), we know the number of vertices at compile time, so we can limit the amount of waste. In GL, the patch dimension is dynamic state, so we either would have to waste all 32 (not reasonable) or guess (badly) and recompile. This is unfortunate. Because we can only spawn 16 thread instances, we can only use this mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH. This patch implements the new 8_PATCH TCS mode, but leaves us using SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to using 8_PATCH mode for testing and benchmarking purposes. We may want to consider using 8_PATCH mode in Vulkan in some cases. The data I've seen shows that 8_PATCH mode can be more efficient in some cases, but SINGLE_PATCH mode (the one we use today) is faster in other cases. Ultimately, the TES matters much more than the TCS for performance, so the decision may not matter much. Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 22:57:54 +01:00
fs_reg get_tcs_eight_patch_icp_handle(const brw::fs_builder &bld,
nir_intrinsic_instr *instr);
struct brw_reg get_tcs_output_urb_handle();
void emit_percomp(const brw::fs_builder &bld, const fs_inst &inst,
unsigned wr_mask);
bool optimize_extract_to_float(nir_alu_instr *instr,
const fs_reg &result);
bool optimize_frontfacing_ternary(nir_alu_instr *instr,
const fs_reg &result);
void emit_alpha_test();
fs_inst *emit_single_fb_write(const brw::fs_builder &bld,
fs_reg color1, fs_reg color2,
fs_reg src0_alpha, unsigned components);
void emit_fb_writes();
fs_inst *emit_non_coherent_fb_read(const brw::fs_builder &bld,
const fs_reg &dst, unsigned target);
void emit_urb_writes(const fs_reg &gs_vertex_count = fs_reg());
void set_gs_stream_control_data_bits(const fs_reg &vertex_count,
unsigned stream_id);
void emit_gs_control_data_bits(const fs_reg &vertex_count);
void emit_gs_end_primitive(const nir_src &vertex_count_nir_src);
void emit_gs_vertex(const nir_src &vertex_count_nir_src,
unsigned stream_id);
void emit_gs_thread_end();
void emit_gs_input_load(const fs_reg &dst, const nir_src &vertex_src,
nir: Get rid of *_indirect variants of input/output load/store intrinsics There is some special-casing needed in a competent back-end. However, they can do their special-casing easily enough based on whether or not the offset is a constant. In the mean time, having the *_indirect variants adds special cases a number of places where they don't need to be and, in general, only complicates things. To complicate matters, NIR had no way to convdert an indirect load/store to a direct one in the case that the indirect was a constant so we would still not really get what the back-ends wanted. The best solution seems to be to get rid of the *_indirect variants entirely. This commit is a bunch of different changes squashed together: - nir: Get rid of *_indirect variants of input/output load/store intrinsics - nir/glsl: Stop handling UBO/SSBO load/stores differently depending on indirect - nir/lower_io: Get rid of load/store_foo_indirect - i965/fs: Get rid of load/store_foo_indirect - i965/vec4: Get rid of load/store_foo_indirect - tgsi_to_nir: Get rid of load/store_foo_indirect - ir3/nir: Use the new unified io intrinsics - vc4: Do all uniform loads with byte offsets - vc4/nir: Use the new unified io intrinsics - vc4: Fix load_user_clip_plane crash - vc4: add missing src for store outputs - vc4: Fix state uniforms - nir/lower_clip: Update to the new load/store intrinsics - nir/lower_two_sided_color: Update to the new load intrinsic NIR and i965 changes are Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> NIR indirect declarations and vc4 changes are Reviewed-by: Eric Anholt <eric@anholt.net> ir3 changes are Reviewed-by: Rob Clark <robdclark@gmail.com> NIR changes are Acked-by: Rob Clark <robdclark@gmail.com>
2015-11-25 22:14:05 +00:00
unsigned base_offset, const nir_src &offset_src,
unsigned num_components, unsigned first_component);
void emit_cs_terminate();
fs_reg emit_work_group_id_setup();
void emit_task_mesh_store(const brw::fs_builder &bld,
nir_intrinsic_instr *instr);
void emit_task_mesh_load(const brw::fs_builder &bld,
nir_intrinsic_instr *instr);
void emit_barrier();
fs_reg get_timestamp(const brw::fs_builder &bld);
fs_reg interp_reg(int location, int channel);
fs_reg per_primitive_reg(int location);
virtual void dump_instructions() const;
virtual void dump_instructions(const char *name) const;
void dump_instruction(const backend_instruction *inst) const;
void dump_instruction(const backend_instruction *inst, FILE *file) const;
const brw_base_prog_key *const key;
const struct brw_sampler_prog_key_data *key_tex;
struct brw_gs_compile *gs_compile;
struct brw_stage_prog_data *prog_data;
brw_analysis<brw::fs_live_variables, backend_shader> live_analysis;
brw_analysis<brw::register_pressure, fs_visitor> regpressure_analysis;
brw_analysis<brw::performance, fs_visitor> performance_analysis;
/** Number of uniform variable components visited. */
unsigned uniforms;
/** Byte-offset for the next available spot in the scratch space buffer. */
unsigned last_scratch;
/**
* Array mapping UNIFORM register numbers to the push parameter index,
* or -1 if this uniform register isn't being uploaded as a push constant.
*/
int *push_constant_loc;
fs_reg subgroup_id;
fs_reg group_size[3];
fs_reg scratch_base;
fs_reg frag_depth;
fs_reg frag_stencil;
fs_reg sample_mask;
fs_reg outputs[VARYING_SLOT_MAX];
fs_reg dual_src_output;
int first_non_payload_grf;
/** Either BRW_MAX_GRF or GFX7_MRF_HACK_START */
unsigned max_grf;
fs_reg *nir_locals;
fs_reg *nir_ssa_values;
fs_reg *nir_system_values;
bool failed;
char *fail_msg;
/** Register numbers for thread payload fields. */
struct thread_payload {
uint8_t subspan_coord_reg[2];
uint8_t source_depth_reg[2];
uint8_t source_w_reg[2];
uint8_t aa_dest_stencil_reg[2];
uint8_t dest_depth_reg[2];
uint8_t sample_pos_reg[2];
uint8_t sample_mask_in_reg[2];
uint8_t depth_w_coef_reg[2];
uint8_t barycentric_coord_reg[BRW_BARYCENTRIC_MODE_COUNT][2];
uint8_t local_invocation_id_reg[2];
/** The number of thread payload registers the hardware will supply. */
uint8_t num_regs;
} payload;
bool source_depth_to_render_target;
bool runtime_check_aads_emit;
fs_reg pixel_x;
fs_reg pixel_y;
fs_reg pixel_z;
fs_reg wpos_w;
fs_reg pixel_w;
fs_reg delta_xy[BRW_BARYCENTRIC_MODE_COUNT];
fs_reg shader_start_time;
fs_reg final_gs_vertex_count;
fs_reg control_data_bits;
fs_reg invocation_id;
unsigned grf_used;
bool spilled_any_registers;
const unsigned dispatch_width; /**< 8, 16 or 32 */
unsigned max_dispatch_width;
struct shader_stats shader_stats;
brw::fs_builder bld;
private:
fs_reg prepare_alu_destination_and_sources(const brw::fs_builder &bld,
nir_alu_instr *instr,
fs_reg *op,
bool need_dest);
intel/fs: Emit logical-not of operands on Gen8+ On Gen8+ specifying negation of a logical operation such as AND actually performs a logical-not. Take advantage of this to generate fewer instructions. v2: Major rebase. Use nir_src_as_alu_instr. Fix swizzle handling. No changes on any pre-Gen8 platform. Skylake and Broadwell had similar results. (Broadwell shown) total instructions in shared programs: 15466902 -> 15466274 (<.01%) instructions in affected programs: 1262953 -> 1262325 (-0.05%) helped: 682 HURT: 4 helped stats (abs) min: 1 max: 5 x̄: 1.02 x̃: 1 helped stats (rel) min: 0.03% max: 2.40% x̄: 0.18% x̃: 0.04% HURT stats (abs) min: 1 max: 62 x̄: 17.50 x̃: 3 HURT stats (rel) min: 0.03% max: 1.89% x̄: 0.53% x̃: 0.10% 95% mean confidence interval for instructions value: -1.10 -0.73 95% mean confidence interval for instructions %-change: -0.19% -0.15% Instructions are helped. total cycles in shared programs: 410996093 -> 410950440 (-0.01%) cycles in affected programs: 144389048 -> 144343395 (-0.03%) helped: 519 HURT: 51 helped stats (abs) min: 1 max: 1060 x̄: 104.46 x̃: 140 helped stats (rel) min: 0.01% max: 10.98% x̄: 0.34% x̃: 0.03% HURT stats (abs) min: 1 max: 4060 x̄: 167.90 x̃: 22 HURT stats (rel) min: <.01% max: 8.20% x̄: 0.96% x̃: 0.25% 95% mean confidence interval for cycles value: -97.16 -63.02 95% mean confidence interval for cycles %-change: -0.32% -0.13% Cycles are helped. total spills in shared programs: 95311 -> 95329 (0.02%) spills in affected programs: 881 -> 899 (2.04%) helped: 0 HURT: 4 total fills in shared programs: 93629 -> 93634 (<.01%) fills in affected programs: 794 -> 799 (0.63%) helped: 1 HURT: 2 Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2017-02-09 15:20:04 +00:00
void resolve_inot_sources(const brw::fs_builder &bld, nir_alu_instr *instr,
fs_reg *op);
void lower_mul_dword_inst(fs_inst *inst, bblock_t *block);
void lower_mul_qword_inst(fs_inst *inst, bblock_t *block);
void lower_mulh_inst(fs_inst *inst, bblock_t *block);
unsigned workgroup_size() const;
};
/**
* Return the flag register used in fragment shaders to keep track of live
* samples. On Gfx7+ we use f1.0-f1.1 to allow discard jumps in SIMD32
* dispatch mode, while earlier generations are constrained to f0.1, which
* limits the dispatch width to SIMD16 for fragment shaders that use discard.
*/
static inline unsigned
sample_mask_flag_subreg(const fs_visitor *shader)
{
assert(shader->stage == MESA_SHADER_FRAGMENT);
return shader->devinfo->ver >= 7 ? 2 : 1;
}
/**
* The fragment shader code generator.
*
* Translates FS IR to actual i965 assembly code.
*/
class fs_generator
{
public:
fs_generator(const struct brw_compiler *compiler, void *log_data,
void *mem_ctx,
struct brw_stage_prog_data *prog_data,
bool runtime_check_aads_emit,
gl_shader_stage stage);
~fs_generator();
void enable_debug(const char *shader_name);
int generate_code(const cfg_t *cfg, int dispatch_width,
struct shader_stats shader_stats,
const brw::performance &perf,
struct brw_compile_stats *stats);
void add_const_data(void *data, unsigned size);
void add_resume_sbt(unsigned num_resume_shaders, uint64_t *sbt);
const unsigned *get_assembly();
private:
void fire_fb_write(fs_inst *inst,
struct brw_reg payload,
struct brw_reg implied_header,
GLuint nr);
void generate_send(fs_inst *inst,
struct brw_reg dst,
struct brw_reg desc,
struct brw_reg ex_desc,
struct brw_reg payload,
struct brw_reg payload2);
void generate_fb_write(fs_inst *inst, struct brw_reg payload);
void generate_fb_read(fs_inst *inst, struct brw_reg dst,
struct brw_reg payload);
void generate_urb_read(fs_inst *inst, struct brw_reg dst, struct brw_reg payload);
void generate_urb_write(fs_inst *inst, struct brw_reg payload);
void generate_cs_terminate(fs_inst *inst, struct brw_reg payload);
void generate_barrier(fs_inst *inst, struct brw_reg src);
bool generate_linterp(fs_inst *inst, struct brw_reg dst,
struct brw_reg *src);
void generate_tex(fs_inst *inst, struct brw_reg dst,
struct brw_reg surface_index,
struct brw_reg sampler_index);
void generate_get_buffer_size(fs_inst *inst, struct brw_reg dst,
struct brw_reg src,
struct brw_reg surf_index);
void generate_ddx(const fs_inst *inst,
struct brw_reg dst, struct brw_reg src);
void generate_ddy(const fs_inst *inst,
struct brw_reg dst, struct brw_reg src);
void generate_scratch_write(fs_inst *inst, struct brw_reg src);
void generate_scratch_read(fs_inst *inst, struct brw_reg dst);
void generate_scratch_read_gfx7(fs_inst *inst, struct brw_reg dst);
void generate_scratch_header(fs_inst *inst, struct brw_reg dst);
void generate_uniform_pull_constant_load(fs_inst *inst, struct brw_reg dst,
struct brw_reg index,
struct brw_reg offset);
void generate_uniform_pull_constant_load_gfx7(fs_inst *inst,
struct brw_reg dst,
struct brw_reg surf_index,
i965/fs: Switch to the constant cache for uniform pull constants. This reverts to using the oword block read messages for uniform pull constant loads, as used to be the case until 4c1fdae0a01b3f92ec03b61aac1d3df5. There are two important differences though: Now the L3 cacheability bits are set up correctly for UBOs (since 11f5d8a5d4fbb861ec161f68593e429cbd65d1cd), and we target the constant cache instead of the data cache. The latter used to get no L3 way allocation on boot on all platforms that existed at the time, so oword read messages wouldn't get cached on L3 regardless of the MOCS bits, what probably explains the apparent slowness of oword fetches. Constant cache loads seem to perform better than SIMD4x2 sampler loads in a number of cases, they alleviate some of the cache thrashing caused by the competition with textures for the L1/L2 sampler caches, and they allow fetching up to 128B worth of constants with a single oword fetch message. Note that IVB devices suffer from a hardware bug that leads to serialization of L3 read requests overlapping the same cacheline as result of a (on IVB buggy) mechanism of the L3 to preserve coherency. Since read requests for matching cachelines from any L3 client are not pipelined, throughput may decrease in cases where there are no non-overlapping requests left in the queue that can be processed between them. This situation should be relatively uncommon as long as we make sure that we don't use the 1/2 oword messages in cases where the shader intends to read from any other location of the same cacheline at some other point. This is generally a good idea anyway on all generations because using the 1 and 2 oword messages is expected to waste bandwidth since the minimum L3 request size for the DC is exactly 4 owords (i.e. one cacheline). A future commit will have this effect. I haven't been able to find any real-world example where this would still result in a regression on IVB, but if someone happens to find one it shouldn't be too difficult to add an IVB-specific check to have it fall back to the sampler cache for pull constant loads. Note that on SKL+ this change has the additional benefit of reducing the register footprint of pull constant loads. The following table summarizes the effect of the whole series on several shader-db stats: Total instructions Total cycles BWR: 4571248 -> 4568342 (-0.06%) 123375740 -> 123373296 (-0.00%) ELK: 3989020 -> 3985402 (-0.09%) 98757068 -> 98754058 (-0.00%) ILK: 6383591 -> 6376787 (-0.11%) 143649910 -> 143648914 (-0.00%) SNB: 7528395 -> 7501446 (-0.36%) 103503796 -> 102460370 (-1.01%) IVB: 6949221 -> 6943317 (-0.08%) 60592262 -> 60584422 (-0.01%) HSW: 6409753 -> 6403702 (-0.09%) 60609070 -> 60604414 (-0.01%) BDW: 8043467 -> 7976364 (-0.83%) 68427730 -> 68483042 (0.08%) CHV: 8045019 -> 7977916 (-0.83%) 68297426 -> 68352756 (0.08%) SKL: 8204037 -> 7939086 (-3.23%) 66583900 -> 65624378 (-1.44%) Lost->Gained Total spills Total fills BWR: 5 -> 5 1488 -> 1488 (0.00%) 1957 -> 1957 (0.00%) ELK: 5 -> 5 1489 -> 1489 (0.00%) 1958 -> 1958 (0.00%) ILK: 1 -> 4 1449 -> 1449 (0.00%) 1921 -> 1921 (0.00%) SNB: 0 -> 0 549 -> 549 (0.00%) 52 -> 52 (0.00%) IVB: 13 -> 3 1271 -> 1271 (0.00%) 1162 -> 1162 (0.00%) HSW: 11 -> 0 1271 -> 1271 (0.00%) 1162 -> 1162 (0.00%) BDW: 12 -> 0 1340 -> 1340 (0.00%) 1452 -> 1452 (0.00%) CHV: 12 -> 0 1340 -> 1340 (0.00%) 1452 -> 1452 (0.00%) SKL: 0 -> 120 1269 -> 375 (-70.45%) 1563 -> 690 (-55.85%) v3: Non-trivial rebase. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2016-10-26 22:25:06 +01:00
struct brw_reg payload);
void generate_varying_pull_constant_load_gfx4(fs_inst *inst,
struct brw_reg dst,
struct brw_reg index);
void generate_mov_dispatch_to_flags(fs_inst *inst);
void generate_pixel_interpolator_query(fs_inst *inst,
struct brw_reg dst,
struct brw_reg src,
struct brw_reg msg_data,
unsigned msg_type);
void generate_set_sample_id(fs_inst *inst,
struct brw_reg dst,
struct brw_reg src0,
struct brw_reg src1);
void generate_halt(fs_inst *inst);
void generate_pack_half_2x16_split(fs_inst *inst,
struct brw_reg dst,
struct brw_reg x,
struct brw_reg y);
void generate_mov_indirect(fs_inst *inst,
struct brw_reg dst,
struct brw_reg reg,
struct brw_reg indirect_byte_offset);
void generate_shuffle(fs_inst *inst,
struct brw_reg dst,
struct brw_reg src,
struct brw_reg idx);
void generate_quad_swizzle(const fs_inst *inst,
struct brw_reg dst, struct brw_reg src,
unsigned swiz);
bool patch_halt_jumps();
const struct brw_compiler *compiler;
void *log_data; /* Passed to compiler->*_log functions */
const struct intel_device_info *devinfo;
struct brw_codegen *p;
struct brw_stage_prog_data * const prog_data;
unsigned dispatch_width; /**< 8, 16 or 32 */
exec_list discard_halt_patches;
bool runtime_check_aads_emit;
bool debug_flag;
const char *shader_name;
gl_shader_stage stage;
void *mem_ctx;
};
namespace brw {
inline fs_reg
fetch_payload_reg(const brw::fs_builder &bld, uint8_t regs[2],
brw_reg_type type = BRW_REGISTER_TYPE_F)
{
if (!regs[0])
return fs_reg();
if (bld.dispatch_width() > 16) {
const fs_reg tmp = bld.vgrf(type);
const brw::fs_builder hbld = bld.exec_all().group(16, 0);
const unsigned m = bld.dispatch_width() / hbld.dispatch_width();
fs_reg components[2];
assert(m <= 2);
for (unsigned g = 0; g < m; g++)
components[g] = retype(brw_vec8_grf(regs[g], 0), type);
hbld.LOAD_PAYLOAD(tmp, components, m, 0);
return tmp;
} else {
return fs_reg(retype(brw_vec8_grf(regs[0], 0), type));
}
}
inline fs_reg
fetch_barycentric_reg(const brw::fs_builder &bld, uint8_t regs[2])
{
if (!regs[0])
return fs_reg();
const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 2);
intel/fs: Switch to standard vector layout for barycentrics at optimization time. This involves permuting the registers of barycentric vectors to have the standard X[0-n] Y[0-n] layout at NIR translation time. Barycentrics are converted to the format expected by the PLN instruction in the lower_barycentrics() pass run after the optimization loop. Main reason is correctness of SIMD32 fragment shaders. The shuffle_from_pln_layout() and shuffle_to_pln_layout() helpers used during NIR translation are busted for SIMD32. This leads to serious corruption at present with INTEL_DEBUG=do32, especially on Gen11+ where these helpers are hit more frequently due to the lack of a hardware PLN instruction. Of course one could have chosen to fix those helpers instead, but there is another far more subtle issue that was reported during review of the SIMD32 fragment shader codegen changes: The SIMD splitting pass currently handles SIMD32 barycentric vectors as if they had the standard X[0-n] Y[0-n] layout, even though they are interleaved for the PLN instruction, which causes incorrect execution masks to be applied to the MOVs unzipping barycentric vectors in cases where a LINTERP instruction occurs under non-uniform control flow. I'm not aware of any conformance regressions due to the latter issue at present, but for our peace of mind let's move the conversion to the PLN layout into the lower_barycentrics() pass run after lower_simd_width(). This leads to the following shader-db improvements (including SIMD32 shaders) in combination with the previous back-end preparation changes -- Without them (especially the copy propagation changes) this would lead to a massive number of regressions. On ICL: total instructions in shared programs: 20662316 -> 20466903 (-0.95%) instructions in affected programs: 10538474 -> 10343061 (-1.85%) helped: 68775 HURT: 6 total spills in shared programs: 8938 -> 8748 (-2.13%) spills in affected programs: 376 -> 186 (-50.53%) helped: 9 HURT: 5 total fills in shared programs: 8965 -> 8663 (-3.37%) fills in affected programs: 965 -> 663 (-31.30%) helped: 9 HURT: 6 LOST: 146 GAINED: 43 On SKL: total instructions in shared programs: 18725867 -> 18614912 (-0.59%) instructions in affected programs: 3876590 -> 3765635 (-2.86%) helped: 27492 HURT: 2 LOST: 191 GAINED: 417 On SNB: total instructions in shared programs: 14573613 -> 13980646 (-4.07%) instructions in affected programs: 5199074 -> 4606107 (-11.41%) helped: 29998 HURT: 0 LOST: 21 GAINED: 30 Results are somewhat less impressive but still significant without SIMD32 fragment shaders enabled. On ICL: total instructions in shared programs: 16148728 -> 16061659 (-0.54%) instructions in affected programs: 6114788 -> 6027719 (-1.42%) helped: 42046 HURT: 6 total spills in shared programs: 8218 -> 8028 (-2.31%) spills in affected programs: 376 -> 186 (-50.53%) helped: 9 HURT: 5 total fills in shared programs: 8953 -> 8651 (-3.37%) fills in affected programs: 965 -> 663 (-31.30%) helped: 9 HURT: 6 LOST: 0 GAINED: 3 On SKL: total instructions in shared programs: 14927994 -> 14926738 (-0.01%) instructions in affected programs: 168850 -> 167594 (-0.74%) helped: 711 HURT: 2 On SNB: total instructions in shared programs: 10770538 -> 10734403 (-0.34%) instructions in affected programs: 2702172 -> 2666037 (-1.34%) helped: 17818 HURT: 0 All of the hurt shaders are either spilling slightly more or emitting additional NOP instructions due to the SIMD16 POW workaround for Gen8-9 combined with differences in scheduling. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2020-01-04 01:08:51 +00:00
const brw::fs_builder hbld = bld.exec_all().group(8, 0);
const unsigned m = bld.dispatch_width() / hbld.dispatch_width();
fs_reg *const components = new fs_reg[2 * m];
for (unsigned c = 0; c < 2; c++) {
for (unsigned g = 0; g < m; g++)
intel/fs: Switch to standard vector layout for barycentrics at optimization time. This involves permuting the registers of barycentric vectors to have the standard X[0-n] Y[0-n] layout at NIR translation time. Barycentrics are converted to the format expected by the PLN instruction in the lower_barycentrics() pass run after the optimization loop. Main reason is correctness of SIMD32 fragment shaders. The shuffle_from_pln_layout() and shuffle_to_pln_layout() helpers used during NIR translation are busted for SIMD32. This leads to serious corruption at present with INTEL_DEBUG=do32, especially on Gen11+ where these helpers are hit more frequently due to the lack of a hardware PLN instruction. Of course one could have chosen to fix those helpers instead, but there is another far more subtle issue that was reported during review of the SIMD32 fragment shader codegen changes: The SIMD splitting pass currently handles SIMD32 barycentric vectors as if they had the standard X[0-n] Y[0-n] layout, even though they are interleaved for the PLN instruction, which causes incorrect execution masks to be applied to the MOVs unzipping barycentric vectors in cases where a LINTERP instruction occurs under non-uniform control flow. I'm not aware of any conformance regressions due to the latter issue at present, but for our peace of mind let's move the conversion to the PLN layout into the lower_barycentrics() pass run after lower_simd_width(). This leads to the following shader-db improvements (including SIMD32 shaders) in combination with the previous back-end preparation changes -- Without them (especially the copy propagation changes) this would lead to a massive number of regressions. On ICL: total instructions in shared programs: 20662316 -> 20466903 (-0.95%) instructions in affected programs: 10538474 -> 10343061 (-1.85%) helped: 68775 HURT: 6 total spills in shared programs: 8938 -> 8748 (-2.13%) spills in affected programs: 376 -> 186 (-50.53%) helped: 9 HURT: 5 total fills in shared programs: 8965 -> 8663 (-3.37%) fills in affected programs: 965 -> 663 (-31.30%) helped: 9 HURT: 6 LOST: 146 GAINED: 43 On SKL: total instructions in shared programs: 18725867 -> 18614912 (-0.59%) instructions in affected programs: 3876590 -> 3765635 (-2.86%) helped: 27492 HURT: 2 LOST: 191 GAINED: 417 On SNB: total instructions in shared programs: 14573613 -> 13980646 (-4.07%) instructions in affected programs: 5199074 -> 4606107 (-11.41%) helped: 29998 HURT: 0 LOST: 21 GAINED: 30 Results are somewhat less impressive but still significant without SIMD32 fragment shaders enabled. On ICL: total instructions in shared programs: 16148728 -> 16061659 (-0.54%) instructions in affected programs: 6114788 -> 6027719 (-1.42%) helped: 42046 HURT: 6 total spills in shared programs: 8218 -> 8028 (-2.31%) spills in affected programs: 376 -> 186 (-50.53%) helped: 9 HURT: 5 total fills in shared programs: 8953 -> 8651 (-3.37%) fills in affected programs: 965 -> 663 (-31.30%) helped: 9 HURT: 6 LOST: 0 GAINED: 3 On SKL: total instructions in shared programs: 14927994 -> 14926738 (-0.01%) instructions in affected programs: 168850 -> 167594 (-0.74%) helped: 711 HURT: 2 On SNB: total instructions in shared programs: 10770538 -> 10734403 (-0.34%) instructions in affected programs: 2702172 -> 2666037 (-1.34%) helped: 17818 HURT: 0 All of the hurt shaders are either spilling slightly more or emitting additional NOP instructions due to the SIMD16 POW workaround for Gen8-9 combined with differences in scheduling. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2020-01-04 01:08:51 +00:00
components[c * m + g] = offset(brw_vec8_grf(regs[g / 2], 0),
hbld, c + 2 * (g % 2));
}
hbld.LOAD_PAYLOAD(tmp, components, 2 * m, 0);
delete[] components;
return tmp;
}
bool
lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i);
}
void shuffle_from_32bit_read(const brw::fs_builder &bld,
const fs_reg &dst,
const fs_reg &src,
uint32_t first_component,
uint32_t components);
fs_reg setup_imm_df(const brw::fs_builder &bld,
double v);
i965: Rewrite FS input handling to use the new NIR intrinsics. This eliminates the need to walk the list of input variables, recurse into their types (via logic largely redundant with nir_lower_io), and interpolate all possible inputs up front. The backend no longer has to care about variables at all, which eliminates complications from trying to pack multiple variables into the same location. Instead, each intrinsic specifies exactly what's needed. This should unblock Timothy's work on GL_ARB_enhanced_layouts. Each load_interpolated_input intrinsic corresponds to PLN instructions, while load_barycentric_at_* intrinsics correspond to pixel interpolator messages. The pixel/centroid/sample barycentric intrinsics simply refer to payload fields (delta_xy[]), and don't actually generate any code. Because we use a single intrinsic for both centroid-qualified variables and interpolateAtCentroid(), they become indistinguishable. We stop sending pixel interpolator messages for those, and instead use the payload provided data, which should be considerably faster. On Broadwell: total instructions in shared programs: 9067751 -> 9067570 (-0.00%) instructions in affected programs: 145902 -> 145721 (-0.12%) helped: 422 HURT: 209 total spills in shared programs: 2849 -> 2899 (1.76%) spills in affected programs: 760 -> 810 (6.58%) helped: 0 HURT: 10 total fills in shared programs: 3910 -> 3950 (1.02%) fills in affected programs: 617 -> 657 (6.48%) helped: 0 HURT: 10 LOST: 3 GAINED: 3 The differences mostly appear to be slight changes in MOVs. v2: Use nir_shader_compiler_options::use_interpolated_input_intrinsics flag rather than passing it directly to nir_lower_io. Use the unreachable() macro rather than assert in one place. (Review feedback from Chris Forbes.) Signed-off-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Chris Forbes <chrisforbes@google.com> Acked-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-12 11:57:25 +01:00
fs_reg setup_imm_b(const brw::fs_builder &bld,
int8_t v);
fs_reg setup_imm_ub(const brw::fs_builder &bld,
uint8_t v);
enum brw_barycentric_mode brw_barycentric_mode(nir_intrinsic_instr *intr);
uint32_t brw_fb_write_msg_control(const fs_inst *inst,
const struct brw_wm_prog_data *prog_data);
void brw_compute_urb_setup_index(struct brw_wm_prog_data *wm_prog_data);
void brw_nir_lower_simd(nir_shader *nir, unsigned dispatch_width);
namespace brw {
class fs_builder;
}
fs_reg brw_sample_mask_reg(const brw::fs_builder &bld);
void brw_emit_predicate_on_sample_mask(const brw::fs_builder &bld, fs_inst *inst);
#endif /* BRW_FS_H */