intel/compiler: Do image load/store lowering to NIR

This commit moves our storage image format conversion codegen into NIR
instead of doing it in the back-end.  This has the advantage of letting
us run it through NIR's optimizer which is pretty effective at shrinking
things down.  In the common case of rgba8, the number of instructions
emitted after NIR is done with it is half of what it was with the
lowering happening in the back-end.  On the downside, the back-end's
lowering is able to directly use predicates and the NIR lowering has to
use IFs.

Shader-db results on Kaby Lake:

    total instructions in shared programs: 15166910 -> 15166872 (<.01%)
    instructions in affected programs: 5895 -> 5857 (-0.64%)
    helped: 15
    HURT: 0

Clearly, we don't have that much image_load_store happening in the
shaders in shader-db....

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
This commit is contained in:
Jason Ekstrand 2018-01-27 13:19:57 -08:00
parent b217705dec
commit 37f7983bcc
10 changed files with 896 additions and 1120 deletions

View File

@ -312,6 +312,15 @@ intrinsic("image_deref_atomic_fadd", src_comp=[1, 4, 1, 1], dest_comp=1)
intrinsic("image_deref_size", src_comp=[1], dest_comp=0, flags=[CAN_ELIMINATE, CAN_REORDER])
intrinsic("image_deref_samples", src_comp=[1], dest_comp=1, flags=[CAN_ELIMINATE, CAN_REORDER])
# Intel-specific query for loading from the brw_image_param struct passed
# into the shader as a uniform. The variable is a deref to the image
# variable. The const index specifies which of the six parameters to load.
intrinsic("image_deref_load_param_intel", src_comp=[1], dest_comp=0,
indices=[BASE], flags=[CAN_ELIMINATE, CAN_REORDER])
intrinsic("image_deref_load_raw_intel", src_comp=[1, 1], dest_comp=0,
flags=[CAN_ELIMINATE])
intrinsic("image_deref_store_raw_intel", src_comp=[1, 1, 0])
# Vulkan descriptor set intrinsics
#
# The Vulkan API uses a different binding model from GL. In the Vulkan

View File

@ -84,6 +84,7 @@ COMPILER_FILES = \
compiler/brw_nir_analyze_ubo_ranges.c \
compiler/brw_nir_attribute_workarounds.c \
compiler/brw_nir_lower_cs_intrinsics.c \
compiler/brw_nir_lower_image_load_store.c \
compiler/brw_nir_opt_peephole_ffma.c \
compiler/brw_nir_tcs_workarounds.c \
compiler/brw_packed_float.c \

View File

@ -3865,38 +3865,33 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
case nir_intrinsic_image_deref_atomic_xor:
case nir_intrinsic_image_deref_atomic_exchange:
case nir_intrinsic_image_deref_atomic_comp_swap: {
using namespace image_access;
if (stage == MESA_SHADER_FRAGMENT &&
instr->intrinsic != nir_intrinsic_image_deref_load)
brw_wm_prog_data(prog_data)->has_side_effects = true;
/* Get the referenced image variable and type. */
nir_deref_instr *deref = nir_src_as_deref(instr->src[0]);
const nir_variable *var = nir_deref_instr_get_variable(deref);
const glsl_type *type = var->type->without_array();
const brw_reg_type base_type = get_image_base_type(type);
const glsl_type *type = deref->type;
/* Get some metadata from the image intrinsic. */
const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
const unsigned arr_dims = type->sampler_array ? 1 : 0;
const unsigned surf_dims = type->coordinate_components() - arr_dims;
const unsigned format = var->data.image.format;
const unsigned dims = type->coordinate_components();
const unsigned dest_components = nir_intrinsic_dest_components(instr);
/* Get the arguments of the image intrinsic. */
const fs_reg image = get_nir_image_deref(deref);
const fs_reg addr = retype(get_nir_src(instr->src[1]),
BRW_REGISTER_TYPE_UD);
const fs_reg coords = retype(get_nir_src(instr->src[1]),
BRW_REGISTER_TYPE_UD);
fs_reg tmp;
/* Emit an image load, store or atomic op. */
if (instr->intrinsic == nir_intrinsic_image_deref_load)
tmp = emit_image_load(bld, image, addr, surf_dims, arr_dims, format);
else if (instr->intrinsic == nir_intrinsic_image_deref_store) {
const fs_reg src0 = retype(get_nir_src(instr->src[3]), base_type);
emit_image_store(bld, image, addr, src0, surf_dims, arr_dims,
var->data.image.write_only ? GL_NONE : format);
if (instr->intrinsic == nir_intrinsic_image_deref_load) {
tmp = emit_typed_read(bld, image, coords, dims,
instr->num_components);
} else if (instr->intrinsic == nir_intrinsic_image_deref_store) {
const fs_reg src0 = get_nir_src(instr->src[3]);
emit_typed_write(bld, image, coords, src0, dims,
instr->num_components);
} else {
int op;
unsigned num_srcs = info->num_srcs;
@ -3938,25 +3933,61 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
}
const fs_reg src0 = (num_srcs >= 4 ?
retype(get_nir_src(instr->src[3]), base_type) :
fs_reg());
get_nir_src(instr->src[3]) : fs_reg());
const fs_reg src1 = (num_srcs >= 5 ?
retype(get_nir_src(instr->src[4]), base_type) :
fs_reg());
get_nir_src(instr->src[4]) : fs_reg());
tmp = emit_image_atomic(bld, image, addr, src0, src1,
surf_dims, arr_dims, dest_components,
op);
tmp = emit_typed_atomic(bld, image, coords, src0, src1, dims, 1, op);
}
/* Assign the result. */
for (unsigned c = 0; c < dest_components; ++c) {
bld.MOV(offset(retype(dest, base_type), bld, c),
offset(tmp, bld, c));
bld.MOV(offset(retype(dest, tmp.type), bld, c),
offset(tmp, bld, c));
}
break;
}
case nir_intrinsic_image_deref_load_param_intel: {
nir_deref_instr *deref = nir_src_as_deref(instr->src[0]);
const fs_reg image = get_nir_image_deref(deref);
const fs_reg param = offset(image, bld, nir_intrinsic_base(instr) * 4);
for (unsigned c = 0; c < instr->dest.ssa.num_components; ++c) {
bld.MOV(offset(retype(dest, param.type), bld, c),
offset(param, bld, c));
}
break;
}
case nir_intrinsic_image_deref_load_raw_intel: {
const fs_reg image = get_nir_image_deref(nir_src_as_deref(instr->src[0]));
const fs_reg addr = retype(get_nir_src(instr->src[1]),
BRW_REGISTER_TYPE_UD);
fs_reg tmp = emit_untyped_read(bld, image, addr, 1,
instr->num_components);
for (unsigned c = 0; c < instr->num_components; ++c) {
bld.MOV(offset(retype(dest, tmp.type), bld, c),
offset(tmp, bld, c));
}
break;
}
case nir_intrinsic_image_deref_store_raw_intel: {
const fs_reg image = get_nir_image_deref(nir_src_as_deref(instr->src[0]));
const fs_reg addr = retype(get_nir_src(instr->src[1]),
BRW_REGISTER_TYPE_UD);
const fs_reg data = retype(get_nir_src(instr->src[2]),
BRW_REGISTER_TYPE_UD);
brw_wm_prog_data(prog_data)->has_side_effects = true;
emit_untyped_write(bld, image, addr, data, 1,
instr->num_components);
break;
}
case nir_intrinsic_group_memory_barrier:
case nir_intrinsic_memory_barrier_shared:
case nir_intrinsic_memory_barrier_atomic_counter:
@ -3979,51 +4010,6 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
break;
}
case nir_intrinsic_image_deref_size: {
/* Get the referenced image variable and type. */
nir_deref_instr *deref = nir_src_as_deref(instr->src[0]);
const nir_variable *var = nir_deref_instr_get_variable(deref);
const glsl_type *type = var->type->without_array();
/* Get the size of the image. */
const fs_reg image = get_nir_image_deref(deref);
const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
/* For 1DArray image types, the array index is stored in the Z component.
* Fix this by swizzling the Z component to the Y component.
*/
const bool is_1d_array_image =
type->sampler_dimensionality == GLSL_SAMPLER_DIM_1D &&
type->sampler_array;
/* For CubeArray images, we should count the number of cubes instead
* of the number of faces. Fix it by dividing the (Z component) by 6.
*/
const bool is_cube_array_image =
type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
type->sampler_array;
/* Copy all the components. */
for (unsigned c = 0; c < instr->dest.ssa.num_components; ++c) {
if ((int)c >= type->coordinate_components()) {
bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
brw_imm_d(1));
} else if (c == 1 && is_1d_array_image) {
bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
offset(size, bld, 2));
} else if (c == 2 && is_cube_array_image) {
bld.emit(SHADER_OPCODE_INT_QUOTIENT,
offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
offset(size, bld, c), brw_imm_d(6));
} else {
bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
offset(size, bld, c));
}
}
break;
}
case nir_intrinsic_image_deref_samples:
/* The driver does not support multi-sampled images. */
bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(1));

File diff suppressed because it is too large Load Diff

View File

@ -85,25 +85,5 @@ namespace brw {
unsigned bit_size,
brw_predicate pred = BRW_PREDICATE_NONE);
}
namespace image_access {
fs_reg
emit_image_load(const fs_builder &bld,
const fs_reg &image, const fs_reg &addr,
unsigned surf_dims, unsigned arr_dims,
unsigned gl_format);
void
emit_image_store(const fs_builder &bld, const fs_reg &image,
const fs_reg &addr, const fs_reg &src,
unsigned surf_dims, unsigned arr_dims,
unsigned gl_format);
fs_reg
emit_image_atomic(const fs_builder &bld,
const fs_reg &image, const fs_reg &addr,
const fs_reg &src0, const fs_reg &src1,
unsigned surf_dims, unsigned arr_dims,
unsigned rsize, unsigned op);
}
}
#endif

View File

@ -114,6 +114,9 @@ void brw_nir_lower_tcs_outputs(nir_shader *nir, const struct brw_vue_map *vue,
GLenum tes_primitive_mode);
void brw_nir_lower_fs_outputs(nir_shader *nir);
bool brw_nir_lower_image_load_store(nir_shader *nir,
const struct gen_device_info *devinfo);
nir_shader *brw_postprocess_nir(nir_shader *nir,
const struct brw_compiler *compiler,
bool is_scalar);

View File

@ -0,0 +1,822 @@
/*
* Copyright © 2018 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "isl/isl.h"
#include "brw_nir.h"
#include "compiler/nir/nir_builder.h"
#include "compiler/nir/nir_format_convert.h"
/* The higher compiler layers use the GL enums for image formats even if
* they come in from SPIR-V or Vulkan. We need to turn them into an ISL
* enum before we can use them.
*/
static enum isl_format
isl_format_for_gl_format(uint32_t gl_format)
{
switch (gl_format) {
case GL_R8: return ISL_FORMAT_R8_UNORM;
case GL_R8_SNORM: return ISL_FORMAT_R8_SNORM;
case GL_R8UI: return ISL_FORMAT_R8_UINT;
case GL_R8I: return ISL_FORMAT_R8_SINT;
case GL_RG8: return ISL_FORMAT_R8G8_UNORM;
case GL_RG8_SNORM: return ISL_FORMAT_R8G8_SNORM;
case GL_RG8UI: return ISL_FORMAT_R8G8_UINT;
case GL_RG8I: return ISL_FORMAT_R8G8_SINT;
case GL_RGBA8: return ISL_FORMAT_R8G8B8A8_UNORM;
case GL_RGBA8_SNORM: return ISL_FORMAT_R8G8B8A8_SNORM;
case GL_RGBA8UI: return ISL_FORMAT_R8G8B8A8_UINT;
case GL_RGBA8I: return ISL_FORMAT_R8G8B8A8_SINT;
case GL_R11F_G11F_B10F: return ISL_FORMAT_R11G11B10_FLOAT;
case GL_RGB10_A2: return ISL_FORMAT_R10G10B10A2_UNORM;
case GL_RGB10_A2UI: return ISL_FORMAT_R10G10B10A2_UINT;
case GL_R16: return ISL_FORMAT_R16_UNORM;
case GL_R16_SNORM: return ISL_FORMAT_R16_SNORM;
case GL_R16F: return ISL_FORMAT_R16_FLOAT;
case GL_R16UI: return ISL_FORMAT_R16_UINT;
case GL_R16I: return ISL_FORMAT_R16_SINT;
case GL_RG16: return ISL_FORMAT_R16G16_UNORM;
case GL_RG16_SNORM: return ISL_FORMAT_R16G16_SNORM;
case GL_RG16F: return ISL_FORMAT_R16G16_FLOAT;
case GL_RG16UI: return ISL_FORMAT_R16G16_UINT;
case GL_RG16I: return ISL_FORMAT_R16G16_SINT;
case GL_RGBA16: return ISL_FORMAT_R16G16B16A16_UNORM;
case GL_RGBA16_SNORM: return ISL_FORMAT_R16G16B16A16_SNORM;
case GL_RGBA16F: return ISL_FORMAT_R16G16B16A16_FLOAT;
case GL_RGBA16UI: return ISL_FORMAT_R16G16B16A16_UINT;
case GL_RGBA16I: return ISL_FORMAT_R16G16B16A16_SINT;
case GL_R32F: return ISL_FORMAT_R32_FLOAT;
case GL_R32UI: return ISL_FORMAT_R32_UINT;
case GL_R32I: return ISL_FORMAT_R32_SINT;
case GL_RG32F: return ISL_FORMAT_R32G32_FLOAT;
case GL_RG32UI: return ISL_FORMAT_R32G32_UINT;
case GL_RG32I: return ISL_FORMAT_R32G32_SINT;
case GL_RGBA32F: return ISL_FORMAT_R32G32B32A32_FLOAT;
case GL_RGBA32UI: return ISL_FORMAT_R32G32B32A32_UINT;
case GL_RGBA32I: return ISL_FORMAT_R32G32B32A32_SINT;
case GL_NONE: return ISL_FORMAT_UNSUPPORTED;
default:
assert(!"Invalid image format");
return ISL_FORMAT_UNSUPPORTED;
}
}
static nir_ssa_def *
_load_image_param(nir_builder *b, nir_deref_instr *deref, unsigned offset)
{
nir_intrinsic_instr *load =
nir_intrinsic_instr_create(b->shader,
nir_intrinsic_image_deref_load_param_intel);
load->src[0] = nir_src_for_ssa(&deref->dest.ssa);
nir_intrinsic_set_base(load, offset / 4);
switch (offset) {
case BRW_IMAGE_PARAM_SURFACE_IDX_OFFSET:
load->num_components = 1;
break;
case BRW_IMAGE_PARAM_OFFSET_OFFSET:
case BRW_IMAGE_PARAM_SWIZZLING_OFFSET:
load->num_components = 2;
break;
case BRW_IMAGE_PARAM_TILING_OFFSET:
case BRW_IMAGE_PARAM_SIZE_OFFSET:
load->num_components = 3;
break;
case BRW_IMAGE_PARAM_STRIDE_OFFSET:
load->num_components = 4;
break;
default:
unreachable("Invalid param offset");
}
nir_ssa_dest_init(&load->instr, &load->dest,
load->num_components, 32, NULL);
nir_builder_instr_insert(b, &load->instr);
return &load->dest.ssa;
}
#define load_image_param(b, d, o) \
_load_image_param(b, d, BRW_IMAGE_PARAM_##o##_OFFSET)
static nir_ssa_def *
sanitize_image_coord(nir_builder *b, nir_deref_instr *deref, nir_ssa_def *coord)
{
if (glsl_get_sampler_dim(deref->type) == GLSL_SAMPLER_DIM_1D &&
glsl_sampler_type_is_array(deref->type)) {
/* It's easier if 1D arrays are treated like 2D arrays */
return nir_vec3(b, nir_channel(b, coord, 0),
nir_imm_int(b, 0),
nir_channel(b, coord, 1));
} else {
unsigned dims = glsl_get_sampler_coordinate_components(deref->type);
return nir_channels(b, coord, (1 << dims) - 1);
}
}
static nir_ssa_def *
image_coord_is_in_bounds(nir_builder *b, nir_deref_instr *deref,
nir_ssa_def *coord)
{
coord = sanitize_image_coord(b, deref, coord);
nir_ssa_def *size = load_image_param(b, deref, SIZE);
nir_ssa_def *cmp = nir_ilt(b, coord, size);
nir_ssa_def *in_bounds = nir_imm_int(b, NIR_TRUE);
for (unsigned i = 0; i < coord->num_components; i++)
in_bounds = nir_iand(b, in_bounds, nir_channel(b, cmp, i));
return in_bounds;
}
/** Calculate the offset in memory of the texel given by \p coord.
*
* This is meant to be used with untyped surface messages to access a tiled
* surface, what involves taking into account the tiling and swizzling modes
* of the surface manually so it will hopefully not happen very often.
*
* The tiling algorithm implemented here matches either the X or Y tiling
* layouts supported by the hardware depending on the tiling coefficients
* passed to the program as uniforms. See Volume 1 Part 2 Section 4.5
* "Address Tiling Function" of the IVB PRM for an in-depth explanation of
* the hardware tiling format.
*/
static nir_ssa_def *
image_address(nir_builder *b, const struct gen_device_info *devinfo,
nir_deref_instr *deref, nir_ssa_def *coord)
{
coord = sanitize_image_coord(b, deref, coord);
nir_ssa_def *offset = load_image_param(b, deref, OFFSET);
nir_ssa_def *tiling = load_image_param(b, deref, TILING);
nir_ssa_def *stride = load_image_param(b, deref, STRIDE);
/* Shift the coordinates by the fixed surface offset. It may be non-zero
* if the image is a single slice of a higher-dimensional surface, or if a
* non-zero mipmap level of the surface is bound to the pipeline. The
* offset needs to be applied here rather than at surface state set-up time
* because the desired slice-level may start mid-tile, so simply shifting
* the surface base address wouldn't give a well-formed tiled surface in
* the general case.
*/
nir_ssa_def *xypos = (coord->num_components == 1) ?
nir_vec2(b, coord, nir_imm_int(b, 0)) :
nir_channels(b, coord, 0x3);
xypos = nir_iadd(b, xypos, offset);
/* The layout of 3-D textures in memory is sort-of like a tiling
* format. At each miplevel, the slices are arranged in rows of
* 2^level slices per row. The slice row is stored in tmp.y and
* the slice within the row is stored in tmp.x.
*
* The layout of 2-D array textures and cubemaps is much simpler:
* Depending on whether the ARYSPC_LOD0 layout is in use it will be
* stored in memory as an array of slices, each one being a 2-D
* arrangement of miplevels, or as a 2D arrangement of miplevels,
* each one being an array of slices. In either case the separation
* between slices of the same LOD is equal to the qpitch value
* provided as stride.w.
*
* This code can be made to handle either 2D arrays and 3D textures
* by passing in the miplevel as tile.z for 3-D textures and 0 in
* tile.z for 2-D array textures.
*
* See Volume 1 Part 1 of the Gen7 PRM, sections 6.18.4.7 "Surface
* Arrays" and 6.18.6 "3D Surfaces" for a more extensive discussion
* of the hardware 3D texture and 2D array layouts.
*/
if (coord->num_components > 2) {
/* Decompose z into a major (tmp.y) and a minor (tmp.x)
* index.
*/
nir_ssa_def *z = nir_channel(b, coord, 2);
nir_ssa_def *z_x = nir_ubfe(b, z, nir_imm_int(b, 0),
nir_channel(b, tiling, 2));
nir_ssa_def *z_y = nir_ushr(b, z, nir_channel(b, tiling, 2));
/* Take into account the horizontal (tmp.x) and vertical (tmp.y)
* slice offset.
*/
xypos = nir_iadd(b, xypos, nir_imul(b, nir_vec2(b, z_x, z_y),
nir_channels(b, stride, 0xc)));
}
nir_ssa_def *addr;
if (coord->num_components > 1) {
/* Calculate the major/minor x and y indices. In order to
* accommodate both X and Y tiling, the Y-major tiling format is
* treated as being a bunch of narrow X-tiles placed next to each
* other. This means that the tile width for Y-tiling is actually
* the width of one sub-column of the Y-major tile where each 4K
* tile has 8 512B sub-columns.
*
* The major Y value is the row of tiles in which the pixel lives.
* The major X value is the tile sub-column in which the pixel
* lives; for X tiling, this is the same as the tile column, for Y
* tiling, each tile has 8 sub-columns. The minor X and Y indices
* are the position within the sub-column.
*/
/* Calculate the minor x and y indices. */
nir_ssa_def *minor = nir_ubfe(b, xypos, nir_imm_int(b, 0),
nir_channels(b, tiling, 0x3));
nir_ssa_def *major = nir_ushr(b, xypos, nir_channels(b, tiling, 0x3));
/* Calculate the texel index from the start of the tile row and the
* vertical coordinate of the row.
* Equivalent to:
* tmp.x = (major.x << tile.y << tile.x) +
* (minor.y << tile.x) + minor.x
* tmp.y = major.y << tile.y
*/
nir_ssa_def *idx_x, *idx_y;
idx_x = nir_ishl(b, nir_channel(b, major, 0), nir_channel(b, tiling, 1));
idx_x = nir_iadd(b, idx_x, nir_channel(b, minor, 1));
idx_x = nir_ishl(b, idx_x, nir_channel(b, tiling, 0));
idx_x = nir_iadd(b, idx_x, nir_channel(b, minor, 0));
idx_y = nir_ishl(b, nir_channel(b, major, 1), nir_channel(b, tiling, 1));
/* Add it to the start of the tile row. */
nir_ssa_def *idx;
idx = nir_imul(b, idx_y, nir_channel(b, stride, 1));
idx = nir_iadd(b, idx, idx_x);
/* Multiply by the Bpp value. */
addr = nir_imul(b, idx, nir_channel(b, stride, 0));
if (devinfo->gen < 8 && !devinfo->is_baytrail) {
/* Take into account the two dynamically specified shifts. Both are
* used to implement swizzling of X-tiled surfaces. For Y-tiled
* surfaces only one bit needs to be XOR-ed with bit 6 of the memory
* address, so a swz value of 0xff (actually interpreted as 31 by the
* hardware) will be provided to cause the relevant bit of tmp.y to
* be zero and turn the first XOR into the identity. For linear
* surfaces or platforms lacking address swizzling both shifts will
* be 0xff causing the relevant bits of both tmp.x and .y to be zero,
* what effectively disables swizzling.
*/
nir_ssa_def *swizzle = load_image_param(b, deref, SWIZZLING);
nir_ssa_def *shift0 = nir_ushr(b, addr, nir_channel(b, swizzle, 0));
nir_ssa_def *shift1 = nir_ushr(b, addr, nir_channel(b, swizzle, 1));
/* XOR tmp.x and tmp.y with bit 6 of the memory address. */
nir_ssa_def *bit = nir_iand(b, nir_ixor(b, shift0, shift1),
nir_imm_int(b, 1 << 6));
addr = nir_ixor(b, addr, bit);
}
} else {
/* Multiply by the Bpp/stride value. Note that the addr.y may be
* non-zero even if the image is one-dimensional because a vertical
* offset may have been applied above to select a non-zero slice or
* level of a higher-dimensional texture.
*/
nir_ssa_def *idx;
idx = nir_imul(b, nir_channel(b, xypos, 1), nir_channel(b, stride, 1));
idx = nir_iadd(b, nir_channel(b, xypos, 0), idx);
addr = nir_imul(b, idx, nir_channel(b, stride, 0));
}
return addr;
}
struct format_info {
const struct isl_format_layout *fmtl;
unsigned chans;
unsigned bits[4];
};
static struct format_info
get_format_info(enum isl_format fmt)
{
const struct isl_format_layout *fmtl = isl_format_get_layout(fmt);
return (struct format_info) {
.fmtl = fmtl,
.chans = isl_format_get_num_channels(fmt),
.bits = {
fmtl->channels.r.bits,
fmtl->channels.g.bits,
fmtl->channels.b.bits,
fmtl->channels.a.bits
},
};
}
static nir_ssa_def *
nir_zero_vec(nir_builder *b, unsigned num_components)
{
nir_const_value v;
memset(&v, 0, sizeof(v));
return nir_build_imm(b, num_components, 32, v);
}
static nir_ssa_def *
convert_color_for_load(nir_builder *b, const struct gen_device_info *devinfo,
nir_ssa_def *color,
enum isl_format image_fmt, enum isl_format lower_fmt,
unsigned dest_components)
{
if (image_fmt == lower_fmt)
goto expand_vec;
if (image_fmt == ISL_FORMAT_R11G11B10_FLOAT) {
assert(lower_fmt == ISL_FORMAT_R32_UINT);
color = nir_format_unpack_11f11f10f(b, color);
goto expand_vec;
}
struct format_info image = get_format_info(image_fmt);
struct format_info lower = get_format_info(lower_fmt);
const bool needs_sign_extension =
isl_format_has_snorm_channel(image_fmt) ||
isl_format_has_sint_channel(image_fmt);
/* We only check the red channel to detect if we need to pack/unpack */
assert(image.bits[0] != lower.bits[0] ||
memcmp(image.bits, lower.bits, sizeof(image.bits)) == 0);
if (image.bits[0] != lower.bits[0] && lower_fmt == ISL_FORMAT_R32_UINT) {
if (needs_sign_extension)
color = nir_format_unpack_sint(b, color, image.bits, image.chans);
else
color = nir_format_unpack_uint(b, color, image.bits, image.chans);
} else {
/* All these formats are homogeneous */
for (unsigned i = 1; i < image.chans; i++)
assert(image.bits[i] == image.bits[0]);
/* On IVB, we rely on the undocumented behavior that typed reads from
* surfaces of the unsupported R8 and R16 formats return useful data in
* their least significant bits. However, the data in the high bits is
* garbage so we have to discard it.
*/
if (devinfo->gen == 7 && !devinfo->is_haswell &&
(lower_fmt == ISL_FORMAT_R16_UINT ||
lower_fmt == ISL_FORMAT_R8_UINT))
color = nir_format_mask_uvec(b, color, lower.bits);
if (image.bits[0] != lower.bits[0]) {
color = nir_format_bitcast_uvec_unmasked(b, color, lower.bits[0],
image.bits[0]);
}
if (needs_sign_extension)
color = nir_format_sign_extend_ivec(b, color, image.bits);
}
switch (image.fmtl->channels.r.type) {
case ISL_UNORM:
assert(isl_format_has_uint_channel(lower_fmt));
color = nir_format_unorm_to_float(b, color, image.bits);
break;
case ISL_SNORM:
assert(isl_format_has_uint_channel(lower_fmt));
color = nir_format_snorm_to_float(b, color, image.bits);
break;
case ISL_SFLOAT:
if (image.bits[0] == 16)
color = nir_unpack_half_2x16_split_x(b, color);
break;
case ISL_UINT:
case ISL_SINT:
break;
default:
unreachable("Invalid image channel type");
}
expand_vec:
assert(dest_components == 1 || dest_components == 4);
assert(color->num_components <= dest_components);
if (color->num_components == dest_components)
return color;
nir_ssa_def *comps[4];
for (unsigned i = 0; i < color->num_components; i++)
comps[i] = nir_channel(b, color, i);
for (unsigned i = color->num_components; i < 3; i++)
comps[i] = nir_imm_int(b, 0);
if (color->num_components < 4) {
if (isl_format_has_int_channel(image_fmt))
comps[3] = nir_imm_int(b, 1);
else
comps[3] = nir_imm_float(b, 1);
}
return nir_vec(b, comps, dest_components);
}
static bool
lower_image_load_instr(nir_builder *b,
const struct gen_device_info *devinfo,
nir_intrinsic_instr *intrin)
{
nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
nir_variable *var = nir_deref_instr_get_variable(deref);
const enum isl_format image_fmt =
isl_format_for_gl_format(var->data.image.format);
if (isl_has_matching_typed_storage_image_format(devinfo, image_fmt)) {
const enum isl_format lower_fmt =
isl_lower_storage_image_format(devinfo, image_fmt);
const unsigned dest_components = intrin->num_components;
/* Use an undef to hold the uses of the load while we do the color
* conversion.
*/
nir_ssa_def *placeholder = nir_ssa_undef(b, 4, 32);
nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(placeholder));
intrin->num_components = isl_format_get_num_channels(lower_fmt);
intrin->dest.ssa.num_components = intrin->num_components;
b->cursor = nir_after_instr(&intrin->instr);
nir_ssa_def *color = convert_color_for_load(b, devinfo,
&intrin->dest.ssa,
image_fmt, lower_fmt,
dest_components);
nir_ssa_def_rewrite_uses(placeholder, nir_src_for_ssa(color));
nir_instr_remove(placeholder->parent_instr);
} else {
const struct isl_format_layout *image_fmtl =
isl_format_get_layout(image_fmt);
/* We have a matching typed format for everything 32b and below */
assert(image_fmtl->bpb == 64 || image_fmtl->bpb == 128);
enum isl_format raw_fmt = (image_fmtl->bpb == 64) ?
ISL_FORMAT_R32G32_UINT :
ISL_FORMAT_R32G32B32A32_UINT;
const unsigned dest_components = intrin->num_components;
b->cursor = nir_instr_remove(&intrin->instr);
nir_ssa_def *coord = intrin->src[1].ssa;
nir_ssa_def *do_load = image_coord_is_in_bounds(b, deref, coord);
if (devinfo->gen == 7 && !devinfo->is_haswell) {
/* Check whether the first stride component (i.e. the Bpp value)
* is greater than four, what on Gen7 indicates that a surface of
* type RAW has been bound for untyped access. Reading or writing
* to a surface of type other than RAW using untyped surface
* messages causes a hang on IVB and VLV.
*/
nir_ssa_def *stride = load_image_param(b, deref, STRIDE);
nir_ssa_def *is_raw =
nir_ilt(b, nir_imm_int(b, 4), nir_channel(b, stride, 0));
do_load = nir_iand(b, do_load, is_raw);
}
nir_push_if(b, do_load);
nir_ssa_def *addr = image_address(b, devinfo, deref, coord);
nir_intrinsic_instr *load =
nir_intrinsic_instr_create(b->shader,
nir_intrinsic_image_deref_load_raw_intel);
load->src[0] = nir_src_for_ssa(&deref->dest.ssa);
load->src[1] = nir_src_for_ssa(addr);
load->num_components = image_fmtl->bpb / 32;
nir_ssa_dest_init(&load->instr, &load->dest,
load->num_components, 32, NULL);
nir_builder_instr_insert(b, &load->instr);
nir_push_else(b, NULL);
nir_ssa_def *zero = nir_zero_vec(b, load->num_components);
nir_pop_if(b, NULL);
nir_ssa_def *value = nir_if_phi(b, &load->dest.ssa, zero);
nir_ssa_def *color = convert_color_for_load(b, devinfo, value,
image_fmt, raw_fmt,
dest_components);
nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(color));
}
return true;
}
static nir_ssa_def *
convert_color_for_store(nir_builder *b, const struct gen_device_info *devinfo,
nir_ssa_def *color,
enum isl_format image_fmt, enum isl_format lower_fmt)
{
struct format_info image = get_format_info(image_fmt);
struct format_info lower = get_format_info(lower_fmt);
color = nir_channels(b, color, (1 << image.chans) - 1);
if (image_fmt == lower_fmt)
return color;
if (image_fmt == ISL_FORMAT_R11G11B10_FLOAT) {
assert(lower_fmt == ISL_FORMAT_R32_UINT);
return nir_format_pack_11f11f10f(b, color);
}
switch (image.fmtl->channels.r.type) {
case ISL_UNORM:
assert(isl_format_has_uint_channel(lower_fmt));
color = nir_format_float_to_unorm(b, color, image.bits);
break;
case ISL_SNORM:
assert(isl_format_has_uint_channel(lower_fmt));
color = nir_format_float_to_snorm(b, color, image.bits);
break;
case ISL_SFLOAT:
if (image.bits[0] == 16) {
nir_ssa_def *f16comps[4];
for (unsigned i = 0; i < image.chans; i++) {
f16comps[i] = nir_pack_half_2x16_split(b, nir_channel(b, color, i),
nir_imm_float(b, 0));
}
color = nir_vec(b, f16comps, image.chans);
}
break;
case ISL_UINT:
if (image.bits[0] < 32) {
nir_const_value max;
for (unsigned i = 0; i < image.chans; i++) {
assert(image.bits[i] < 32);
max.u32[i] = (1u << image.bits[i]) - 1;
}
color = nir_umin(b, color, nir_build_imm(b, image.chans, 32, max));
}
break;
case ISL_SINT:
if (image.bits[0] < 32) {
nir_const_value min, max;
for (unsigned i = 0; i < image.chans; i++) {
assert(image.bits[i] < 32);
max.i32[i] = (1 << (image.bits[i] - 1)) - 1;
min.i32[i] = -(1 << (image.bits[i] - 1));
}
color = nir_imin(b, color, nir_build_imm(b, image.chans, 32, max));
color = nir_imax(b, color, nir_build_imm(b, image.chans, 32, min));
}
break;
default:
unreachable("Invalid image channel type");
}
if (image.bits[0] < 32 &&
(isl_format_has_snorm_channel(image_fmt) ||
isl_format_has_sint_channel(image_fmt)))
color = nir_format_mask_uvec(b, color, image.bits);
if (image.bits[0] != lower.bits[0] && lower_fmt == ISL_FORMAT_R32_UINT) {
color = nir_format_pack_uint(b, color, image.bits, image.chans);
} else {
/* All these formats are homogeneous */
for (unsigned i = 1; i < image.chans; i++)
assert(image.bits[i] == image.bits[0]);
if (image.bits[0] != lower.bits[0]) {
color = nir_format_bitcast_uvec_unmasked(b, color, image.bits[0],
lower.bits[0]);
}
}
return color;
}
static bool
lower_image_store_instr(nir_builder *b,
const struct gen_device_info *devinfo,
nir_intrinsic_instr *intrin)
{
nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
nir_variable *var = nir_deref_instr_get_variable(deref);
/* For write-only surfaces, we trust that the hardware can just do the
* conversion for us.
*/
if (var->data.image.write_only)
return false;
const enum isl_format image_fmt =
isl_format_for_gl_format(var->data.image.format);
if (isl_has_matching_typed_storage_image_format(devinfo, image_fmt)) {
const enum isl_format lower_fmt =
isl_lower_storage_image_format(devinfo, image_fmt);
/* Color conversion goes before the store */
b->cursor = nir_before_instr(&intrin->instr);
nir_ssa_def *color = convert_color_for_store(b, devinfo,
intrin->src[3].ssa,
image_fmt, lower_fmt);
intrin->num_components = isl_format_get_num_channels(lower_fmt);
nir_instr_rewrite_src(&intrin->instr, &intrin->src[3],
nir_src_for_ssa(color));
} else {
const struct isl_format_layout *image_fmtl =
isl_format_get_layout(image_fmt);
/* We have a matching typed format for everything 32b and below */
assert(image_fmtl->bpb == 64 || image_fmtl->bpb == 128);
enum isl_format raw_fmt = (image_fmtl->bpb == 64) ?
ISL_FORMAT_R32G32_UINT :
ISL_FORMAT_R32G32B32A32_UINT;
b->cursor = nir_instr_remove(&intrin->instr);
nir_ssa_def *coord = intrin->src[1].ssa;
nir_ssa_def *do_store = image_coord_is_in_bounds(b, deref, coord);
if (devinfo->gen == 7 && !devinfo->is_haswell) {
/* Check whether the first stride component (i.e. the Bpp value)
* is greater than four, what on Gen7 indicates that a surface of
* type RAW has been bound for untyped access. Reading or writing
* to a surface of type other than RAW using untyped surface
* messages causes a hang on IVB and VLV.
*/
nir_ssa_def *stride = load_image_param(b, deref, STRIDE);
nir_ssa_def *is_raw =
nir_ilt(b, nir_imm_int(b, 4), nir_channel(b, stride, 0));
do_store = nir_iand(b, do_store, is_raw);
}
nir_push_if(b, do_store);
nir_ssa_def *addr = image_address(b, devinfo, deref, coord);
nir_ssa_def *color = convert_color_for_store(b, devinfo,
intrin->src[3].ssa,
image_fmt, raw_fmt);
nir_intrinsic_instr *store =
nir_intrinsic_instr_create(b->shader,
nir_intrinsic_image_deref_store_raw_intel);
store->src[0] = nir_src_for_ssa(&deref->dest.ssa);
store->src[1] = nir_src_for_ssa(addr);
store->src[2] = nir_src_for_ssa(color);
store->num_components = image_fmtl->bpb / 32;
nir_builder_instr_insert(b, &store->instr);
nir_pop_if(b, NULL);
}
return true;
}
static bool
lower_image_atomic_instr(nir_builder *b,
const struct gen_device_info *devinfo,
nir_intrinsic_instr *intrin)
{
if (devinfo->is_haswell || devinfo->gen >= 8)
return false;
nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
b->cursor = nir_instr_remove(&intrin->instr);
/* Use an undef to hold the uses of the load conversion. */
nir_ssa_def *placeholder = nir_ssa_undef(b, 4, 32);
nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(placeholder));
/* Check the first component of the size field to find out if the
* image is bound. Necessary on IVB for typed atomics because
* they don't seem to respect null surfaces and will happily
* corrupt or read random memory when no image is bound.
*/
nir_ssa_def *size = load_image_param(b, deref, SIZE);
nir_ssa_def *zero = nir_imm_int(b, 0);
nir_push_if(b, nir_ine(b, nir_channel(b, size, 0), zero));
nir_builder_instr_insert(b, &intrin->instr);
nir_pop_if(b, NULL);
nir_ssa_def *result = nir_if_phi(b, &intrin->dest.ssa, zero);
nir_ssa_def_rewrite_uses(placeholder, nir_src_for_ssa(result));
return true;
}
static bool
lower_image_size_instr(nir_builder *b,
const struct gen_device_info *devinfo,
nir_intrinsic_instr *intrin)
{
nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
b->cursor = nir_instr_remove(&intrin->instr);
nir_ssa_def *size = load_image_param(b, deref, SIZE);
nir_ssa_def *comps[4] = { NULL, NULL, NULL, NULL };
enum glsl_sampler_dim dim = glsl_get_sampler_dim(deref->type);
unsigned coord_comps = glsl_get_sampler_coordinate_components(deref->type);
for (unsigned c = 0; c < coord_comps; c++) {
if (c == 1 && dim == GLSL_SAMPLER_DIM_1D) {
/* The array length for 1D arrays is in .z */
comps[1] = nir_channel(b, size, 2);
} else if (c == 2 && dim == GLSL_SAMPLER_DIM_CUBE) {
comps[2] = nir_idiv(b, nir_channel(b, size, 2), nir_imm_int(b, 6));
} else {
comps[c] = nir_channel(b, size, c);
}
}
for (unsigned c = coord_comps; c < intrin->dest.ssa.num_components; ++c)
comps[c] = nir_imm_int(b, 1);
nir_ssa_def *vec = nir_vec(b, comps, intrin->dest.ssa.num_components);
nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(vec));
return true;
}
bool
brw_nir_lower_image_load_store(nir_shader *shader,
const struct gen_device_info *devinfo)
{
bool progress = false;
nir_foreach_function(function, shader) {
if (function->impl == NULL)
continue;
nir_foreach_block_safe(block, function->impl) {
nir_builder b;
nir_builder_init(&b, function->impl);
nir_foreach_instr_safe(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
switch (intrin->intrinsic) {
case nir_intrinsic_image_deref_load:
if (lower_image_load_instr(&b, devinfo, intrin))
progress = true;
break;
case nir_intrinsic_image_deref_store:
if (lower_image_store_instr(&b, devinfo, intrin))
progress = true;
break;
case nir_intrinsic_image_deref_atomic_add:
case nir_intrinsic_image_deref_atomic_min:
case nir_intrinsic_image_deref_atomic_max:
case nir_intrinsic_image_deref_atomic_and:
case nir_intrinsic_image_deref_atomic_or:
case nir_intrinsic_image_deref_atomic_xor:
case nir_intrinsic_image_deref_atomic_exchange:
case nir_intrinsic_image_deref_atomic_comp_swap:
if (lower_image_atomic_instr(&b, devinfo, intrin))
progress = true;
break;
case nir_intrinsic_image_deref_size:
if (lower_image_size_instr(&b, devinfo, intrin))
progress = true;
break;
default:
/* Nothing to do */
break;
}
}
}
nir_metadata_preserve(function->impl, nir_metadata_block_index |
nir_metadata_dominance);
}
return progress;
}

View File

@ -77,6 +77,7 @@ libintel_compiler_files = files(
'brw_nir_analyze_ubo_ranges.c',
'brw_nir_attribute_workarounds.c',
'brw_nir_lower_cs_intrinsics.c',
'brw_nir_lower_image_load_store.c',
'brw_nir_opt_peephole_ffma.c',
'brw_nir_tcs_workarounds.c',
'brw_packed_float.c',

View File

@ -532,6 +532,8 @@ anv_pipeline_lower_nir(struct anv_pipeline *pipeline,
if (nir->info.stage != MESA_SHADER_COMPUTE)
brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges);
NIR_PASS_V(nir, brw_nir_lower_image_load_store, compiler->devinfo);
assert(nir->num_uniforms == prog_data->nr_params * 4);
stage->nir = nir;

View File

@ -102,6 +102,8 @@ brw_create_nir(struct brw_context *brw,
nir = brw_preprocess_nir(brw->screen->compiler, nir);
NIR_PASS_V(nir, brw_nir_lower_image_load_store, devinfo);
if (stage == MESA_SHADER_TESS_CTRL) {
/* Lower gl_PatchVerticesIn from a sys. value to a uniform on Gen8+. */
static const gl_state_index16 tokens[STATE_LENGTH] =