nv50/ir: add surface op lowering
This handles BUFQ, SUQ, as well as all the various texture types and formats, driven by data supplied by the driver (and shader itself). TODO: - 2d linear surfaces - format via key for writeonly These will be included in a later change. ES3.1 doesn't require writeonly, and it's very hard to generate a 2d linear surface. Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu> Acked-by: Pierre Moreau <dev@pmoreau.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10164>
This commit is contained in:
parent
67f98497af
commit
6b1a526ac5
|
@ -25,6 +25,24 @@
|
|||
|
||||
#include "codegen/nv50_ir_target_nv50.h"
|
||||
|
||||
#define NV50_SU_INFO_SIZE_X 0x00
|
||||
#define NV50_SU_INFO_SIZE_Y 0x04
|
||||
#define NV50_SU_INFO_SIZE_Z 0x08
|
||||
#define NV50_SU_INFO_BSIZE 0x0c
|
||||
#define NV50_SU_INFO_STRIDE_Y 0x10
|
||||
#define NV50_SU_INFO_MS_X 0x18
|
||||
#define NV50_SU_INFO_MS_Y 0x1c
|
||||
#define NV50_SU_INFO_TILE_SHIFT_X 0x20
|
||||
#define NV50_SU_INFO_TILE_SHIFT_Y 0x24
|
||||
#define NV50_SU_INFO_TILE_SHIFT_Z 0x28
|
||||
#define NV50_SU_INFO_OFFSET_Z 0x2c
|
||||
|
||||
#define NV50_SU_INFO__STRIDE 0x30
|
||||
|
||||
#define NV50_SU_INFO_SIZE(i) (0x00 + (i) * 4)
|
||||
#define NV50_SU_INFO_MS(i) (0x18 + (i) * 4)
|
||||
#define NV50_SU_INFO_TILE_SHIFT(i) (0x20 + (i) * 4)
|
||||
|
||||
namespace nv50_ir {
|
||||
|
||||
// nv50 doesn't support 32 bit integer multiplication
|
||||
|
@ -215,6 +233,8 @@ private:
|
|||
void handlePRERET(FlowInstruction *);
|
||||
void replaceZero(Instruction *);
|
||||
|
||||
BuildUtil bld;
|
||||
|
||||
LValue *r63;
|
||||
};
|
||||
|
||||
|
@ -627,6 +647,10 @@ private:
|
|||
bool handleEXPORT(Instruction *);
|
||||
bool handleLOAD(Instruction *);
|
||||
bool handleLDST(Instruction *);
|
||||
bool handleSULDP(TexInstruction *);
|
||||
bool handleSUREDP(TexInstruction *);
|
||||
bool handleSUSTP(TexInstruction *);
|
||||
Value *processSurfaceCoords(TexInstruction *);
|
||||
|
||||
bool handleDIV(Instruction *);
|
||||
bool handleSQRT(Instruction *);
|
||||
|
@ -642,6 +666,8 @@ private:
|
|||
bool handleTXD(TexInstruction *); // these 3
|
||||
bool handleTXLQ(TexInstruction *);
|
||||
bool handleTXQ(TexInstruction *);
|
||||
bool handleSUQ(TexInstruction *);
|
||||
bool handleBUFQ(Instruction *);
|
||||
|
||||
bool handleCALL(Instruction *);
|
||||
bool handlePRECONT(Instruction *);
|
||||
|
@ -650,6 +676,8 @@ private:
|
|||
void checkPredicate(Instruction *);
|
||||
void loadTexMsInfo(uint32_t off, Value **ms, Value **ms_x, Value **ms_y);
|
||||
void loadMsInfo(Value *ms, Value *s, Value **dx, Value **dy);
|
||||
Value *loadSuInfo(int slot, uint32_t off);
|
||||
Value *loadSuInfo16(int slot, uint32_t off);
|
||||
|
||||
private:
|
||||
const Target *const targ;
|
||||
|
@ -724,6 +752,24 @@ void NV50LoweringPreSSA::loadMsInfo(Value *ms, Value *s, Value **dx, Value **dy)
|
|||
prog->driver->io.msInfoBase + 4), off);
|
||||
}
|
||||
|
||||
Value *
|
||||
NV50LoweringPreSSA::loadSuInfo(int slot, uint32_t off)
|
||||
{
|
||||
uint8_t b = prog->driver->io.auxCBSlot;
|
||||
off += prog->driver->io.bufInfoBase + slot * NV50_SU_INFO__STRIDE;
|
||||
return bld.mkLoadv(TYPE_U32, bld.mkSymbol(
|
||||
FILE_MEMORY_CONST, b, TYPE_U32, off), NULL);
|
||||
}
|
||||
|
||||
Value *
|
||||
NV50LoweringPreSSA::loadSuInfo16(int slot, uint32_t off)
|
||||
{
|
||||
uint8_t b = prog->driver->io.auxCBSlot;
|
||||
off += prog->driver->io.bufInfoBase + slot * NV50_SU_INFO__STRIDE;
|
||||
return bld.mkLoadv(TYPE_U16, bld.mkSymbol(
|
||||
FILE_MEMORY_CONST, b, TYPE_U16, off), NULL);
|
||||
}
|
||||
|
||||
bool
|
||||
NV50LoweringPreSSA::handleTEX(TexInstruction *i)
|
||||
{
|
||||
|
@ -1064,6 +1110,56 @@ NV50LoweringPreSSA::handleTXQ(TexInstruction *i)
|
|||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
NV50LoweringPreSSA::handleSUQ(TexInstruction *suq)
|
||||
{
|
||||
const int dim = suq->tex.target.getDim();
|
||||
const int arg = dim + (suq->tex.target.isArray() || suq->tex.target.isCube());
|
||||
int mask = suq->tex.mask;
|
||||
int slot = suq->tex.r + 7;
|
||||
int c, d;
|
||||
|
||||
for (c = 0, d = 0; c < 3; ++c, mask >>= 1) {
|
||||
if (c >= arg || !(mask & 1))
|
||||
continue;
|
||||
|
||||
int offset;
|
||||
|
||||
if (c == 1 && suq->tex.target == TEX_TARGET_1D_ARRAY) {
|
||||
offset = NV50_SU_INFO_SIZE(2);
|
||||
} else {
|
||||
offset = NV50_SU_INFO_SIZE(c);
|
||||
}
|
||||
bld.mkMov(suq->getDef(d++), loadSuInfo(slot, offset));
|
||||
if (c == 2 && suq->tex.target.isCube())
|
||||
bld.mkOp2(OP_DIV, TYPE_U32, suq->getDef(d - 1), suq->getDef(d - 1),
|
||||
bld.loadImm(NULL, 6));
|
||||
}
|
||||
|
||||
if (mask & 1) {
|
||||
if (suq->tex.target.isMS()) {
|
||||
Value *ms_x = loadSuInfo(slot, NV50_SU_INFO_MS(0));
|
||||
Value *ms_y = loadSuInfo(slot, NV50_SU_INFO_MS(1));
|
||||
Value *ms = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(), ms_x, ms_y);
|
||||
bld.mkOp2(OP_SHL, TYPE_U32, suq->getDef(d++), bld.loadImm(NULL, 1), ms);
|
||||
} else {
|
||||
bld.mkMov(suq->getDef(d++), bld.loadImm(NULL, 1));
|
||||
}
|
||||
}
|
||||
|
||||
bld.remove(suq);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
NV50LoweringPreSSA::handleBUFQ(Instruction *bufq)
|
||||
{
|
||||
bufq->op = OP_MOV;
|
||||
bufq->setSrc(0, loadSuInfo(bufq->getSrc(0)->reg.fileIndex, NV50_SU_INFO_SIZE_X));
|
||||
bufq->setIndirect(0, 0, NULL);
|
||||
bufq->setIndirect(0, 1, NULL);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
NV50LoweringPreSSA::handleSET(Instruction *i)
|
||||
|
@ -1407,6 +1503,478 @@ NV50LoweringPreSSA::handleLDST(Instruction *i)
|
|||
return true;
|
||||
}
|
||||
|
||||
// The type that bests represents how each component can be stored when packed.
|
||||
static DataType
|
||||
getPackedType(const TexInstruction::ImgFormatDesc *t, int c)
|
||||
{
|
||||
switch (t->type) {
|
||||
case FLOAT: return t->bits[c] == 16 ? TYPE_F16 : TYPE_F32;
|
||||
case UNORM: return t->bits[c] == 8 ? TYPE_U8 : TYPE_U16;
|
||||
case SNORM: return t->bits[c] == 8 ? TYPE_S8 : TYPE_S16;
|
||||
case UINT:
|
||||
return (t->bits[c] == 8 ? TYPE_U8 :
|
||||
(t->bits[c] <= 16 ? TYPE_U16 : TYPE_U32));
|
||||
case SINT:
|
||||
return (t->bits[c] == 8 ? TYPE_S8 :
|
||||
(t->bits[c] <= 16 ? TYPE_S16 : TYPE_S32));
|
||||
}
|
||||
return TYPE_NONE;
|
||||
}
|
||||
|
||||
// The type that the rest of the shader expects to process this image type in.
|
||||
static DataType
|
||||
getShaderType(const ImgType type) {
|
||||
switch (type) {
|
||||
case FLOAT:
|
||||
case UNORM:
|
||||
case SNORM:
|
||||
return TYPE_F32;
|
||||
case UINT:
|
||||
return TYPE_U32;
|
||||
case SINT:
|
||||
return TYPE_S32;
|
||||
default:
|
||||
assert(!"Impossible type");
|
||||
return TYPE_NONE;
|
||||
}
|
||||
}
|
||||
|
||||
// Reads the raw coordinates out of the input instruction, and returns a
|
||||
// single-value coordinate which is what the hardware expects to receive in a
|
||||
// ld/st op.
|
||||
Value *
|
||||
NV50LoweringPreSSA::processSurfaceCoords(TexInstruction *su)
|
||||
{
|
||||
const int slot = su->tex.r + 7;
|
||||
const int dim = su->tex.target.getDim();
|
||||
const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
|
||||
|
||||
const TexInstruction::ImgFormatDesc *format = su->tex.format;
|
||||
const uint16_t bytes = (format->bits[0] + format->bits[1] +
|
||||
format->bits[2] + format->bits[3]) / 8;
|
||||
uint16_t shift = ffs(bytes) - 1;
|
||||
|
||||
// Buffer sizes don't necessarily fit in 16-bit values
|
||||
if (su->tex.target == TEX_TARGET_BUFFER) {
|
||||
return bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
|
||||
su->getSrc(0), bld.loadImm(NULL, (uint32_t)shift));
|
||||
}
|
||||
|
||||
// For buffers, we just need the byte offset. And for 2d buffers we want
|
||||
// the x coordinate in bytes as well.
|
||||
Value *coords[3] = {};
|
||||
for (int i = 0; i < arg; i++) {
|
||||
Value *src[2];
|
||||
bld.mkSplit(src, 2, su->getSrc(i));
|
||||
coords[i] = src[0];
|
||||
// For 1d-images, we want the y coord to be 0, which it will be here.
|
||||
if (i == 0)
|
||||
coords[1] = src[1];
|
||||
}
|
||||
|
||||
coords[0] = bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2),
|
||||
coords[0], bld.loadImm(NULL, shift));
|
||||
|
||||
if (su->tex.target.isMS()) {
|
||||
Value *ms_x = loadSuInfo16(slot, NV50_SU_INFO_MS(0));
|
||||
Value *ms_y = loadSuInfo16(slot, NV50_SU_INFO_MS(1));
|
||||
coords[0] = bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2), coords[0], ms_x);
|
||||
coords[1] = bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2), coords[1], ms_y);
|
||||
}
|
||||
|
||||
// If there are more dimensions, we just want the y-offset. But that needs
|
||||
// to be adjusted up by the y-stride for array images.
|
||||
if (su->tex.target.isArray() || su->tex.target.isCube()) {
|
||||
Value *index = coords[dim];
|
||||
Value *height = loadSuInfo16(slot, NV50_SU_INFO_STRIDE_Y);
|
||||
Instruction *mul = bld.mkOp2(OP_MUL, TYPE_U32, bld.getSSA(4), index, height);
|
||||
mul->sType = TYPE_U16;
|
||||
Value *muls[2];
|
||||
bld.mkSplit(muls, 2, mul->getDef(0));
|
||||
if (dim > 1)
|
||||
coords[1] = bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2), coords[1], muls[0]);
|
||||
else
|
||||
coords[1] = muls[0];
|
||||
}
|
||||
|
||||
// 3d is special-cased. Note that a single "slice" of a 3d image may
|
||||
// also be attached as 2d, so we have to do the same 3d processing for
|
||||
// 2d as well, just in case. In order to remap a 3d image onto a 2d
|
||||
// image, we have to retile it "by hand".
|
||||
if (su->tex.target == TEX_TARGET_3D || su->tex.target == TEX_TARGET_2D) {
|
||||
Value *z = loadSuInfo16(slot, NV50_SU_INFO_OFFSET_Z);
|
||||
Value *y_size_aligned = loadSuInfo16(slot, NV50_SU_INFO_STRIDE_Y);
|
||||
// Add the z coordinate for actual 3d-images
|
||||
if (dim > 2)
|
||||
coords[2] = bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2), z, coords[2]);
|
||||
else
|
||||
coords[2] = z;
|
||||
|
||||
// Compute the surface parameters from tile shifts
|
||||
Value *tile_shift[3];
|
||||
Value *tile_size[3];
|
||||
Value *tile_mask[3];
|
||||
// We only ever use one kind of X-tiling.
|
||||
tile_shift[0] = bld.loadImm(NULL, (uint16_t)6);
|
||||
tile_size[0] = bld.loadImm(NULL, (uint16_t)64);
|
||||
tile_mask[0] = bld.loadImm(NULL, (uint16_t)63);
|
||||
// Fetch the "real" tiling parameters of the underlying surface
|
||||
for (int i = 1; i < 3; i++) {
|
||||
tile_shift[i] = loadSuInfo16(slot, NV50_SU_INFO_TILE_SHIFT(i));
|
||||
tile_size[i] = bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2), bld.loadImm(NULL, (uint16_t)1), tile_shift[i]);
|
||||
tile_mask[i] = bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2), tile_size[i], bld.loadImm(NULL, (uint16_t)-1));
|
||||
}
|
||||
|
||||
// Compute the location of given coordinate, both inside the tile as
|
||||
// well as which (linearly-laid out) tile it's in.
|
||||
Value *coord_in_tile[3];
|
||||
Value *tile[3];
|
||||
for (int i = 0; i < 3; i++) {
|
||||
coord_in_tile[i] = bld.mkOp2v(OP_AND, TYPE_U16, bld.getSSA(2), coords[i], tile_mask[i]);
|
||||
tile[i] = bld.mkOp2v(OP_SHR, TYPE_U16, bld.getSSA(2), coords[i], tile_shift[i]);
|
||||
}
|
||||
|
||||
// Based on the "real" tiling parameters, compute x/y coordinates in the
|
||||
// larger surface with 2d tiling that was supplied to the hardware. This
|
||||
// was determined and verified with the help of the tiling pseudocode in
|
||||
// the envytools docs.
|
||||
//
|
||||
// adj_x = x_coord_in_tile + x_tile * x_tile_size * z_tile_size +
|
||||
// z_coord_in_tile * x_tile_size
|
||||
// adj_y = y_coord_in_tile + y_tile * y_tile_size +
|
||||
// z_tile * y_tile_size * y_tiles
|
||||
//
|
||||
// Note: STRIDE_Y = y_tile_size * y_tiles
|
||||
|
||||
coords[0] = bld.mkOp2v(
|
||||
OP_ADD, TYPE_U16, bld.getSSA(2),
|
||||
bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2),
|
||||
coord_in_tile[0],
|
||||
bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2),
|
||||
tile[0],
|
||||
bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2),
|
||||
tile_shift[2], tile_shift[0]))),
|
||||
bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2),
|
||||
coord_in_tile[2], tile_shift[0]));
|
||||
|
||||
Instruction *mul = bld.mkOp2(OP_MUL, TYPE_U32, bld.getSSA(4),
|
||||
tile[2], y_size_aligned);
|
||||
mul->sType = TYPE_U16;
|
||||
Value *muls[2];
|
||||
bld.mkSplit(muls, 2, mul->getDef(0));
|
||||
|
||||
coords[1] = bld.mkOp2v(
|
||||
OP_ADD, TYPE_U16, bld.getSSA(2),
|
||||
muls[0],
|
||||
bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2),
|
||||
coord_in_tile[1],
|
||||
bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2),
|
||||
tile[1], tile_shift[1])));
|
||||
}
|
||||
|
||||
return bld.mkOp2v(OP_MERGE, TYPE_U32, bld.getSSA(), coords[0], coords[1]);
|
||||
}
|
||||
|
||||
// This is largely a copy of NVC0LoweringPass::convertSurfaceFormat, but
|
||||
// adjusted to make use of 16-bit math where possible.
|
||||
bool
|
||||
NV50LoweringPreSSA::handleSULDP(TexInstruction *su)
|
||||
{
|
||||
const int slot = su->tex.r + 7;
|
||||
assert(!su->getIndirectR());
|
||||
|
||||
bld.setPosition(su, false);
|
||||
|
||||
const TexInstruction::ImgFormatDesc *format = su->tex.format;
|
||||
const int bytes = (su->tex.format->bits[0] +
|
||||
su->tex.format->bits[1] +
|
||||
su->tex.format->bits[2] +
|
||||
su->tex.format->bits[3]) / 8;
|
||||
DataType ty = typeOfSize(bytes);
|
||||
|
||||
Value *coord = processSurfaceCoords(su);
|
||||
|
||||
Value *untypedDst[4] = {};
|
||||
Value *typedDst[4] = {};
|
||||
int i;
|
||||
for (i = 0; i < bytes / 4; i++)
|
||||
untypedDst[i] = bld.getSSA();
|
||||
if (bytes < 4)
|
||||
untypedDst[0] = bld.getSSA();
|
||||
|
||||
for (i = 0; i < 4; i++)
|
||||
typedDst[i] = su->getDef(i);
|
||||
|
||||
Instruction *load = bld.mkLoad(ty, NULL, bld.mkSymbol(FILE_MEMORY_GLOBAL, slot, ty, 0), coord);
|
||||
for (i = 0; i < 4 && untypedDst[i]; i++)
|
||||
load->setDef(i, untypedDst[i]);
|
||||
|
||||
// Unpack each component into the typed dsts
|
||||
int bits = 0;
|
||||
for (int i = 0; i < 4; bits += format->bits[i], i++) {
|
||||
if (!typedDst[i])
|
||||
continue;
|
||||
|
||||
if (i >= format->components) {
|
||||
if (format->type == FLOAT ||
|
||||
format->type == UNORM ||
|
||||
format->type == SNORM)
|
||||
bld.loadImm(typedDst[i], i == 3 ? 1.0f : 0.0f);
|
||||
else
|
||||
bld.loadImm(typedDst[i], i == 3 ? 1 : 0);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Get just that component's data into the relevant place
|
||||
if (format->bits[i] == 32)
|
||||
bld.mkMov(typedDst[i], untypedDst[i]);
|
||||
else if (format->bits[i] == 16) {
|
||||
// We can always convert directly from the appropriate half of the
|
||||
// loaded value into the typed result.
|
||||
Value *src[2];
|
||||
bld.mkSplit(src, 2, untypedDst[i / 2]);
|
||||
bld.mkCvt(OP_CVT, getShaderType(format->type), typedDst[i],
|
||||
getPackedType(format, i), src[i & 1]);
|
||||
}
|
||||
else if (format->bits[i] == 8) {
|
||||
// Same approach as for 16 bits, but we have to massage the value a
|
||||
// bit more, since we have to get the appropriate 8 bits from the
|
||||
// half-register. In all cases, we can CVT from a 8-bit source, so we
|
||||
// only have to shift when we want the upper 8 bits.
|
||||
Value *src[2], *shifted;
|
||||
bld.mkSplit(src, 2, untypedDst[0]);
|
||||
DataType packedType = getPackedType(format, i);
|
||||
if (i & 1)
|
||||
shifted = bld.mkOp2v(OP_SHR, TYPE_U16, bld.getSSA(2), src[!!(i & 2)], bld.loadImm(NULL, (uint16_t)8));
|
||||
else
|
||||
shifted = src[!!(i & 2)];
|
||||
|
||||
bld.mkCvt(OP_CVT, getShaderType(format->type), typedDst[i],
|
||||
packedType, shifted);
|
||||
}
|
||||
else {
|
||||
// The options are 10, 11, and 2. Get it into a 32-bit reg, then
|
||||
// shift/mask. That's where it'll have to end up anyways. For signed,
|
||||
// we have to make sure to get sign-extension, so we actually have to
|
||||
// shift *up* first, and then shift down. There's no advantage to
|
||||
// AND'ing, so we don't.
|
||||
DataType ty = TYPE_U32;
|
||||
if (format->type == SNORM || format->type == SINT) {
|
||||
ty = TYPE_S32;
|
||||
}
|
||||
|
||||
// Poor man's EXTBF
|
||||
bld.mkOp2(
|
||||
OP_SHR, ty, typedDst[i],
|
||||
bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), untypedDst[0], bld.loadImm(NULL, 32 - bits - format->bits[i])),
|
||||
bld.loadImm(NULL, 32 - format->bits[i]));
|
||||
|
||||
// If the stored data is already in the appropriate type, we don't
|
||||
// have to do anything. Convert to float for the *NORM formats.
|
||||
if (format->type == UNORM || format->type == SNORM)
|
||||
bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], TYPE_U32, typedDst[i]);
|
||||
}
|
||||
|
||||
// Normalize / convert as necessary
|
||||
if (format->type == UNORM)
|
||||
bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << format->bits[i]) - 1)));
|
||||
else if (format->type == SNORM)
|
||||
bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << (format->bits[i] - 1)) - 1)));
|
||||
else if (format->type == FLOAT && format->bits[i] < 16) {
|
||||
// We expect the value to be in the low bits of the register, so we
|
||||
// have to shift back up.
|
||||
bld.mkOp2(OP_SHL, TYPE_U32, typedDst[i], typedDst[i], bld.loadImm(NULL, 15 - format->bits[i]));
|
||||
Value *src[2];
|
||||
bld.mkSplit(src, 2, typedDst[i]);
|
||||
bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], TYPE_F16, src[0]);
|
||||
}
|
||||
}
|
||||
|
||||
if (format->bgra) {
|
||||
std::swap(typedDst[0], typedDst[2]);
|
||||
}
|
||||
|
||||
bld.getBB()->remove(su);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
NV50LoweringPreSSA::handleSUREDP(TexInstruction *su)
|
||||
{
|
||||
const int slot = su->tex.r + 7;
|
||||
const int dim = su->tex.target.getDim();
|
||||
const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
|
||||
assert(!su->getIndirectR());
|
||||
|
||||
bld.setPosition(su, false);
|
||||
|
||||
Value *coord = processSurfaceCoords(su);
|
||||
|
||||
// This is guaranteed to be a 32-bit format. So there's nothing to
|
||||
// pack/unpack.
|
||||
Instruction *atom = bld.mkOp2(
|
||||
OP_ATOM, su->dType, su->getDef(0),
|
||||
bld.mkSymbol(FILE_MEMORY_GLOBAL, slot, TYPE_U32, 0), su->getSrc(arg));
|
||||
if (su->subOp == NV50_IR_SUBOP_ATOM_CAS)
|
||||
atom->setSrc(2, su->getSrc(arg + 1));
|
||||
atom->setIndirect(0, 0, coord);
|
||||
atom->subOp = su->subOp;
|
||||
|
||||
bld.getBB()->remove(su);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
NV50LoweringPreSSA::handleSUSTP(TexInstruction *su)
|
||||
{
|
||||
const int slot = su->tex.r + 7;
|
||||
const int dim = su->tex.target.getDim();
|
||||
const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
|
||||
assert(!su->getIndirectR());
|
||||
|
||||
bld.setPosition(su, false);
|
||||
|
||||
const TexInstruction::ImgFormatDesc *format = su->tex.format;
|
||||
const int bytes = (su->tex.format->bits[0] +
|
||||
su->tex.format->bits[1] +
|
||||
su->tex.format->bits[2] +
|
||||
su->tex.format->bits[3]) / 8;
|
||||
DataType ty = typeOfSize(bytes);
|
||||
|
||||
Value *coord = processSurfaceCoords(su);
|
||||
|
||||
// The packed values we will eventually store into memory
|
||||
Value *untypedDst[4] = {};
|
||||
// Each component's packed representation, in 16-bit registers (only used
|
||||
// where appropriate)
|
||||
Value *untypedDst16[4] = {};
|
||||
// The original values that are being packed
|
||||
Value *typedDst[4] = {};
|
||||
int i;
|
||||
|
||||
for (i = 0; i < bytes / 4; i++)
|
||||
untypedDst[i] = bld.getSSA();
|
||||
for (i = 0; i < format->components; i++)
|
||||
untypedDst16[i] = bld.getSSA(2);
|
||||
// Make sure we get at least one of each value allocated for the
|
||||
// super-narrow formats.
|
||||
if (bytes < 4)
|
||||
untypedDst[0] = bld.getSSA();
|
||||
if (bytes < 2)
|
||||
untypedDst16[0] = bld.getSSA(2);
|
||||
|
||||
for (i = 0; i < 4; i++) {
|
||||
typedDst[i] = bld.getSSA();
|
||||
bld.mkMov(typedDst[i], su->getSrc(arg + i));
|
||||
}
|
||||
|
||||
if (format->bgra) {
|
||||
std::swap(typedDst[0], typedDst[2]);
|
||||
}
|
||||
|
||||
// Pack each component into the untyped dsts.
|
||||
int bits = 0;
|
||||
for (int i = 0; i < format->components; bits += format->bits[i], i++) {
|
||||
// Un-normalize / convert as necessary
|
||||
if (format->type == UNORM)
|
||||
bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f * ((1 << format->bits[i]) - 1)));
|
||||
else if (format->type == SNORM)
|
||||
bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f * ((1 << (format->bits[i] - 1)) - 1)));
|
||||
|
||||
// There is nothing to convert/pack for 32-bit values
|
||||
if (format->bits[i] == 32) {
|
||||
bld.mkMov(untypedDst[i], typedDst[i]);
|
||||
continue;
|
||||
}
|
||||
|
||||
// The remainder of the cases will naturally want to deal in 16-bit
|
||||
// registers. We will put these into untypedDst16 and then merge them
|
||||
// together later.
|
||||
if (format->type == FLOAT && format->bits[i] < 16) {
|
||||
bld.mkCvt(OP_CVT, TYPE_F16, untypedDst16[i], TYPE_F32, typedDst[i]);
|
||||
bld.mkOp2(OP_SHR, TYPE_U16, untypedDst16[i], untypedDst16[i], bld.loadImm(NULL, (uint16_t)(15 - format->bits[i])));
|
||||
|
||||
// For odd bit sizes, it's easier to pack it into the final
|
||||
// destination directly.
|
||||
Value *tmp = bld.getSSA();
|
||||
bld.mkCvt(OP_CVT, TYPE_U32, tmp, TYPE_U16, untypedDst16[i]);
|
||||
if (i == 0) {
|
||||
untypedDst[0] = tmp;
|
||||
} else {
|
||||
bld.mkOp2(OP_SHL, TYPE_U32, tmp, tmp, bld.loadImm(NULL, bits));
|
||||
bld.mkOp2(OP_OR, TYPE_U32, untypedDst[0], untypedDst[0], tmp);
|
||||
}
|
||||
} else if (format->bits[i] == 16) {
|
||||
// We can always convert the shader value into the packed value
|
||||
// directly here
|
||||
bld.mkCvt(OP_CVT, getPackedType(format, i), untypedDst16[i],
|
||||
getShaderType(format->type), typedDst[i]);
|
||||
} else if (format->bits[i] < 16) {
|
||||
DataType packedType = getPackedType(format, i);
|
||||
DataType shaderType = getShaderType(format->type);
|
||||
// We can't convert F32 to U8/S8 directly, so go to U16/S16 first.
|
||||
if (shaderType == TYPE_F32 && typeSizeof(packedType) == 1) {
|
||||
packedType = format->type == SNORM ? TYPE_S16 : TYPE_U16;
|
||||
}
|
||||
bld.mkCvt(OP_CVT, packedType, untypedDst16[i], shaderType, typedDst[i]);
|
||||
// TODO: clamp for 10- and 2-bit sizes. Also, due to the oddness of
|
||||
// the size, it's easier to dump them into a 32-bit value and OR
|
||||
// everything later.
|
||||
if (format->bits[i] != 8) {
|
||||
// Restrict value to the appropriate bits (although maybe supposed
|
||||
// to clamp instead?)
|
||||
bld.mkOp2(OP_AND, TYPE_U16, untypedDst16[i], untypedDst16[i], bld.loadImm(NULL, (uint16_t)((1 << format->bits[i]) - 1)));
|
||||
// And merge into final packed value
|
||||
Value *tmp = bld.getSSA();
|
||||
bld.mkCvt(OP_CVT, TYPE_U32, tmp, TYPE_U16, untypedDst16[i]);
|
||||
if (i == 0) {
|
||||
untypedDst[0] = tmp;
|
||||
} else {
|
||||
bld.mkOp2(OP_SHL, TYPE_U32, tmp, tmp, bld.loadImm(NULL, bits));
|
||||
bld.mkOp2(OP_OR, TYPE_U32, untypedDst[0], untypedDst[0], tmp);
|
||||
}
|
||||
} else if (i & 1) {
|
||||
// Shift the 8-bit value up (so that it can be OR'd later)
|
||||
bld.mkOp2(OP_SHL, TYPE_U16, untypedDst16[i], untypedDst16[i], bld.loadImm(NULL, (uint16_t)(bits % 16)));
|
||||
} else if (packedType != TYPE_U8) {
|
||||
// S8 (or the *16 if converted from float) will all have high bits
|
||||
// set, so AND them out.
|
||||
bld.mkOp2(OP_AND, TYPE_U16, untypedDst16[i], untypedDst16[i], bld.loadImm(NULL, (uint16_t)0xff));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// OR pairs of 8-bit values together (into the even value)
|
||||
if (format->bits[0] == 8) {
|
||||
for (i = 0; i < 2 && untypedDst16[2 * i] && untypedDst16[2 * i + 1]; i++)
|
||||
bld.mkOp2(OP_OR, TYPE_U16, untypedDst16[2 * i], untypedDst16[2 * i], untypedDst16[2 * i + 1]);
|
||||
}
|
||||
|
||||
// We'll always want to have at least a 32-bit source register for the store
|
||||
Instruction *merge = bld.mkOp(OP_MERGE, bytes < 4 ? TYPE_U32 : ty, bld.getSSA(bytes < 4 ? 4 : bytes));
|
||||
if (format->bits[0] == 32) {
|
||||
for (i = 0; i < 4 && untypedDst[i]; i++)
|
||||
merge->setSrc(i, untypedDst[i]);
|
||||
} else if (format->bits[0] == 16) {
|
||||
for (i = 0; i < 4 && untypedDst16[i]; i++)
|
||||
merge->setSrc(i, untypedDst16[i]);
|
||||
if (i == 1)
|
||||
merge->setSrc(i, bld.getSSA(2));
|
||||
} else if (format->bits[0] == 8) {
|
||||
for (i = 0; i < 2 && untypedDst16[2 * i]; i++)
|
||||
merge->setSrc(i, untypedDst16[2 * i]);
|
||||
if (i == 1)
|
||||
merge->setSrc(i, bld.getSSA(2));
|
||||
} else {
|
||||
merge->setSrc(0, untypedDst[0]);
|
||||
}
|
||||
|
||||
bld.mkStore(OP_STORE, ty, bld.mkSymbol(FILE_MEMORY_GLOBAL, slot, TYPE_U32, 0), coord, merge->getDef(0));
|
||||
|
||||
bld.getBB()->remove(su);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
NV50LoweringPreSSA::handlePFETCH(Instruction *i)
|
||||
{
|
||||
|
@ -1507,6 +2075,16 @@ NV50LoweringPreSSA::visit(Instruction *i)
|
|||
case OP_ATOM:
|
||||
case OP_STORE:
|
||||
return handleLDST(i);
|
||||
case OP_SULDP:
|
||||
return handleSULDP(i->asTex());
|
||||
case OP_SUSTP:
|
||||
return handleSUSTP(i->asTex());
|
||||
case OP_SUREDP:
|
||||
return handleSUREDP(i->asTex());
|
||||
case OP_SUQ:
|
||||
return handleSUQ(i->asTex());
|
||||
case OP_BUFQ:
|
||||
return handleBUFQ(i);
|
||||
case OP_RDSV:
|
||||
return handleRDSV(i);
|
||||
case OP_WRSV:
|
||||
|
|
Loading…
Reference in New Issue