mirror of https://gitlab.freedesktop.org/mesa/mesa
3347 lines
130 KiB
Rust
3347 lines
130 KiB
Rust
// Copyright © 2022 Collabora, Ltd.
|
|
// SPDX-License-Identifier: MIT
|
|
|
|
#![allow(non_upper_case_globals)]
|
|
|
|
use crate::api::GetDebugFlags;
|
|
use crate::api::DEBUG;
|
|
use crate::cfg::CFGBuilder;
|
|
use crate::ir::*;
|
|
use crate::nir::*;
|
|
use crate::nir_instr_printer::NirInstrPrinter;
|
|
use crate::sph::{OutputTopology, PixelImap};
|
|
|
|
use nak_bindings::*;
|
|
|
|
use std::cmp::max;
|
|
use std::collections::{HashMap, HashSet};
|
|
use std::ops::Index;
|
|
|
|
fn init_info_from_nir(nir: &nir_shader, sm: u8) -> ShaderInfo {
|
|
ShaderInfo {
|
|
sm: sm,
|
|
num_gprs: 0,
|
|
num_barriers: 0,
|
|
slm_size: nir.scratch_size,
|
|
uses_global_mem: false,
|
|
writes_global_mem: false,
|
|
// TODO: handle this.
|
|
uses_fp64: false,
|
|
stage: match nir.info.stage() {
|
|
MESA_SHADER_COMPUTE => {
|
|
ShaderStageInfo::Compute(ComputeShaderInfo {
|
|
local_size: [
|
|
nir.info.workgroup_size[0],
|
|
nir.info.workgroup_size[1],
|
|
nir.info.workgroup_size[2],
|
|
],
|
|
smem_size: nir.info.shared_size.try_into().unwrap(),
|
|
})
|
|
}
|
|
MESA_SHADER_VERTEX => ShaderStageInfo::Vertex,
|
|
MESA_SHADER_FRAGMENT => ShaderStageInfo::Fragment,
|
|
MESA_SHADER_GEOMETRY => {
|
|
let info_gs = unsafe { &nir.info.__bindgen_anon_1.gs };
|
|
let output_topology = match info_gs.output_primitive {
|
|
MESA_PRIM_POINTS => OutputTopology::PointList,
|
|
MESA_PRIM_LINE_STRIP => OutputTopology::LineStrip,
|
|
MESA_PRIM_TRIANGLE_STRIP => OutputTopology::TriangleStrip,
|
|
_ => panic!(
|
|
"Invalid GS input primitive {}",
|
|
info_gs.input_primitive
|
|
),
|
|
};
|
|
|
|
ShaderStageInfo::Geometry(GeometryShaderInfo {
|
|
// TODO: Should be set if VK_NV_geometry_shader_passthrough is in use.
|
|
passthrough_enable: false,
|
|
stream_out_mask: info_gs.active_stream_mask(),
|
|
threads_per_input_primitive: info_gs.invocations,
|
|
output_topology: output_topology,
|
|
max_output_vertex_count: info_gs.vertices_out,
|
|
})
|
|
}
|
|
MESA_SHADER_TESS_CTRL => {
|
|
let info_tess = unsafe { &nir.info.__bindgen_anon_1.tess };
|
|
ShaderStageInfo::TessellationInit(TessellationInitShaderInfo {
|
|
per_patch_attribute_count: 6,
|
|
threads_per_patch: info_tess.tcs_vertices_out,
|
|
})
|
|
}
|
|
MESA_SHADER_TESS_EVAL => ShaderStageInfo::Tessellation,
|
|
_ => panic!("Unknown shader stage"),
|
|
},
|
|
io: match nir.info.stage() {
|
|
MESA_SHADER_COMPUTE => ShaderIoInfo::None,
|
|
MESA_SHADER_FRAGMENT => ShaderIoInfo::Fragment(FragmentIoInfo {
|
|
sysvals_in: SysValInfo {
|
|
// Required on fragment shaders, otherwise it cause a trap.
|
|
ab: 1 << 31,
|
|
c: 0,
|
|
},
|
|
sysvals_in_d: [PixelImap::Unused; 8],
|
|
attr_in: [PixelImap::Unused; 128],
|
|
barycentric_attr_in: [0; 4],
|
|
reads_sample_mask: false,
|
|
uses_kill: false,
|
|
writes_color: 0,
|
|
writes_sample_mask: false,
|
|
writes_depth: false,
|
|
// TODO: Should be set if interlocks are in use. (VK_EXT_fragment_shader_interlock)
|
|
does_interlock: false,
|
|
}),
|
|
MESA_SHADER_VERTEX
|
|
| MESA_SHADER_GEOMETRY
|
|
| MESA_SHADER_TESS_CTRL
|
|
| MESA_SHADER_TESS_EVAL => ShaderIoInfo::Vtg(VtgIoInfo {
|
|
sysvals_in: SysValInfo::default(),
|
|
sysvals_in_d: 0,
|
|
sysvals_out: SysValInfo::default(),
|
|
sysvals_out_d: 0,
|
|
attr_in: [0; 4],
|
|
attr_out: [0; 4],
|
|
|
|
// TODO: figure out how to fill this.
|
|
store_req_start: u8::MAX,
|
|
store_req_end: 0,
|
|
}),
|
|
_ => panic!("Unknown shader stage"),
|
|
},
|
|
}
|
|
}
|
|
|
|
fn alloc_ssa_for_nir(b: &mut impl SSABuilder, ssa: &nir_def) -> Vec<SSAValue> {
|
|
let (file, comps) = if ssa.bit_size == 1 {
|
|
(RegFile::Pred, ssa.num_components)
|
|
} else {
|
|
let bits = ssa.bit_size * ssa.num_components;
|
|
(RegFile::GPR, bits.div_ceil(32))
|
|
};
|
|
|
|
let mut vec = Vec::new();
|
|
for _ in 0..comps {
|
|
vec.push(b.alloc_ssa(file, 1)[0]);
|
|
}
|
|
vec
|
|
}
|
|
|
|
struct PhiAllocMap<'a> {
|
|
alloc: &'a mut PhiAllocator,
|
|
map: HashMap<(u32, u8), u32>,
|
|
}
|
|
|
|
impl<'a> PhiAllocMap<'a> {
|
|
fn new(alloc: &'a mut PhiAllocator) -> PhiAllocMap<'a> {
|
|
PhiAllocMap {
|
|
alloc: alloc,
|
|
map: HashMap::new(),
|
|
}
|
|
}
|
|
|
|
fn get_phi_id(&mut self, phi: &nir_phi_instr, comp: u8) -> u32 {
|
|
*self
|
|
.map
|
|
.entry((phi.def.index, comp))
|
|
.or_insert_with(|| self.alloc.alloc())
|
|
}
|
|
}
|
|
|
|
struct PerSizeFloatControls {
|
|
pub ftz: bool,
|
|
pub rnd_mode: FRndMode,
|
|
}
|
|
|
|
struct ShaderFloatControls {
|
|
pub fp16: PerSizeFloatControls,
|
|
pub fp32: PerSizeFloatControls,
|
|
pub fp64: PerSizeFloatControls,
|
|
}
|
|
|
|
impl Default for ShaderFloatControls {
|
|
fn default() -> Self {
|
|
Self {
|
|
fp16: PerSizeFloatControls {
|
|
ftz: false,
|
|
rnd_mode: FRndMode::NearestEven,
|
|
},
|
|
fp32: PerSizeFloatControls {
|
|
ftz: true, // Default FTZ on fp32
|
|
rnd_mode: FRndMode::NearestEven,
|
|
},
|
|
fp64: PerSizeFloatControls {
|
|
ftz: false,
|
|
rnd_mode: FRndMode::NearestEven,
|
|
},
|
|
}
|
|
}
|
|
}
|
|
|
|
impl ShaderFloatControls {
|
|
fn from_nir(nir: &nir_shader) -> ShaderFloatControls {
|
|
let nir_fc = nir.info.float_controls_execution_mode;
|
|
let mut fc: ShaderFloatControls = Default::default();
|
|
|
|
if (nir_fc & FLOAT_CONTROLS_DENORM_PRESERVE_FP16) != 0 {
|
|
fc.fp16.ftz = false;
|
|
} else if (nir_fc & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16) != 0 {
|
|
fc.fp16.ftz = true;
|
|
}
|
|
if (nir_fc & FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16) != 0 {
|
|
fc.fp16.rnd_mode = FRndMode::NearestEven;
|
|
} else if (nir_fc & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16) != 0 {
|
|
fc.fp16.rnd_mode = FRndMode::Zero;
|
|
}
|
|
|
|
if (nir_fc & FLOAT_CONTROLS_DENORM_PRESERVE_FP32) != 0 {
|
|
fc.fp32.ftz = false;
|
|
} else if (nir_fc & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32) != 0 {
|
|
fc.fp32.ftz = true;
|
|
}
|
|
if (nir_fc & FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32) != 0 {
|
|
fc.fp32.rnd_mode = FRndMode::NearestEven;
|
|
} else if (nir_fc & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32) != 0 {
|
|
fc.fp32.rnd_mode = FRndMode::Zero;
|
|
}
|
|
|
|
if (nir_fc & FLOAT_CONTROLS_DENORM_PRESERVE_FP64) != 0 {
|
|
fc.fp64.ftz = false;
|
|
} else if (nir_fc & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64) != 0 {
|
|
fc.fp64.ftz = true;
|
|
}
|
|
if (nir_fc & FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) != 0 {
|
|
fc.fp64.rnd_mode = FRndMode::NearestEven;
|
|
} else if (nir_fc & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) != 0 {
|
|
fc.fp64.rnd_mode = FRndMode::Zero;
|
|
}
|
|
|
|
fc
|
|
}
|
|
}
|
|
|
|
impl Index<FloatType> for ShaderFloatControls {
|
|
type Output = PerSizeFloatControls;
|
|
|
|
fn index(&self, idx: FloatType) -> &PerSizeFloatControls {
|
|
match idx {
|
|
FloatType::F16 => &self.fp16,
|
|
FloatType::F32 => &self.fp32,
|
|
FloatType::F64 => &self.fp64,
|
|
}
|
|
}
|
|
}
|
|
|
|
struct ShaderFromNir<'a> {
|
|
nir: &'a nir_shader,
|
|
info: ShaderInfo,
|
|
float_ctl: ShaderFloatControls,
|
|
cfg: CFGBuilder<u32, BasicBlock>,
|
|
label_alloc: LabelAllocator,
|
|
block_label: HashMap<u32, Label>,
|
|
bar_label: HashMap<u32, Label>,
|
|
fs_out_regs: [SSAValue; 34],
|
|
end_block_id: u32,
|
|
ssa_map: HashMap<u32, Vec<SSAValue>>,
|
|
saturated: HashSet<*const nir_def>,
|
|
nir_instr_printer: NirInstrPrinter,
|
|
}
|
|
|
|
impl<'a> ShaderFromNir<'a> {
|
|
fn new(nir: &'a nir_shader, sm: u8) -> Self {
|
|
Self {
|
|
nir: nir,
|
|
info: init_info_from_nir(nir, sm),
|
|
float_ctl: ShaderFloatControls::from_nir(nir),
|
|
cfg: CFGBuilder::new(),
|
|
label_alloc: LabelAllocator::new(),
|
|
block_label: HashMap::new(),
|
|
bar_label: HashMap::new(),
|
|
fs_out_regs: [SSAValue::NONE; 34],
|
|
end_block_id: 0,
|
|
ssa_map: HashMap::new(),
|
|
saturated: HashSet::new(),
|
|
nir_instr_printer: NirInstrPrinter::new(),
|
|
}
|
|
}
|
|
|
|
fn get_block_label(&mut self, block: &nir_block) -> Label {
|
|
*self
|
|
.block_label
|
|
.entry(block.index)
|
|
.or_insert_with(|| self.label_alloc.alloc())
|
|
}
|
|
|
|
fn get_ssa(&mut self, ssa: &nir_def) -> &[SSAValue] {
|
|
self.ssa_map.get(&ssa.index).unwrap()
|
|
}
|
|
|
|
fn set_ssa(&mut self, def: &nir_def, vec: Vec<SSAValue>) {
|
|
if def.bit_size == 1 {
|
|
for s in &vec {
|
|
assert!(s.is_predicate());
|
|
}
|
|
} else {
|
|
for s in &vec {
|
|
assert!(!s.is_predicate());
|
|
}
|
|
let bits =
|
|
usize::from(def.bit_size) * usize::from(def.num_components);
|
|
assert!(vec.len() == bits.div_ceil(32));
|
|
}
|
|
self.ssa_map
|
|
.entry(def.index)
|
|
.and_modify(|_| panic!("Cannot set an SSA def twice"))
|
|
.or_insert(vec);
|
|
}
|
|
|
|
fn get_ssa_comp(&mut self, def: &nir_def, c: u8) -> (SSARef, u8) {
|
|
let vec = self.get_ssa(def);
|
|
match def.bit_size {
|
|
1 => (vec[usize::from(c)].into(), 0),
|
|
8 => (vec[usize::from(c / 4)].into(), c % 4),
|
|
16 => (vec[usize::from(c / 2)].into(), (c * 2) % 4),
|
|
32 => (vec[usize::from(c)].into(), 0),
|
|
64 => {
|
|
let comps =
|
|
[vec[usize::from(c) * 2 + 0], vec[usize::from(c) * 2 + 1]];
|
|
(comps.into(), 0)
|
|
}
|
|
_ => panic!("Unsupported bit size: {}", def.bit_size),
|
|
}
|
|
}
|
|
|
|
fn get_src(&mut self, src: &nir_src) -> Src {
|
|
SSARef::try_from(self.get_ssa(src.as_def())).unwrap().into()
|
|
}
|
|
|
|
fn get_io_addr_offset(
|
|
&mut self,
|
|
addr: &nir_src,
|
|
imm_bits: u8,
|
|
) -> (Src, i32) {
|
|
let addr = addr.as_def();
|
|
let addr_offset = unsafe {
|
|
nak_get_io_addr_offset(addr as *const _ as *mut _, imm_bits)
|
|
};
|
|
|
|
if let Some(base_def) = std::ptr::NonNull::new(addr_offset.base.def) {
|
|
let base_def = unsafe { base_def.as_ref() };
|
|
let base_comp = u8::try_from(addr_offset.base.comp).unwrap();
|
|
let (base, _) = self.get_ssa_comp(base_def, base_comp);
|
|
(base.into(), addr_offset.offset)
|
|
} else {
|
|
(SrcRef::Zero.into(), addr_offset.offset)
|
|
}
|
|
}
|
|
|
|
fn set_dst(&mut self, def: &nir_def, ssa: SSARef) {
|
|
self.set_ssa(def, (*ssa).into());
|
|
}
|
|
|
|
fn try_saturate_alu_dst(&mut self, def: &nir_def) -> bool {
|
|
if def.all_uses_are_fsat() {
|
|
self.saturated.insert(def as *const _);
|
|
true
|
|
} else {
|
|
false
|
|
}
|
|
}
|
|
|
|
fn alu_src_is_saturated(&self, src: &nir_alu_src) -> bool {
|
|
self.saturated.get(&(src.as_def() as *const _)).is_some()
|
|
}
|
|
|
|
fn parse_alu(&mut self, b: &mut impl SSABuilder, alu: &nir_alu_instr) {
|
|
// Handle vectors and pack ops as a special case since they're the only
|
|
// ALU ops that can produce more than 16B. They are also the only ALU
|
|
// ops which we allow to consume small (8 and 16-bit) vector data
|
|
// scattered across multiple dwords
|
|
match alu.op {
|
|
nir_op_mov
|
|
| nir_op_pack_32_4x8_split
|
|
| nir_op_pack_32_2x16_split
|
|
| nir_op_pack_64_2x32_split
|
|
| nir_op_vec2
|
|
| nir_op_vec3
|
|
| nir_op_vec4
|
|
| nir_op_vec5
|
|
| nir_op_vec8
|
|
| nir_op_vec16 => {
|
|
let src_bit_size = alu.get_src(0).src.bit_size();
|
|
let bits = usize::from(alu.def.num_components)
|
|
* usize::from(alu.def.bit_size);
|
|
|
|
// Collect the sources into a vec with src_bit_size per SSA
|
|
// value in the vec. This implicitly makes 64-bit sources look
|
|
// like two 32-bit values
|
|
let mut srcs = Vec::new();
|
|
if alu.op == nir_op_mov {
|
|
let src = alu.get_src(0);
|
|
for c in 0..alu.def.num_components {
|
|
let s = src.swizzle[usize::from(c)];
|
|
let (src, byte) =
|
|
self.get_ssa_comp(src.src.as_def(), s);
|
|
for ssa in src.iter() {
|
|
srcs.push((*ssa, byte));
|
|
}
|
|
}
|
|
} else {
|
|
for src in alu.srcs_as_slice().iter() {
|
|
let s = src.swizzle[0];
|
|
let (src, byte) =
|
|
self.get_ssa_comp(src.src.as_def(), s);
|
|
for ssa in src.iter() {
|
|
srcs.push((*ssa, byte));
|
|
}
|
|
}
|
|
}
|
|
|
|
let mut comps = Vec::new();
|
|
match src_bit_size {
|
|
1 | 32 | 64 => {
|
|
for (ssa, _) in srcs {
|
|
comps.push(ssa);
|
|
}
|
|
}
|
|
8 => {
|
|
for dc in 0..bits.div_ceil(32) {
|
|
let mut psrc = [Src::new_zero(); 4];
|
|
let mut psel = [0_u8; 4];
|
|
|
|
for b in 0..4 {
|
|
let sc = dc * 4 + b;
|
|
if sc < srcs.len() {
|
|
let (ssa, byte) = srcs[sc];
|
|
for i in 0..4_u8 {
|
|
let psrc_i = &mut psrc[usize::from(i)];
|
|
if *psrc_i == Src::new_zero() {
|
|
*psrc_i = ssa.into();
|
|
} else if *psrc_i != Src::from(ssa) {
|
|
continue;
|
|
}
|
|
psel[b] = i * 4 + byte;
|
|
}
|
|
}
|
|
}
|
|
comps.push(b.prmt4(psrc, psel)[0]);
|
|
}
|
|
}
|
|
16 => {
|
|
for dc in 0..bits.div_ceil(32) {
|
|
let mut psrc = [Src::new_zero(); 2];
|
|
let mut psel = [0_u8; 4];
|
|
|
|
for w in 0..2 {
|
|
let sc = dc * 2 + w;
|
|
if sc < srcs.len() {
|
|
let (ssa, byte) = srcs[sc];
|
|
let w_u8 = u8::try_from(w).unwrap();
|
|
psrc[w] = ssa.into();
|
|
psel[w * 2 + 0] = (w_u8 * 4) + byte;
|
|
psel[w * 2 + 1] = (w_u8 * 4) + byte + 1;
|
|
}
|
|
}
|
|
comps.push(b.prmt(psrc[0], psrc[1], psel)[0]);
|
|
}
|
|
}
|
|
_ => panic!("Unknown bit size: {src_bit_size}"),
|
|
}
|
|
|
|
self.set_ssa(&alu.def, comps);
|
|
return;
|
|
}
|
|
_ => (),
|
|
}
|
|
|
|
let mut srcs: Vec<Src> = Vec::new();
|
|
for (i, alu_src) in alu.srcs_as_slice().iter().enumerate() {
|
|
let bit_size = alu_src.src.bit_size();
|
|
let comps = alu.src_components(i.try_into().unwrap());
|
|
let ssa = self.get_ssa(alu_src.src.as_def());
|
|
|
|
match bit_size {
|
|
1 => {
|
|
assert!(comps == 1);
|
|
let s = usize::from(alu_src.swizzle[0]);
|
|
srcs.push(ssa[s].into());
|
|
}
|
|
8 | 16 => {
|
|
let num_bytes = usize::from(comps * (bit_size / 8));
|
|
assert!(num_bytes <= 4);
|
|
|
|
let mut bytes = [0_u8; 4];
|
|
for c in 0..usize::from(comps) {
|
|
let cs = alu_src.swizzle[c];
|
|
if bit_size == 8 {
|
|
bytes[c] = cs;
|
|
} else {
|
|
bytes[c * 2 + 0] = cs * 2 + 0;
|
|
bytes[c * 2 + 1] = cs * 2 + 1;
|
|
}
|
|
}
|
|
|
|
let mut prmt_srcs = [Src::new_zero(); 4];
|
|
let mut prmt = [0_u8; 4];
|
|
for b in 0..num_bytes {
|
|
for (ds, s) in prmt_srcs.iter_mut().enumerate() {
|
|
let dw = ssa[usize::from(bytes[b] / 4)];
|
|
if s.is_zero() {
|
|
*s = dw.into();
|
|
} else if *s != Src::from(dw) {
|
|
continue;
|
|
}
|
|
prmt[usize::from(b)] =
|
|
(ds as u8) * 4 + (bytes[b] % 4);
|
|
break;
|
|
}
|
|
}
|
|
|
|
srcs.push(b.prmt4(prmt_srcs, prmt).into());
|
|
}
|
|
32 => {
|
|
assert!(comps == 1);
|
|
let s = usize::from(alu_src.swizzle[0]);
|
|
srcs.push(ssa[s].into());
|
|
}
|
|
64 => {
|
|
assert!(comps == 1);
|
|
let s = usize::from(alu_src.swizzle[0]);
|
|
srcs.push([ssa[s * 2], ssa[s * 2 + 1]].into());
|
|
}
|
|
_ => panic!("Invalid bit size: {bit_size}"),
|
|
}
|
|
}
|
|
|
|
// Restricts an F16v2 source to just x if the ALU op is single-component. This
|
|
// must only be called for per-component sources (see nir_op_info::output_sizes
|
|
// for more details).
|
|
let restrict_f16v2_src = |mut src: Src| {
|
|
if alu.def.num_components == 1 {
|
|
src.src_swizzle = SrcSwizzle::Xx;
|
|
}
|
|
src
|
|
};
|
|
|
|
let dst: SSARef = match alu.op {
|
|
nir_op_b2b1 => {
|
|
assert!(alu.get_src(0).bit_size() == 32);
|
|
b.isetp(IntCmpType::I32, IntCmpOp::Ne, srcs[0], 0.into())
|
|
}
|
|
nir_op_b2b32 | nir_op_b2i8 | nir_op_b2i16 | nir_op_b2i32 => {
|
|
b.sel(srcs[0].bnot(), 0.into(), 1.into())
|
|
}
|
|
nir_op_b2i64 => {
|
|
let lo = b.sel(srcs[0].bnot(), 0.into(), 1.into());
|
|
let hi = b.copy(0.into());
|
|
[lo[0], hi[0]].into()
|
|
}
|
|
nir_op_b2f16 => b.sel(srcs[0].bnot(), 0.into(), 0x3c00.into()),
|
|
nir_op_b2f32 => {
|
|
b.sel(srcs[0].bnot(), 0.0_f32.into(), 1.0_f32.into())
|
|
}
|
|
nir_op_b2f64 => {
|
|
let lo = b.copy(0.into());
|
|
let hi = b.sel(srcs[0].bnot(), 0.into(), 0x3ff00000.into());
|
|
[lo[0], hi[0]].into()
|
|
}
|
|
nir_op_bcsel => b.sel(srcs[0], srcs[1], srcs[2]),
|
|
nir_op_bfm => {
|
|
let dst = b.alloc_ssa(RegFile::GPR, 1);
|
|
b.push_op(OpBMsk {
|
|
dst: dst.into(),
|
|
pos: srcs[1],
|
|
width: srcs[0],
|
|
wrap: true,
|
|
});
|
|
dst
|
|
}
|
|
nir_op_bit_count => {
|
|
let dst = b.alloc_ssa(RegFile::GPR, 1);
|
|
b.push_op(OpPopC {
|
|
dst: dst.into(),
|
|
src: srcs[0],
|
|
});
|
|
dst
|
|
}
|
|
nir_op_bitfield_reverse => b.brev(srcs[0]),
|
|
nir_op_ibitfield_extract | nir_op_ubitfield_extract => {
|
|
let range = b.alloc_ssa(RegFile::GPR, 1);
|
|
b.push_op(OpPrmt {
|
|
dst: range.into(),
|
|
srcs: [srcs[1], srcs[2]],
|
|
sel: 0x0040.into(),
|
|
mode: PrmtMode::Index,
|
|
});
|
|
|
|
let dst = b.alloc_ssa(RegFile::GPR, 1);
|
|
b.push_op(OpBfe {
|
|
dst: dst.into(),
|
|
base: srcs[0],
|
|
signed: !matches!(alu.op, nir_op_ubitfield_extract),
|
|
range: range.into(),
|
|
reverse: false,
|
|
});
|
|
dst
|
|
}
|
|
nir_op_extract_u8 | nir_op_extract_i8 | nir_op_extract_u16
|
|
| nir_op_extract_i16 => {
|
|
let src1 = alu.get_src(1);
|
|
let elem = src1.src.comp_as_uint(src1.swizzle[0]).unwrap();
|
|
let elem = u8::try_from(elem).unwrap();
|
|
|
|
match alu.op {
|
|
nir_op_extract_u8 => {
|
|
assert!(elem < 4);
|
|
let byte = elem;
|
|
let zero = 4;
|
|
b.prmt(srcs[0], 0.into(), [byte, zero, zero, zero])
|
|
}
|
|
nir_op_extract_i8 => {
|
|
assert!(elem < 4);
|
|
let byte = elem;
|
|
let sign = byte | 0x8;
|
|
b.prmt(srcs[0], 0.into(), [byte, sign, sign, sign])
|
|
}
|
|
nir_op_extract_u16 => {
|
|
assert!(elem < 2);
|
|
let byte = elem * 2;
|
|
let zero = 4;
|
|
b.prmt(srcs[0], 0.into(), [byte, byte + 1, zero, zero])
|
|
}
|
|
nir_op_extract_i16 => {
|
|
assert!(elem < 2);
|
|
let byte = elem * 2;
|
|
let sign = (byte + 1) | 0x8;
|
|
b.prmt(srcs[0], 0.into(), [byte, byte + 1, sign, sign])
|
|
}
|
|
_ => panic!("Unknown extract op: {}", alu.op),
|
|
}
|
|
}
|
|
nir_op_f2f16 | nir_op_f2f16_rtne | nir_op_f2f16_rtz
|
|
| nir_op_f2f32 | nir_op_f2f64 => {
|
|
let src_bits = alu.get_src(0).src.bit_size();
|
|
let dst_bits = alu.def.bit_size();
|
|
let src_type = FloatType::from_bits(src_bits.into());
|
|
let dst_type = FloatType::from_bits(dst_bits.into());
|
|
|
|
let dst = b.alloc_ssa(RegFile::GPR, dst_bits.div_ceil(32));
|
|
b.push_op(OpF2F {
|
|
dst: dst.into(),
|
|
src: srcs[0],
|
|
src_type: FloatType::from_bits(src_bits.into()),
|
|
dst_type: dst_type,
|
|
rnd_mode: match alu.op {
|
|
nir_op_f2f16_rtne => FRndMode::NearestEven,
|
|
nir_op_f2f16_rtz => FRndMode::Zero,
|
|
_ => self.float_ctl[dst_type].rnd_mode,
|
|
},
|
|
ftz: if src_bits < dst_bits {
|
|
self.float_ctl[src_type].ftz
|
|
} else {
|
|
self.float_ctl[dst_type].ftz
|
|
},
|
|
high: false,
|
|
integer_rnd: false,
|
|
});
|
|
dst
|
|
}
|
|
nir_op_find_lsb => {
|
|
let rev = b.brev(srcs[0]);
|
|
let dst = b.alloc_ssa(RegFile::GPR, 1);
|
|
b.push_op(OpFlo {
|
|
dst: dst.into(),
|
|
src: rev.into(),
|
|
signed: false,
|
|
return_shift_amount: true,
|
|
});
|
|
dst
|
|
}
|
|
nir_op_f2i8 | nir_op_f2i16 | nir_op_f2i32 | nir_op_f2i64
|
|
| nir_op_f2u8 | nir_op_f2u16 | nir_op_f2u32 | nir_op_f2u64 => {
|
|
let src_bits = usize::from(alu.get_src(0).bit_size());
|
|
let dst_bits = alu.def.bit_size();
|
|
let src_type = FloatType::from_bits(src_bits);
|
|
let dst = b.alloc_ssa(RegFile::GPR, dst_bits.div_ceil(32));
|
|
let dst_is_signed = alu.info().output_type & 2 != 0;
|
|
let dst_type =
|
|
IntType::from_bits(dst_bits.into(), dst_is_signed);
|
|
if b.sm() < 70 && dst_bits == 8 {
|
|
// F2I doesn't support 8-bit destinations pre-Volta
|
|
let tmp = b.alloc_ssa(RegFile::GPR, 1);
|
|
let tmp_type = IntType::from_bits(32, dst_is_signed);
|
|
b.push_op(OpF2I {
|
|
dst: tmp.into(),
|
|
src: srcs[0],
|
|
src_type,
|
|
dst_type: tmp_type,
|
|
rnd_mode: FRndMode::Zero,
|
|
ftz: self.float_ctl[src_type].ftz,
|
|
});
|
|
b.push_op(OpI2I {
|
|
dst: dst.into(),
|
|
src: tmp.into(),
|
|
src_type: tmp_type,
|
|
dst_type,
|
|
saturate: true,
|
|
abs: false,
|
|
neg: false,
|
|
});
|
|
} else {
|
|
b.push_op(OpF2I {
|
|
dst: dst.into(),
|
|
src: srcs[0],
|
|
src_type,
|
|
dst_type,
|
|
rnd_mode: FRndMode::Zero,
|
|
ftz: self.float_ctl[src_type].ftz,
|
|
});
|
|
}
|
|
dst
|
|
}
|
|
nir_op_fabs | nir_op_fadd | nir_op_fneg => {
|
|
let (x, y) = match alu.op {
|
|
nir_op_fabs => (srcs[0].fabs(), 0.into()),
|
|
nir_op_fadd => (srcs[0], srcs[1]),
|
|
nir_op_fneg => (Src::new_zero().fneg(), srcs[0].fneg()),
|
|
_ => panic!("Unhandled case"),
|
|
};
|
|
let ftype = FloatType::from_bits(alu.def.bit_size().into());
|
|
let dst;
|
|
if alu.def.bit_size() == 64 {
|
|
dst = b.alloc_ssa(RegFile::GPR, 2);
|
|
b.push_op(OpDAdd {
|
|
dst: dst.into(),
|
|
srcs: [x, y],
|
|
rnd_mode: self.float_ctl[ftype].rnd_mode,
|
|
});
|
|
} else if alu.def.bit_size() == 32 {
|
|
dst = b.alloc_ssa(RegFile::GPR, 1);
|
|
b.push_op(OpFAdd {
|
|
dst: dst.into(),
|
|
srcs: [x, y],
|
|
saturate: self.try_saturate_alu_dst(&alu.def),
|
|
rnd_mode: self.float_ctl[ftype].rnd_mode,
|
|
ftz: self.float_ctl[ftype].ftz,
|
|
});
|
|
} else if alu.def.bit_size() == 16 {
|
|
assert!(
|
|
self.float_ctl[ftype].rnd_mode == FRndMode::NearestEven
|
|
);
|
|
|
|
dst = b.alloc_ssa(RegFile::GPR, 1);
|
|
b.push_op(OpHAdd2 {
|
|
dst: dst.into(),
|
|
srcs: [restrict_f16v2_src(x), restrict_f16v2_src(y)],
|
|
saturate: self.try_saturate_alu_dst(&alu.def),
|
|
ftz: self.float_ctl[ftype].ftz,
|
|
f32: false,
|
|
});
|
|
} else {
|
|
panic!("Unsupported float type: f{}", alu.def.bit_size());
|
|
}
|
|
dst
|
|
}
|
|
nir_op_fceil | nir_op_ffloor | nir_op_fround_even
|
|
| nir_op_ftrunc => {
|
|
let dst = b.alloc_ssa(RegFile::GPR, 1);
|
|
let ty = FloatType::from_bits(alu.def.bit_size().into());
|
|
let rnd_mode = match alu.op {
|
|
nir_op_fceil => FRndMode::PosInf,
|
|
nir_op_ffloor => FRndMode::NegInf,
|
|
nir_op_ftrunc => FRndMode::Zero,
|
|
nir_op_fround_even => FRndMode::NearestEven,
|
|
_ => unreachable!(),
|
|
};
|
|
let ftz = self.float_ctl[ty].ftz;
|
|
if b.sm() >= 70 {
|
|
assert!(
|
|
alu.def.bit_size() == 32 || alu.def.bit_size() == 16
|
|
);
|
|
b.push_op(OpFRnd {
|
|
dst: dst.into(),
|
|
src: srcs[0],
|
|
src_type: ty,
|
|
dst_type: ty,
|
|
rnd_mode,
|
|
ftz,
|
|
});
|
|
} else {
|
|
assert!(alu.def.bit_size() == 32);
|
|
b.push_op(OpF2F {
|
|
dst: dst.into(),
|
|
src: srcs[0],
|
|
src_type: ty,
|
|
dst_type: ty,
|
|
rnd_mode,
|
|
ftz,
|
|
integer_rnd: true,
|
|
high: false,
|
|
});
|
|
}
|
|
dst
|
|
}
|
|
nir_op_fcos => b.fcos(srcs[0]),
|
|
nir_op_feq | nir_op_fge | nir_op_flt | nir_op_fneu => {
|
|
let src_type =
|
|
FloatType::from_bits(alu.get_src(0).bit_size().into());
|
|
let cmp_op = match alu.op {
|
|
nir_op_feq => FloatCmpOp::OrdEq,
|
|
nir_op_fge => FloatCmpOp::OrdGe,
|
|
nir_op_flt => FloatCmpOp::OrdLt,
|
|
nir_op_fneu => FloatCmpOp::UnordNe,
|
|
_ => panic!("Usupported float comparison"),
|
|
};
|
|
|
|
let dst = b.alloc_ssa(RegFile::Pred, alu.def.num_components);
|
|
if alu.get_src(0).bit_size() == 64 {
|
|
assert!(alu.def.num_components == 1);
|
|
b.push_op(OpDSetP {
|
|
dst: dst.into(),
|
|
set_op: PredSetOp::And,
|
|
cmp_op: cmp_op,
|
|
srcs: [srcs[0], srcs[1]],
|
|
accum: SrcRef::True.into(),
|
|
});
|
|
} else if alu.get_src(0).bit_size() == 32 {
|
|
assert!(alu.def.num_components == 1);
|
|
b.push_op(OpFSetP {
|
|
dst: dst.into(),
|
|
set_op: PredSetOp::And,
|
|
cmp_op: cmp_op,
|
|
srcs: [srcs[0], srcs[1]],
|
|
accum: SrcRef::True.into(),
|
|
ftz: self.float_ctl[src_type].ftz,
|
|
});
|
|
} else if alu.get_src(0).bit_size() == 16 {
|
|
assert!(
|
|
alu.def.num_components == 1
|
|
|| alu.def.num_components == 2
|
|
);
|
|
|
|
let dsts = if alu.def.num_components == 2 {
|
|
[dst[0].into(), dst[1].into()]
|
|
} else {
|
|
[dst[0].into(), Dst::None]
|
|
};
|
|
|
|
b.push_op(OpHSetP2 {
|
|
dsts,
|
|
set_op: PredSetOp::And,
|
|
cmp_op: cmp_op,
|
|
srcs: [
|
|
restrict_f16v2_src(srcs[0]),
|
|
restrict_f16v2_src(srcs[1]),
|
|
],
|
|
accum: SrcRef::True.into(),
|
|
ftz: self.float_ctl[src_type].ftz,
|
|
horizontal: false,
|
|
});
|
|
} else {
|
|
panic!(
|
|
"Unsupported float type: f{}",
|
|
alu.get_src(0).bit_size()
|
|
);
|
|
}
|
|
dst
|
|
}
|
|
nir_op_fexp2 => b.fexp2(srcs[0]),
|
|
nir_op_ffma => {
|
|
let ftype = FloatType::from_bits(alu.def.bit_size().into());
|
|
let dst;
|
|
if alu.def.bit_size() == 64 {
|
|
debug_assert!(!self.float_ctl[ftype].ftz);
|
|
dst = b.alloc_ssa(RegFile::GPR, 2);
|
|
b.push_op(OpDFma {
|
|
dst: dst.into(),
|
|
srcs: [srcs[0], srcs[1], srcs[2]],
|
|
rnd_mode: self.float_ctl[ftype].rnd_mode,
|
|
});
|
|
} else if alu.def.bit_size() == 32 {
|
|
dst = b.alloc_ssa(RegFile::GPR, 1);
|
|
b.push_op(OpFFma {
|
|
dst: dst.into(),
|
|
srcs: [srcs[0], srcs[1], srcs[2]],
|
|
saturate: self.try_saturate_alu_dst(&alu.def),
|
|
rnd_mode: self.float_ctl[ftype].rnd_mode,
|
|
// The hardware doesn't like FTZ+DNZ and DNZ implies FTZ
|
|
// anyway so only set one of the two bits.
|
|
ftz: self.float_ctl[ftype].ftz,
|
|
dnz: false,
|
|
});
|
|
} else if alu.def.bit_size() == 16 {
|
|
assert!(
|
|
self.float_ctl[ftype].rnd_mode == FRndMode::NearestEven
|
|
);
|
|
|
|
dst = b.alloc_ssa(RegFile::GPR, 1);
|
|
b.push_op(OpHFma2 {
|
|
dst: dst.into(),
|
|
srcs: [
|
|
restrict_f16v2_src(srcs[0]),
|
|
restrict_f16v2_src(srcs[1]),
|
|
restrict_f16v2_src(srcs[2]),
|
|
],
|
|
saturate: self.try_saturate_alu_dst(&alu.def),
|
|
ftz: self.float_ctl[ftype].ftz,
|
|
dnz: false,
|
|
f32: false,
|
|
});
|
|
} else {
|
|
panic!("Unsupported float type: f{}", alu.def.bit_size());
|
|
}
|
|
dst
|
|
}
|
|
nir_op_ffmaz => {
|
|
assert!(alu.def.bit_size() == 32);
|
|
// DNZ implies FTZ so we need FTZ set or this is invalid
|
|
assert!(self.float_ctl.fp32.ftz);
|
|
let dst = b.alloc_ssa(RegFile::GPR, 1);
|
|
b.push_op(OpFFma {
|
|
dst: dst.into(),
|
|
srcs: [srcs[0], srcs[1], srcs[2]],
|
|
saturate: self.try_saturate_alu_dst(&alu.def),
|
|
rnd_mode: self.float_ctl.fp32.rnd_mode,
|
|
// The hardware doesn't like FTZ+DNZ and DNZ implies FTZ
|
|
// anyway so only set one of the two bits.
|
|
ftz: false,
|
|
dnz: true,
|
|
});
|
|
dst
|
|
}
|
|
nir_op_flog2 => {
|
|
assert!(alu.def.bit_size() == 32);
|
|
b.mufu(MuFuOp::Log2, srcs[0])
|
|
}
|
|
nir_op_fmax | nir_op_fmin => {
|
|
let dst;
|
|
if alu.def.bit_size() == 64 {
|
|
dst = b.alloc_ssa(RegFile::GPR, 2);
|
|
b.push_op(OpDMnMx {
|
|
dst: dst.into(),
|
|
srcs: [srcs[0], srcs[1]],
|
|
min: (alu.op == nir_op_fmin).into(),
|
|
});
|
|
} else if alu.def.bit_size() == 32 {
|
|
dst = b.alloc_ssa(RegFile::GPR, 1);
|
|
b.push_op(OpFMnMx {
|
|
dst: dst.into(),
|
|
srcs: [srcs[0], srcs[1]],
|
|
min: (alu.op == nir_op_fmin).into(),
|
|
ftz: self.float_ctl.fp32.ftz,
|
|
});
|
|
} else if alu.def.bit_size() == 16 {
|
|
dst = b.alloc_ssa(RegFile::GPR, 1);
|
|
b.push_op(OpHMnMx2 {
|
|
dst: dst.into(),
|
|
srcs: [
|
|
restrict_f16v2_src(srcs[0]),
|
|
restrict_f16v2_src(srcs[1]),
|
|
],
|
|
min: (alu.op == nir_op_fmin).into(),
|
|
ftz: self.float_ctl.fp16.ftz,
|
|
});
|
|
} else {
|
|
panic!("Unsupported float type: f{}", alu.def.bit_size());
|
|
}
|
|
dst
|
|
}
|
|
nir_op_fmul => {
|
|
let ftype = FloatType::from_bits(alu.def.bit_size().into());
|
|
let dst;
|
|
if alu.def.bit_size() == 64 {
|
|
debug_assert!(!self.float_ctl[ftype].ftz);
|
|
dst = b.alloc_ssa(RegFile::GPR, 2);
|
|
b.push_op(OpDMul {
|
|
dst: dst.into(),
|
|
srcs: [srcs[0], srcs[1]],
|
|
rnd_mode: self.float_ctl[ftype].rnd_mode,
|
|
});
|
|
} else if alu.def.bit_size() == 32 {
|
|
dst = b.alloc_ssa(RegFile::GPR, 1);
|
|
b.push_op(OpFMul {
|
|
dst: dst.into(),
|
|
srcs: [srcs[0], srcs[1]],
|
|
saturate: self.try_saturate_alu_dst(&alu.def),
|
|
rnd_mode: self.float_ctl[ftype].rnd_mode,
|
|
ftz: self.float_ctl[ftype].ftz,
|
|
dnz: false,
|
|
});
|
|
} else if alu.def.bit_size() == 16 {
|
|
assert!(
|
|
self.float_ctl[ftype].rnd_mode == FRndMode::NearestEven
|
|
);
|
|
|
|
dst = b.alloc_ssa(RegFile::GPR, 1);
|
|
b.push_op(OpHMul2 {
|
|
dst: dst.into(),
|
|
srcs: [
|
|
restrict_f16v2_src(srcs[0]),
|
|
restrict_f16v2_src(srcs[1]),
|
|
],
|
|
saturate: self.try_saturate_alu_dst(&alu.def),
|
|
ftz: self.float_ctl[ftype].ftz,
|
|
dnz: false,
|
|
});
|
|
} else {
|
|
panic!("Unsupported float type: f{}", alu.def.bit_size());
|
|
}
|
|
dst
|
|
}
|
|
nir_op_fmulz => {
|
|
assert!(alu.def.bit_size() == 32);
|
|
// DNZ implies FTZ so we need FTZ set or this is invalid
|
|
assert!(self.float_ctl.fp32.ftz);
|
|
let dst = b.alloc_ssa(RegFile::GPR, 1);
|
|
b.push_op(OpFMul {
|
|
dst: dst.into(),
|
|
srcs: [srcs[0], srcs[1]],
|
|
saturate: self.try_saturate_alu_dst(&alu.def),
|
|
rnd_mode: self.float_ctl.fp32.rnd_mode,
|
|
// The hardware doesn't like FTZ+DNZ and DNZ implies FTZ
|
|
// anyway so only set one of the two bits.
|
|
ftz: false,
|
|
dnz: true,
|
|
});
|
|
dst
|
|
}
|
|
nir_op_fquantize2f16 => {
|
|
let tmp = b.alloc_ssa(RegFile::GPR, 1);
|
|
b.push_op(OpF2F {
|
|
dst: tmp.into(),
|
|
src: srcs[0],
|
|
src_type: FloatType::F32,
|
|
dst_type: FloatType::F16,
|
|
rnd_mode: FRndMode::NearestEven,
|
|
ftz: true,
|
|
high: false,
|
|
integer_rnd: false,
|
|
});
|
|
assert!(alu.def.bit_size() == 32);
|
|
let dst = b.alloc_ssa(RegFile::GPR, 1);
|
|
b.push_op(OpF2F {
|
|
dst: dst.into(),
|
|
src: tmp.into(),
|
|
src_type: FloatType::F16,
|
|
dst_type: FloatType::F32,
|
|
rnd_mode: FRndMode::NearestEven,
|
|
ftz: true,
|
|
high: false,
|
|
integer_rnd: false,
|
|
});
|
|
dst
|
|
}
|
|
nir_op_frcp => {
|
|
assert!(alu.def.bit_size() == 32);
|
|
b.mufu(MuFuOp::Rcp, srcs[0])
|
|
}
|
|
nir_op_frsq => {
|
|
assert!(alu.def.bit_size() == 32);
|
|
b.mufu(MuFuOp::Rsq, srcs[0])
|
|
}
|
|
nir_op_fsat => {
|
|
let ftype = FloatType::from_bits(alu.def.bit_size().into());
|
|
|
|
if self.alu_src_is_saturated(&alu.srcs_as_slice()[0]) {
|
|
b.copy(srcs[0])
|
|
} else if alu.def.bit_size() == 32 {
|
|
let dst = b.alloc_ssa(RegFile::GPR, 1);
|
|
b.push_op(OpFAdd {
|
|
dst: dst.into(),
|
|
srcs: [srcs[0], 0.into()],
|
|
saturate: true,
|
|
rnd_mode: self.float_ctl[ftype].rnd_mode,
|
|
ftz: self.float_ctl[ftype].ftz,
|
|
});
|
|
dst
|
|
} else if alu.def.bit_size() == 16 {
|
|
assert!(
|
|
self.float_ctl[ftype].rnd_mode == FRndMode::NearestEven
|
|
);
|
|
|
|
let dst = b.alloc_ssa(RegFile::GPR, 1);
|
|
b.push_op(OpHAdd2 {
|
|
dst: dst.into(),
|
|
srcs: [restrict_f16v2_src(srcs[0]), 0.into()],
|
|
saturate: true,
|
|
ftz: self.float_ctl[ftype].ftz,
|
|
f32: false,
|
|
});
|
|
dst
|
|
} else {
|
|
panic!("Unsupported float type: f{}", alu.def.bit_size());
|
|
}
|
|
}
|
|
nir_op_fsign => {
|
|
if alu.def.bit_size() == 64 {
|
|
let lz = b.dsetp(FloatCmpOp::OrdLt, srcs[0], 0.into());
|
|
let gz = b.dsetp(FloatCmpOp::OrdGt, srcs[0], 0.into());
|
|
let hi = b.sel(lz.into(), 0xbff00000.into(), 0.into());
|
|
let hi = b.sel(gz.into(), 0x3ff00000.into(), hi.into());
|
|
let lo = b.copy(0.into());
|
|
[lo[0], hi[0]].into()
|
|
} else if alu.def.bit_size() == 32 {
|
|
let lz = b.fset(FloatCmpOp::OrdLt, srcs[0], 0.into());
|
|
let gz = b.fset(FloatCmpOp::OrdGt, srcs[0], 0.into());
|
|
b.fadd(gz.into(), Src::from(lz).fneg())
|
|
} else if alu.def.bit_size() == 16 {
|
|
let x = restrict_f16v2_src(srcs[0]);
|
|
|
|
let lz = restrict_f16v2_src(
|
|
b.hset2(FloatCmpOp::OrdLt, x, 0.into()).into(),
|
|
);
|
|
let gz = restrict_f16v2_src(
|
|
b.hset2(FloatCmpOp::OrdGt, x, 0.into()).into(),
|
|
);
|
|
|
|
b.hadd2(gz, lz.fneg())
|
|
} else {
|
|
panic!("Unsupported float type: f{}", alu.def.bit_size());
|
|
}
|
|
}
|
|
nir_op_fsin => b.fsin(srcs[0]),
|
|
nir_op_fsqrt => b.mufu(MuFuOp::Sqrt, srcs[0]),
|
|
nir_op_i2f16 | nir_op_i2f32 | nir_op_i2f64 => {
|
|
let src_bits = alu.get_src(0).src.bit_size();
|
|
let dst_bits = alu.def.bit_size();
|
|
let dst_type = FloatType::from_bits(dst_bits.into());
|
|
let dst = b.alloc_ssa(RegFile::GPR, dst_bits.div_ceil(32));
|
|
b.push_op(OpI2F {
|
|
dst: dst.into(),
|
|
src: srcs[0],
|
|
dst_type: dst_type,
|
|
src_type: IntType::from_bits(src_bits.into(), true),
|
|
rnd_mode: self.float_ctl[dst_type].rnd_mode,
|
|
});
|
|
dst
|
|
}
|
|
nir_op_i2i8 | nir_op_i2i16 | nir_op_i2i32 | nir_op_i2i64
|
|
| nir_op_u2u8 | nir_op_u2u16 | nir_op_u2u32 | nir_op_u2u64 => {
|
|
let src_bits = alu.get_src(0).src.bit_size();
|
|
let dst_bits = alu.def.bit_size();
|
|
|
|
let mut prmt = [0_u8; 8];
|
|
match alu.op {
|
|
nir_op_i2i8 | nir_op_i2i16 | nir_op_i2i32
|
|
| nir_op_i2i64 => {
|
|
let sign = ((src_bits / 8) - 1) | 0x8;
|
|
for i in 0..8 {
|
|
if i < (src_bits / 8) {
|
|
prmt[usize::from(i)] = i;
|
|
} else {
|
|
prmt[usize::from(i)] = sign;
|
|
}
|
|
}
|
|
}
|
|
nir_op_u2u8 | nir_op_u2u16 | nir_op_u2u32
|
|
| nir_op_u2u64 => {
|
|
for i in 0..8 {
|
|
if i < (src_bits / 8) {
|
|
prmt[usize::from(i)] = i;
|
|
} else {
|
|
prmt[usize::from(i)] = 4;
|
|
}
|
|
}
|
|
}
|
|
_ => panic!("Invalid integer conversion: {}", alu.op),
|
|
}
|
|
let prmt_lo: [u8; 4] = prmt[0..4].try_into().unwrap();
|
|
let prmt_hi: [u8; 4] = prmt[4..8].try_into().unwrap();
|
|
|
|
let src = srcs[0].as_ssa().unwrap();
|
|
if src_bits == 64 {
|
|
if dst_bits == 64 {
|
|
*src
|
|
} else {
|
|
b.prmt(src[0].into(), src[1].into(), prmt_lo)
|
|
}
|
|
} else {
|
|
if dst_bits == 64 {
|
|
let lo = b.prmt(src[0].into(), 0.into(), prmt_lo);
|
|
let hi = b.prmt(src[0].into(), 0.into(), prmt_hi);
|
|
[lo[0], hi[0]].into()
|
|
} else {
|
|
b.prmt(src[0].into(), 0.into(), prmt_lo)
|
|
}
|
|
}
|
|
}
|
|
nir_op_iabs => b.iabs(srcs[0]),
|
|
nir_op_iadd => match alu.def.bit_size {
|
|
32 => b.iadd(srcs[0], srcs[1], 0.into()),
|
|
64 => b.iadd64(srcs[0], srcs[1], 0.into()),
|
|
x => panic!("unsupported bit size for nir_op_iadd: {x}"),
|
|
},
|
|
nir_op_iadd3 => match alu.def.bit_size {
|
|
32 => b.iadd(srcs[0], srcs[1], srcs[2]),
|
|
64 => b.iadd64(srcs[0], srcs[1], srcs[2]),
|
|
x => panic!("unsupported bit size for nir_op_iadd3: {x}"),
|
|
},
|
|
nir_op_iand => b.lop2(LogicOp2::And, srcs[0], srcs[1]),
|
|
nir_op_ieq => {
|
|
if alu.get_src(0).bit_size() == 1 {
|
|
b.lop2(LogicOp2::Xor, srcs[0], srcs[1].bnot())
|
|
} else if alu.get_src(0).bit_size() == 64 {
|
|
b.isetp64(IntCmpType::I32, IntCmpOp::Eq, srcs[0], srcs[1])
|
|
} else {
|
|
assert!(alu.get_src(0).bit_size() == 32);
|
|
b.isetp(IntCmpType::I32, IntCmpOp::Eq, srcs[0], srcs[1])
|
|
}
|
|
}
|
|
nir_op_ifind_msb | nir_op_ifind_msb_rev | nir_op_ufind_msb
|
|
| nir_op_ufind_msb_rev => {
|
|
let dst = b.alloc_ssa(RegFile::GPR, 1);
|
|
b.push_op(OpFlo {
|
|
dst: dst.into(),
|
|
src: srcs[0],
|
|
signed: match alu.op {
|
|
nir_op_ifind_msb | nir_op_ifind_msb_rev => true,
|
|
nir_op_ufind_msb | nir_op_ufind_msb_rev => false,
|
|
_ => panic!("Not a find_msb op"),
|
|
},
|
|
return_shift_amount: match alu.op {
|
|
nir_op_ifind_msb | nir_op_ufind_msb => false,
|
|
nir_op_ifind_msb_rev | nir_op_ufind_msb_rev => true,
|
|
_ => panic!("Not a find_msb op"),
|
|
},
|
|
});
|
|
dst
|
|
}
|
|
nir_op_ige | nir_op_ilt | nir_op_uge | nir_op_ult => {
|
|
let x = *srcs[0].as_ssa().unwrap();
|
|
let y = *srcs[1].as_ssa().unwrap();
|
|
let (cmp_type, cmp_op) = match alu.op {
|
|
nir_op_ige => (IntCmpType::I32, IntCmpOp::Ge),
|
|
nir_op_ilt => (IntCmpType::I32, IntCmpOp::Lt),
|
|
nir_op_uge => (IntCmpType::U32, IntCmpOp::Ge),
|
|
nir_op_ult => (IntCmpType::U32, IntCmpOp::Lt),
|
|
_ => panic!("Not an integer comparison"),
|
|
};
|
|
if alu.get_src(0).bit_size() == 64 {
|
|
b.isetp64(cmp_type, cmp_op, x.into(), y.into())
|
|
} else {
|
|
assert!(alu.get_src(0).bit_size() == 32);
|
|
b.isetp(cmp_type, cmp_op, x.into(), y.into())
|
|
}
|
|
}
|
|
nir_op_imad => {
|
|
assert!(alu.def.bit_size() == 32);
|
|
let dst = b.alloc_ssa(RegFile::GPR, 1);
|
|
b.push_op(OpIMad {
|
|
dst: dst.into(),
|
|
srcs: [srcs[0], srcs[1], srcs[2]],
|
|
signed: false,
|
|
});
|
|
dst
|
|
}
|
|
nir_op_imax | nir_op_imin | nir_op_umax | nir_op_umin => {
|
|
let (tp, min) = match alu.op {
|
|
nir_op_imax => (IntCmpType::I32, SrcRef::False),
|
|
nir_op_imin => (IntCmpType::I32, SrcRef::True),
|
|
nir_op_umax => (IntCmpType::U32, SrcRef::False),
|
|
nir_op_umin => (IntCmpType::U32, SrcRef::True),
|
|
_ => panic!("Not an integer min/max"),
|
|
};
|
|
assert!(alu.def.bit_size() == 32);
|
|
b.imnmx(tp, srcs[0], srcs[1], min.into())
|
|
}
|
|
nir_op_imul => {
|
|
assert!(alu.def.bit_size() == 32);
|
|
b.imul(srcs[0], srcs[1])
|
|
}
|
|
nir_op_imul_2x32_64 | nir_op_umul_2x32_64 => {
|
|
let signed = alu.op == nir_op_imul_2x32_64;
|
|
b.imul_2x32_64(srcs[0], srcs[1], signed)
|
|
}
|
|
nir_op_imul_high | nir_op_umul_high => {
|
|
let signed = alu.op == nir_op_imul_high;
|
|
let dst64 = b.imul_2x32_64(srcs[0], srcs[1], signed);
|
|
dst64[1].into()
|
|
}
|
|
nir_op_ine => {
|
|
if alu.get_src(0).bit_size() == 1 {
|
|
b.lop2(LogicOp2::Xor, srcs[0], srcs[1])
|
|
} else if alu.get_src(0).bit_size() == 64 {
|
|
b.isetp64(IntCmpType::I32, IntCmpOp::Ne, srcs[0], srcs[1])
|
|
} else {
|
|
assert!(alu.get_src(0).bit_size() == 32);
|
|
b.isetp(IntCmpType::I32, IntCmpOp::Ne, srcs[0], srcs[1])
|
|
}
|
|
}
|
|
nir_op_ineg => {
|
|
if alu.def.bit_size == 64 {
|
|
let x = srcs[0].as_ssa().unwrap();
|
|
let sum = b.alloc_ssa(RegFile::GPR, 2);
|
|
let carry = b.alloc_ssa(RegFile::Pred, 1);
|
|
b.push_op(OpIAdd3 {
|
|
dst: sum[0].into(),
|
|
overflow: [carry.into(), Dst::None],
|
|
srcs: [0.into(), Src::from(x[0]).ineg(), 0.into()],
|
|
});
|
|
b.push_op(OpIAdd3X {
|
|
dst: sum[1].into(),
|
|
overflow: [Dst::None, Dst::None],
|
|
srcs: [0.into(), Src::from(x[1]).bnot(), 0.into()],
|
|
carry: [carry.into(), SrcRef::False.into()],
|
|
});
|
|
sum
|
|
} else {
|
|
assert!(alu.def.bit_size() == 32);
|
|
b.ineg(srcs[0])
|
|
}
|
|
}
|
|
nir_op_inot => {
|
|
if alu.def.bit_size() == 1 {
|
|
b.lop2(LogicOp2::PassB, true.into(), srcs[0].bnot())
|
|
} else {
|
|
assert!(alu.def.bit_size() == 32);
|
|
b.lop2(LogicOp2::PassB, 0.into(), srcs[0].bnot())
|
|
}
|
|
}
|
|
nir_op_ior => b.lop2(LogicOp2::Or, srcs[0], srcs[1]),
|
|
nir_op_ishl => {
|
|
let x = *srcs[0].as_ssa().unwrap();
|
|
let shift = srcs[1];
|
|
if alu.def.bit_size() == 64 {
|
|
// For 64-bit shifts, we have to use clamp mode so we need
|
|
// to mask the shift in order satisfy NIR semantics.
|
|
let shift = b.lop2(LogicOp2::And, shift, 0x3f.into());
|
|
let dst = b.alloc_ssa(RegFile::GPR, 2);
|
|
b.push_op(OpShf {
|
|
dst: dst[0].into(),
|
|
low: 0.into(),
|
|
high: x[0].into(),
|
|
shift: shift.into(),
|
|
right: false,
|
|
wrap: false,
|
|
data_type: IntType::U32,
|
|
dst_high: true,
|
|
});
|
|
b.push_op(OpShf {
|
|
dst: dst[1].into(),
|
|
low: x[0].into(),
|
|
high: x[1].into(),
|
|
shift: shift.into(),
|
|
right: false,
|
|
wrap: false,
|
|
data_type: IntType::U64,
|
|
dst_high: true,
|
|
});
|
|
dst
|
|
} else {
|
|
assert!(alu.def.bit_size() == 32);
|
|
b.shl(srcs[0], srcs[1])
|
|
}
|
|
}
|
|
nir_op_ishr => {
|
|
let x = *srcs[0].as_ssa().unwrap();
|
|
let shift = srcs[1];
|
|
if alu.def.bit_size() == 64 {
|
|
// For 64-bit shifts, we have to use clamp mode so we need
|
|
// to mask the shift in order satisfy NIR semantics.
|
|
let shift = b.lop2(LogicOp2::And, shift, 0x3f.into());
|
|
let dst = b.alloc_ssa(RegFile::GPR, 2);
|
|
b.push_op(OpShf {
|
|
dst: dst[0].into(),
|
|
low: x[0].into(),
|
|
high: x[1].into(),
|
|
shift: shift.into(),
|
|
right: true,
|
|
wrap: false,
|
|
data_type: IntType::I64,
|
|
dst_high: false,
|
|
});
|
|
b.push_op(OpShf {
|
|
dst: dst[1].into(),
|
|
low: x[0].into(),
|
|
high: x[1].into(),
|
|
shift: shift.into(),
|
|
right: true,
|
|
wrap: false,
|
|
data_type: IntType::I32,
|
|
dst_high: true,
|
|
});
|
|
dst
|
|
} else {
|
|
assert!(alu.def.bit_size() == 32);
|
|
b.shr(srcs[0], srcs[1], true)
|
|
}
|
|
}
|
|
nir_op_ixor => b.lop2(LogicOp2::Xor, srcs[0], srcs[1]),
|
|
nir_op_pack_half_2x16_split | nir_op_pack_half_2x16_rtz_split => {
|
|
assert!(alu.get_src(0).bit_size() == 32);
|
|
let low = b.alloc_ssa(RegFile::GPR, 1);
|
|
let high = b.alloc_ssa(RegFile::GPR, 1);
|
|
|
|
let rnd_mode = match alu.op {
|
|
nir_op_pack_half_2x16_split => FRndMode::NearestEven,
|
|
nir_op_pack_half_2x16_rtz_split => FRndMode::Zero,
|
|
_ => panic!("Unhandled fp16 pack op"),
|
|
};
|
|
|
|
b.push_op(OpF2F {
|
|
dst: low.into(),
|
|
src: srcs[0],
|
|
src_type: FloatType::F32,
|
|
dst_type: FloatType::F16,
|
|
rnd_mode: rnd_mode,
|
|
ftz: false,
|
|
high: false,
|
|
integer_rnd: false,
|
|
});
|
|
|
|
let src_bits = usize::from(alu.get_src(1).bit_size());
|
|
let src_type = FloatType::from_bits(src_bits);
|
|
assert!(matches!(src_type, FloatType::F32));
|
|
b.push_op(OpF2F {
|
|
dst: high.into(),
|
|
src: srcs[1],
|
|
src_type: FloatType::F32,
|
|
dst_type: FloatType::F16,
|
|
rnd_mode: rnd_mode,
|
|
ftz: false,
|
|
high: false,
|
|
integer_rnd: false,
|
|
});
|
|
|
|
b.prmt(low.into(), high.into(), [0, 1, 4, 5])
|
|
}
|
|
nir_op_sdot_4x8_iadd => {
|
|
let dst = b.alloc_ssa(RegFile::GPR, 1);
|
|
b.push_op(OpIDp4 {
|
|
dst: dst.into(),
|
|
src_types: [IntType::I8, IntType::I8],
|
|
srcs: [srcs[0], srcs[1], srcs[2]],
|
|
});
|
|
dst
|
|
}
|
|
nir_op_sudot_4x8_iadd => {
|
|
let dst = b.alloc_ssa(RegFile::GPR, 1);
|
|
b.push_op(OpIDp4 {
|
|
dst: dst.into(),
|
|
src_types: [IntType::I8, IntType::U8],
|
|
srcs: [srcs[0], srcs[1], srcs[2]],
|
|
});
|
|
dst
|
|
}
|
|
nir_op_udot_4x8_uadd => {
|
|
let dst = b.alloc_ssa(RegFile::GPR, 1);
|
|
b.push_op(OpIDp4 {
|
|
dst: dst.into(),
|
|
src_types: [IntType::U8, IntType::U8],
|
|
srcs: [srcs[0], srcs[1], srcs[2]],
|
|
});
|
|
dst
|
|
}
|
|
nir_op_u2f16 | nir_op_u2f32 | nir_op_u2f64 => {
|
|
let src_bits = alu.get_src(0).src.bit_size();
|
|
let dst_bits = alu.def.bit_size();
|
|
let dst_type = FloatType::from_bits(dst_bits.into());
|
|
let dst = b.alloc_ssa(RegFile::GPR, dst_bits.div_ceil(32));
|
|
b.push_op(OpI2F {
|
|
dst: dst.into(),
|
|
src: srcs[0],
|
|
dst_type: dst_type,
|
|
src_type: IntType::from_bits(src_bits.into(), false),
|
|
rnd_mode: self.float_ctl[dst_type].rnd_mode,
|
|
});
|
|
dst
|
|
}
|
|
nir_op_uadd_sat => {
|
|
let x = srcs[0].as_ssa().unwrap();
|
|
let y = srcs[1].as_ssa().unwrap();
|
|
let sum_lo = b.alloc_ssa(RegFile::GPR, 1);
|
|
let ovf_lo = b.alloc_ssa(RegFile::Pred, 1);
|
|
b.push_op(OpIAdd3 {
|
|
dst: sum_lo.into(),
|
|
overflow: [ovf_lo.into(), Dst::None],
|
|
srcs: [0.into(), x[0].into(), y[0].into()],
|
|
});
|
|
if alu.def.bit_size() == 64 {
|
|
let sum_hi = b.alloc_ssa(RegFile::GPR, 1);
|
|
let ovf_hi = b.alloc_ssa(RegFile::Pred, 1);
|
|
b.push_op(OpIAdd3X {
|
|
dst: sum_hi.into(),
|
|
overflow: [ovf_hi.into(), Dst::None],
|
|
srcs: [0.into(), x[1].into(), y[1].into()],
|
|
carry: [ovf_lo.into(), false.into()],
|
|
});
|
|
let lo =
|
|
b.sel(ovf_hi.into(), u32::MAX.into(), sum_lo.into());
|
|
let hi =
|
|
b.sel(ovf_hi.into(), u32::MAX.into(), sum_hi.into());
|
|
[lo[0], hi[0]].into()
|
|
} else {
|
|
assert!(alu.def.bit_size() == 32);
|
|
b.sel(ovf_lo.into(), u32::MAX.into(), sum_lo.into())
|
|
}
|
|
}
|
|
nir_op_usub_sat => {
|
|
let x = srcs[0].as_ssa().unwrap();
|
|
let y = srcs[1].as_ssa().unwrap();
|
|
let sum_lo = b.alloc_ssa(RegFile::GPR, 1);
|
|
let ovf_lo = b.alloc_ssa(RegFile::Pred, 1);
|
|
// The result of OpIAdd3X is the 33-bit value
|
|
//
|
|
// s|o = x + !y + 1
|
|
//
|
|
// The overflow bit of this result is true if and only if the
|
|
// subtract did NOT overflow.
|
|
b.push_op(OpIAdd3 {
|
|
dst: sum_lo.into(),
|
|
overflow: [ovf_lo.into(), Dst::None],
|
|
srcs: [0.into(), x[0].into(), Src::from(y[0]).ineg()],
|
|
});
|
|
if alu.def.bit_size() == 64 {
|
|
let sum_hi = b.alloc_ssa(RegFile::GPR, 1);
|
|
let ovf_hi = b.alloc_ssa(RegFile::Pred, 1);
|
|
b.push_op(OpIAdd3X {
|
|
dst: sum_hi.into(),
|
|
overflow: [ovf_hi.into(), Dst::None],
|
|
srcs: [0.into(), x[1].into(), Src::from(y[1]).bnot()],
|
|
carry: [ovf_lo.into(), false.into()],
|
|
});
|
|
let lo = b.sel(ovf_hi.into(), sum_lo.into(), 0.into());
|
|
let hi = b.sel(ovf_hi.into(), sum_hi.into(), 0.into());
|
|
[lo[0], hi[0]].into()
|
|
} else {
|
|
assert!(alu.def.bit_size() == 32);
|
|
b.sel(ovf_lo.into(), sum_lo.into(), 0.into())
|
|
}
|
|
}
|
|
nir_op_unpack_32_2x16_split_x => {
|
|
b.prmt(srcs[0], 0.into(), [0, 1, 4, 4])
|
|
}
|
|
nir_op_unpack_32_2x16_split_y => {
|
|
b.prmt(srcs[0], 0.into(), [2, 3, 4, 4])
|
|
}
|
|
nir_op_unpack_64_2x32_split_x => {
|
|
let src0_x = srcs[0].as_ssa().unwrap()[0];
|
|
b.copy(src0_x.into())
|
|
}
|
|
nir_op_unpack_64_2x32_split_y => {
|
|
let src0_y = srcs[0].as_ssa().unwrap()[1];
|
|
b.copy(src0_y.into())
|
|
}
|
|
nir_op_unpack_half_2x16_split_x
|
|
| nir_op_unpack_half_2x16_split_y => {
|
|
assert!(alu.def.bit_size() == 32);
|
|
let dst = b.alloc_ssa(RegFile::GPR, 1);
|
|
|
|
b.push_op(OpF2F {
|
|
dst: dst[0].into(),
|
|
src: srcs[0],
|
|
src_type: FloatType::F16,
|
|
dst_type: FloatType::F32,
|
|
rnd_mode: FRndMode::NearestEven,
|
|
ftz: false,
|
|
high: alu.op == nir_op_unpack_half_2x16_split_y,
|
|
integer_rnd: false,
|
|
});
|
|
|
|
dst
|
|
}
|
|
nir_op_ushr => {
|
|
let x = *srcs[0].as_ssa().unwrap();
|
|
let shift = srcs[1];
|
|
if alu.def.bit_size() == 64 {
|
|
// For 64-bit shifts, we have to use clamp mode so we need
|
|
// to mask the shift in order satisfy NIR semantics.
|
|
let shift = b.lop2(LogicOp2::And, shift, 0x3f.into());
|
|
let dst = b.alloc_ssa(RegFile::GPR, 2);
|
|
b.push_op(OpShf {
|
|
dst: dst[0].into(),
|
|
low: x[0].into(),
|
|
high: x[1].into(),
|
|
shift: shift.into(),
|
|
right: true,
|
|
wrap: false,
|
|
data_type: IntType::U64,
|
|
dst_high: false,
|
|
});
|
|
b.push_op(OpShf {
|
|
dst: dst[1].into(),
|
|
low: x[0].into(),
|
|
high: x[1].into(),
|
|
shift: shift.into(),
|
|
right: true,
|
|
wrap: false,
|
|
data_type: IntType::U32,
|
|
dst_high: true,
|
|
});
|
|
dst
|
|
} else {
|
|
assert!(alu.def.bit_size() == 32);
|
|
b.shr(srcs[0], srcs[1], false)
|
|
}
|
|
}
|
|
nir_op_fddx | nir_op_fddx_coarse | nir_op_fddx_fine => {
|
|
// TODO: Real coarse derivatives
|
|
|
|
assert!(alu.def.bit_size() == 32);
|
|
let ftype = FloatType::F32;
|
|
let scratch = b.alloc_ssa(RegFile::GPR, 1);
|
|
|
|
b.push_op(OpShfl {
|
|
dst: scratch[0].into(),
|
|
in_bounds: Dst::None,
|
|
src: srcs[0],
|
|
lane: 1_u32.into(),
|
|
c: (0x3_u32 | 0x1c_u32 << 8).into(),
|
|
op: ShflOp::Bfly,
|
|
});
|
|
|
|
let dst = b.alloc_ssa(RegFile::GPR, 1);
|
|
|
|
b.push_op(OpFSwzAdd {
|
|
dst: dst[0].into(),
|
|
srcs: [scratch[0].into(), srcs[0]],
|
|
ops: [
|
|
FSwzAddOp::SubLeft,
|
|
FSwzAddOp::SubRight,
|
|
FSwzAddOp::SubLeft,
|
|
FSwzAddOp::SubRight,
|
|
],
|
|
rnd_mode: self.float_ctl[ftype].rnd_mode,
|
|
ftz: self.float_ctl[ftype].ftz,
|
|
});
|
|
|
|
dst
|
|
}
|
|
nir_op_fddy | nir_op_fddy_coarse | nir_op_fddy_fine => {
|
|
// TODO: Real coarse derivatives
|
|
|
|
assert!(alu.def.bit_size() == 32);
|
|
let ftype = FloatType::F32;
|
|
let scratch = b.alloc_ssa(RegFile::GPR, 1);
|
|
|
|
b.push_op(OpShfl {
|
|
dst: scratch[0].into(),
|
|
in_bounds: Dst::None,
|
|
src: srcs[0],
|
|
lane: 2_u32.into(),
|
|
c: (0x3_u32 | 0x1c_u32 << 8).into(),
|
|
op: ShflOp::Bfly,
|
|
});
|
|
|
|
let dst = b.alloc_ssa(RegFile::GPR, 1);
|
|
|
|
b.push_op(OpFSwzAdd {
|
|
dst: dst[0].into(),
|
|
srcs: [scratch[0].into(), srcs[0]],
|
|
ops: [
|
|
FSwzAddOp::SubLeft,
|
|
FSwzAddOp::SubLeft,
|
|
FSwzAddOp::SubRight,
|
|
FSwzAddOp::SubRight,
|
|
],
|
|
rnd_mode: self.float_ctl[ftype].rnd_mode,
|
|
ftz: self.float_ctl[ftype].ftz,
|
|
});
|
|
|
|
dst
|
|
}
|
|
_ => panic!("Unsupported ALU instruction: {}", alu.info().name()),
|
|
};
|
|
self.set_dst(&alu.def, dst);
|
|
}
|
|
|
|
fn parse_tex(&mut self, b: &mut impl SSABuilder, tex: &nir_tex_instr) {
|
|
let dim = match tex.sampler_dim {
|
|
GLSL_SAMPLER_DIM_1D => {
|
|
if tex.is_array {
|
|
TexDim::Array1D
|
|
} else {
|
|
TexDim::_1D
|
|
}
|
|
}
|
|
GLSL_SAMPLER_DIM_2D => {
|
|
if tex.is_array {
|
|
TexDim::Array2D
|
|
} else {
|
|
TexDim::_2D
|
|
}
|
|
}
|
|
GLSL_SAMPLER_DIM_3D => {
|
|
assert!(!tex.is_array);
|
|
TexDim::_3D
|
|
}
|
|
GLSL_SAMPLER_DIM_CUBE => {
|
|
if tex.is_array {
|
|
TexDim::ArrayCube
|
|
} else {
|
|
TexDim::Cube
|
|
}
|
|
}
|
|
GLSL_SAMPLER_DIM_BUF => TexDim::_1D,
|
|
GLSL_SAMPLER_DIM_MS => {
|
|
if tex.is_array {
|
|
TexDim::Array2D
|
|
} else {
|
|
TexDim::_2D
|
|
}
|
|
}
|
|
_ => panic!("Unsupported texture dimension: {}", tex.sampler_dim),
|
|
};
|
|
|
|
let srcs = tex.srcs_as_slice();
|
|
assert!(srcs[0].src_type == nir_tex_src_backend1);
|
|
if srcs.len() > 1 {
|
|
assert!(srcs.len() == 2);
|
|
assert!(srcs[1].src_type == nir_tex_src_backend2);
|
|
}
|
|
|
|
let flags: nak_nir_tex_flags =
|
|
unsafe { std::mem::transmute_copy(&tex.backend_flags) };
|
|
|
|
let mask = tex.def.components_read();
|
|
let mut mask = u8::try_from(mask).unwrap();
|
|
if flags.is_sparse() {
|
|
mask &= !(1 << (tex.def.num_components - 1));
|
|
}
|
|
|
|
let dst_comps = u8::try_from(mask.count_ones()).unwrap();
|
|
let dst = b.alloc_ssa(RegFile::GPR, dst_comps);
|
|
|
|
// On Volta and later, the destination is split in two
|
|
let mut dsts = [Dst::None; 2];
|
|
if dst_comps > 2 && b.sm() >= 70 {
|
|
dsts[0] = SSARef::try_from(&dst[0..2]).unwrap().into();
|
|
dsts[1] = SSARef::try_from(&dst[2..]).unwrap().into();
|
|
} else {
|
|
dsts[0] = dst.into();
|
|
}
|
|
|
|
let fault = if flags.is_sparse() {
|
|
b.alloc_ssa(RegFile::Pred, 1).into()
|
|
} else {
|
|
Dst::None
|
|
};
|
|
|
|
if tex.op == nir_texop_hdr_dim_nv {
|
|
let src = self.get_src(&srcs[0].src);
|
|
assert!(fault.is_none());
|
|
b.push_op(OpTxq {
|
|
dsts: dsts,
|
|
src: src,
|
|
query: TexQuery::Dimension,
|
|
mask: mask,
|
|
});
|
|
} else if tex.op == nir_texop_tex_type_nv {
|
|
let src = self.get_src(&srcs[0].src);
|
|
assert!(fault.is_none());
|
|
b.push_op(OpTxq {
|
|
dsts: dsts,
|
|
src: src,
|
|
query: TexQuery::TextureType,
|
|
mask: mask,
|
|
});
|
|
} else {
|
|
let lod_mode = match flags.lod_mode() {
|
|
NAK_NIR_LOD_MODE_AUTO => TexLodMode::Auto,
|
|
NAK_NIR_LOD_MODE_ZERO => TexLodMode::Zero,
|
|
NAK_NIR_LOD_MODE_BIAS => TexLodMode::Bias,
|
|
NAK_NIR_LOD_MODE_LOD => TexLodMode::Lod,
|
|
NAK_NIR_LOD_MODE_CLAMP => TexLodMode::Clamp,
|
|
NAK_NIR_LOD_MODE_BIAS_CLAMP => TexLodMode::BiasClamp,
|
|
_ => panic!("Invalid LOD mode"),
|
|
};
|
|
|
|
let offset_mode = match flags.offset_mode() {
|
|
NAK_NIR_OFFSET_MODE_NONE => Tld4OffsetMode::None,
|
|
NAK_NIR_OFFSET_MODE_AOFFI => Tld4OffsetMode::AddOffI,
|
|
NAK_NIR_OFFSET_MODE_PER_PX => Tld4OffsetMode::PerPx,
|
|
_ => panic!("Invalid offset mode"),
|
|
};
|
|
|
|
let srcs = [self.get_src(&srcs[0].src), self.get_src(&srcs[1].src)];
|
|
|
|
if tex.op == nir_texop_txd {
|
|
assert!(lod_mode == TexLodMode::Auto);
|
|
assert!(offset_mode != Tld4OffsetMode::PerPx);
|
|
assert!(!flags.has_z_cmpr());
|
|
b.push_op(OpTxd {
|
|
dsts: dsts,
|
|
fault,
|
|
srcs: srcs,
|
|
dim: dim,
|
|
offset: offset_mode == Tld4OffsetMode::AddOffI,
|
|
mask: mask,
|
|
});
|
|
} else if tex.op == nir_texop_lod {
|
|
assert!(offset_mode == Tld4OffsetMode::None);
|
|
b.push_op(OpTmml {
|
|
dsts: dsts,
|
|
srcs: srcs,
|
|
dim: dim,
|
|
mask: mask,
|
|
});
|
|
} else if tex.op == nir_texop_txf || tex.op == nir_texop_txf_ms {
|
|
assert!(offset_mode != Tld4OffsetMode::PerPx);
|
|
b.push_op(OpTld {
|
|
dsts: dsts,
|
|
fault,
|
|
srcs: srcs,
|
|
dim: dim,
|
|
lod_mode: lod_mode,
|
|
is_ms: tex.op == nir_texop_txf_ms,
|
|
offset: offset_mode == Tld4OffsetMode::AddOffI,
|
|
mask: mask,
|
|
});
|
|
} else if tex.op == nir_texop_tg4 {
|
|
b.push_op(OpTld4 {
|
|
dsts: dsts,
|
|
fault,
|
|
srcs: srcs,
|
|
dim: dim,
|
|
comp: tex.component().try_into().unwrap(),
|
|
offset_mode: offset_mode,
|
|
z_cmpr: flags.has_z_cmpr(),
|
|
mask: mask,
|
|
});
|
|
} else {
|
|
assert!(offset_mode != Tld4OffsetMode::PerPx);
|
|
b.push_op(OpTex {
|
|
dsts: dsts,
|
|
fault,
|
|
srcs: srcs,
|
|
dim: dim,
|
|
lod_mode: lod_mode,
|
|
z_cmpr: flags.has_z_cmpr(),
|
|
offset: offset_mode == Tld4OffsetMode::AddOffI,
|
|
mask: mask,
|
|
});
|
|
}
|
|
}
|
|
|
|
let mut di = 0_usize;
|
|
let mut nir_dst = Vec::new();
|
|
for i in 0..tex.def.num_components() {
|
|
if flags.is_sparse() && i == tex.def.num_components - 1 {
|
|
let Dst::SSA(fault) = fault else {
|
|
panic!("No fault value for sparse op");
|
|
};
|
|
nir_dst.push(b.sel(fault.into(), 0.into(), 1.into())[0]);
|
|
} else if mask & (1 << i) == 0 {
|
|
nir_dst.push(b.copy(0.into())[0]);
|
|
} else {
|
|
nir_dst.push(dst[di]);
|
|
di += 1;
|
|
}
|
|
}
|
|
self.set_ssa(tex.def.as_def(), nir_dst);
|
|
}
|
|
|
|
fn get_atomic_type(&self, intrin: &nir_intrinsic_instr) -> AtomType {
|
|
let bit_size = intrin.def.bit_size();
|
|
match intrin.atomic_op() {
|
|
nir_atomic_op_iadd => AtomType::U(bit_size),
|
|
nir_atomic_op_imin => AtomType::I(bit_size),
|
|
nir_atomic_op_umin => AtomType::U(bit_size),
|
|
nir_atomic_op_imax => AtomType::I(bit_size),
|
|
nir_atomic_op_umax => AtomType::U(bit_size),
|
|
nir_atomic_op_iand => AtomType::U(bit_size),
|
|
nir_atomic_op_ior => AtomType::U(bit_size),
|
|
nir_atomic_op_ixor => AtomType::U(bit_size),
|
|
nir_atomic_op_xchg => AtomType::U(bit_size),
|
|
nir_atomic_op_fadd => AtomType::F(bit_size),
|
|
nir_atomic_op_fmin => AtomType::F(bit_size),
|
|
nir_atomic_op_fmax => AtomType::F(bit_size),
|
|
nir_atomic_op_cmpxchg => AtomType::U(bit_size),
|
|
_ => panic!("Unsupported NIR atomic op"),
|
|
}
|
|
}
|
|
|
|
fn get_atomic_op(&self, intrin: &nir_intrinsic_instr) -> AtomOp {
|
|
match intrin.atomic_op() {
|
|
nir_atomic_op_iadd => AtomOp::Add,
|
|
nir_atomic_op_imin => AtomOp::Min,
|
|
nir_atomic_op_umin => AtomOp::Min,
|
|
nir_atomic_op_imax => AtomOp::Max,
|
|
nir_atomic_op_umax => AtomOp::Max,
|
|
nir_atomic_op_iand => AtomOp::And,
|
|
nir_atomic_op_ior => AtomOp::Or,
|
|
nir_atomic_op_ixor => AtomOp::Xor,
|
|
nir_atomic_op_xchg => AtomOp::Exch,
|
|
nir_atomic_op_fadd => AtomOp::Add,
|
|
nir_atomic_op_fmin => AtomOp::Min,
|
|
nir_atomic_op_fmax => AtomOp::Max,
|
|
nir_atomic_op_cmpxchg => AtomOp::CmpExch,
|
|
_ => panic!("Unsupported NIR atomic op"),
|
|
}
|
|
}
|
|
|
|
fn get_eviction_priority(
|
|
&mut self,
|
|
access: gl_access_qualifier,
|
|
) -> MemEvictionPriority {
|
|
if self.info.sm >= 70 && access & ACCESS_NON_TEMPORAL != 0 {
|
|
MemEvictionPriority::First
|
|
} else {
|
|
MemEvictionPriority::Normal
|
|
}
|
|
}
|
|
|
|
fn get_image_dim(&mut self, intrin: &nir_intrinsic_instr) -> ImageDim {
|
|
let is_array = intrin.image_array();
|
|
let image_dim = intrin.image_dim();
|
|
match intrin.image_dim() {
|
|
GLSL_SAMPLER_DIM_1D => {
|
|
if is_array {
|
|
ImageDim::_1DArray
|
|
} else {
|
|
ImageDim::_1D
|
|
}
|
|
}
|
|
GLSL_SAMPLER_DIM_2D => {
|
|
if is_array {
|
|
ImageDim::_2DArray
|
|
} else {
|
|
ImageDim::_2D
|
|
}
|
|
}
|
|
GLSL_SAMPLER_DIM_3D => {
|
|
assert!(!is_array);
|
|
ImageDim::_3D
|
|
}
|
|
GLSL_SAMPLER_DIM_CUBE => ImageDim::_2DArray,
|
|
GLSL_SAMPLER_DIM_BUF => {
|
|
assert!(!is_array);
|
|
ImageDim::_1DBuffer
|
|
}
|
|
_ => panic!("Unsupported image dimension: {}", image_dim),
|
|
}
|
|
}
|
|
|
|
fn get_image_coord(
|
|
&mut self,
|
|
intrin: &nir_intrinsic_instr,
|
|
dim: ImageDim,
|
|
) -> Src {
|
|
let vec = self.get_ssa(intrin.get_src(1).as_def());
|
|
// let sample = self.get_src(&srcs[2]);
|
|
let comps = usize::from(dim.coord_comps());
|
|
SSARef::try_from(&vec[0..comps]).unwrap().into()
|
|
}
|
|
|
|
fn parse_intrinsic(
|
|
&mut self,
|
|
b: &mut impl SSABuilder,
|
|
intrin: &nir_intrinsic_instr,
|
|
) {
|
|
let srcs = intrin.srcs_as_slice();
|
|
match intrin.intrinsic {
|
|
nir_intrinsic_al2p_nv => {
|
|
let offset = self.get_src(&srcs[0]);
|
|
let addr = u16::try_from(intrin.base()).unwrap();
|
|
|
|
let flags = intrin.flags();
|
|
let flags: nak_nir_attr_io_flags =
|
|
unsafe { std::mem::transmute_copy(&flags) };
|
|
|
|
let access = AttrAccess {
|
|
addr: addr,
|
|
comps: 1,
|
|
patch: flags.patch(),
|
|
output: flags.output(),
|
|
phys: false,
|
|
};
|
|
|
|
let dst = b.alloc_ssa(RegFile::GPR, 1);
|
|
b.push_op(OpAL2P {
|
|
dst: dst.into(),
|
|
offset: offset,
|
|
access: access,
|
|
});
|
|
self.set_dst(&intrin.def, dst);
|
|
}
|
|
nir_intrinsic_ald_nv | nir_intrinsic_ast_nv => {
|
|
let addr = u16::try_from(intrin.base()).unwrap();
|
|
let base = u16::try_from(intrin.range_base()).unwrap();
|
|
let range = u16::try_from(intrin.range()).unwrap();
|
|
let range = base..(base + range);
|
|
|
|
let flags = intrin.flags();
|
|
let flags: nak_nir_attr_io_flags =
|
|
unsafe { std::mem::transmute_copy(&flags) };
|
|
assert!(!flags.patch() || !flags.phys());
|
|
|
|
if let ShaderIoInfo::Vtg(io) = &mut self.info.io {
|
|
if flags.patch() {
|
|
match &mut self.info.stage {
|
|
ShaderStageInfo::TessellationInit(stage) => {
|
|
assert!(flags.output());
|
|
stage.per_patch_attribute_count = max(
|
|
stage.per_patch_attribute_count,
|
|
(range.end / 4).try_into().unwrap(),
|
|
);
|
|
}
|
|
ShaderStageInfo::Tessellation => (),
|
|
_ => panic!("Patch I/O not supported"),
|
|
}
|
|
} else {
|
|
if flags.output() {
|
|
if intrin.intrinsic == nir_intrinsic_ast_nv {
|
|
io.mark_store_req(range.clone());
|
|
}
|
|
io.mark_attrs_written(range);
|
|
} else {
|
|
io.mark_attrs_read(range);
|
|
}
|
|
}
|
|
} else {
|
|
panic!("Must be a VTG stage");
|
|
}
|
|
|
|
let access = AttrAccess {
|
|
addr: addr,
|
|
comps: intrin.num_components,
|
|
patch: flags.patch(),
|
|
output: flags.output(),
|
|
phys: flags.phys(),
|
|
};
|
|
|
|
if intrin.intrinsic == nir_intrinsic_ald_nv {
|
|
let vtx = self.get_src(&srcs[0]);
|
|
let offset = self.get_src(&srcs[1]);
|
|
|
|
assert!(intrin.def.bit_size() == 32);
|
|
let dst = b.alloc_ssa(RegFile::GPR, access.comps);
|
|
b.push_op(OpALd {
|
|
dst: dst.into(),
|
|
vtx: vtx,
|
|
offset: offset,
|
|
access: access,
|
|
});
|
|
self.set_dst(&intrin.def, dst);
|
|
} else if intrin.intrinsic == nir_intrinsic_ast_nv {
|
|
assert!(srcs[0].bit_size() == 32);
|
|
let data = self.get_src(&srcs[0]);
|
|
let vtx = self.get_src(&srcs[1]);
|
|
let offset = self.get_src(&srcs[2]);
|
|
|
|
b.push_op(OpASt {
|
|
data: data,
|
|
vtx: vtx,
|
|
offset: offset,
|
|
access: access,
|
|
});
|
|
} else {
|
|
panic!("Invalid VTG I/O intrinsic");
|
|
}
|
|
}
|
|
nir_intrinsic_ballot => {
|
|
assert!(srcs[0].bit_size() == 1);
|
|
let src = self.get_src(&srcs[0]);
|
|
|
|
assert!(intrin.def.bit_size() == 32);
|
|
let dst = b.alloc_ssa(RegFile::GPR, 1);
|
|
|
|
b.push_op(OpVote {
|
|
op: VoteOp::Any,
|
|
ballot: dst.into(),
|
|
vote: Dst::None,
|
|
pred: src,
|
|
});
|
|
self.set_dst(&intrin.def, dst);
|
|
}
|
|
nir_intrinsic_bar_break_nv => {
|
|
let src = self.get_src(&srcs[0]);
|
|
let bar_in = b.bmov_to_bar(src);
|
|
let cond = self.get_src(&srcs[1]);
|
|
|
|
let bar_out = b.alloc_ssa(RegFile::Bar, 1);
|
|
b.push_op(OpBreak {
|
|
bar_out: bar_out.into(),
|
|
bar_in: bar_in.into(),
|
|
cond: cond.into(),
|
|
});
|
|
|
|
self.set_dst(&intrin.def, b.bmov_to_gpr(bar_out.into()));
|
|
}
|
|
nir_intrinsic_bar_set_nv => {
|
|
let label = self.label_alloc.alloc();
|
|
let old = self.bar_label.insert(intrin.def.index, label);
|
|
assert!(old.is_none());
|
|
|
|
let bar_clear = b.alloc_ssa(RegFile::Bar, 1);
|
|
b.push_op(OpBClear {
|
|
dst: bar_clear.into(),
|
|
});
|
|
|
|
let bar_out = b.alloc_ssa(RegFile::Bar, 1);
|
|
b.push_op(OpBSSy {
|
|
bar_out: bar_out.into(),
|
|
bar_in: bar_clear.into(),
|
|
cond: SrcRef::True.into(),
|
|
target: label,
|
|
});
|
|
|
|
self.set_dst(&intrin.def, b.bmov_to_gpr(bar_out.into()));
|
|
}
|
|
nir_intrinsic_bar_sync_nv => {
|
|
let src = self.get_src(&srcs[0]);
|
|
|
|
let bar = b.bmov_to_bar(src);
|
|
b.push_op(OpBSync {
|
|
bar: bar.into(),
|
|
cond: SrcRef::True.into(),
|
|
});
|
|
|
|
let bar_set_idx = &srcs[1].as_def().index;
|
|
if let Some(label) = self.bar_label.get(bar_set_idx) {
|
|
b.push_op(OpNop {
|
|
label: Some(*label),
|
|
});
|
|
}
|
|
}
|
|
nir_intrinsic_bindless_image_atomic
|
|
| nir_intrinsic_bindless_image_atomic_swap => {
|
|
let handle = self.get_src(&srcs[0]);
|
|
let dim = self.get_image_dim(intrin);
|
|
let coord = self.get_image_coord(intrin, dim);
|
|
// let sample = self.get_src(&srcs[2]);
|
|
let atom_type = self.get_atomic_type(intrin);
|
|
let atom_op = self.get_atomic_op(intrin);
|
|
|
|
assert!(
|
|
intrin.def.bit_size() == 32 || intrin.def.bit_size() == 64
|
|
);
|
|
assert!(intrin.def.num_components() == 1);
|
|
let dst = b.alloc_ssa(RegFile::GPR, intrin.def.bit_size() / 32);
|
|
|
|
let data = if intrin.intrinsic
|
|
== nir_intrinsic_bindless_image_atomic_swap
|
|
{
|
|
if intrin.def.bit_size() == 64 {
|
|
SSARef::from([
|
|
self.get_ssa(srcs[3].as_def())[0],
|
|
self.get_ssa(srcs[3].as_def())[1],
|
|
self.get_ssa(srcs[4].as_def())[0],
|
|
self.get_ssa(srcs[4].as_def())[1],
|
|
])
|
|
.into()
|
|
} else {
|
|
SSARef::from([
|
|
self.get_ssa(srcs[3].as_def())[0],
|
|
self.get_ssa(srcs[4].as_def())[0],
|
|
])
|
|
.into()
|
|
}
|
|
} else {
|
|
self.get_src(&srcs[3])
|
|
};
|
|
|
|
b.push_op(OpSuAtom {
|
|
dst: dst.into(),
|
|
fault: Dst::None,
|
|
handle: handle,
|
|
coord: coord,
|
|
data: data,
|
|
atom_op: atom_op,
|
|
atom_type: atom_type,
|
|
image_dim: dim,
|
|
mem_order: MemOrder::Strong(MemScope::System),
|
|
mem_eviction_priority: self
|
|
.get_eviction_priority(intrin.access()),
|
|
});
|
|
self.set_dst(&intrin.def, dst);
|
|
}
|
|
nir_intrinsic_bindless_image_load => {
|
|
let handle = self.get_src(&srcs[0]);
|
|
let dim = self.get_image_dim(intrin);
|
|
let coord = self.get_image_coord(intrin, dim);
|
|
// let sample = self.get_src(&srcs[2]);
|
|
|
|
let comps = intrin.num_components;
|
|
assert!(intrin.def.bit_size() == 32);
|
|
assert!(comps == 1 || comps == 2 || comps == 4);
|
|
|
|
let dst = b.alloc_ssa(RegFile::GPR, comps);
|
|
|
|
b.push_op(OpSuLd {
|
|
dst: dst.into(),
|
|
fault: Dst::None,
|
|
image_dim: dim,
|
|
mem_order: MemOrder::Strong(MemScope::System),
|
|
mem_eviction_priority: self
|
|
.get_eviction_priority(intrin.access()),
|
|
mask: (1 << comps) - 1,
|
|
handle: handle,
|
|
coord: coord,
|
|
});
|
|
self.set_dst(&intrin.def, dst);
|
|
}
|
|
nir_intrinsic_bindless_image_sparse_load => {
|
|
let handle = self.get_src(&srcs[0]);
|
|
let dim = self.get_image_dim(intrin);
|
|
let coord = self.get_image_coord(intrin, dim);
|
|
// let sample = self.get_src(&srcs[2]);
|
|
|
|
let comps = intrin.num_components;
|
|
assert!(intrin.def.bit_size() == 32);
|
|
assert!(comps == 5);
|
|
|
|
let dst = b.alloc_ssa(RegFile::GPR, comps - 1);
|
|
let fault = b.alloc_ssa(RegFile::Pred, 1);
|
|
|
|
b.push_op(OpSuLd {
|
|
dst: dst.into(),
|
|
fault: fault.into(),
|
|
image_dim: dim,
|
|
mem_order: MemOrder::Strong(MemScope::System),
|
|
mem_eviction_priority: self
|
|
.get_eviction_priority(intrin.access()),
|
|
mask: (1 << (comps - 1)) - 1,
|
|
handle: handle,
|
|
coord: coord,
|
|
});
|
|
|
|
let mut final_dst = Vec::new();
|
|
for i in 0..usize::from(comps) - 1 {
|
|
final_dst.push(dst[i]);
|
|
}
|
|
final_dst.push(b.sel(fault.into(), 0.into(), 1.into())[0]);
|
|
|
|
self.set_ssa(&intrin.def, final_dst);
|
|
}
|
|
nir_intrinsic_bindless_image_store => {
|
|
let handle = self.get_src(&srcs[0]);
|
|
let dim = self.get_image_dim(intrin);
|
|
let coord = self.get_image_coord(intrin, dim);
|
|
// let sample = self.get_src(&srcs[2]);
|
|
let data = self.get_src(&srcs[3]);
|
|
|
|
let comps = intrin.num_components;
|
|
assert!(srcs[3].bit_size() == 32);
|
|
assert!(comps == 1 || comps == 2 || comps == 4);
|
|
|
|
b.push_op(OpSuSt {
|
|
image_dim: dim,
|
|
mem_order: MemOrder::Strong(MemScope::System),
|
|
mem_eviction_priority: self
|
|
.get_eviction_priority(intrin.access()),
|
|
mask: (1 << comps) - 1,
|
|
handle: handle,
|
|
coord: coord,
|
|
data: data,
|
|
});
|
|
}
|
|
nir_intrinsic_copy_fs_outputs_nv => {
|
|
let ShaderIoInfo::Fragment(info) = &mut self.info.io else {
|
|
panic!(
|
|
"copy_fs_outputs_nv is only allowed in fragment shaders"
|
|
);
|
|
};
|
|
|
|
for i in 0..32 {
|
|
if !self.fs_out_regs[i].is_none() {
|
|
info.writes_color |= 1 << i;
|
|
}
|
|
}
|
|
let mask_idx = (NAK_FS_OUT_SAMPLE_MASK / 4) as usize;
|
|
info.writes_sample_mask = !self.fs_out_regs[mask_idx].is_none();
|
|
let depth_idx = (NAK_FS_OUT_DEPTH / 4) as usize;
|
|
info.writes_depth = !self.fs_out_regs[depth_idx].is_none();
|
|
|
|
let mut srcs = Vec::new();
|
|
for i in 0..8 {
|
|
// Even though the mask is per-component, the actual output
|
|
// space is per-output vec4s.
|
|
if info.writes_color & (0xf << (i * 4)) != 0 {
|
|
for c in 0..4 {
|
|
let reg = self.fs_out_regs[i * 4 + c];
|
|
if reg.is_none() {
|
|
srcs.push(b.undef().into());
|
|
} else {
|
|
srcs.push(reg.into());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// These always come together for some reason
|
|
if info.writes_sample_mask || info.writes_depth {
|
|
if info.writes_sample_mask {
|
|
srcs.push(self.fs_out_regs[mask_idx].into());
|
|
} else {
|
|
srcs.push(b.undef().into());
|
|
}
|
|
if info.writes_depth {
|
|
srcs.push(self.fs_out_regs[depth_idx].into());
|
|
}
|
|
}
|
|
|
|
b.push_op(OpFSOut { srcs: srcs });
|
|
}
|
|
nir_intrinsic_demote | nir_intrinsic_discard => {
|
|
if let ShaderIoInfo::Fragment(info) = &mut self.info.io {
|
|
info.uses_kill = true;
|
|
} else {
|
|
panic!("OpKill is only available in fragment shaders");
|
|
}
|
|
b.push_op(OpKill {});
|
|
}
|
|
nir_intrinsic_demote_if | nir_intrinsic_discard_if => {
|
|
if let ShaderIoInfo::Fragment(info) = &mut self.info.io {
|
|
info.uses_kill = true;
|
|
} else {
|
|
panic!("OpKill is only available in fragment shaders");
|
|
}
|
|
let cond = self.get_ssa(srcs[0].as_def())[0];
|
|
b.predicate(cond.into()).push_op(OpKill {});
|
|
}
|
|
nir_intrinsic_global_atomic => {
|
|
let bit_size = intrin.def.bit_size();
|
|
let (addr, offset) = self.get_io_addr_offset(&srcs[0], 24);
|
|
let data = self.get_src(&srcs[1]);
|
|
let atom_type = self.get_atomic_type(intrin);
|
|
let atom_op = self.get_atomic_op(intrin);
|
|
|
|
assert!(intrin.def.num_components() == 1);
|
|
let dst = b.alloc_ssa(RegFile::GPR, bit_size.div_ceil(32));
|
|
|
|
b.push_op(OpAtom {
|
|
dst: dst.into(),
|
|
addr: addr,
|
|
cmpr: 0.into(),
|
|
data: data,
|
|
atom_op: atom_op,
|
|
atom_type: atom_type,
|
|
addr_offset: offset,
|
|
mem_space: MemSpace::Global(MemAddrType::A64),
|
|
mem_order: MemOrder::Strong(MemScope::System),
|
|
mem_eviction_priority: MemEvictionPriority::Normal, // Note: no intrinic access
|
|
});
|
|
self.set_dst(&intrin.def, dst);
|
|
}
|
|
nir_intrinsic_global_atomic_swap => {
|
|
assert!(intrin.atomic_op() == nir_atomic_op_cmpxchg);
|
|
let bit_size = intrin.def.bit_size();
|
|
let (addr, offset) = self.get_io_addr_offset(&srcs[0], 24);
|
|
let cmpr = self.get_src(&srcs[1]);
|
|
let data = self.get_src(&srcs[2]);
|
|
let atom_type = AtomType::U(bit_size);
|
|
|
|
assert!(intrin.def.num_components() == 1);
|
|
let dst = b.alloc_ssa(RegFile::GPR, bit_size.div_ceil(32));
|
|
|
|
b.push_op(OpAtom {
|
|
dst: dst.into(),
|
|
addr: addr,
|
|
cmpr: cmpr,
|
|
data: data,
|
|
atom_op: AtomOp::CmpExch,
|
|
atom_type: atom_type,
|
|
addr_offset: offset,
|
|
mem_space: MemSpace::Global(MemAddrType::A64),
|
|
mem_order: MemOrder::Strong(MemScope::System),
|
|
mem_eviction_priority: MemEvictionPriority::Normal, // Note: no intrinic access
|
|
});
|
|
self.set_dst(&intrin.def, dst);
|
|
}
|
|
nir_intrinsic_ipa_nv => {
|
|
let addr = u16::try_from(intrin.base()).unwrap();
|
|
|
|
let flags = intrin.flags();
|
|
let flags: nak_nir_ipa_flags =
|
|
unsafe { std::mem::transmute_copy(&flags) };
|
|
|
|
let mode = match flags.interp_mode() {
|
|
NAK_INTERP_MODE_PERSPECTIVE => PixelImap::Perspective,
|
|
NAK_INTERP_MODE_SCREEN_LINEAR => PixelImap::ScreenLinear,
|
|
NAK_INTERP_MODE_CONSTANT => PixelImap::Constant,
|
|
_ => panic!("Unsupported interp mode"),
|
|
};
|
|
|
|
let freq = match flags.interp_freq() {
|
|
NAK_INTERP_FREQ_PASS => InterpFreq::Pass,
|
|
NAK_INTERP_FREQ_PASS_MUL_W => InterpFreq::PassMulW,
|
|
NAK_INTERP_FREQ_CONSTANT => InterpFreq::Constant,
|
|
NAK_INTERP_FREQ_STATE => InterpFreq::State,
|
|
_ => panic!("Invalid interp freq"),
|
|
};
|
|
|
|
let loc = match flags.interp_loc() {
|
|
NAK_INTERP_LOC_DEFAULT => InterpLoc::Default,
|
|
NAK_INTERP_LOC_CENTROID => InterpLoc::Centroid,
|
|
NAK_INTERP_LOC_OFFSET => InterpLoc::Offset,
|
|
_ => panic!("Invalid interp loc"),
|
|
};
|
|
|
|
let inv_w = if freq == InterpFreq::PassMulW {
|
|
self.get_src(&srcs[0])
|
|
} else {
|
|
0.into()
|
|
};
|
|
|
|
let offset = if loc == InterpLoc::Offset {
|
|
self.get_src(&srcs[1])
|
|
} else {
|
|
0.into()
|
|
};
|
|
|
|
let ShaderIoInfo::Fragment(io) = &mut self.info.io else {
|
|
panic!("OpIpa is only used for fragment shaders");
|
|
};
|
|
|
|
io.mark_attr_read(addr, mode);
|
|
|
|
let dst = b.alloc_ssa(RegFile::GPR, 1);
|
|
b.push_op(OpIpa {
|
|
dst: dst.into(),
|
|
addr: addr,
|
|
freq: freq,
|
|
loc: loc,
|
|
inv_w: inv_w,
|
|
offset: offset,
|
|
});
|
|
self.set_dst(&intrin.def, dst);
|
|
}
|
|
nir_intrinsic_isberd_nv => {
|
|
let dst = b.alloc_ssa(RegFile::GPR, 1);
|
|
b.push_op(OpIsberd {
|
|
dst: dst.into(),
|
|
idx: self.get_src(&srcs[0]),
|
|
});
|
|
self.set_dst(&intrin.def, dst);
|
|
}
|
|
nir_intrinsic_load_barycentric_at_offset_nv => (),
|
|
nir_intrinsic_load_barycentric_centroid => (),
|
|
nir_intrinsic_load_barycentric_pixel => (),
|
|
nir_intrinsic_load_barycentric_sample => (),
|
|
nir_intrinsic_load_global | nir_intrinsic_load_global_constant => {
|
|
let size_B =
|
|
(intrin.def.bit_size() / 8) * intrin.def.num_components();
|
|
assert!(u32::from(size_B) <= intrin.align());
|
|
let order =
|
|
if intrin.intrinsic == nir_intrinsic_load_global_constant {
|
|
MemOrder::Constant
|
|
} else {
|
|
MemOrder::Strong(MemScope::System)
|
|
};
|
|
let access = MemAccess {
|
|
mem_type: MemType::from_size(size_B, false),
|
|
space: MemSpace::Global(MemAddrType::A64),
|
|
order: order,
|
|
eviction_priority: self
|
|
.get_eviction_priority(intrin.access()),
|
|
};
|
|
let (addr, offset) = self.get_io_addr_offset(&srcs[0], 24);
|
|
let dst = b.alloc_ssa(RegFile::GPR, size_B.div_ceil(4));
|
|
|
|
b.push_op(OpLd {
|
|
dst: dst.into(),
|
|
addr: addr,
|
|
offset: offset,
|
|
access: access,
|
|
});
|
|
self.set_dst(&intrin.def, dst);
|
|
}
|
|
nir_intrinsic_ldtram_nv => {
|
|
let ShaderIoInfo::Fragment(io) = &mut self.info.io else {
|
|
panic!("ldtram_nv is only used for fragment shaders");
|
|
};
|
|
|
|
assert!(
|
|
intrin.def.bit_size() == 32
|
|
&& intrin.def.num_components == 2
|
|
);
|
|
|
|
let flags = intrin.flags();
|
|
let use_c = flags != 0;
|
|
|
|
let addr = u16::try_from(intrin.base()).unwrap();
|
|
|
|
io.mark_barycentric_attr_in(addr);
|
|
|
|
let dst = b.alloc_ssa(RegFile::GPR, 2);
|
|
b.push_op(OpLdTram {
|
|
dst: dst.into(),
|
|
addr,
|
|
use_c,
|
|
});
|
|
self.set_dst(&intrin.def, dst);
|
|
}
|
|
nir_intrinsic_load_sample_id => {
|
|
let dst = b.alloc_ssa(RegFile::GPR, 1);
|
|
b.push_op(OpPixLd {
|
|
dst: dst.into(),
|
|
val: PixVal::MyIndex,
|
|
});
|
|
self.set_dst(&intrin.def, dst);
|
|
}
|
|
nir_intrinsic_load_sample_mask_in => {
|
|
if let ShaderIoInfo::Fragment(info) = &mut self.info.io {
|
|
info.reads_sample_mask = true;
|
|
} else {
|
|
panic!(
|
|
"sample_mask_in is only available in fragment shaders"
|
|
);
|
|
}
|
|
|
|
let dst = b.alloc_ssa(RegFile::GPR, 1);
|
|
b.push_op(OpPixLd {
|
|
dst: dst.into(),
|
|
val: PixVal::CovMask,
|
|
});
|
|
self.set_dst(&intrin.def, dst);
|
|
}
|
|
nir_intrinsic_load_tess_coord_xy => {
|
|
// Loading gl_TessCoord in tessellation evaluation shaders is
|
|
// weird. It's treated as a per-vertex output which is indexed
|
|
// by LANEID.
|
|
match &self.info.stage {
|
|
ShaderStageInfo::Tessellation => (),
|
|
_ => panic!(
|
|
"load_tess_coord is only available in tessellation \
|
|
shaders"
|
|
),
|
|
};
|
|
|
|
assert!(intrin.def.bit_size() == 32);
|
|
assert!(intrin.def.num_components() == 2);
|
|
|
|
let vtx = b.alloc_ssa(RegFile::GPR, 1);
|
|
b.push_op(OpS2R {
|
|
dst: vtx.into(),
|
|
idx: 0,
|
|
});
|
|
|
|
let access = AttrAccess {
|
|
addr: NAK_ATTR_TESS_COORD,
|
|
comps: 2,
|
|
patch: false,
|
|
output: true,
|
|
phys: false,
|
|
};
|
|
|
|
// This is recorded as a patch output in parse_shader() because
|
|
// the hardware requires it be in the SPH, whether we use it or
|
|
// not.
|
|
|
|
let dst = b.alloc_ssa(RegFile::GPR, access.comps);
|
|
b.push_op(OpALd {
|
|
dst: dst.into(),
|
|
vtx: vtx.into(),
|
|
offset: 0.into(),
|
|
access: access,
|
|
});
|
|
self.set_dst(&intrin.def, dst);
|
|
}
|
|
nir_intrinsic_load_scratch => {
|
|
let size_B =
|
|
(intrin.def.bit_size() / 8) * intrin.def.num_components();
|
|
assert!(u32::from(size_B) <= intrin.align());
|
|
let access = MemAccess {
|
|
mem_type: MemType::from_size(size_B, false),
|
|
space: MemSpace::Local,
|
|
order: MemOrder::Strong(MemScope::CTA),
|
|
eviction_priority: MemEvictionPriority::Normal,
|
|
};
|
|
let (addr, offset) = self.get_io_addr_offset(&srcs[0], 24);
|
|
let dst = b.alloc_ssa(RegFile::GPR, size_B.div_ceil(4));
|
|
|
|
b.push_op(OpLd {
|
|
dst: dst.into(),
|
|
addr: addr,
|
|
offset: offset,
|
|
access: access,
|
|
});
|
|
self.set_dst(&intrin.def, dst);
|
|
}
|
|
nir_intrinsic_load_shared => {
|
|
let size_B =
|
|
(intrin.def.bit_size() / 8) * intrin.def.num_components();
|
|
assert!(u32::from(size_B) <= intrin.align());
|
|
let access = MemAccess {
|
|
mem_type: MemType::from_size(size_B, false),
|
|
space: MemSpace::Shared,
|
|
order: MemOrder::Strong(MemScope::CTA),
|
|
eviction_priority: MemEvictionPriority::Normal,
|
|
};
|
|
let (addr, offset) = self.get_io_addr_offset(&srcs[0], 24);
|
|
let offset = offset + intrin.base();
|
|
let dst = b.alloc_ssa(RegFile::GPR, size_B.div_ceil(4));
|
|
|
|
b.push_op(OpLd {
|
|
dst: dst.into(),
|
|
addr: addr,
|
|
offset: offset,
|
|
access: access,
|
|
});
|
|
self.set_dst(&intrin.def, dst);
|
|
}
|
|
nir_intrinsic_load_sysval_nv => {
|
|
let idx = u8::try_from(intrin.base()).unwrap();
|
|
debug_assert!(intrin.def.num_components == 1);
|
|
debug_assert!(
|
|
intrin.def.bit_size == 32 || intrin.def.bit_size == 64
|
|
);
|
|
let comps = intrin.def.bit_size / 32;
|
|
let dst = b.alloc_ssa(RegFile::GPR, comps);
|
|
if idx == NAK_SV_CLOCK || idx == NAK_SV_CLOCK + 1 {
|
|
debug_assert!(idx + comps <= NAK_SV_CLOCK + 2);
|
|
b.push_op(OpCS2R {
|
|
dst: dst.into(),
|
|
idx: idx,
|
|
});
|
|
} else {
|
|
debug_assert!(intrin.def.bit_size == 32);
|
|
b.push_op(OpS2R {
|
|
dst: dst.into(),
|
|
idx: idx,
|
|
});
|
|
}
|
|
self.set_dst(&intrin.def, dst);
|
|
}
|
|
nir_intrinsic_load_ubo => {
|
|
let size_B =
|
|
(intrin.def.bit_size() / 8) * intrin.def.num_components();
|
|
let idx = &srcs[0];
|
|
|
|
let (off, off_imm) = self.get_io_addr_offset(&srcs[1], 16);
|
|
let (off, off_imm) =
|
|
if let Ok(off_imm_u16) = u16::try_from(off_imm) {
|
|
(off, off_imm_u16)
|
|
} else {
|
|
(self.get_src(&srcs[1]), 0)
|
|
};
|
|
|
|
let dst = b.alloc_ssa(RegFile::GPR, size_B.div_ceil(4));
|
|
|
|
if let Some(idx_imm) = idx.as_uint() {
|
|
let idx_imm: u8 = idx_imm.try_into().unwrap();
|
|
let cb = CBufRef {
|
|
buf: CBuf::Binding(idx_imm),
|
|
offset: off_imm,
|
|
};
|
|
if off.is_zero() {
|
|
for (i, comp) in dst.iter().enumerate() {
|
|
let i = u16::try_from(i).unwrap();
|
|
b.copy_to((*comp).into(), cb.offset(i * 4).into());
|
|
}
|
|
} else {
|
|
b.push_op(OpLdc {
|
|
dst: dst.into(),
|
|
cb: cb.into(),
|
|
offset: off,
|
|
mode: LdcMode::Indexed,
|
|
mem_type: MemType::from_size(size_B, false),
|
|
});
|
|
}
|
|
} else {
|
|
// In the IndexedSegmented mode, the hardware computes the
|
|
// actual index and offset as follows:
|
|
//
|
|
// idx = imm_idx + reg[31:16]
|
|
// offset = imm_offset + reg[15:0]
|
|
// ldc c[idx][offset]
|
|
//
|
|
// So pack the index and offset accordingly
|
|
let idx = self.get_src(idx);
|
|
let off_idx = b.prmt(off, idx, [0, 1, 4, 5]);
|
|
let cb = CBufRef {
|
|
buf: CBuf::Binding(0),
|
|
offset: off_imm,
|
|
};
|
|
b.push_op(OpLdc {
|
|
dst: dst.into(),
|
|
cb: cb.into(),
|
|
offset: off_idx.into(),
|
|
mode: LdcMode::IndexedSegmented,
|
|
mem_type: MemType::from_size(size_B, false),
|
|
});
|
|
}
|
|
self.set_dst(&intrin.def, dst);
|
|
}
|
|
nir_intrinsic_barrier => {
|
|
let modes = intrin.memory_modes();
|
|
let semantics = intrin.memory_semantics();
|
|
if (modes & nir_var_mem_global) != 0
|
|
&& (semantics & NIR_MEMORY_RELEASE) != 0
|
|
{
|
|
b.push_op(OpCCtl {
|
|
op: CCtlOp::WBAll,
|
|
mem_space: MemSpace::Global(MemAddrType::A64),
|
|
addr: 0.into(),
|
|
addr_offset: 0,
|
|
});
|
|
}
|
|
match intrin.execution_scope() {
|
|
SCOPE_NONE => (),
|
|
SCOPE_WORKGROUP => {
|
|
assert!(
|
|
self.nir.info.stage() == MESA_SHADER_COMPUTE
|
|
|| self.nir.info.stage() == MESA_SHADER_KERNEL
|
|
);
|
|
self.info.num_barriers = 1;
|
|
b.push_op(OpBar {});
|
|
}
|
|
_ => panic!("Unhandled execution scope"),
|
|
}
|
|
if intrin.memory_scope() != SCOPE_NONE {
|
|
let mem_scope = match intrin.memory_scope() {
|
|
SCOPE_INVOCATION | SCOPE_SUBGROUP => MemScope::CTA,
|
|
SCOPE_WORKGROUP | SCOPE_QUEUE_FAMILY | SCOPE_DEVICE => {
|
|
MemScope::GPU
|
|
}
|
|
_ => panic!("Unhandled memory scope"),
|
|
};
|
|
b.push_op(OpMemBar { scope: mem_scope });
|
|
}
|
|
if (modes & nir_var_mem_global) != 0
|
|
&& (semantics & NIR_MEMORY_ACQUIRE) != 0
|
|
{
|
|
b.push_op(OpCCtl {
|
|
op: CCtlOp::IVAll,
|
|
mem_space: MemSpace::Global(MemAddrType::A64),
|
|
addr: 0.into(),
|
|
addr_offset: 0,
|
|
});
|
|
}
|
|
}
|
|
nir_intrinsic_quad_broadcast
|
|
| nir_intrinsic_read_invocation
|
|
| nir_intrinsic_shuffle
|
|
| nir_intrinsic_shuffle_down
|
|
| nir_intrinsic_shuffle_up
|
|
| nir_intrinsic_shuffle_xor => {
|
|
assert!(srcs[0].bit_size() == 32);
|
|
assert!(srcs[0].num_components() == 1);
|
|
let data = self.get_src(&srcs[0]);
|
|
|
|
assert!(srcs[1].bit_size() == 32);
|
|
let idx = self.get_src(&srcs[1]);
|
|
|
|
assert!(intrin.def.bit_size() == 32);
|
|
let dst = b.alloc_ssa(RegFile::GPR, 1);
|
|
|
|
b.push_op(OpShfl {
|
|
dst: dst.into(),
|
|
in_bounds: Dst::None,
|
|
src: data,
|
|
lane: idx,
|
|
c: match intrin.intrinsic {
|
|
nir_intrinsic_quad_broadcast => 0x1c_03.into(),
|
|
nir_intrinsic_shuffle_up => 0.into(),
|
|
_ => 0x1f.into(),
|
|
},
|
|
op: match intrin.intrinsic {
|
|
nir_intrinsic_shuffle_down => ShflOp::Down,
|
|
nir_intrinsic_shuffle_up => ShflOp::Up,
|
|
nir_intrinsic_shuffle_xor => ShflOp::Bfly,
|
|
_ => ShflOp::Idx,
|
|
},
|
|
});
|
|
self.set_dst(&intrin.def, dst);
|
|
}
|
|
nir_intrinsic_quad_swap_horizontal
|
|
| nir_intrinsic_quad_swap_vertical
|
|
| nir_intrinsic_quad_swap_diagonal => {
|
|
assert!(srcs[0].bit_size() == 32);
|
|
assert!(srcs[0].num_components() == 1);
|
|
let data = self.get_src(&srcs[0]);
|
|
|
|
assert!(intrin.def.bit_size() == 32);
|
|
let dst = b.alloc_ssa(RegFile::GPR, 1);
|
|
b.push_op(OpShfl {
|
|
dst: dst.into(),
|
|
in_bounds: Dst::None,
|
|
src: data,
|
|
lane: match intrin.intrinsic {
|
|
nir_intrinsic_quad_swap_horizontal => 1_u32.into(),
|
|
nir_intrinsic_quad_swap_vertical => 2_u32.into(),
|
|
nir_intrinsic_quad_swap_diagonal => 3_u32.into(),
|
|
op => panic!("Unknown quad intrinsic {}", op),
|
|
},
|
|
c: 0x1c_03.into(),
|
|
op: ShflOp::Bfly,
|
|
});
|
|
self.set_dst(&intrin.def, dst);
|
|
}
|
|
nir_intrinsic_shared_atomic => {
|
|
let bit_size = intrin.def.bit_size();
|
|
let (addr, offset) = self.get_io_addr_offset(&srcs[0], 24);
|
|
let data = self.get_src(&srcs[1]);
|
|
let atom_type = self.get_atomic_type(intrin);
|
|
let atom_op = self.get_atomic_op(intrin);
|
|
|
|
assert!(intrin.def.num_components() == 1);
|
|
let dst = b.alloc_ssa(RegFile::GPR, bit_size.div_ceil(32));
|
|
|
|
b.push_op(OpAtom {
|
|
dst: dst.into(),
|
|
addr: addr,
|
|
cmpr: 0.into(),
|
|
data: data,
|
|
atom_op: atom_op,
|
|
atom_type: atom_type,
|
|
addr_offset: offset,
|
|
mem_space: MemSpace::Shared,
|
|
mem_order: MemOrder::Strong(MemScope::CTA),
|
|
mem_eviction_priority: MemEvictionPriority::Normal,
|
|
});
|
|
self.set_dst(&intrin.def, dst);
|
|
}
|
|
nir_intrinsic_shared_atomic_swap => {
|
|
assert!(intrin.atomic_op() == nir_atomic_op_cmpxchg);
|
|
let bit_size = intrin.def.bit_size();
|
|
let (addr, offset) = self.get_io_addr_offset(&srcs[0], 24);
|
|
let cmpr = self.get_src(&srcs[1]);
|
|
let data = self.get_src(&srcs[2]);
|
|
let atom_type = AtomType::U(bit_size);
|
|
|
|
assert!(intrin.def.num_components() == 1);
|
|
let dst = b.alloc_ssa(RegFile::GPR, bit_size.div_ceil(32));
|
|
|
|
b.push_op(OpAtom {
|
|
dst: dst.into(),
|
|
addr: addr,
|
|
cmpr: cmpr,
|
|
data: data,
|
|
atom_op: AtomOp::CmpExch,
|
|
atom_type: atom_type,
|
|
addr_offset: offset,
|
|
mem_space: MemSpace::Shared,
|
|
mem_order: MemOrder::Strong(MemScope::CTA),
|
|
mem_eviction_priority: MemEvictionPriority::Normal,
|
|
});
|
|
self.set_dst(&intrin.def, dst);
|
|
}
|
|
nir_intrinsic_ssa_bar_nv => {
|
|
let src = self.get_src(&srcs[0]);
|
|
b.push_op(OpSrcBar { src });
|
|
}
|
|
nir_intrinsic_store_global => {
|
|
let data = self.get_src(&srcs[0]);
|
|
let size_B =
|
|
(srcs[0].bit_size() / 8) * srcs[0].num_components();
|
|
assert!(u32::from(size_B) <= intrin.align());
|
|
let access = MemAccess {
|
|
mem_type: MemType::from_size(size_B, false),
|
|
space: MemSpace::Global(MemAddrType::A64),
|
|
order: MemOrder::Strong(MemScope::System),
|
|
eviction_priority: self
|
|
.get_eviction_priority(intrin.access()),
|
|
};
|
|
let (addr, offset) = self.get_io_addr_offset(&srcs[1], 24);
|
|
|
|
b.push_op(OpSt {
|
|
addr: addr,
|
|
data: data,
|
|
offset: offset,
|
|
access: access,
|
|
});
|
|
}
|
|
nir_intrinsic_fs_out_nv => {
|
|
let data = self.get_ssa(srcs[0].as_def());
|
|
assert!(data.len() == 1);
|
|
let data = data[0];
|
|
|
|
let addr = u16::try_from(intrin.base()).unwrap();
|
|
assert!(addr % 4 == 0);
|
|
|
|
self.fs_out_regs[usize::from(addr / 4)] = data;
|
|
}
|
|
nir_intrinsic_store_scratch => {
|
|
let data = self.get_src(&srcs[0]);
|
|
let size_B =
|
|
(srcs[0].bit_size() / 8) * srcs[0].num_components();
|
|
assert!(u32::from(size_B) <= intrin.align());
|
|
let access = MemAccess {
|
|
mem_type: MemType::from_size(size_B, false),
|
|
space: MemSpace::Local,
|
|
order: MemOrder::Strong(MemScope::CTA),
|
|
eviction_priority: MemEvictionPriority::Normal,
|
|
};
|
|
let (addr, offset) = self.get_io_addr_offset(&srcs[1], 24);
|
|
|
|
b.push_op(OpSt {
|
|
addr: addr,
|
|
data: data,
|
|
offset: offset,
|
|
access: access,
|
|
});
|
|
}
|
|
nir_intrinsic_store_shared => {
|
|
let data = self.get_src(&srcs[0]);
|
|
let size_B =
|
|
(srcs[0].bit_size() / 8) * srcs[0].num_components();
|
|
assert!(u32::from(size_B) <= intrin.align());
|
|
let access = MemAccess {
|
|
mem_type: MemType::from_size(size_B, false),
|
|
space: MemSpace::Shared,
|
|
order: MemOrder::Strong(MemScope::CTA),
|
|
eviction_priority: MemEvictionPriority::Normal,
|
|
};
|
|
let (addr, offset) = self.get_io_addr_offset(&srcs[1], 24);
|
|
let offset = offset + intrin.base();
|
|
|
|
b.push_op(OpSt {
|
|
addr: addr,
|
|
data: data,
|
|
offset: offset,
|
|
access: access,
|
|
});
|
|
}
|
|
nir_intrinsic_emit_vertex_nv | nir_intrinsic_end_primitive_nv => {
|
|
assert!(intrin.def.bit_size() == 32);
|
|
assert!(intrin.def.num_components() == 1);
|
|
|
|
let dst = b.alloc_ssa(RegFile::GPR, 1);
|
|
let handle = self.get_src(&srcs[0]);
|
|
let stream_id = intrin.stream_id();
|
|
|
|
b.push_op(OpOut {
|
|
dst: dst.into(),
|
|
handle: handle,
|
|
stream: stream_id.into(),
|
|
out_type: if intrin.intrinsic
|
|
== nir_intrinsic_emit_vertex_nv
|
|
{
|
|
OutType::Emit
|
|
} else {
|
|
OutType::Cut
|
|
},
|
|
});
|
|
self.set_dst(&intrin.def, dst);
|
|
}
|
|
|
|
nir_intrinsic_final_primitive_nv => {
|
|
let handle = self.get_src(&srcs[0]);
|
|
|
|
if self.info.sm >= 70 {
|
|
b.push_op(OpOutFinal { handle: handle });
|
|
}
|
|
}
|
|
nir_intrinsic_vote_all
|
|
| nir_intrinsic_vote_any
|
|
| nir_intrinsic_vote_ieq => {
|
|
assert!(srcs[0].bit_size() == 1);
|
|
let src = self.get_src(&srcs[0]);
|
|
|
|
assert!(intrin.def.bit_size() == 1);
|
|
let dst = b.alloc_ssa(RegFile::Pred, 1);
|
|
|
|
b.push_op(OpVote {
|
|
op: match intrin.intrinsic {
|
|
nir_intrinsic_vote_all => VoteOp::All,
|
|
nir_intrinsic_vote_any => VoteOp::Any,
|
|
nir_intrinsic_vote_ieq => VoteOp::Eq,
|
|
_ => panic!("Unknown vote intrinsic"),
|
|
},
|
|
ballot: Dst::None,
|
|
vote: dst.into(),
|
|
pred: src,
|
|
});
|
|
self.set_dst(&intrin.def, dst);
|
|
}
|
|
nir_intrinsic_is_sparse_texels_resident => {
|
|
let src = self.get_src(&srcs[0]);
|
|
let dst = b.isetp(IntCmpType::I32, IntCmpOp::Ne, src, 0.into());
|
|
self.set_dst(&intrin.def, dst);
|
|
}
|
|
_ => panic!(
|
|
"Unsupported intrinsic instruction: {}",
|
|
intrin.info().name()
|
|
),
|
|
}
|
|
}
|
|
|
|
fn parse_load_const(
|
|
&mut self,
|
|
b: &mut impl SSABuilder,
|
|
load_const: &nir_load_const_instr,
|
|
) {
|
|
let values = &load_const.values();
|
|
|
|
let mut dst = Vec::new();
|
|
match load_const.def.bit_size {
|
|
1 => {
|
|
for c in 0..load_const.def.num_components {
|
|
let imm_b1 = unsafe { values[usize::from(c)].b };
|
|
dst.push(b.copy(imm_b1.into())[0]);
|
|
}
|
|
}
|
|
8 => {
|
|
for dw in 0..load_const.def.num_components.div_ceil(4) {
|
|
let mut imm_u32 = 0;
|
|
for b in 0..4 {
|
|
let c = dw * 4 + b;
|
|
if c < load_const.def.num_components {
|
|
let imm_u8 = unsafe { values[usize::from(c)].u8_ };
|
|
imm_u32 |= u32::from(imm_u8) << b * 8;
|
|
}
|
|
}
|
|
dst.push(b.copy(imm_u32.into())[0]);
|
|
}
|
|
}
|
|
16 => {
|
|
for dw in 0..load_const.def.num_components.div_ceil(2) {
|
|
let mut imm_u32 = 0;
|
|
for w in 0..2 {
|
|
let c = dw * 2 + w;
|
|
if c < load_const.def.num_components {
|
|
let imm_u16 =
|
|
unsafe { values[usize::from(c)].u16_ };
|
|
imm_u32 |= u32::from(imm_u16) << w * 16;
|
|
}
|
|
}
|
|
dst.push(b.copy(imm_u32.into())[0]);
|
|
}
|
|
}
|
|
32 => {
|
|
for c in 0..load_const.def.num_components {
|
|
let imm_u32 = unsafe { values[usize::from(c)].u32_ };
|
|
dst.push(b.copy(imm_u32.into())[0]);
|
|
}
|
|
}
|
|
64 => {
|
|
for c in 0..load_const.def.num_components {
|
|
let imm_u64 = unsafe { values[c as usize].u64_ };
|
|
dst.push(b.copy((imm_u64 as u32).into())[0]);
|
|
dst.push(b.copy(((imm_u64 >> 32) as u32).into())[0]);
|
|
}
|
|
}
|
|
_ => panic!("Unknown bit size: {}", load_const.def.bit_size),
|
|
}
|
|
|
|
self.set_ssa(&load_const.def, dst);
|
|
}
|
|
|
|
fn parse_undef(
|
|
&mut self,
|
|
b: &mut impl SSABuilder,
|
|
undef: &nir_undef_instr,
|
|
) {
|
|
let dst = alloc_ssa_for_nir(b, &undef.def);
|
|
for c in &dst {
|
|
b.push_op(OpUndef { dst: (*c).into() });
|
|
}
|
|
self.set_ssa(&undef.def, dst);
|
|
}
|
|
|
|
fn emit_jump(
|
|
&mut self,
|
|
b: &mut impl SSABuilder,
|
|
nb: &nir_block,
|
|
target: &nir_block,
|
|
) {
|
|
if target.index == self.end_block_id {
|
|
b.push_op(OpExit {});
|
|
} else {
|
|
self.cfg.add_edge(nb.index, target.index);
|
|
b.push_op(OpBra {
|
|
target: self.get_block_label(target),
|
|
});
|
|
}
|
|
}
|
|
|
|
fn emit_pred_jump(
|
|
&mut self,
|
|
b: &mut impl SSABuilder,
|
|
nb: &nir_block,
|
|
pred: Pred,
|
|
target: &nir_block,
|
|
fallthrough: &nir_block,
|
|
) {
|
|
// The fall-through edge has to come first
|
|
self.cfg.add_edge(nb.index, fallthrough.index);
|
|
let op = if target.index == self.end_block_id {
|
|
Op::Exit(OpExit {})
|
|
} else {
|
|
self.cfg.add_edge(nb.index, target.index);
|
|
Op::Bra(OpBra {
|
|
target: self.get_block_label(target),
|
|
})
|
|
};
|
|
b.predicate(pred).push_op(op);
|
|
}
|
|
|
|
fn parse_block(
|
|
&mut self,
|
|
ssa_alloc: &mut SSAValueAllocator,
|
|
phi_map: &mut PhiAllocMap,
|
|
nb: &nir_block,
|
|
) {
|
|
let mut b = SSAInstrBuilder::new(self.info.sm, ssa_alloc);
|
|
|
|
if nb.index == 0 && self.nir.info.shared_size > 0 {
|
|
// The blob seems to always do a BSYNC before accessing shared
|
|
// memory. Perhaps this is to ensure that our allocation is
|
|
// actually available and not in use by another thread?
|
|
let label = self.label_alloc.alloc();
|
|
let bar_clear = b.alloc_ssa(RegFile::Bar, 1);
|
|
|
|
b.push_op(OpBClear {
|
|
dst: bar_clear.into(),
|
|
});
|
|
|
|
let bar = b.alloc_ssa(RegFile::Bar, 1);
|
|
b.push_op(OpBSSy {
|
|
bar_out: bar.into(),
|
|
bar_in: bar_clear.into(),
|
|
cond: SrcRef::True.into(),
|
|
target: label,
|
|
});
|
|
|
|
b.push_op(OpBSync {
|
|
bar: bar.into(),
|
|
cond: SrcRef::True.into(),
|
|
});
|
|
|
|
b.push_op(OpNop { label: Some(label) });
|
|
}
|
|
|
|
let mut phi = OpPhiDsts::new();
|
|
for ni in nb.iter_instr_list() {
|
|
if ni.type_ == nir_instr_type_phi {
|
|
let np = ni.as_phi().unwrap();
|
|
let dst = alloc_ssa_for_nir(&mut b, np.def.as_def());
|
|
for (i, dst) in dst.iter().enumerate() {
|
|
let phi_id = phi_map.get_phi_id(np, i.try_into().unwrap());
|
|
phi.dsts.push(phi_id, (*dst).into());
|
|
}
|
|
self.set_ssa(np.def.as_def(), dst);
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
if !phi.dsts.is_empty() {
|
|
b.push_op(phi);
|
|
}
|
|
|
|
let mut goto = None;
|
|
for ni in nb.iter_instr_list() {
|
|
if DEBUG.annotate() {
|
|
let annotation = self
|
|
.nir_instr_printer
|
|
.instr_to_string(ni)
|
|
.split_whitespace()
|
|
.collect::<Vec<_>>()
|
|
.join(" ");
|
|
b.push_op(OpAnnotate {
|
|
annotation: format!("generated by \"{}\"", annotation,),
|
|
});
|
|
}
|
|
|
|
match ni.type_ {
|
|
nir_instr_type_alu => {
|
|
self.parse_alu(&mut b, ni.as_alu().unwrap())
|
|
}
|
|
nir_instr_type_jump => {
|
|
let jump = ni.as_jump().unwrap();
|
|
if jump.type_ == nir_jump_goto
|
|
|| jump.type_ == nir_jump_goto_if
|
|
{
|
|
goto = Some(jump);
|
|
}
|
|
}
|
|
nir_instr_type_tex => {
|
|
self.parse_tex(&mut b, ni.as_tex().unwrap())
|
|
}
|
|
nir_instr_type_intrinsic => {
|
|
self.parse_intrinsic(&mut b, ni.as_intrinsic().unwrap())
|
|
}
|
|
nir_instr_type_load_const => {
|
|
self.parse_load_const(&mut b, ni.as_load_const().unwrap())
|
|
}
|
|
nir_instr_type_undef => {
|
|
self.parse_undef(&mut b, ni.as_undef().unwrap())
|
|
}
|
|
nir_instr_type_phi => (),
|
|
_ => panic!("Unsupported instruction type"),
|
|
}
|
|
}
|
|
|
|
let succ = nb.successors();
|
|
for sb in succ {
|
|
let sb = match sb {
|
|
Some(b) => b,
|
|
None => continue,
|
|
};
|
|
|
|
let mut phi = OpPhiSrcs::new();
|
|
|
|
for i in sb.iter_instr_list() {
|
|
let np = match i.as_phi() {
|
|
Some(phi) => phi,
|
|
None => break,
|
|
};
|
|
|
|
for ps in np.iter_srcs() {
|
|
if ps.pred().index == nb.index {
|
|
let src = *self.get_src(&ps.src).as_ssa().unwrap();
|
|
for (i, src) in src.iter().enumerate() {
|
|
let phi_id =
|
|
phi_map.get_phi_id(np, i.try_into().unwrap());
|
|
phi.srcs.push(phi_id, (*src).into());
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if !phi.srcs.is_empty() {
|
|
b.push_op(phi);
|
|
}
|
|
}
|
|
|
|
if let Some(goto) = goto {
|
|
let target = goto.target().unwrap();
|
|
if goto.type_ == nir_jump_goto {
|
|
self.emit_jump(&mut b, nb, target);
|
|
} else {
|
|
let cond = self.get_ssa(goto.condition.as_def())[0];
|
|
let else_target = goto.else_target().unwrap();
|
|
|
|
/* Next block in the NIR CF list */
|
|
let next_block = nb.cf_node.next().unwrap().as_block().unwrap();
|
|
|
|
if else_target as *const _ == next_block as *const _ {
|
|
self.emit_pred_jump(
|
|
&mut b,
|
|
nb,
|
|
// This is the branch to jump to the else
|
|
cond.into(),
|
|
target,
|
|
else_target,
|
|
);
|
|
} else if target as *const _ == next_block as *const _ {
|
|
self.emit_pred_jump(
|
|
&mut b,
|
|
nb,
|
|
Pred::from(cond).bnot(),
|
|
else_target,
|
|
target,
|
|
);
|
|
} else {
|
|
panic!(
|
|
"One of the two goto targets must be the next block in \
|
|
the NIR CF list"
|
|
);
|
|
}
|
|
}
|
|
} else {
|
|
if let Some(ni) = nb.following_if() {
|
|
let cond = self.get_ssa(ni.condition.as_def())[0];
|
|
self.emit_pred_jump(
|
|
&mut b,
|
|
nb,
|
|
// This is the branch to jump to the else
|
|
Pred::from(cond).bnot(),
|
|
ni.first_else_block(),
|
|
ni.first_then_block(),
|
|
);
|
|
} else {
|
|
assert!(succ[1].is_none());
|
|
let s0 = succ[0].unwrap();
|
|
self.emit_jump(&mut b, nb, s0);
|
|
}
|
|
}
|
|
|
|
let mut bb = BasicBlock::new(self.get_block_label(nb));
|
|
bb.instrs.append(&mut b.as_vec());
|
|
self.cfg.add_node(nb.index, bb);
|
|
}
|
|
|
|
fn parse_if(
|
|
&mut self,
|
|
ssa_alloc: &mut SSAValueAllocator,
|
|
phi_map: &mut PhiAllocMap,
|
|
ni: &nir_if,
|
|
) {
|
|
self.parse_cf_list(ssa_alloc, phi_map, ni.iter_then_list());
|
|
self.parse_cf_list(ssa_alloc, phi_map, ni.iter_else_list());
|
|
}
|
|
|
|
fn parse_loop(
|
|
&mut self,
|
|
ssa_alloc: &mut SSAValueAllocator,
|
|
phi_map: &mut PhiAllocMap,
|
|
nl: &nir_loop,
|
|
) {
|
|
self.parse_cf_list(ssa_alloc, phi_map, nl.iter_body());
|
|
}
|
|
|
|
fn parse_cf_list(
|
|
&mut self,
|
|
ssa_alloc: &mut SSAValueAllocator,
|
|
phi_map: &mut PhiAllocMap,
|
|
list: ExecListIter<nir_cf_node>,
|
|
) {
|
|
for node in list {
|
|
match node.type_ {
|
|
nir_cf_node_block => {
|
|
let nb = node.as_block().unwrap();
|
|
self.parse_block(ssa_alloc, phi_map, nb);
|
|
}
|
|
nir_cf_node_if => {
|
|
let ni = node.as_if().unwrap();
|
|
self.parse_if(ssa_alloc, phi_map, ni);
|
|
}
|
|
nir_cf_node_loop => {
|
|
let nl = node.as_loop().unwrap();
|
|
self.parse_loop(ssa_alloc, phi_map, nl);
|
|
}
|
|
_ => panic!("Invalid inner CF node type"),
|
|
}
|
|
}
|
|
}
|
|
|
|
pub fn parse_function_impl(&mut self, nfi: &nir_function_impl) -> Function {
|
|
let mut ssa_alloc = SSAValueAllocator::new();
|
|
let end_nb = nfi.end_block();
|
|
self.end_block_id = end_nb.index;
|
|
|
|
let mut phi_alloc = PhiAllocator::new();
|
|
let mut phi_map = PhiAllocMap::new(&mut phi_alloc);
|
|
|
|
self.parse_cf_list(&mut ssa_alloc, &mut phi_map, nfi.iter_body());
|
|
|
|
let cfg = std::mem::take(&mut self.cfg).as_cfg();
|
|
assert!(cfg.len() > 0);
|
|
for i in 0..cfg.len() {
|
|
if cfg[i].falls_through() {
|
|
assert!(cfg.succ_indices(i)[0] == i + 1);
|
|
}
|
|
}
|
|
|
|
Function {
|
|
ssa_alloc: ssa_alloc,
|
|
phi_alloc: phi_alloc,
|
|
blocks: cfg,
|
|
}
|
|
}
|
|
|
|
pub fn parse_shader(mut self) -> Shader {
|
|
let mut functions = Vec::new();
|
|
for nf in self.nir.iter_functions() {
|
|
if let Some(nfi) = nf.get_impl() {
|
|
let f = self.parse_function_impl(nfi);
|
|
functions.push(f);
|
|
}
|
|
}
|
|
|
|
// Tessellation evaluation shaders MUST claim to read gl_TessCoord or
|
|
// the hardware will throw an SPH error.
|
|
if matches!(self.info.stage, ShaderStageInfo::Tessellation) {
|
|
match &mut self.info.io {
|
|
ShaderIoInfo::Vtg(io) => {
|
|
let tc = NAK_ATTR_TESS_COORD;
|
|
io.mark_attrs_written(tc..(tc + 8));
|
|
}
|
|
_ => panic!("Tessellation must have ShaderIoInfo::Vtg"),
|
|
}
|
|
}
|
|
|
|
Shader {
|
|
info: self.info,
|
|
functions: functions,
|
|
}
|
|
}
|
|
}
|
|
|
|
pub fn nak_shader_from_nir(ns: &nir_shader, sm: u8) -> Shader {
|
|
ShaderFromNir::new(ns, sm).parse_shader()
|
|
}
|