intel/vec4: Add support for masking pushed data

This is the vec4 equivalent of d0d039a4d3, required for proper UBO
pushing in vertex stages for Vulkan on HSW.  Sadly, the implementation
requires us to do everything in ALIGN1 mode and the vec4 instruction
scheduler doesn't understand HW_GRF <-> UNIFORM interference so it's
easier to do the whole thing in the generator.  We add an instruction
to the top of the program which just means "emit the blob" and all the
magic happens in codegen.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10571>
This commit is contained in:
Jason Ekstrand 2021-05-02 13:54:01 -05:00 committed by Marge Bot
parent a881f2295f
commit 89fd196f6b
5 changed files with 73 additions and 0 deletions

View File

@ -574,6 +574,7 @@ enum opcode {
VEC4_OPCODE_SET_LOW_32BIT,
VEC4_OPCODE_SET_HIGH_32BIT,
VEC4_OPCODE_MOV_FOR_SCRATCH,
VEC4_OPCODE_ZERO_OOB_PUSH_REGS,
FS_OPCODE_DDX_COARSE,
FS_OPCODE_DDX_FINE,

View File

@ -344,6 +344,7 @@ namespace {
case VEC4_OPCODE_PICK_HIGH_32BIT:
case VEC4_OPCODE_SET_LOW_32BIT:
case VEC4_OPCODE_SET_HIGH_32BIT:
case VEC4_OPCODE_ZERO_OOB_PUSH_REGS:
case GS_OPCODE_SET_DWORD_2:
case GS_OPCODE_SET_WRITE_OFFSET:
case GS_OPCODE_SET_VERTEX_COUNT:

View File

@ -423,6 +423,8 @@ brw_instruction_name(const struct intel_device_info *devinfo, enum opcode op)
return "set_high_32bit";
case VEC4_OPCODE_MOV_FOR_SCRATCH:
return "mov_for_scratch";
case VEC4_OPCODE_ZERO_OOB_PUSH_REGS:
return "zero_oob_push_regs";
case FS_OPCODE_DDX_COARSE:
return "ddx_coarse";
@ -1136,6 +1138,7 @@ backend_instruction::has_side_effects() const
case SHADER_OPCODE_BTD_SPAWN_LOGICAL:
case SHADER_OPCODE_BTD_RETIRE_LOGICAL:
case RT_OPCODE_TRACE_RAY_LOGICAL:
case VEC4_OPCODE_ZERO_OOB_PUSH_REGS:
return true;
default:
return eot;

View File

@ -2698,6 +2698,20 @@ vec4_visitor::run()
setup_push_ranges();
if (prog_data->base.zero_push_reg) {
/* push_reg_mask_param is in uint32 params and UNIFORM is in vec4s */
const unsigned mask_param = stage_prog_data->push_reg_mask_param;
src_reg mask = src_reg(dst_reg(UNIFORM, mask_param / 4));
assert(mask_param % 2 == 0); /* Should be 64-bit-aligned */
mask.swizzle = BRW_SWIZZLE4((mask_param + 0) % 4,
(mask_param + 1) % 4,
(mask_param + 0) % 4,
(mask_param + 1) % 4);
emit(VEC4_OPCODE_ZERO_OOB_PUSH_REGS,
dst_reg(VGRF, alloc.allocate(3)), mask);
}
emit_prolog();
emit_nir_code();

View File

@ -1464,6 +1464,57 @@ generate_mov_indirect(struct brw_codegen *p,
}
}
static void
generate_zero_oob_push_regs(struct brw_codegen *p,
struct brw_stage_prog_data *prog_data,
struct brw_reg scratch,
struct brw_reg bit_mask_in)
{
const uint64_t want_zero = prog_data->zero_push_reg;
assert(want_zero);
assert(bit_mask_in.file == BRW_GENERAL_REGISTER_FILE);
assert(BRW_GET_SWZ(bit_mask_in.swizzle, 1) ==
BRW_GET_SWZ(bit_mask_in.swizzle, 0) + 1);
bit_mask_in.subnr += BRW_GET_SWZ(bit_mask_in.swizzle, 0) * 4;
bit_mask_in.type = BRW_REGISTER_TYPE_W;
/* Scratch should be 3 registers in the GRF */
assert(scratch.file == BRW_GENERAL_REGISTER_FILE);
scratch = vec8(scratch);
struct brw_reg mask_w16 = retype(scratch, BRW_REGISTER_TYPE_W);
struct brw_reg mask_d16 = retype(byte_offset(scratch, REG_SIZE),
BRW_REGISTER_TYPE_D);
brw_push_insn_state(p);
brw_set_default_access_mode(p, BRW_ALIGN_1);
brw_set_default_mask_control(p, BRW_MASK_DISABLE);
for (unsigned i = 0; i < 64; i++) {
if (i % 16 == 0 && (want_zero & BITFIELD64_RANGE(i, 16))) {
brw_set_default_exec_size(p, BRW_EXECUTE_8);
brw_SHL(p, suboffset(mask_w16, 8),
vec1(byte_offset(bit_mask_in, i / 8)),
brw_imm_v(0x01234567));
brw_SHL(p, mask_w16, suboffset(mask_w16, 8), brw_imm_w(8));
brw_set_default_exec_size(p, BRW_EXECUTE_16);
brw_ASR(p, mask_d16, mask_w16, brw_imm_w(15));
}
if (want_zero & BITFIELD64_BIT(i)) {
unsigned push_start = prog_data->dispatch_grf_start_reg;
struct brw_reg push_reg =
retype(brw_vec8_grf(push_start + i, 0), BRW_REGISTER_TYPE_D);
brw_set_default_exec_size(p, BRW_EXECUTE_8);
brw_AND(p, push_reg, push_reg, vec1(suboffset(mask_d16, i)));
}
}
brw_pop_insn_state(p);
}
static void
generate_code(struct brw_codegen *p,
const struct brw_compiler *compiler,
@ -2065,6 +2116,9 @@ generate_code(struct brw_codegen *p,
break;
}
case VEC4_OPCODE_ZERO_OOB_PUSH_REGS:
generate_zero_oob_push_regs(p, &prog_data->base, dst, src[0]);
case TCS_OPCODE_URB_WRITE:
generate_tcs_urb_write(p, inst, src[0]);
send_count++;