freedreno/a6xx: Add EARLYPREAMBLE flag to all a6xx_sp_xs_ctrl_reg0
Each shader stage has its own "early preamble" flag. Early preamble is likely an optimization to hide some of latency when loading UBOs into consts in the preamble. Early preamble has the following limitations: - Only shared, a1, and consts regs could be used (accessing other regs would result in GPU fault); - No cat5/cat6, only stc/ldc variants are working; - Values writen to shared regs are not accessible by the rest of the shader; - Instructions before shps are also considered to be a part of early preamble. Note, for all shaders from d3d11 games blob produced preambles compatible with early preamble mode. Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15901>
This commit is contained in:
parent
c54555c496
commit
5d377f435b
|
@ -153,6 +153,7 @@ cs_program_emit(struct fd_ringbuffer *ring, struct kernel *kernel)
|
|||
A6XX_SP_CS_CTRL_REG0_FULLREGFOOTPRINT(i->max_reg + 1) |
|
||||
A6XX_SP_CS_CTRL_REG0_HALFREGFOOTPRINT(i->max_half_reg + 1) |
|
||||
COND(v->mergedregs, A6XX_SP_CS_CTRL_REG0_MERGEDREGS) |
|
||||
COND(ir3_kernel->info.early_preamble, A6XX_SP_CS_CTRL_REG0_EARLYPREAMBLE) |
|
||||
A6XX_SP_CS_CTRL_REG0_BRANCHSTACK(ir3_shader_branchstack_hw(v)));
|
||||
|
||||
OUT_PKT4(ring, REG_A6XX_SP_CS_UNKNOWN_A9B1, 1);
|
||||
|
|
|
@ -0,0 +1,25 @@
|
|||
@localsize 1, 1, 1
|
||||
@buf 4 ; g[0]
|
||||
@invocationid(r0.x) ; r0.xyz
|
||||
@const(c0.x) 0.0, 0.0, 0.0, 0.0
|
||||
@earlypreamble
|
||||
|
||||
shps #l_preamble_end
|
||||
getone #l_preamble_end
|
||||
|
||||
mov.u32u32 r48.x, 1
|
||||
mov.u32u32 r48.y, 2
|
||||
mov.u32u32 r48.z, 3
|
||||
mov.u32u32 r48.w, 4
|
||||
(rpt5)nop
|
||||
stc.u32 c[0], r48.x, 4
|
||||
|
||||
(sy)(ss)shpe
|
||||
|
||||
l_preamble_end:
|
||||
(jp)nop
|
||||
|
||||
(rpt3)mov.u32u32 r1.x, (r)c0.x
|
||||
(rpt5)nop
|
||||
stib.b.untyped.1d.u32.4.imm r1.x, r0.x, 0
|
||||
end
|
|
@ -37,6 +37,7 @@ struct ir3_kernel_info {
|
|||
/* driver-param / replaced uniforms: */
|
||||
unsigned numwg;
|
||||
unsigned wgid;
|
||||
unsigned early_preamble;
|
||||
};
|
||||
|
||||
struct ir3_shader;
|
||||
|
|
|
@ -107,6 +107,7 @@ static int parse_reg(const char *str)
|
|||
"@out" return TOKEN(T_A_OUT);
|
||||
"@tex" return TOKEN(T_A_TEX);
|
||||
"@pvtmem" return TOKEN(T_A_PVTMEM);
|
||||
"@earlypreamble" return TOKEN(T_A_EARLYPREAMBLE);
|
||||
"(sy)" return TOKEN(T_SY);
|
||||
"(ss)" return TOKEN(T_SS);
|
||||
"(absneg)" return TOKEN(T_ABSNEG);
|
||||
|
|
|
@ -334,6 +334,7 @@ static void print_token(FILE *file, int type, YYSTYPE value)
|
|||
%token <tok> T_A_OUT
|
||||
%token <tok> T_A_TEX
|
||||
%token <tok> T_A_PVTMEM
|
||||
%token <tok> T_A_EARLYPREAMBLE
|
||||
/* todo, re-add @sampler/@uniform/@varying if needed someday */
|
||||
|
||||
/* src register flags */
|
||||
|
@ -701,6 +702,7 @@ header: localsize_header
|
|||
| out_header
|
||||
| tex_header
|
||||
| pvtmem_header
|
||||
| earlypreamble_header
|
||||
|
||||
const_val: T_FLOAT { $$ = fui($1); }
|
||||
| T_INT { $$ = $1; }
|
||||
|
@ -767,6 +769,8 @@ branchstack_header: T_A_BRANCHSTACK const_val { variant->branchstack = $2; }
|
|||
|
||||
pvtmem_header: T_A_PVTMEM const_val { variant->pvtmem_size = $2; }
|
||||
|
||||
earlypreamble_header: T_A_EARLYPREAMBLE { info->early_preamble = 1; }
|
||||
|
||||
/* Stubs for now */
|
||||
in_header: T_A_IN '(' T_REGISTER ')' T_IDENTIFIER '(' T_IDENTIFIER '=' integer ')' { }
|
||||
|
||||
|
|
|
@ -2887,8 +2887,22 @@ to upconvert to 32b float internally?
|
|||
GS must have the same mergedregs setting as VS.
|
||||
-->
|
||||
<bitfield name="MERGEDREGS" pos="20" type="boolean"/>
|
||||
<!-- ??? (blob has it set) -->
|
||||
<bitfield name="UNK21" pos="21" type="boolean"/>
|
||||
<!--
|
||||
Creates a separate preamble-only thread?
|
||||
|
||||
Early preamble has the following limitations:
|
||||
- Only shared, a1, and consts regs could be used
|
||||
(accessing other regs would result in GPU fault);
|
||||
- No cat5/cat6, only stc/ldc variants are working;
|
||||
- Values writen to shared regs are not accessible by the rest
|
||||
of the shader;
|
||||
- Instructions before shps are also considered to be a part of
|
||||
early preamble;
|
||||
|
||||
Note, for all shaders from d3d11 games blob produced preambles
|
||||
compatible with early preamble mode.
|
||||
-->
|
||||
<bitfield name="EARLYPREAMBLE" pos="21" type="boolean"/>
|
||||
</reg32>
|
||||
<!-- bitmask of true/false conditions for VS brac.N instructions,
|
||||
bit N corresponds to brac.N -->
|
||||
|
@ -3001,11 +3015,8 @@ to upconvert to 32b float internally?
|
|||
<reg32 offset="0xa825" name="SP_VS_PVT_MEM_HW_STACK_OFFSET" type="a6xx_sp_xs_pvt_mem_hw_stack_offset"/>
|
||||
|
||||
<reg32 offset="0xa830" name="SP_HS_CTRL_REG0" type="a6xx_sp_xs_ctrl_reg0">
|
||||
<!--
|
||||
There is no mergedregs bit, that comes from the VS.
|
||||
No idea what this bit does here.
|
||||
-->
|
||||
<bitfield name="UNK20" pos="20" type="boolean"/>
|
||||
<!-- There is no mergedregs bit, that comes from the VS. -->
|
||||
<bitfield name="EARLYPREAMBLE" pos="20" type="boolean"/>
|
||||
</reg32>
|
||||
<!--
|
||||
Total size of local storage in dwords divided by the wave size.
|
||||
|
@ -3029,7 +3040,7 @@ to upconvert to 32b float internally?
|
|||
|
||||
<reg32 offset="0xa840" name="SP_DS_CTRL_REG0" type="a6xx_sp_xs_ctrl_reg0">
|
||||
<!-- There is no mergedregs bit, that comes from the VS. -->
|
||||
<bitfield name="UNK20" pos="20" type="boolean"/> <!-- something preamble-related -->
|
||||
<bitfield name="EARLYPREAMBLE" pos="20" type="boolean"/>
|
||||
</reg32>
|
||||
<reg32 offset="0xa841" name="SP_DS_BRANCH_COND" type="hex"/>
|
||||
|
||||
|
@ -3064,11 +3075,8 @@ to upconvert to 32b float internally?
|
|||
<reg32 offset="0xa865" name="SP_DS_PVT_MEM_HW_STACK_OFFSET" type="a6xx_sp_xs_pvt_mem_hw_stack_offset"/>
|
||||
|
||||
<reg32 offset="0xa870" name="SP_GS_CTRL_REG0" type="a6xx_sp_xs_ctrl_reg0">
|
||||
<!--
|
||||
There is no mergedregs bit, that comes from the VS.
|
||||
No idea what this bit does here.
|
||||
-->
|
||||
<bitfield name="UNK20" pos="20" type="boolean"/>
|
||||
<!-- There is no mergedregs bit, that comes from the VS. -->
|
||||
<bitfield name="EARLYPREAMBLE" pos="20" type="boolean"/>
|
||||
</reg32>
|
||||
<reg32 offset="0xa871" name="SP_GS_PRIM_SIZE" low="0" high="7" type="uint">
|
||||
<doc>
|
||||
|
@ -3137,7 +3145,8 @@ to upconvert to 32b float internally?
|
|||
<bitfield name="UNK24" pos="24" type="boolean"/>
|
||||
<bitfield name="UNK25" pos="25" type="boolean"/>
|
||||
<bitfield name="PIXLODENABLE" pos="26" type="boolean"/>
|
||||
<bitfield name="UNK27" low="27" high="28"/>
|
||||
<bitfield name="UNK27" pos="27" type="boolean"/>
|
||||
<bitfield name="EARLYPREAMBLE" pos="28" type="boolean"/>
|
||||
<bitfield name="MERGEDREGS" pos="31" type="boolean"/>
|
||||
</reg32>
|
||||
<reg32 offset="0xa981" name="SP_FS_BRANCH_COND" type="hex"/>
|
||||
|
@ -3249,8 +3258,7 @@ to upconvert to 32b float internally?
|
|||
<bitfield name="UNK21" pos="21" type="boolean"/>
|
||||
<!-- has a small impact on performance, not clear what it does -->
|
||||
<bitfield name="UNK22" pos="22" type="boolean"/>
|
||||
<!-- creates a separate prolog-only thread? -->
|
||||
<bitfield name="SEPARATEPROLOG" pos="23" type="boolean"/>
|
||||
<bitfield name="EARLYPREAMBLE" pos="23" type="boolean"/>
|
||||
<bitfield name="MERGEDREGS" pos="31" type="boolean"/>
|
||||
</reg32>
|
||||
|
||||
|
|
Loading…
Reference in New Issue