gallium: Generate SSE code to swizzle and unswizzle vs inputs and outputs.
Change SSE_SWIZZLES #define to 0 to disable it.
This commit is contained in:
parent
7f5e9d3f07
commit
58d3dff0d3
|
@ -109,9 +109,10 @@ static void fetch_pipeline_run( struct draw_pt_middle_end *middle,
|
|||
struct draw_context *draw = fpme->draw;
|
||||
struct draw_vertex_shader *shader = draw->vertex_shader;
|
||||
unsigned opt = fpme->opt;
|
||||
unsigned alloc_count = align_int( fetch_count, 4 );
|
||||
|
||||
struct vertex_header *pipeline_verts =
|
||||
(struct vertex_header *)MALLOC(fpme->vertex_size * fetch_count);
|
||||
(struct vertex_header *)MALLOC(fpme->vertex_size * alloc_count);
|
||||
|
||||
if (!pipeline_verts) {
|
||||
/* Not much we can do here - just skip the rendering.
|
||||
|
|
|
@ -47,14 +47,29 @@
|
|||
#include "tgsi/util/tgsi_parse.h"
|
||||
|
||||
#define SSE_MAX_VERTICES 4
|
||||
#define SSE_SWIZZLES 1
|
||||
|
||||
#if SSE_SWIZZLES
|
||||
typedef void (XSTDCALL *codegen_function) (
|
||||
const struct tgsi_exec_vector *input,
|
||||
struct tgsi_exec_vector *output,
|
||||
float (*constant)[4],
|
||||
struct tgsi_exec_vector *temporary,
|
||||
float (*immediates)[4],
|
||||
const float (*aos_input)[4],
|
||||
uint num_inputs,
|
||||
uint input_stride,
|
||||
float (*aos_output)[4],
|
||||
uint num_outputs,
|
||||
uint output_stride );
|
||||
#else
|
||||
typedef void (XSTDCALL *codegen_function) (
|
||||
const struct tgsi_exec_vector *input,
|
||||
struct tgsi_exec_vector *output,
|
||||
float (*constant)[4],
|
||||
struct tgsi_exec_vector *temporary,
|
||||
float (*immediates)[4] );
|
||||
|
||||
#endif
|
||||
|
||||
struct draw_sse_vertex_shader {
|
||||
struct draw_vertex_shader base;
|
||||
|
@ -91,12 +106,31 @@ vs_sse_run_linear( struct draw_vertex_shader *base,
|
|||
{
|
||||
struct draw_sse_vertex_shader *shader = (struct draw_sse_vertex_shader *)base;
|
||||
struct tgsi_exec_machine *machine = shader->machine;
|
||||
unsigned int i, j;
|
||||
unsigned slot;
|
||||
unsigned int i;
|
||||
|
||||
for (i = 0; i < count; i += MAX_TGSI_VERTICES) {
|
||||
unsigned int max_vertices = MIN2(MAX_TGSI_VERTICES, count - i);
|
||||
|
||||
#if SSE_SWIZZLES
|
||||
/* run compiled shader
|
||||
*/
|
||||
shader->func(machine->Inputs,
|
||||
machine->Outputs,
|
||||
(float (*)[4])constants,
|
||||
machine->Temps,
|
||||
shader->immediates,
|
||||
input,
|
||||
base->info.num_inputs,
|
||||
input_stride,
|
||||
output,
|
||||
base->info.num_outputs,
|
||||
output_stride );
|
||||
|
||||
input = (const float (*)[4])((const char *)input + input_stride * max_vertices);
|
||||
output = (float (*)[4])((char *)output + output_stride * max_vertices);
|
||||
#else
|
||||
unsigned int j, slot;
|
||||
|
||||
/* Swizzle inputs.
|
||||
*/
|
||||
for (j = 0; j < max_vertices; j++) {
|
||||
|
@ -105,10 +139,10 @@ vs_sse_run_linear( struct draw_vertex_shader *base,
|
|||
machine->Inputs[slot].xyzw[1].f[j] = input[slot][1];
|
||||
machine->Inputs[slot].xyzw[2].f[j] = input[slot][2];
|
||||
machine->Inputs[slot].xyzw[3].f[j] = input[slot][3];
|
||||
}
|
||||
}
|
||||
|
||||
input = (const float (*)[4])((const char *)input + input_stride);
|
||||
}
|
||||
}
|
||||
|
||||
/* run compiled shader
|
||||
*/
|
||||
|
@ -118,7 +152,6 @@ vs_sse_run_linear( struct draw_vertex_shader *base,
|
|||
machine->Temps,
|
||||
shader->immediates);
|
||||
|
||||
|
||||
/* Unswizzle all output results.
|
||||
*/
|
||||
for (j = 0; j < max_vertices; j++) {
|
||||
|
@ -127,10 +160,11 @@ vs_sse_run_linear( struct draw_vertex_shader *base,
|
|||
output[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
|
||||
output[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
|
||||
output[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
|
||||
}
|
||||
}
|
||||
|
||||
output = (float (*)[4])((char *)output + output_stride);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -176,7 +210,7 @@ draw_create_vs_sse(struct draw_context *draw,
|
|||
x86_init_func( &vs->sse2_program );
|
||||
|
||||
if (!tgsi_emit_sse2( (struct tgsi_token *) vs->base.state.tokens,
|
||||
&vs->sse2_program, vs->immediates ))
|
||||
&vs->sse2_program, vs->immediates, SSE_SWIZZLES ))
|
||||
goto fail;
|
||||
|
||||
vs->func = (codegen_function) x86_get_func( &vs->sse2_program );
|
||||
|
|
|
@ -853,6 +853,20 @@ void sse_shufps( struct x86_function *p,
|
|||
emit_1ub(p, shuf);
|
||||
}
|
||||
|
||||
void sse_unpckhps( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
|
||||
{
|
||||
DUMP_RR( dst, src );
|
||||
emit_2ub( p, X86_TWOB, 0x15 );
|
||||
emit_modrm( p, dst, src );
|
||||
}
|
||||
|
||||
void sse_unpcklps( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
|
||||
{
|
||||
DUMP_RR( dst, src );
|
||||
emit_2ub( p, X86_TWOB, 0x14 );
|
||||
emit_modrm( p, dst, src );
|
||||
}
|
||||
|
||||
void sse_cmpps( struct x86_function *p,
|
||||
struct x86_reg dst,
|
||||
struct x86_reg src,
|
||||
|
|
|
@ -203,6 +203,8 @@ void sse_rsqrtps( struct x86_function *p, struct x86_reg dst, struct x86_reg src
|
|||
void sse_rsqrtss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
|
||||
void sse_shufps( struct x86_function *p, struct x86_reg dest, struct x86_reg arg0,
|
||||
unsigned char shuf );
|
||||
void sse_unpckhps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
|
||||
void sse_unpcklps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
|
||||
void sse_pmovmskb( struct x86_function *p, struct x86_reg dest, struct x86_reg src );
|
||||
void sse2_punpcklbw( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
|
||||
|
||||
|
|
|
@ -1788,7 +1788,6 @@ emit_instruction(
|
|||
break;
|
||||
|
||||
case TGSI_OPCODE_RET:
|
||||
case TGSI_OPCODE_END:
|
||||
#ifdef WIN32
|
||||
emit_retw( func, 16 );
|
||||
#else
|
||||
|
@ -1796,6 +1795,9 @@ emit_instruction(
|
|||
#endif
|
||||
break;
|
||||
|
||||
case TGSI_OPCODE_END:
|
||||
break;
|
||||
|
||||
case TGSI_OPCODE_SSG:
|
||||
return 0;
|
||||
break;
|
||||
|
@ -2027,6 +2029,127 @@ emit_declaration(
|
|||
}
|
||||
}
|
||||
|
||||
static void aos_to_soa( struct x86_function *func, uint aos, uint soa, uint num, uint stride )
|
||||
{
|
||||
struct x86_reg soa_input;
|
||||
struct x86_reg aos_input;
|
||||
struct x86_reg num_inputs;
|
||||
struct x86_reg temp;
|
||||
unsigned char *inner_loop;
|
||||
|
||||
soa_input = x86_make_reg( file_REG32, reg_AX );
|
||||
aos_input = x86_make_reg( file_REG32, reg_BX );
|
||||
num_inputs = x86_make_reg( file_REG32, reg_CX );
|
||||
temp = x86_make_reg( file_REG32, reg_DX );
|
||||
|
||||
/* Save EBX */
|
||||
x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
|
||||
|
||||
x86_mov( func, soa_input, get_argument( soa + 1 ) );
|
||||
x86_mov( func, aos_input, get_argument( aos + 1 ) );
|
||||
x86_mov( func, num_inputs, get_argument( num + 1 ) );
|
||||
|
||||
inner_loop = x86_get_label( func );
|
||||
|
||||
x86_mov( func, temp, get_argument( stride + 1 ) );
|
||||
x86_push( func, aos_input );
|
||||
sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
|
||||
sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
|
||||
x86_add( func, aos_input, temp );
|
||||
sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
|
||||
sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
|
||||
x86_add( func, aos_input, temp );
|
||||
sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
|
||||
sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
|
||||
x86_add( func, aos_input, temp );
|
||||
sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
|
||||
sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
|
||||
x86_pop( func, aos_input );
|
||||
|
||||
sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
|
||||
sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
|
||||
sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
|
||||
sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
|
||||
sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
|
||||
sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
|
||||
|
||||
sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
|
||||
sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
|
||||
sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
|
||||
sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
|
||||
|
||||
/* Advance to next input */
|
||||
x86_mov_reg_imm( func, temp, 16 );
|
||||
x86_add( func, aos_input, temp );
|
||||
x86_mov_reg_imm( func, temp, 64 );
|
||||
x86_add( func, soa_input, temp );
|
||||
x86_dec( func, num_inputs );
|
||||
x86_jcc( func, cc_NE, inner_loop );
|
||||
|
||||
/* Restore EBX */
|
||||
x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
|
||||
}
|
||||
|
||||
static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num, uint stride )
|
||||
{
|
||||
struct x86_reg soa_output;
|
||||
struct x86_reg aos_output;
|
||||
struct x86_reg num_outputs;
|
||||
struct x86_reg temp;
|
||||
unsigned char *inner_loop;
|
||||
|
||||
soa_output = x86_make_reg( file_REG32, reg_AX );
|
||||
aos_output = x86_make_reg( file_REG32, reg_BX );
|
||||
num_outputs = x86_make_reg( file_REG32, reg_CX );
|
||||
temp = x86_make_reg( file_REG32, reg_DX );
|
||||
|
||||
/* Save EBX */
|
||||
x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
|
||||
|
||||
x86_mov( func, soa_output, get_argument( soa + 1 ) );
|
||||
x86_mov( func, aos_output, get_argument( aos + 1 ) );
|
||||
x86_mov( func, num_outputs, get_argument( num + 1 ) );
|
||||
|
||||
inner_loop = x86_get_label( func );
|
||||
|
||||
sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
|
||||
sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
|
||||
sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
|
||||
sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
|
||||
|
||||
sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
|
||||
sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
|
||||
sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
|
||||
sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
|
||||
sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
|
||||
sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
|
||||
|
||||
x86_mov( func, temp, get_argument( stride + 1 ) );
|
||||
x86_push( func, aos_output );
|
||||
sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
|
||||
sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
|
||||
x86_add( func, aos_output, temp );
|
||||
sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
|
||||
sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
|
||||
x86_add( func, aos_output, temp );
|
||||
sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
|
||||
sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
|
||||
x86_add( func, aos_output, temp );
|
||||
sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
|
||||
sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
|
||||
x86_pop( func, aos_output );
|
||||
|
||||
/* Advance to next output */
|
||||
x86_mov_reg_imm( func, temp, 16 );
|
||||
x86_add( func, aos_output, temp );
|
||||
x86_mov_reg_imm( func, temp, 64 );
|
||||
x86_add( func, soa_output, temp );
|
||||
x86_dec( func, num_outputs );
|
||||
x86_jcc( func, cc_NE, inner_loop );
|
||||
|
||||
/* Restore EBX */
|
||||
x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
|
||||
}
|
||||
|
||||
/**
|
||||
* Translate a TGSI vertex/fragment shader to SSE2 code.
|
||||
|
@ -2048,7 +2171,8 @@ unsigned
|
|||
tgsi_emit_sse2(
|
||||
const struct tgsi_token *tokens,
|
||||
struct x86_function *func,
|
||||
float (*immediates)[4])
|
||||
float (*immediates)[4],
|
||||
boolean do_swizzles )
|
||||
{
|
||||
struct tgsi_parse_context parse;
|
||||
boolean instruction_phase = FALSE;
|
||||
|
@ -2089,6 +2213,9 @@ tgsi_emit_sse2(
|
|||
else {
|
||||
assert(parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX);
|
||||
|
||||
if (do_swizzles)
|
||||
aos_to_soa( func, 5, 0, 6, 7 );
|
||||
|
||||
x86_mov(
|
||||
func,
|
||||
get_input_base(),
|
||||
|
@ -2176,6 +2303,17 @@ tgsi_emit_sse2(
|
|||
}
|
||||
}
|
||||
|
||||
if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
|
||||
if (do_swizzles)
|
||||
soa_to_aos( func, 8, 1, 9, 10 );
|
||||
}
|
||||
|
||||
#ifdef WIN32
|
||||
emit_retw( func, 16 );
|
||||
#else
|
||||
emit_ret( func );
|
||||
#endif
|
||||
|
||||
tgsi_parse_free( &parse );
|
||||
|
||||
return ok;
|
||||
|
|
|
@ -12,8 +12,8 @@ unsigned
|
|||
tgsi_emit_sse2(
|
||||
const struct tgsi_token *tokens,
|
||||
struct x86_function *function,
|
||||
float (*immediates)[4]
|
||||
);
|
||||
float (*immediates)[4],
|
||||
boolean do_swizzles );
|
||||
|
||||
#if defined __cplusplus
|
||||
}
|
||||
|
|
|
@ -133,7 +133,7 @@ softpipe_create_fs_sse(struct softpipe_context *softpipe,
|
|||
x86_init_func( &shader->sse2_program );
|
||||
|
||||
if (!tgsi_emit_sse2( templ->tokens, &shader->sse2_program,
|
||||
shader->immediates)) {
|
||||
shader->immediates, FALSE )) {
|
||||
FREE(shader);
|
||||
return NULL;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue