From 5b4c43d98556c5a4806757513bcb196a724518c5 Mon Sep 17 00:00:00 2001 From: Keith Whitwell Date: Sun, 5 Sep 2010 13:17:43 +0100 Subject: [PATCH] llvmpipe: use llvm for attribute interpolant calculation Basically no change relative to hard-coded version, but this will be useful for other changes later. --- src/gallium/drivers/llvmpipe/SConscript | 5 +- src/gallium/drivers/llvmpipe/lp_bld_interp.h | 26 +- src/gallium/drivers/llvmpipe/lp_context.c | 3 + src/gallium/drivers/llvmpipe/lp_context.h | 10 +- src/gallium/drivers/llvmpipe/lp_flush.h | 1 + src/gallium/drivers/llvmpipe/lp_limits.h | 10 + src/gallium/drivers/llvmpipe/lp_setup.c | 19 +- src/gallium/drivers/llvmpipe/lp_setup.h | 29 +- src/gallium/drivers/llvmpipe/lp_setup_coef.c | 279 ------- src/gallium/drivers/llvmpipe/lp_setup_coef.h | 64 -- .../drivers/llvmpipe/lp_setup_coef_intrin.c | 228 ------ .../drivers/llvmpipe/lp_setup_context.h | 12 +- src/gallium/drivers/llvmpipe/lp_setup_line.c | 20 +- src/gallium/drivers/llvmpipe/lp_setup_point.c | 13 +- src/gallium/drivers/llvmpipe/lp_setup_tri.c | 42 +- src/gallium/drivers/llvmpipe/lp_state.h | 3 + .../drivers/llvmpipe/lp_state_derived.c | 69 +- src/gallium/drivers/llvmpipe/lp_state_fs.c | 80 +- src/gallium/drivers/llvmpipe/lp_state_fs.h | 4 + src/gallium/drivers/llvmpipe/lp_state_setup.c | 768 ++++++++++++++++++ src/gallium/drivers/llvmpipe/lp_state_setup.h | 81 ++ .../llvmpipe/lp_state_setup_fallback.c | 265 ++++++ 22 files changed, 1315 insertions(+), 716 deletions(-) delete mode 100644 src/gallium/drivers/llvmpipe/lp_setup_coef.c delete mode 100644 src/gallium/drivers/llvmpipe/lp_setup_coef.h delete mode 100644 src/gallium/drivers/llvmpipe/lp_setup_coef_intrin.c create mode 100644 src/gallium/drivers/llvmpipe/lp_state_setup.c create mode 100644 src/gallium/drivers/llvmpipe/lp_state_setup.h create mode 100644 src/gallium/drivers/llvmpipe/lp_state_setup_fallback.c diff --git a/src/gallium/drivers/llvmpipe/SConscript b/src/gallium/drivers/llvmpipe/SConscript index 650435f0f19..6ddce659206 100644 --- a/src/gallium/drivers/llvmpipe/SConscript +++ b/src/gallium/drivers/llvmpipe/SConscript @@ -61,16 +61,17 @@ llvmpipe = env.ConvenienceLibrary( 'lp_scene_queue.c', 'lp_screen.c', 'lp_setup.c', + 'lp_setup_debug.c', 'lp_setup_line.c', 'lp_setup_point.c', 'lp_setup_tri.c', - 'lp_setup_coef.c', - 'lp_setup_coef_intrin.c', 'lp_setup_vbuf.c', 'lp_state_blend.c', 'lp_state_clip.c', 'lp_state_derived.c', 'lp_state_fs.c', + 'lp_state_setup.c', + 'lp_state_setup_fallback.c', 'lp_state_gs.c', 'lp_state_rasterizer.c', 'lp_state_sampler.c', diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.h b/src/gallium/drivers/llvmpipe/lp_bld_interp.h index 3054030f739..37479fca9dc 100644 --- a/src/gallium/drivers/llvmpipe/lp_bld_interp.h +++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.h @@ -46,7 +46,31 @@ #include "tgsi/tgsi_exec.h" -#include "lp_setup.h" +/** + * Describes how to compute the interpolation coefficients (a0, dadx, dady) + * from the vertices passed into our triangle/line/point functions by the + * draw module. + * + * Vertices are treated as an array of float[4] values, indexed by + * src_index. + * + * LP_INTERP_COLOR is translated to either LP_INTERP_CONSTANT or + * LINEAR depending on flatshade state. + */ +enum lp_interp { + LP_INTERP_CONSTANT, + LP_INTERP_COLOR, + LP_INTERP_LINEAR, + LP_INTERP_PERSPECTIVE, + LP_INTERP_POSITION, + LP_INTERP_FACING +}; + +struct lp_shader_input { + ushort interp:4; /* enum lp_interp */ + ushort usage_mask:4; /* bitmask of TGSI_WRITEMASK_x flags */ + ushort src_index:8; /* where to find values in incoming vertices */ +}; struct lp_build_interp_soa_context diff --git a/src/gallium/drivers/llvmpipe/lp_context.c b/src/gallium/drivers/llvmpipe/lp_context.c index 39f2c6085ef..763432ed712 100644 --- a/src/gallium/drivers/llvmpipe/lp_context.c +++ b/src/gallium/drivers/llvmpipe/lp_context.c @@ -82,6 +82,8 @@ static void llvmpipe_destroy( struct pipe_context *pipe ) } } + lp_delete_setup_variants(llvmpipe); + align_free( llvmpipe ); } @@ -108,6 +110,7 @@ llvmpipe_create_context( struct pipe_screen *screen, void *priv ) memset(llvmpipe, 0, sizeof *llvmpipe); make_empty_list(&llvmpipe->fs_variants_list); + make_empty_list(&llvmpipe->setup_variants_list); llvmpipe->pipe.winsys = screen->winsys; llvmpipe->pipe.screen = screen; diff --git a/src/gallium/drivers/llvmpipe/lp_context.h b/src/gallium/drivers/llvmpipe/lp_context.h index 34fa20e204a..db09c95b272 100644 --- a/src/gallium/drivers/llvmpipe/lp_context.h +++ b/src/gallium/drivers/llvmpipe/lp_context.h @@ -39,6 +39,7 @@ #include "lp_jit.h" #include "lp_setup.h" #include "lp_state_fs.h" +#include "lp_state_setup.h" struct llvmpipe_vbuf_render; @@ -48,6 +49,7 @@ struct lp_fragment_shader; struct lp_vertex_shader; struct lp_blend_state; struct lp_setup_context; +struct lp_setup_variant; struct lp_velems_state; struct llvmpipe_context { @@ -105,12 +107,9 @@ struct llvmpipe_context { /** Which vertex shader output slot contains point size */ int psize_slot; - /** Fragment shader input interpolation info */ - unsigned num_inputs; - struct lp_shader_input inputs[PIPE_MAX_SHADER_INPUTS]; - /** The tiling engine */ struct lp_setup_context *setup; + struct lp_setup_variant setup_variant; /** The primitive drawing context */ struct draw_context *draw; @@ -120,6 +119,9 @@ struct llvmpipe_context { struct lp_fs_variant_list_item fs_variants_list; unsigned nr_fs_variants; + + struct lp_setup_variant_list_item setup_variants_list; + unsigned nr_setup_variants; }; diff --git a/src/gallium/drivers/llvmpipe/lp_flush.h b/src/gallium/drivers/llvmpipe/lp_flush.h index bb538b2bd83..3626ce4a86c 100644 --- a/src/gallium/drivers/llvmpipe/lp_flush.h +++ b/src/gallium/drivers/llvmpipe/lp_flush.h @@ -32,6 +32,7 @@ struct pipe_context; struct pipe_fence_handle; +struct pipe_resource; void llvmpipe_flush(struct pipe_context *pipe, diff --git a/src/gallium/drivers/llvmpipe/lp_limits.h b/src/gallium/drivers/llvmpipe/lp_limits.h index d1c431475d8..2538164ffaa 100644 --- a/src/gallium/drivers/llvmpipe/lp_limits.h +++ b/src/gallium/drivers/llvmpipe/lp_limits.h @@ -72,4 +72,14 @@ */ #define LP_MAX_SHADER_VARIANTS 1024 +/** + * Max number of setup variants that will be kept around. + * + * These are determined by the combination of the fragment shader + * input signature and a small amount of rasterization state (eg + * flatshading). It is likely that many active fragment shaders will + * share the same setup variant. + */ +#define LP_MAX_SETUP_VARIANTS 64 + #endif /* LP_LIMITS_H */ diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c index 6674d281d1e..3854fd70af7 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup.c +++ b/src/gallium/drivers/llvmpipe/lp_setup.c @@ -500,14 +500,12 @@ lp_setup_set_point_state( struct lp_setup_context *setup, } void -lp_setup_set_fs_inputs( struct lp_setup_context *setup, - const struct lp_shader_input *input, - unsigned nr ) +lp_setup_set_setup_variant( struct lp_setup_context *setup, + const struct lp_setup_variant *variant) { - LP_DBG(DEBUG_SETUP, "%s %p %u\n", __FUNCTION__, (void *) input, nr); - - memcpy( setup->fs.input, input, nr * sizeof input[0] ); - setup->fs.nr_inputs = nr; + LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__); + + setup->setup.variant = variant; } void @@ -863,6 +861,13 @@ lp_setup_update_state( struct lp_setup_context *setup, setup->psize = lp->psize_slot; assert(lp->dirty == 0); + + assert(lp->setup_variant.key.size == + setup->setup.variant->key.size); + + assert(memcmp(&lp->setup_variant.key, + &setup->setup.variant->key, + setup->setup.variant->key.size) == 0); } if (update_scene) diff --git a/src/gallium/drivers/llvmpipe/lp_setup.h b/src/gallium/drivers/llvmpipe/lp_setup.h index b94061b7d49..19078ebbcb4 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup.h +++ b/src/gallium/drivers/llvmpipe/lp_setup.h @@ -33,28 +33,6 @@ struct draw_context; struct vertex_info; -enum lp_interp { - LP_INTERP_CONSTANT, - LP_INTERP_LINEAR, - LP_INTERP_PERSPECTIVE, - LP_INTERP_POSITION, - LP_INTERP_FACING -}; - - -/** - * Describes how to compute the interpolation coefficients (a0, dadx, dady) - * from the vertices passed into our triangle/line/point functions by the - * draw module. - * - * Vertices are treated as an array of float[4] values, indexed by - * src_index. - */ -struct lp_shader_input { - enum lp_interp interp; /* how to interpolate values */ - unsigned src_index; /* where to find values in incoming vertices */ - unsigned usage_mask; /* bitmask of TGSI_WRITEMASK_x flags */ -}; struct pipe_resource; struct pipe_query; @@ -66,7 +44,7 @@ struct lp_fragment_shader_variant; struct lp_jit_context; struct llvmpipe_query; struct pipe_fence_handle; - +struct lp_setup_variant; struct lp_setup_context * lp_setup_create( struct pipe_context *pipe, @@ -110,9 +88,8 @@ lp_setup_set_point_state( struct lp_setup_context *setup, uint sprite); void -lp_setup_set_fs_inputs( struct lp_setup_context *setup, - const struct lp_shader_input *interp, - unsigned nr ); +lp_setup_set_setup_variant( struct lp_setup_context *setup, + const struct lp_setup_variant *variant ); void lp_setup_set_fs_variant( struct lp_setup_context *setup, diff --git a/src/gallium/drivers/llvmpipe/lp_setup_coef.c b/src/gallium/drivers/llvmpipe/lp_setup_coef.c deleted file mode 100644 index 8dc2688ddb6..00000000000 --- a/src/gallium/drivers/llvmpipe/lp_setup_coef.c +++ /dev/null @@ -1,279 +0,0 @@ -/************************************************************************** - * - * Copyright 2010, VMware. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - -/* - * Binning code for triangles - */ - -#include "util/u_math.h" -#include "util/u_memory.h" -#include "lp_perf.h" -#include "lp_setup_context.h" -#include "lp_setup_coef.h" -#include "lp_rast.h" -#include "lp_state_fs.h" - -#if !defined(PIPE_ARCH_SSE) - -/** - * Compute a0 for a constant-valued coefficient (GL_FLAT shading). - */ -static void constant_coef( struct lp_rast_shader_inputs *inputs, - unsigned slot, - const float value, - unsigned i ) -{ - inputs->a0[slot][i] = value; - inputs->dadx[slot][i] = 0.0f; - inputs->dady[slot][i] = 0.0f; -} - - - -static void linear_coef( struct lp_rast_shader_inputs *inputs, - const struct lp_tri_info *info, - unsigned slot, - unsigned vert_attr, - unsigned i) -{ - float a0 = info->v0[vert_attr][i]; - float a1 = info->v1[vert_attr][i]; - float a2 = info->v2[vert_attr][i]; - - float da01 = a0 - a1; - float da20 = a2 - a0; - float dadx = (da01 * info->dy20_ooa - info->dy01_ooa * da20); - float dady = (da20 * info->dx01_ooa - info->dx20_ooa * da01); - - inputs->dadx[slot][i] = dadx; - inputs->dady[slot][i] = dady; - - /* calculate a0 as the value which would be sampled for the - * fragment at (0,0), taking into account that we want to sample at - * pixel centers, in other words (0.5, 0.5). - * - * this is neat but unfortunately not a good way to do things for - * triangles with very large values of dadx or dady as it will - * result in the subtraction and re-addition from a0 of a very - * large number, which means we'll end up loosing a lot of the - * fractional bits and precision from a0. the way to fix this is - * to define a0 as the sample at a pixel center somewhere near vmin - * instead - i'll switch to this later. - */ - inputs->a0[slot][i] = a0 - (dadx * info->x0_center + - dady * info->y0_center); -} - - -/** - * Compute a0, dadx and dady for a perspective-corrected interpolant, - * for a triangle. - * We basically multiply the vertex value by 1/w before computing - * the plane coefficients (a0, dadx, dady). - * Later, when we compute the value at a particular fragment position we'll - * divide the interpolated value by the interpolated W at that fragment. - */ -static void perspective_coef( struct lp_rast_shader_inputs *inputs, - const struct lp_tri_info *info, - unsigned slot, - unsigned vert_attr, - unsigned i) -{ - /* premultiply by 1/w (v[0][3] is always 1/w): - */ - float a0 = info->v0[vert_attr][i] * info->v0[0][3]; - float a1 = info->v1[vert_attr][i] * info->v1[0][3]; - float a2 = info->v2[vert_attr][i] * info->v2[0][3]; - float da01 = a0 - a1; - float da20 = a2 - a0; - float dadx = da01 * info->dy20_ooa - info->dy01_ooa * da20; - float dady = da20 * info->dx01_ooa - info->dx20_ooa * da01; - - inputs->dadx[slot][i] = dadx; - inputs->dady[slot][i] = dady; - inputs->a0[slot][i] = a0 - (dadx * info->x0_center + - dady * info->y0_center); -} - - -/** - * Special coefficient setup for gl_FragCoord. - * X and Y are trivial - * Z and W are copied from position_coef which should have already been computed. - * We could do a bit less work if we'd examine gl_FragCoord's swizzle mask. - */ -static void -setup_fragcoord_coef(struct lp_rast_shader_inputs *inputs, - const struct lp_tri_info *info, - unsigned slot, - unsigned usage_mask) -{ - /*X*/ - if (usage_mask & TGSI_WRITEMASK_X) { - inputs->a0[slot][0] = 0.0; - inputs->dadx[slot][0] = 1.0; - inputs->dady[slot][0] = 0.0; - } - - /*Y*/ - if (usage_mask & TGSI_WRITEMASK_Y) { - inputs->a0[slot][1] = 0.0; - inputs->dadx[slot][1] = 0.0; - inputs->dady[slot][1] = 1.0; - } - - /*Z*/ - if (usage_mask & TGSI_WRITEMASK_Z) { - linear_coef(inputs, info, slot, 0, 2); - } - - /*W*/ - if (usage_mask & TGSI_WRITEMASK_W) { - linear_coef(inputs, info, slot, 0, 3); - } -} - - -/** - * Setup the fragment input attribute with the front-facing value. - * \param frontface is the triangle front facing? - */ -static void setup_facing_coef( struct lp_rast_shader_inputs *inputs, - unsigned slot, - boolean frontface, - unsigned usage_mask) -{ - /* convert TRUE to 1.0 and FALSE to -1.0 */ - if (usage_mask & TGSI_WRITEMASK_X) - constant_coef( inputs, slot, 2.0f * frontface - 1.0f, 0 ); - - if (usage_mask & TGSI_WRITEMASK_Y) - constant_coef( inputs, slot, 0.0f, 1 ); /* wasted */ - - if (usage_mask & TGSI_WRITEMASK_Z) - constant_coef( inputs, slot, 0.0f, 2 ); /* wasted */ - - if (usage_mask & TGSI_WRITEMASK_W) - constant_coef( inputs, slot, 0.0f, 3 ); /* wasted */ -} - - -/** - * Compute the tri->coef[] array dadx, dady, a0 values. - */ -void lp_setup_tri_coef( struct lp_setup_context *setup, - struct lp_rast_shader_inputs *inputs, - const float (*v0)[4], - const float (*v1)[4], - const float (*v2)[4], - boolean frontfacing) -{ - unsigned fragcoord_usage_mask = TGSI_WRITEMASK_XYZ; - unsigned slot; - unsigned i; - struct lp_tri_info info; - float dx01 = v0[0][0] - v1[0][0]; - float dy01 = v0[0][1] - v1[0][1]; - float dx20 = v2[0][0] - v0[0][0]; - float dy20 = v2[0][1] - v0[0][1]; - float oneoverarea = 1.0f / (dx01 * dy20 - dx20 * dy01); - - info.v0 = v0; - info.v1 = v1; - info.v2 = v2; - info.frontfacing = frontfacing; - info.x0_center = v0[0][0] - setup->pixel_offset; - info.y0_center = v0[0][1] - setup->pixel_offset; - info.dx01_ooa = dx01 * oneoverarea; - info.dx20_ooa = dx20 * oneoverarea; - info.dy01_ooa = dy01 * oneoverarea; - info.dy20_ooa = dy20 * oneoverarea; - - - /* setup interpolation for all the remaining attributes: - */ - for (slot = 0; slot < setup->fs.nr_inputs; slot++) { - unsigned vert_attr = setup->fs.input[slot].src_index; - unsigned usage_mask = setup->fs.input[slot].usage_mask; - - switch (setup->fs.input[slot].interp) { - case LP_INTERP_CONSTANT: - if (setup->flatshade_first) { - for (i = 0; i < NUM_CHANNELS; i++) - if (usage_mask & (1 << i)) - constant_coef(inputs, slot+1, info.v0[vert_attr][i], i); - } - else { - for (i = 0; i < NUM_CHANNELS; i++) - if (usage_mask & (1 << i)) - constant_coef(inputs, slot+1, info.v2[vert_attr][i], i); - } - break; - - case LP_INTERP_LINEAR: - for (i = 0; i < NUM_CHANNELS; i++) - if (usage_mask & (1 << i)) - linear_coef(inputs, &info, slot+1, vert_attr, i); - break; - - case LP_INTERP_PERSPECTIVE: - for (i = 0; i < NUM_CHANNELS; i++) - if (usage_mask & (1 << i)) - perspective_coef(inputs, &info, slot+1, vert_attr, i); - fragcoord_usage_mask |= TGSI_WRITEMASK_W; - break; - - case LP_INTERP_POSITION: - /* - * The generated pixel interpolators will pick up the coeffs from - * slot 0, so all need to ensure that the usage mask is covers all - * usages. - */ - fragcoord_usage_mask |= usage_mask; - break; - - case LP_INTERP_FACING: - setup_facing_coef(inputs, slot+1, info.frontfacing, usage_mask); - break; - - default: - assert(0); - } - } - - /* The internal position input is in slot zero: - */ - setup_fragcoord_coef(inputs, &info, 0, fragcoord_usage_mask); -} - -#else -extern void lp_setup_coef_dummy(void); -void lp_setup_coef_dummy(void) -{ -} - -#endif diff --git a/src/gallium/drivers/llvmpipe/lp_setup_coef.h b/src/gallium/drivers/llvmpipe/lp_setup_coef.h deleted file mode 100644 index 87a3255ccc6..00000000000 --- a/src/gallium/drivers/llvmpipe/lp_setup_coef.h +++ /dev/null @@ -1,64 +0,0 @@ -/************************************************************************** - * - * Copyright 2010 VMware, Inc. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - - -/** - * The setup code is concerned with point/line/triangle setup and - * putting commands/data into the bins. - */ - - -#ifndef LP_SETUP_COEF_H -#define LP_SETUP_COEF_H - - -struct lp_tri_info { - - float x0_center; - float y0_center; - - /* turn these into an aligned float[4] */ - float dy01_ooa; - float dy20_ooa; - float dx01_ooa; - float dx20_ooa; - - const float (*v0)[4]; - const float (*v1)[4]; - const float (*v2)[4]; - - boolean frontfacing; /* remove eventually */ -}; - -void lp_setup_tri_coef( struct lp_setup_context *setup, - struct lp_rast_shader_inputs *inputs, - const float (*v0)[4], - const float (*v1)[4], - const float (*v2)[4], - boolean frontfacing); - -#endif diff --git a/src/gallium/drivers/llvmpipe/lp_setup_coef_intrin.c b/src/gallium/drivers/llvmpipe/lp_setup_coef_intrin.c deleted file mode 100644 index 3742fd672b2..00000000000 --- a/src/gallium/drivers/llvmpipe/lp_setup_coef_intrin.c +++ /dev/null @@ -1,228 +0,0 @@ -/************************************************************************** - * - * Copyright 2010 VMware. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - -/* - * Binning code for triangles - */ - -#include "util/u_math.h" -#include "util/u_memory.h" -#include "lp_perf.h" -#include "lp_setup_context.h" -#include "lp_setup_coef.h" -#include "lp_rast.h" - -#if defined(PIPE_ARCH_SSE) -#include - - -static void constant_coef4( struct lp_rast_shader_inputs *inputs, - const struct lp_tri_info *info, - unsigned slot, - const float *attr) -{ - *(__m128 *)inputs->a0[slot] = *(__m128 *)attr; - *(__m128 *)inputs->dadx[slot] = _mm_set1_ps(0.0); - *(__m128 *)inputs->dady[slot] = _mm_set1_ps(0.0); -} - - - -/** - * Setup the fragment input attribute with the front-facing value. - * \param frontface is the triangle front facing? - */ -static void setup_facing_coef( struct lp_rast_shader_inputs *inputs, - const struct lp_tri_info *info, - unsigned slot ) -{ - /* XXX: just pass frontface directly to the shader, don't bother - * treating it as an input. - */ - __m128 a0 = _mm_setr_ps(info->frontfacing ? 1.0 : -1.0, - 0, 0, 0); - - *(__m128 *)inputs->a0[slot] = a0; - *(__m128 *)inputs->dadx[slot] = _mm_set1_ps(0.0); - *(__m128 *)inputs->dady[slot] = _mm_set1_ps(0.0); -} - - - -static void calc_coef4( struct lp_rast_shader_inputs *inputs, - const struct lp_tri_info *info, - unsigned slot, - __m128 a0, - __m128 a1, - __m128 a2) -{ - __m128 da01 = _mm_sub_ps(a0, a1); - __m128 da20 = _mm_sub_ps(a2, a0); - - __m128 da01_dy20_ooa = _mm_mul_ps(da01, _mm_set1_ps(info->dy20_ooa)); - __m128 da20_dy01_ooa = _mm_mul_ps(da20, _mm_set1_ps(info->dy01_ooa)); - __m128 dadx = _mm_sub_ps(da01_dy20_ooa, da20_dy01_ooa); - - __m128 da01_dx20_ooa = _mm_mul_ps(da01, _mm_set1_ps(info->dx20_ooa)); - __m128 da20_dx01_ooa = _mm_mul_ps(da20, _mm_set1_ps(info->dx01_ooa)); - __m128 dady = _mm_sub_ps(da20_dx01_ooa, da01_dx20_ooa); - - __m128 dadx_x0 = _mm_mul_ps(dadx, _mm_set1_ps(info->x0_center)); - __m128 dady_y0 = _mm_mul_ps(dady, _mm_set1_ps(info->y0_center)); - __m128 attr_v0 = _mm_add_ps(dadx_x0, dady_y0); - __m128 attr_0 = _mm_sub_ps(a0, attr_v0); - - *(__m128 *)inputs->a0[slot] = attr_0; - *(__m128 *)inputs->dadx[slot] = dadx; - *(__m128 *)inputs->dady[slot] = dady; -} - - -static void linear_coef( struct lp_rast_shader_inputs *inputs, - const struct lp_tri_info *info, - unsigned slot, - unsigned vert_attr) -{ - __m128 a0 = *(const __m128 *)info->v0[vert_attr]; - __m128 a1 = *(const __m128 *)info->v1[vert_attr]; - __m128 a2 = *(const __m128 *)info->v2[vert_attr]; - - calc_coef4(inputs, info, slot, a0, a1, a2); -} - - - -/** - * Compute a0, dadx and dady for a perspective-corrected interpolant, - * for a triangle. - * We basically multiply the vertex value by 1/w before computing - * the plane coefficients (a0, dadx, dady). - * Later, when we compute the value at a particular fragment position we'll - * divide the interpolated value by the interpolated W at that fragment. - */ -static void perspective_coef( struct lp_rast_shader_inputs *inputs, - const struct lp_tri_info *info, - unsigned slot, - unsigned vert_attr) -{ - /* premultiply by 1/w (v[0][3] is always 1/w): - */ - __m128 a0 = *(const __m128 *)info->v0[vert_attr]; - __m128 a1 = *(const __m128 *)info->v1[vert_attr]; - __m128 a2 = *(const __m128 *)info->v2[vert_attr]; - - __m128 a0_oow = _mm_mul_ps(a0, _mm_set1_ps(info->v0[0][3])); - __m128 a1_oow = _mm_mul_ps(a1, _mm_set1_ps(info->v1[0][3])); - __m128 a2_oow = _mm_mul_ps(a2, _mm_set1_ps(info->v2[0][3])); - - calc_coef4(inputs, info, slot, a0_oow, a1_oow, a2_oow); -} - - - - - -/** - * Compute the inputs-> dadx, dady, a0 values. - */ -void lp_setup_tri_coef( struct lp_setup_context *setup, - struct lp_rast_shader_inputs *inputs, - const float (*v0)[4], - const float (*v1)[4], - const float (*v2)[4], - boolean frontfacing) -{ - unsigned slot; - struct lp_tri_info info; - float dx01 = v0[0][0] - v1[0][0]; - float dy01 = v0[0][1] - v1[0][1]; - float dx20 = v2[0][0] - v0[0][0]; - float dy20 = v2[0][1] - v0[0][1]; - float oneoverarea = 1.0f / (dx01 * dy20 - dx20 * dy01); - - info.v0 = v0; - info.v1 = v1; - info.v2 = v2; - info.frontfacing = frontfacing; - info.x0_center = v0[0][0] - setup->pixel_offset; - info.y0_center = v0[0][1] - setup->pixel_offset; - info.dx01_ooa = dx01 * oneoverarea; - info.dx20_ooa = dx20 * oneoverarea; - info.dy01_ooa = dy01 * oneoverarea; - info.dy20_ooa = dy20 * oneoverarea; - - - /* The internal position input is in slot zero: - */ - linear_coef(inputs, &info, 0, 0); - - /* setup interpolation for all the remaining attributes: - */ - for (slot = 0; slot < setup->fs.nr_inputs; slot++) { - unsigned vert_attr = setup->fs.input[slot].src_index; - - switch (setup->fs.input[slot].interp) { - case LP_INTERP_CONSTANT: - if (setup->flatshade_first) { - constant_coef4(inputs, &info, slot+1, info.v0[vert_attr]); - } - else { - constant_coef4(inputs, &info, slot+1, info.v2[vert_attr]); - } - break; - - case LP_INTERP_LINEAR: - linear_coef(inputs, &info, slot+1, vert_attr); - break; - - case LP_INTERP_PERSPECTIVE: - perspective_coef(inputs, &info, slot+1, vert_attr); - break; - - case LP_INTERP_POSITION: - /* - * The generated pixel interpolators will pick up the coeffs from - * slot 0. - */ - break; - - case LP_INTERP_FACING: - setup_facing_coef(inputs, &info, slot+1); - break; - - default: - assert(0); - } - } -} - -#else -extern void lp_setup_coef_dummy(void); -void lp_setup_coef_dummy(void) -{ -} -#endif diff --git a/src/gallium/drivers/llvmpipe/lp_setup_context.h b/src/gallium/drivers/llvmpipe/lp_setup_context.h index 80b356476ab..ff3b69afa83 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup_context.h +++ b/src/gallium/drivers/llvmpipe/lp_setup_context.h @@ -39,6 +39,7 @@ #include "lp_rast.h" #include "lp_tile_soa.h" /* for TILE_SIZE */ #include "lp_scene.h" +#include "lp_bld_interp.h" /* for struct lp_shader_input */ #include "draw/draw_vbuf.h" #include "util/u_rect.h" @@ -49,6 +50,8 @@ #define LP_SETUP_NEW_SCISSOR 0x08 +struct lp_setup_variant; + /** Max number of scenes */ #define MAX_SCENES 2 @@ -118,9 +121,6 @@ struct lp_setup_context } state; struct { - struct lp_shader_input input[PIPE_MAX_ATTRIBS]; - unsigned nr_inputs; - const struct lp_rast_state *stored; /**< what's in the scene */ struct lp_rast_state current; /**< currently set state */ struct pipe_resource *current_tex[PIPE_MAX_SAMPLERS]; @@ -139,6 +139,10 @@ struct lp_setup_context } blend_color; + struct { + const struct lp_setup_variant *variant; + } setup; + unsigned dirty; /**< bitmask of LP_SETUP_NEW_x bits */ void (*point)( struct lp_setup_context *, @@ -181,7 +185,7 @@ lp_setup_print_vertex(struct lp_setup_context *setup, struct lp_rast_triangle * lp_setup_alloc_triangle(struct lp_scene *scene, - unsigned nr_inputs, + unsigned num_inputs, unsigned nr_planes, unsigned *tri_size); diff --git a/src/gallium/drivers/llvmpipe/lp_setup_line.c b/src/gallium/drivers/llvmpipe/lp_setup_line.c index 9f090d1992e..928ffdc5cb5 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup_line.c +++ b/src/gallium/drivers/llvmpipe/lp_setup_line.c @@ -35,6 +35,7 @@ #include "lp_setup_context.h" #include "lp_rast.h" #include "lp_state_fs.h" +#include "lp_state_setup.h" #define NUM_CHANNELS 4 @@ -162,19 +163,20 @@ static void setup_line_coefficients( struct lp_setup_context *setup, struct lp_rast_triangle *tri, struct lp_line_info *info) { + const struct lp_setup_variant_key *key = &setup->setup.variant->key; unsigned fragcoord_usage_mask = TGSI_WRITEMASK_XYZ; unsigned slot; /* setup interpolation for all the remaining attributes: */ - for (slot = 0; slot < setup->fs.nr_inputs; slot++) { - unsigned vert_attr = setup->fs.input[slot].src_index; - unsigned usage_mask = setup->fs.input[slot].usage_mask; + for (slot = 0; slot < key->num_inputs; slot++) { + unsigned vert_attr = key->inputs[slot].src_index; + unsigned usage_mask = key->inputs[slot].usage_mask; unsigned i; - switch (setup->fs.input[slot].interp) { + switch (key->inputs[slot].interp) { case LP_INTERP_CONSTANT: - if (setup->flatshade_first) { + if (key->flatshade_first) { for (i = 0; i < NUM_CHANNELS; i++) if (usage_mask & (1 << i)) constant_coef(setup, tri, slot+1, info->v1[vert_attr][i], i); @@ -235,14 +237,15 @@ print_line(struct lp_setup_context *setup, const float (*v1)[4], const float (*v2)[4]) { + const struct lp_setup_variant_key *key = &setup->setup.variant->key; uint i; debug_printf("llvmpipe line\n"); - for (i = 0; i < 1 + setup->fs.nr_inputs; i++) { + for (i = 0; i < 1 + key->num_inputs; i++) { debug_printf(" v1[%d]: %f %f %f %f\n", i, v1[i][0], v1[i][1], v1[i][2], v1[i][3]); } - for (i = 0; i < 1 + setup->fs.nr_inputs; i++) { + for (i = 0; i < 1 + key->num_inputs; i++) { debug_printf(" v2[%d]: %f %f %f %f\n", i, v2[i][0], v2[i][1], v2[i][2], v2[i][3]); } @@ -269,6 +272,7 @@ try_setup_line( struct lp_setup_context *setup, const float (*v2)[4]) { struct lp_scene *scene = setup->scene; + const struct lp_setup_variant_key *key = &setup->setup.variant->key; struct lp_rast_triangle *line; struct lp_line_info info; float width = MAX2(1.0, setup->line_width); @@ -548,7 +552,7 @@ try_setup_line( struct lp_setup_context *setup, u_rect_find_intersection(&setup->draw_region, &bbox); line = lp_setup_alloc_triangle(scene, - setup->fs.nr_inputs, + key->num_inputs, nr_planes, &tri_bytes); if (!line) diff --git a/src/gallium/drivers/llvmpipe/lp_setup_point.c b/src/gallium/drivers/llvmpipe/lp_setup_point.c index 55389871518..c98966022ee 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup_point.c +++ b/src/gallium/drivers/llvmpipe/lp_setup_point.c @@ -36,6 +36,7 @@ #include "lp_setup_context.h" #include "lp_rast.h" #include "lp_state_fs.h" +#include "lp_state_setup.h" #include "tgsi/tgsi_scan.h" #define NUM_CHANNELS 4 @@ -152,17 +153,18 @@ setup_point_coefficients( struct lp_setup_context *setup, struct lp_rast_triangle *point, const struct point_info *info) { + const struct lp_setup_variant_key *key = &setup->setup.variant->key; unsigned fragcoord_usage_mask = TGSI_WRITEMASK_XYZ; unsigned slot; /* setup interpolation for all the remaining attributes: */ - for (slot = 0; slot < setup->fs.nr_inputs; slot++) { - unsigned vert_attr = setup->fs.input[slot].src_index; - unsigned usage_mask = setup->fs.input[slot].usage_mask; + for (slot = 0; slot < key->num_inputs; slot++) { + unsigned vert_attr = key->inputs[slot].src_index; + unsigned usage_mask = key->inputs[slot].usage_mask; unsigned i; - switch (setup->fs.input[slot].interp) { + switch (key->inputs[slot].interp) { case LP_INTERP_POSITION: /* * The generated pixel interpolators will pick up the coeffs from @@ -215,6 +217,7 @@ try_setup_point( struct lp_setup_context *setup, const float (*v0)[4] ) { /* x/y positions in fixed point */ + const struct lp_setup_variant_key *key = &setup->setup.variant->key; const int sizeAttr = setup->psize; const float size = (setup->point_size_per_vertex && sizeAttr > 0) ? v0[sizeAttr][0] @@ -266,7 +269,7 @@ try_setup_point( struct lp_setup_context *setup, u_rect_find_intersection(&setup->draw_region, &bbox); point = lp_setup_alloc_triangle(scene, - setup->fs.nr_inputs, + key->num_inputs, nr_planes, &bytes); if (!point) diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c index 5090f82ab5f..43617a6b672 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c +++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c @@ -34,9 +34,9 @@ #include "util/u_rect.h" #include "lp_perf.h" #include "lp_setup_context.h" -#include "lp_setup_coef.h" #include "lp_rast.h" #include "lp_state_fs.h" +#include "lp_state_setup.h" #define NUM_CHANNELS 4 @@ -65,16 +65,16 @@ fixed_to_float(int a) * immediately after it. * The memory is allocated from the per-scene pool, not per-tile. * \param tri_size returns number of bytes allocated - * \param nr_inputs number of fragment shader inputs + * \param num_inputs number of fragment shader inputs * \return pointer to triangle space */ struct lp_rast_triangle * lp_setup_alloc_triangle(struct lp_scene *scene, - unsigned nr_inputs, + unsigned num_inputs, unsigned nr_planes, unsigned *tri_size) { - unsigned input_array_sz = NUM_CHANNELS * (nr_inputs + 1) * sizeof(float); + unsigned input_array_sz = NUM_CHANNELS * (num_inputs + 1) * sizeof(float); struct lp_rast_triangle *tri; unsigned tri_bytes, bytes; char *inputs; @@ -101,25 +101,26 @@ lp_setup_print_vertex(struct lp_setup_context *setup, const char *name, const float (*v)[4]) { + const struct lp_setup_variant_key *key = &setup->setup.variant->key; int i, j; debug_printf(" wpos (%s[0]) xyzw %f %f %f %f\n", name, v[0][0], v[0][1], v[0][2], v[0][3]); - for (i = 0; i < setup->fs.nr_inputs; i++) { - const float *in = v[setup->fs.input[i].src_index]; + for (i = 0; i < key->num_inputs; i++) { + const float *in = v[key->inputs[i].src_index]; debug_printf(" in[%d] (%s[%d]) %s%s%s%s ", i, - name, setup->fs.input[i].src_index, - (setup->fs.input[i].usage_mask & 0x1) ? "x" : " ", - (setup->fs.input[i].usage_mask & 0x2) ? "y" : " ", - (setup->fs.input[i].usage_mask & 0x4) ? "z" : " ", - (setup->fs.input[i].usage_mask & 0x8) ? "w" : " "); + name, key->inputs[i].src_index, + (key->inputs[i].usage_mask & 0x1) ? "x" : " ", + (key->inputs[i].usage_mask & 0x2) ? "y" : " ", + (key->inputs[i].usage_mask & 0x4) ? "z" : " ", + (key->inputs[i].usage_mask & 0x8) ? "w" : " "); for (j = 0; j < 4; j++) - if (setup->fs.input[i].usage_mask & (1<inputs[i].usage_mask & (1<scene; + const struct lp_setup_variant_key *key = &setup->setup.variant->key; struct lp_rast_triangle *tri; int x[3]; int y[3]; @@ -288,7 +290,7 @@ do_triangle_ccw(struct lp_setup_context *setup, u_rect_find_intersection(&setup->draw_region, &bbox); tri = lp_setup_alloc_triangle(scene, - setup->fs.nr_inputs, + key->num_inputs, nr_planes, &tri_bytes); if (!tri) @@ -328,13 +330,25 @@ do_triangle_ccw(struct lp_setup_context *setup, /* Setup parameter interpolants: */ - lp_setup_tri_coef( setup, &tri->inputs, v0, v1, v2, frontfacing ); + setup->setup.variant->jit_function( v0, + v1, + v2, + frontfacing, + tri->inputs.a0, + tri->inputs.dadx, + tri->inputs.dady, + &setup->setup.variant->key ); tri->inputs.facing = frontfacing ? 1.0F : -1.0F; tri->inputs.disable = FALSE; tri->inputs.opaque = setup->fs.current.variant->opaque; tri->inputs.state = setup->fs.stored; + if (0) + lp_dump_setup_coef(&setup->setup.variant->key, + (const float (*)[4])tri->inputs.a0, + (const float (*)[4])tri->inputs.dadx, + (const float (*)[4])tri->inputs.dady); for (i = 0; i < 3; i++) { struct lp_rast_plane *plane = &tri->plane[i]; diff --git a/src/gallium/drivers/llvmpipe/lp_state.h b/src/gallium/drivers/llvmpipe/lp_state.h index 86313e1c484..7893e9cdc0c 100644 --- a/src/gallium/drivers/llvmpipe/lp_state.h +++ b/src/gallium/drivers/llvmpipe/lp_state.h @@ -97,6 +97,9 @@ llvmpipe_set_framebuffer_state(struct pipe_context *, void llvmpipe_update_fs(struct llvmpipe_context *lp); +void +llvmpipe_update_setup(struct llvmpipe_context *lp); + void llvmpipe_update_derived(struct llvmpipe_context *llvmpipe); diff --git a/src/gallium/drivers/llvmpipe/lp_state_derived.c b/src/gallium/drivers/llvmpipe/lp_state_derived.c index edd723f65f2..de242aa93ca 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_derived.c +++ b/src/gallium/drivers/llvmpipe/lp_state_derived.c @@ -50,12 +50,13 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe) { const struct lp_fragment_shader *lpfs = llvmpipe->fs; struct vertex_info *vinfo = &llvmpipe->vertex_info; - struct lp_shader_input *inputs = llvmpipe->inputs; unsigned vs_index; uint i; /* - * Match FS inputs against VS outputs, emitting the necessary attributes. + * Match FS inputs against VS outputs, emitting the necessary + * attributes. Could cache these structs and look them up with a + * combination of fragment shader, vertex shader ids. */ vinfo->num_attribs = 0; @@ -74,64 +75,10 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe) vs_index = draw_find_shader_output(llvmpipe->draw, lpfs->info.input_semantic_name[i], lpfs->info.input_semantic_index[i]); - if (vs_index < 0) { - /* - * This can happen with sprite coordinates - the vertex - * shader doesn't need to provide an output as we generate - * them internally. However, lets keep pretending that there - * is something there to not confuse other code. - */ - vs_index = 0; - } - - /* This can be pre-computed, except for flatshade: - */ - inputs[i].usage_mask = lpfs->info.input_usage_mask[i]; - - switch (lpfs->info.input_interpolate[i]) { - case TGSI_INTERPOLATE_CONSTANT: - inputs[i].interp = LP_INTERP_CONSTANT; - break; - case TGSI_INTERPOLATE_LINEAR: - inputs[i].interp = LP_INTERP_LINEAR; - break; - case TGSI_INTERPOLATE_PERSPECTIVE: - inputs[i].interp = LP_INTERP_PERSPECTIVE; - break; - default: - assert(0); - break; - } - - switch (lpfs->info.input_semantic_name[i]) { - case TGSI_SEMANTIC_FACE: - inputs[i].interp = LP_INTERP_FACING; - break; - case TGSI_SEMANTIC_POSITION: - /* Position was already emitted above - */ - inputs[i].interp = LP_INTERP_POSITION; - inputs[i].src_index = 0; - continue; - case TGSI_SEMANTIC_COLOR: - /* Colors are linearly inputs[i].interpolated in the fragment shader - * even when flatshading is active. This just tells the - * setup module to use coefficients with ddx==0 and - * ddy==0. - */ - if (llvmpipe->rasterizer->flatshade) - inputs[i].interp = LP_INTERP_CONSTANT; - break; - - default: - break; - } /* * Emit the requested fs attribute for all but position. */ - - inputs[i].src_index = vinfo->num_attribs; draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, vs_index); } @@ -145,15 +92,9 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe) draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index); } - llvmpipe->num_inputs = lpfs->info.num_inputs; - draw_compute_vertex_size(vinfo); - lp_setup_set_vertex_info(llvmpipe->setup, vinfo); - lp_setup_set_fs_inputs(llvmpipe->setup, - inputs, - lpfs->info.num_inputs); } @@ -190,6 +131,10 @@ void llvmpipe_update_derived( struct llvmpipe_context *llvmpipe ) LP_NEW_QUERY)) llvmpipe_update_fs( llvmpipe ); + if (llvmpipe->dirty & (LP_NEW_FS | + LP_NEW_RASTERIZER)) + llvmpipe_update_setup( llvmpipe ); + if (llvmpipe->dirty & LP_NEW_BLEND_COLOR) lp_setup_set_blend_color(llvmpipe->setup, &llvmpipe->blend_color); diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c index e54dd9f0a3c..ad058e384ad 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_fs.c +++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c @@ -255,8 +255,7 @@ generate_quad_mask(LLVMBuilderRef builder, * \param partial_mask if 1, do mask_input testing */ static void -generate_fs(struct llvmpipe_context *lp, - struct lp_fragment_shader *shader, +generate_fs(struct lp_fragment_shader *shader, const struct lp_fragment_shader_variant_key *key, LLVMBuilderRef builder, struct lp_type type, @@ -468,13 +467,13 @@ generate_blend(const struct pipe_blend_state *blend, * 2x2 pixels. */ static void -generate_fragment(struct llvmpipe_context *lp, +generate_fragment(struct llvmpipe_screen *screen, struct lp_fragment_shader *shader, struct lp_fragment_shader_variant *variant, unsigned partial_mask) { - struct llvmpipe_screen *screen = llvmpipe_screen(lp->pipe.screen); const struct lp_fragment_shader_variant_key *key = &variant->key; + struct lp_shader_input inputs[PIPE_MAX_SHADER_INPUTS]; char func_name[256]; struct lp_type fs_type; struct lp_type blend_type; @@ -507,6 +506,18 @@ generate_fragment(struct llvmpipe_context *lp, unsigned chan; unsigned cbuf; + /* Adjust color input interpolation according to flatshade state: + */ + memcpy(inputs, shader->inputs, shader->info.num_inputs * sizeof inputs[0]); + for (i = 0; i < shader->info.num_inputs; i++) { + if (inputs[i].interp == LP_INTERP_COLOR) { + if (key->flatshade) + inputs[i].interp = LP_INTERP_CONSTANT; + else + inputs[i].interp = LP_INTERP_LINEAR; + } + } + /* TODO: actually pick these based on the fs and color buffer * characteristics. */ @@ -558,7 +569,6 @@ generate_fragment(struct llvmpipe_context *lp, variant->function[partial_mask] = function; - /* XXX: need to propagate noalias down into color param now we are * passing a pointer-to-pointer? */ @@ -606,8 +616,8 @@ generate_fragment(struct llvmpipe_context *lp, * already included in the shader key. */ lp_build_interp_soa_init(&interp, - lp->num_inputs, - lp->inputs, + shader->info.num_inputs, + inputs, builder, fs_type, a0_ptr, dadx_ptr, dady_ptr, x, y); @@ -626,7 +636,7 @@ generate_fragment(struct llvmpipe_context *lp, depth_ptr_i = LLVMBuildGEP(builder, depth_ptr, &index, 1, ""); - generate_fs(lp, shader, key, + generate_fs(shader, key, builder, fs_type, context_ptr, @@ -823,7 +833,7 @@ lp_debug_fs_variant(const struct lp_fragment_shader_variant *variant) } static struct lp_fragment_shader_variant * -generate_variant(struct llvmpipe_context *lp, +generate_variant(struct llvmpipe_screen *screen, struct lp_fragment_shader *shader, const struct lp_fragment_shader_variant_key *key) { @@ -869,11 +879,11 @@ generate_variant(struct llvmpipe_context *lp, lp_debug_fs_variant(variant); } - generate_fragment(lp, shader, variant, RAST_EDGE_TEST); + generate_fragment(screen, shader, variant, RAST_EDGE_TEST); if (variant->opaque) { /* Specialized shader, which doesn't need to read the color buffer. */ - generate_fragment(lp, shader, variant, RAST_WHOLE); + generate_fragment(screen, shader, variant, RAST_WHOLE); } else { variant->jit_function[RAST_WHOLE] = variant->jit_function[RAST_EDGE_TEST]; } @@ -888,6 +898,7 @@ llvmpipe_create_fs_state(struct pipe_context *pipe, { struct lp_fragment_shader *shader; int nr_samplers; + int i; shader = CALLOC_STRUCT(lp_fragment_shader); if (!shader) @@ -907,6 +918,46 @@ llvmpipe_create_fs_state(struct pipe_context *pipe, shader->variant_key_size = Offset(struct lp_fragment_shader_variant_key, sampler[nr_samplers]); + for (i = 0; i < shader->info.num_inputs; i++) { + shader->inputs[i].usage_mask = shader->info.input_usage_mask[i]; + + switch (shader->info.input_interpolate[i]) { + case TGSI_INTERPOLATE_CONSTANT: + shader->inputs[i].interp = LP_INTERP_CONSTANT; + break; + case TGSI_INTERPOLATE_LINEAR: + shader->inputs[i].interp = LP_INTERP_LINEAR; + break; + case TGSI_INTERPOLATE_PERSPECTIVE: + shader->inputs[i].interp = LP_INTERP_PERSPECTIVE; + break; + default: + assert(0); + break; + } + + switch (shader->info.input_semantic_name[i]) { + case TGSI_SEMANTIC_COLOR: + /* Colors may be either linearly or constant interpolated in + * the fragment shader, but that information isn't available + * here. Mark color inputs and fix them up later. + */ + shader->inputs[i].interp = LP_INTERP_COLOR; + break; + case TGSI_SEMANTIC_FACE: + shader->inputs[i].interp = LP_INTERP_FACING; + break; + case TGSI_SEMANTIC_POSITION: + /* Position was already emitted above + */ + shader->inputs[i].interp = LP_INTERP_POSITION; + shader->inputs[i].src_index = 0; + continue; + } + + shader->inputs[i].src_index = i+1; + } + if (LP_DEBUG & DEBUG_TGSI) { unsigned attrib; debug_printf("llvmpipe: Create fragment shader #%u %p:\n", shader->no, (void *) shader); @@ -1161,6 +1212,7 @@ make_variant_key(struct llvmpipe_context *lp, void llvmpipe_update_fs(struct llvmpipe_context *lp) { + struct llvmpipe_screen *screen = llvmpipe_screen(lp->pipe.screen); struct lp_fragment_shader *shader = lp->fs; struct lp_fragment_shader_variant_key key; struct lp_fragment_shader_variant *variant = NULL; @@ -1201,7 +1253,7 @@ llvmpipe_update_fs(struct llvmpipe_context *lp) } t0 = os_time_get(); - variant = generate_variant(lp, shader, &key); + variant = generate_variant(screen, shader, &key); t1 = os_time_get(); dt = t1 - t0; @@ -1221,6 +1273,10 @@ llvmpipe_update_fs(struct llvmpipe_context *lp) + + + + void llvmpipe_init_fs_funcs(struct llvmpipe_context *llvmpipe) { diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.h b/src/gallium/drivers/llvmpipe/lp_state_fs.h index 2914e7d7efd..f73c7801c00 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_fs.h +++ b/src/gallium/drivers/llvmpipe/lp_state_fs.h @@ -34,6 +34,7 @@ #include "pipe/p_state.h" #include "tgsi/tgsi_scan.h" /* for tgsi_shader_info */ #include "gallivm/lp_bld_sample.h" /* for struct lp_sampler_static_state */ +#include "lp_bld_interp.h" /* for struct lp_shader_input */ struct tgsi_token; @@ -105,6 +106,9 @@ struct lp_fragment_shader unsigned no; unsigned variants_created; unsigned variants_cached; + + /** Fragment shader input interpolation info */ + struct lp_shader_input inputs[PIPE_MAX_SHADER_INPUTS]; }; diff --git a/src/gallium/drivers/llvmpipe/lp_state_setup.c b/src/gallium/drivers/llvmpipe/lp_state_setup.c new file mode 100644 index 00000000000..aa9147a1a15 --- /dev/null +++ b/src/gallium/drivers/llvmpipe/lp_state_setup.c @@ -0,0 +1,768 @@ +/************************************************************************** + * + * Copyright 2010 VMware. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +#include "util/u_math.h" +#include "util/u_memory.h" +#include "util/u_simple_list.h" +#include "os/os_time.h" +#include "gallivm/lp_bld_debug.h" +#include "gallivm/lp_bld_init.h" +#include "gallivm/lp_bld_intr.h" +#include /* for LLVMVerifyFunction */ + +#include "lp_perf.h" +#include "lp_debug.h" +#include "lp_flush.h" +#include "lp_screen.h" +#include "lp_context.h" +#include "lp_setup_context.h" +#include "lp_rast.h" +#include "lp_state.h" +#include "lp_state_fs.h" +#include "lp_state_setup.h" + + + +/* currently organized to interpolate full float[4] attributes even + * when some elements are unused. Later, can pack vertex data more + * closely. + */ + + +struct lp_setup_args +{ + /* Function arguments: + */ + LLVMValueRef v0; + LLVMValueRef v1; + LLVMValueRef v2; + LLVMValueRef facing; /* boolean */ + LLVMValueRef a0; + LLVMValueRef dadx; + LLVMValueRef dady; + + /* Derived: + */ + LLVMValueRef x0_center; + LLVMValueRef y0_center; + LLVMValueRef dy20_ooa; + LLVMValueRef dy01_ooa; + LLVMValueRef dx20_ooa; + LLVMValueRef dx01_ooa; +}; + +static LLVMTypeRef type4f(void) +{ + return LLVMVectorType(LLVMFloatType(), 4); +} + + +/* Equivalent of _mm_setr_ps(a,b,c,d) + */ +static LLVMValueRef vec4f(LLVMBuilderRef bld, + LLVMValueRef a, LLVMValueRef b, LLVMValueRef c, LLVMValueRef d, + const char *name) +{ + LLVMValueRef i0 = LLVMConstInt(LLVMInt32Type(), 0, 0); + LLVMValueRef i1 = LLVMConstInt(LLVMInt32Type(), 1, 0); + LLVMValueRef i2 = LLVMConstInt(LLVMInt32Type(), 2, 0); + LLVMValueRef i3 = LLVMConstInt(LLVMInt32Type(), 3, 0); + + LLVMValueRef res = LLVMGetUndef(type4f()); + + res = LLVMBuildInsertElement(bld, res, a, i0, ""); + res = LLVMBuildInsertElement(bld, res, b, i1, ""); + res = LLVMBuildInsertElement(bld, res, c, i2, ""); + res = LLVMBuildInsertElement(bld, res, d, i3, name); + + return res; +} + +/* Equivalent of _mm_set1_ps(a) + */ +static LLVMValueRef vec4f_from_scalar(LLVMBuilderRef bld, + LLVMValueRef a, + const char *name) +{ + LLVMValueRef res = LLVMGetUndef(type4f()); + int i; + + for(i = 0; i < 4; ++i) { + LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0); + res = LLVMBuildInsertElement(bld, res, a, index, i == 3 ? name : ""); + } + + return res; +} + +static void +store_coef(LLVMBuilderRef builder, + struct lp_setup_args *args, + unsigned slot, + LLVMValueRef a0, + LLVMValueRef dadx, + LLVMValueRef dady) +{ + LLVMValueRef idx = LLVMConstInt(LLVMInt32Type(), slot, 0); + + LLVMBuildStore(builder, + a0, + LLVMBuildGEP(builder, args->a0, &idx, 1, "")); + + LLVMBuildStore(builder, + dadx, + LLVMBuildGEP(builder, args->dadx, &idx, 1, "")); + + LLVMBuildStore(builder, + dady, + LLVMBuildGEP(builder, args->dady, &idx, 1, "")); +} + + + +static void +emit_constant_coef4( LLVMBuilderRef builder, + struct lp_setup_args *args, + unsigned slot, + LLVMValueRef vert, + unsigned attr) +{ + LLVMValueRef zero = LLVMConstReal(LLVMFloatType(), 0.0); + LLVMValueRef zerovec = vec4f_from_scalar(builder, zero, "zero"); + LLVMValueRef idx = LLVMConstInt(LLVMInt32Type(), attr, 0); + LLVMValueRef attr_ptr = LLVMBuildGEP(builder, vert, &idx, 1, "attr_ptr"); + LLVMValueRef vert_attr = LLVMBuildLoad(builder, attr_ptr, "vert_attr"); + + store_coef(builder, args, slot, vert_attr, zerovec, zerovec); +} + + + +/** + * Setup the fragment input attribute with the front-facing value. + * \param frontface is the triangle front facing? + */ +static void +emit_facing_coef( LLVMBuilderRef builder, + struct lp_setup_args *args, + unsigned slot ) +{ + LLVMValueRef a0_0 = args->facing; + LLVMValueRef zero = LLVMConstReal(LLVMFloatType(), 0.0); + LLVMValueRef a0 = vec4f(builder, a0_0, zero, zero, zero, "facing"); + LLVMValueRef zerovec = vec4f_from_scalar(builder, zero, "zero"); + + store_coef(builder, args, slot, a0, zerovec, zerovec); +} + + +static LLVMValueRef +vert_attrib(LLVMBuilderRef b, + LLVMValueRef vert, + int attr, + int elem, + const char *name) +{ + LLVMValueRef idx[2]; + idx[0] = LLVMConstInt(LLVMInt32Type(), attr, 0); + idx[1] = LLVMConstInt(LLVMInt32Type(), elem, 0); + return LLVMBuildLoad(b, LLVMBuildGEP(b, vert, idx, 2, ""), name); +} + + + +static void +emit_coef4( LLVMBuilderRef b, + struct lp_setup_args *args, + unsigned slot, + LLVMValueRef a0, + LLVMValueRef a1, + LLVMValueRef a2) +{ + LLVMValueRef dy20_ooa = args->dy20_ooa; + LLVMValueRef dy01_ooa = args->dy01_ooa; + LLVMValueRef dx20_ooa = args->dx20_ooa; + LLVMValueRef dx01_ooa = args->dx01_ooa; + LLVMValueRef x0_center = args->x0_center; + LLVMValueRef y0_center = args->y0_center; + + /* XXX: using fsub, fmul on vector types -- does this work?? + */ + LLVMValueRef da01 = LLVMBuildFSub(b, a0, a1, "da01"); + LLVMValueRef da20 = LLVMBuildFSub(b, a2, a0, "da20"); + + /* Calculate dadx (vec4f) + */ + LLVMValueRef da01_dy20_ooa = LLVMBuildFMul(b, da01, dy20_ooa, "da01_dy20_ooa"); + LLVMValueRef da20_dy01_ooa = LLVMBuildFMul(b, da20, dy01_ooa, "da20_dy01_ooa"); + LLVMValueRef dadx = LLVMBuildFSub(b, da01_dy20_ooa, da20_dy01_ooa, "dadx"); + + /* Calculate dady (vec4f) + */ + LLVMValueRef da01_dx20_ooa = LLVMBuildFMul(b, da01, dx20_ooa, "da01_dx20_ooa"); + LLVMValueRef da20_dx01_ooa = LLVMBuildFMul(b, da20, dx01_ooa, "da20_dx01_ooa"); + LLVMValueRef dady = LLVMBuildFSub(b, da20_dx01_ooa, da01_dx20_ooa, "dady"); + + /* Calculate a0 - the attribute value at the origin + */ + LLVMValueRef dadx_x0 = LLVMBuildFMul(b, dadx, x0_center, "dadx_x0"); + LLVMValueRef dady_y0 = LLVMBuildFMul(b, dady, y0_center, "dady_y0"); + LLVMValueRef attr_v0 = LLVMBuildFAdd(b, dadx_x0, dady_y0, "attr_v0"); + LLVMValueRef attr_0 = LLVMBuildFSub(b, a0, attr_v0, "attr_0"); + + store_coef(b, args, slot, attr_0, dadx, dady); +} + + +static void +emit_linear_coef( LLVMBuilderRef b, + struct lp_setup_args *args, + unsigned slot, + unsigned vert_attr) +{ + LLVMValueRef idx = LLVMConstInt(LLVMInt32Type(), vert_attr, 0); + + LLVMValueRef a0 = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v0, &idx, 1, ""), "v0a"); + LLVMValueRef a1 = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v1, &idx, 1, ""), "v1a"); + LLVMValueRef a2 = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v2, &idx, 1, ""), "v2a"); + + emit_coef4(b, args, slot, a0, a1, a2); +} + + + +/** + * Compute a0, dadx and dady for a perspective-corrected interpolant, + * for a triangle. + * We basically multiply the vertex value by 1/w before computing + * the plane coefficients (a0, dadx, dady). + * Later, when we compute the value at a particular fragment position we'll + * divide the interpolated value by the interpolated W at that fragment. + */ +static void +emit_perspective_coef( LLVMBuilderRef b, + struct lp_setup_args *args, + unsigned slot, + unsigned vert_attr) +{ + /* premultiply by 1/w (v[0][3] is always 1/w): + */ + LLVMValueRef idx = LLVMConstInt(LLVMInt32Type(), vert_attr, 0); + + LLVMValueRef v0a = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v0, &idx, 1, ""), "v0a"); + LLVMValueRef v1a = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v1, &idx, 1, ""), "v1a"); + LLVMValueRef v2a = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v2, &idx, 1, ""), "v2a"); + + LLVMValueRef v0_oow = vec4f_from_scalar(b, vert_attrib(b, args->v0, 0, 3, ""), "v0_oow"); + LLVMValueRef v1_oow = vec4f_from_scalar(b, vert_attrib(b, args->v1, 0, 3, ""), "v1_oow"); + LLVMValueRef v2_oow = vec4f_from_scalar(b, vert_attrib(b, args->v2, 0, 3, ""), "v2_oow"); + + LLVMValueRef v0_oow_v0a = LLVMBuildFMul(b, v0a, v0_oow, "v0_oow_v0a"); + LLVMValueRef v1_oow_v1a = LLVMBuildFMul(b, v1a, v1_oow, "v1_oow_v1a"); + LLVMValueRef v2_oow_v2a = LLVMBuildFMul(b, v2a, v2_oow, "v2_oow_v2a"); + + emit_coef4(b, args, slot, v0_oow_v0a, v1_oow_v1a, v2_oow_v2a); +} + + +static void +emit_position_coef( LLVMBuilderRef builder, + struct lp_setup_args *args, + int slot, int attrib ) +{ + emit_linear_coef(builder, args, slot, attrib); +} + + + + +/** + * Compute the inputs-> dadx, dady, a0 values. + */ +static void +emit_tri_coef( LLVMBuilderRef builder, + const struct lp_setup_variant_key *key, + struct lp_setup_args *args ) +{ + unsigned slot; + + /* The internal position input is in slot zero: + */ + emit_position_coef(builder, args, 0, 0); + + /* setup interpolation for all the remaining attributes: + */ + for (slot = 0; slot < key->num_inputs; slot++) { + unsigned vert_attr = key->inputs[slot].src_index; + + switch (key->inputs[slot].interp) { + case LP_INTERP_CONSTANT: + if (key->flatshade_first) { + emit_constant_coef4(builder, args, slot+1, args->v0, vert_attr); + } + else { + emit_constant_coef4(builder, args, slot+1, args->v2, vert_attr); + } + break; + + case LP_INTERP_LINEAR: + emit_linear_coef(builder, args, slot+1, vert_attr); + break; + + case LP_INTERP_PERSPECTIVE: + emit_perspective_coef(builder, args, slot+1, vert_attr); + break; + + case LP_INTERP_POSITION: + /* + * The generated pixel interpolators will pick up the coeffs from + * slot 0. + */ + break; + + case LP_INTERP_FACING: + emit_facing_coef(builder, args, slot+1); + break; + + default: + assert(0); + } + } +} + + +/* XXX: This is generic code, share with fs/vs codegen: + */ +static lp_jit_setup_triangle +finalize_function(struct llvmpipe_screen *screen, + LLVMBuilderRef builder, + LLVMValueRef function) +{ + void *f; + + /* Verify the LLVM IR. If invalid, dump and abort */ +#ifdef DEBUG + if (LLVMVerifyFunction(function, LLVMPrintMessageAction)) { + if (1) + lp_debug_dump_value(function); + abort(); + } +#endif + + /* Apply optimizations to LLVM IR */ + LLVMRunFunctionPassManager(screen->pass, function); + + if (gallivm_debug & GALLIVM_DEBUG_IR) + { + /* Print the LLVM IR to stderr */ + lp_debug_dump_value(function); + debug_printf("\n"); + } + + /* + * Translate the LLVM IR into machine code. + */ + f = LLVMGetPointerToGlobal(screen->engine, function); + + if (gallivm_debug & GALLIVM_DEBUG_ASM) + { + lp_disassemble(f); + } + + lp_func_delete_body(function); + + return f; +} + +/* XXX: Generic code: + */ +static void +lp_emit_emms(LLVMBuilderRef builder) +{ +#ifdef PIPE_ARCH_X86 + /* Avoid corrupting the FPU stack on 32bit OSes. */ + lp_build_intrinsic(builder, "llvm.x86.mmx.emms", LLVMVoidType(), NULL, 0); +#endif +} + + +/* XXX: generic code: + */ +static void +set_noalias(LLVMBuilderRef builder, + LLVMValueRef function, + const LLVMTypeRef *arg_types, + int nr_args) +{ + int i; + for(i = 0; i < Elements(arg_types); ++i) + if(LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind) + LLVMAddAttribute(LLVMGetParam(function, i), + LLVMNoAliasAttribute); +} + +static void +init_args(LLVMBuilderRef b, + struct lp_setup_args *args, + const struct lp_setup_variant *variant) +{ + LLVMValueRef v0_x = vert_attrib(b, args->v0, 0, 0, "v0_x"); + LLVMValueRef v0_y = vert_attrib(b, args->v0, 0, 1, "v0_y"); + + LLVMValueRef v1_x = vert_attrib(b, args->v1, 0, 0, "v1_x"); + LLVMValueRef v1_y = vert_attrib(b, args->v1, 0, 1, "v1_y"); + + LLVMValueRef v2_x = vert_attrib(b, args->v2, 0, 0, "v2_x"); + LLVMValueRef v2_y = vert_attrib(b, args->v2, 0, 1, "v2_y"); + + LLVMValueRef pixel_center = LLVMConstReal(LLVMFloatType(), + variant->key.pixel_center_half ? 0.5 : 0); + + LLVMValueRef x0_center = LLVMBuildFSub(b, v0_x, pixel_center, "x0_center" ); + LLVMValueRef y0_center = LLVMBuildFSub(b, v0_y, pixel_center, "y0_center" ); + + LLVMValueRef dx01 = LLVMBuildFSub(b, v0_x, v1_x, "dx01"); + LLVMValueRef dy01 = LLVMBuildFSub(b, v0_y, v1_y, "dy01"); + LLVMValueRef dx20 = LLVMBuildFSub(b, v2_x, v0_x, "dx20"); + LLVMValueRef dy20 = LLVMBuildFSub(b, v2_y, v0_y, "dy20"); + + LLVMValueRef one = LLVMConstReal(LLVMFloatType(), 1.0); + LLVMValueRef e = LLVMBuildFMul(b, dx01, dy20, "e"); + LLVMValueRef f = LLVMBuildFMul(b, dx20, dy01, "f"); + LLVMValueRef ooa = LLVMBuildFDiv(b, one, LLVMBuildFSub(b, e, f, ""), "ooa"); + + LLVMValueRef dy20_ooa = LLVMBuildFMul(b, dy20, ooa, "dy20_ooa"); + LLVMValueRef dy01_ooa = LLVMBuildFMul(b, dy01, ooa, "dy01_ooa"); + LLVMValueRef dx20_ooa = LLVMBuildFMul(b, dx20, ooa, "dx20_ooa"); + LLVMValueRef dx01_ooa = LLVMBuildFMul(b, dx01, ooa, "dx01_ooa"); + + args->dy20_ooa = vec4f_from_scalar(b, dy20_ooa, "dy20_ooa_4f"); + args->dy01_ooa = vec4f_from_scalar(b, dy01_ooa, "dy01_ooa_4f"); + + args->dx20_ooa = vec4f_from_scalar(b, dx20_ooa, "dx20_ooa_4f"); + args->dx01_ooa = vec4f_from_scalar(b, dx01_ooa, "dx01_ooa_4f"); + + args->x0_center = vec4f_from_scalar(b, x0_center, "x0_center_4f"); + args->y0_center = vec4f_from_scalar(b, y0_center, "y0_center_4f"); +} + +/** + * Generate the runtime callable function for the coefficient calculation. + * + */ +static struct lp_setup_variant * +generate_setup_variant(struct llvmpipe_screen *screen, + struct lp_setup_variant_key *key) +{ + struct lp_setup_variant *variant = NULL; + struct lp_setup_args args; + char func_name[256]; + LLVMTypeRef vec4f_type; + LLVMTypeRef func_type; + LLVMTypeRef arg_types[8]; + LLVMBasicBlockRef block; + LLVMBuilderRef builder; + int64_t t0, t1; + + if (0) + goto fail; + + variant = CALLOC_STRUCT(lp_setup_variant); + if (variant == NULL) + goto fail; + + if (LP_DEBUG & DEBUG_COUNTERS) { + t0 = os_time_get(); + } + + memcpy(&variant->key, key, key->size); + variant->list_item_global.base = variant; + + util_snprintf(func_name, sizeof(func_name), "fs%u_setup%u", + 0, + variant->no); + + /* Currently always deal with full 4-wide vertex attributes from + * the vertices. + */ + + vec4f_type = LLVMVectorType(LLVMFloatType(), 4); + + arg_types[0] = LLVMPointerType(vec4f_type, 0); /* v0 */ + arg_types[1] = LLVMPointerType(vec4f_type, 0); /* v1 */ + arg_types[2] = LLVMPointerType(vec4f_type, 0); /* v2 */ + arg_types[3] = LLVMInt32Type(); /* facing */ + arg_types[4] = LLVMPointerType(vec4f_type, 0); /* a0, aligned */ + arg_types[5] = LLVMPointerType(vec4f_type, 0); /* dadx, aligned */ + arg_types[6] = LLVMPointerType(vec4f_type, 0); /* dady, aligned */ + arg_types[7] = LLVMPointerType(LLVMVoidType(), 0); /* key, unused */ + + func_type = LLVMFunctionType(LLVMVoidType(), arg_types, Elements(arg_types), 0); + + variant->function = LLVMAddFunction(screen->module, func_name, func_type); + if (!variant->function) + goto fail; + + LLVMSetFunctionCallConv(variant->function, LLVMCCallConv); + + args.v0 = LLVMGetParam(variant->function, 0); + args.v1 = LLVMGetParam(variant->function, 1); + args.v2 = LLVMGetParam(variant->function, 2); + args.facing = LLVMGetParam(variant->function, 3); + args.a0 = LLVMGetParam(variant->function, 4); + args.dadx = LLVMGetParam(variant->function, 5); + args.dady = LLVMGetParam(variant->function, 6); + + lp_build_name(args.v0, "in_v0"); + lp_build_name(args.v1, "in_v1"); + lp_build_name(args.v2, "in_v2"); + lp_build_name(args.facing, "in_facing"); + lp_build_name(args.a0, "out_a0"); + lp_build_name(args.dadx, "out_dadx"); + lp_build_name(args.dady, "out_dady"); + + /* + * Function body + */ + block = LLVMAppendBasicBlock(variant->function, "entry"); + builder = LLVMCreateBuilder(); + LLVMPositionBuilderAtEnd(builder, block); + + set_noalias(builder, variant->function, arg_types, Elements(arg_types)); + init_args(builder, &args, variant); + emit_tri_coef(builder, &variant->key, &args); + + lp_emit_emms(builder); + LLVMBuildRetVoid(builder); + LLVMDisposeBuilder(builder); + + variant->jit_function = finalize_function(screen, builder, + variant->function); + if (!variant->jit_function) + goto fail; + + /* + * Update timing information: + */ + if (LP_DEBUG & DEBUG_COUNTERS) { + t1 = os_time_get(); + LP_COUNT_ADD(llvm_compile_time, t1 - t0); + LP_COUNT_ADD(nr_llvm_compiles, 1); + } + + return variant; + +fail: + if (variant) { + if (variant->function) { + if (variant->jit_function) + LLVMFreeMachineCodeForFunction(screen->engine, + variant->function); + LLVMDeleteFunction(variant->function); + } + FREE(variant); + } + + return NULL; +} + + + +static void +lp_make_setup_variant_key(struct llvmpipe_context *lp, + struct lp_setup_variant_key *key) +{ + struct lp_fragment_shader *fs = lp->fs; + unsigned i; + + assert(sizeof key->inputs[0] == sizeof(ushort)); + + key->num_inputs = fs->info.num_inputs; + key->flatshade_first = lp->rasterizer->flatshade_first; + key->pixel_center_half = lp->rasterizer->gl_rasterization_rules; + key->size = Offset(struct lp_setup_variant_key, + inputs[key->num_inputs]); + key->pad = 0; + + memcpy(key->inputs, fs->inputs, key->num_inputs * sizeof key->inputs[0]); + for (i = 0; i < key->num_inputs; i++) { + if (key->inputs[i].interp == LP_INTERP_COLOR) { + if (lp->rasterizer->flatshade) + key->inputs[i].interp = LP_INTERP_CONSTANT; + else + key->inputs[i].interp = LP_INTERP_LINEAR; + } + } + +} + + +static void +remove_setup_variant(struct llvmpipe_context *lp, + struct lp_setup_variant *variant) +{ + struct llvmpipe_screen *screen = llvmpipe_screen(lp->pipe.screen); + + if (gallivm_debug & GALLIVM_DEBUG_IR) { + debug_printf("llvmpipe: del setup_variant #%u total %u\n", + variant->no, lp->nr_setup_variants); + } + + if (variant->function) { + if (variant->jit_function) + LLVMFreeMachineCodeForFunction(screen->engine, + variant->function); + LLVMDeleteFunction(variant->function); + } + + remove_from_list(&variant->list_item_global); + lp->nr_setup_variants--; + FREE(variant); +} + + + +/* When the number of setup variants exceeds a threshold, cull a + * fraction (currently a quarter) of them. + */ +static void +cull_setup_variants(struct llvmpipe_context *lp) +{ + struct pipe_context *pipe = &lp->pipe; + int i; + + /* + * XXX: we need to flush the context until we have some sort of reference + * counting in fragment shaders as they may still be binned + * Flushing alone might not be sufficient we need to wait on it too. + */ + llvmpipe_finish(pipe, __FUNCTION__); + + for (i = 0; i < LP_MAX_SETUP_VARIANTS / 4; i++) { + struct lp_setup_variant_list_item *item = last_elem(&lp->setup_variants_list); + remove_setup_variant(lp, item->base); + } +} + + +/** + * Update fragment/vertex shader linkage state. This is called just + * prior to drawing something when some fragment-related state has + * changed. + */ +void +llvmpipe_update_setup(struct llvmpipe_context *lp) +{ + struct llvmpipe_screen *screen = llvmpipe_screen(lp->pipe.screen); + + struct lp_setup_variant_key *key = &lp->setup_variant.key; + struct lp_setup_variant *variant = NULL; + struct lp_setup_variant_list_item *li; + + lp_make_setup_variant_key(lp, key); + + foreach(li, &lp->setup_variants_list) { + if(li->base->key.size == key->size && + memcmp(&li->base->key, key, key->size) == 0) { + variant = li->base; + break; + } + } + + if (variant) { + move_to_head(&lp->setup_variants_list, &variant->list_item_global); + } + else { + if (lp->nr_setup_variants >= LP_MAX_SETUP_VARIANTS) { + cull_setup_variants(lp); + } + + variant = generate_setup_variant(screen, key); + if (variant) { + insert_at_head(&lp->setup_variants_list, &variant->list_item_global); + lp->nr_setup_variants++; + } + else { + /* Keep the old path around for debugging, and also perhaps + * in case malloc fails during compilation. + */ + variant = &lp->setup_variant; + variant->jit_function = lp_setup_tri_fallback; + } + } + + lp_setup_set_setup_variant(lp->setup, + variant); +} + +void +lp_delete_setup_variants(struct llvmpipe_context *lp) +{ + struct lp_setup_variant_list_item *li; + li = first_elem(&lp->setup_variants_list); + while(!at_end(&lp->setup_variants_list, li)) { + struct lp_setup_variant_list_item *next = next_elem(li); + remove_setup_variant(lp, li->base); + li = next; + } +} + +void +lp_dump_setup_coef( const struct lp_setup_variant_key *key, + const float (*sa0)[4], + const float (*sdadx)[4], + const float (*sdady)[4]) +{ + int i, slot; + + for (i = 0; i < NUM_CHANNELS; i++) { + float a0 = sa0 [0][i]; + float dadx = sdadx[0][i]; + float dady = sdady[0][i]; + + debug_printf("POS.%c: a0 = %f, dadx = %f, dady = %f\n", + "xyzw"[i], + a0, dadx, dady); + } + + for (slot = 0; slot < key->num_inputs; slot++) { + unsigned usage_mask = key->inputs[slot].usage_mask; + for (i = 0; i < NUM_CHANNELS; i++) { + if (usage_mask & (1 << i)) { + float a0 = sa0 [1 + slot][i]; + float dadx = sdadx[1 + slot][i]; + float dady = sdady[1 + slot][i]; + + debug_printf("IN[%u].%c: a0 = %f, dadx = %f, dady = %f\n", + slot, + "xyzw"[i], + a0, dadx, dady); + } + } + } +} diff --git a/src/gallium/drivers/llvmpipe/lp_state_setup.h b/src/gallium/drivers/llvmpipe/lp_state_setup.h new file mode 100644 index 00000000000..2b080fbc321 --- /dev/null +++ b/src/gallium/drivers/llvmpipe/lp_state_setup.h @@ -0,0 +1,81 @@ +#ifndef LP_STATE_SETUP_H +#define LP_STATE_SETUP_H + +#include "lp_bld_interp.h" + + +struct llvmpipe_context; +struct lp_setup_variant; + +struct lp_setup_variant_list_item +{ + struct lp_setup_variant *base; + struct lp_setup_variant_list_item *next, *prev; +}; + + +struct lp_setup_variant_key { + unsigned num_inputs:8; + unsigned flatshade_first:1; + unsigned pixel_center_half:1; + unsigned pad:7; + unsigned size:16; + struct lp_shader_input inputs[PIPE_MAX_SHADER_INPUTS]; +}; + + +typedef void (*lp_jit_setup_triangle)( const float (*v0)[4], + const float (*v1)[4], + const float (*v2)[4], + boolean front_facing, + float (*a0)[4], + float (*dadx)[4], + float (*dady)[4], + const struct lp_setup_variant_key *key ); + + + + +/* At this stage, for a given variant key, we create a + * draw_vertex_info struct telling the draw module how to format the + * vertices, and an llvm-generated function which calculates the + * attribute interpolants (a0, dadx, dady) from three of those + * vertices. + */ +struct lp_setup_variant { + struct lp_setup_variant_key key; + + struct lp_setup_variant_list_item list_item_global; + + /* XXX: this is a pointer to the LLVM IR. Once jit_function is + * generated, we never need to use the IR again - need to find a + * way to release this data without destroying the generated + * assembly. + */ + LLVMValueRef function; + + /* The actual generated setup function: + */ + lp_jit_setup_triangle jit_function; + + unsigned no; +}; + +void lp_setup_tri_fallback( const float (*v0)[4], + const float (*v1)[4], + const float (*v2)[4], + boolean front_facing, + float (*a0)[4], + float (*dadx)[4], + float (*dady)[4], + const struct lp_setup_variant_key *key ); + +void lp_delete_setup_variants(struct llvmpipe_context *lp); + +void +lp_dump_setup_coef( const struct lp_setup_variant_key *key, + const float (*sa0)[4], + const float (*sdadx)[4], + const float (*sdady)[4]); + +#endif diff --git a/src/gallium/drivers/llvmpipe/lp_state_setup_fallback.c b/src/gallium/drivers/llvmpipe/lp_state_setup_fallback.c new file mode 100644 index 00000000000..1922efcc88d --- /dev/null +++ b/src/gallium/drivers/llvmpipe/lp_state_setup_fallback.c @@ -0,0 +1,265 @@ +/************************************************************************** + * + * Copyright 2010, VMware. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/* + * Fallback (non-llvm) path for triangle setup. Will remove once llvm + * is up and running. + * + * TODO: line/point setup. + */ + +#include "util/u_math.h" +#include "util/u_memory.h" +#include "lp_state_setup.h" + + + +#if defined(PIPE_ARCH_SSE) +#include + +struct setup_args { + float (*a0)[4]; /* aligned */ + float (*dadx)[4]; /* aligned */ + float (*dady)[4]; /* aligned */ + + float x0_center; + float y0_center; + + /* turn these into an aligned float[4] */ + float dy01_ooa; + float dy20_ooa; + float dx01_ooa; + float dx20_ooa; + + const float (*v0)[4]; /* aligned */ + const float (*v1)[4]; /* aligned */ + const float (*v2)[4]; /* aligned */ + + boolean frontfacing; /* remove eventually */ +}; + + +static void constant_coef4( struct setup_args *args, + unsigned slot, + const float *attr) +{ + *(__m128 *)args->a0[slot] = *(__m128 *)attr; + *(__m128 *)args->dadx[slot] = _mm_set1_ps(0.0); + *(__m128 *)args->dady[slot] = _mm_set1_ps(0.0); +} + + + +/** + * Setup the fragment input attribute with the front-facing value. + * \param frontface is the triangle front facing? + */ +static void setup_facing_coef( struct setup_args *args, + unsigned slot ) +{ + /* XXX: just pass frontface directly to the shader, don't bother + * treating it as an input. + */ + __m128 a0 = _mm_setr_ps(args->frontfacing ? 1.0 : -1.0, + 0, 0, 0); + + *(__m128 *)args->a0[slot] = a0; + *(__m128 *)args->dadx[slot] = _mm_set1_ps(0.0); + *(__m128 *)args->dady[slot] = _mm_set1_ps(0.0); +} + + + +static void calc_coef4( struct setup_args *args, + unsigned slot, + __m128 a0, + __m128 a1, + __m128 a2) +{ + __m128 da01 = _mm_sub_ps(a0, a1); + __m128 da20 = _mm_sub_ps(a2, a0); + + __m128 da01_dy20_ooa = _mm_mul_ps(da01, _mm_set1_ps(args->dy20_ooa)); + __m128 da20_dy01_ooa = _mm_mul_ps(da20, _mm_set1_ps(args->dy01_ooa)); + __m128 dadx = _mm_sub_ps(da01_dy20_ooa, da20_dy01_ooa); + + __m128 da01_dx20_ooa = _mm_mul_ps(da01, _mm_set1_ps(args->dx20_ooa)); + __m128 da20_dx01_ooa = _mm_mul_ps(da20, _mm_set1_ps(args->dx01_ooa)); + __m128 dady = _mm_sub_ps(da20_dx01_ooa, da01_dx20_ooa); + + __m128 dadx_x0 = _mm_mul_ps(dadx, _mm_set1_ps(args->x0_center)); + __m128 dady_y0 = _mm_mul_ps(dady, _mm_set1_ps(args->y0_center)); + __m128 attr_v0 = _mm_add_ps(dadx_x0, dady_y0); + __m128 attr_0 = _mm_sub_ps(a0, attr_v0); + + *(__m128 *)args->a0[slot] = attr_0; + *(__m128 *)args->dadx[slot] = dadx; + *(__m128 *)args->dady[slot] = dady; +} + + +static void linear_coef( struct setup_args *args, + unsigned slot, + unsigned vert_attr) +{ + __m128 a0 = *(const __m128 *)args->v0[vert_attr]; + __m128 a1 = *(const __m128 *)args->v1[vert_attr]; + __m128 a2 = *(const __m128 *)args->v2[vert_attr]; + + calc_coef4(args, slot, a0, a1, a2); +} + + + +/** + * Compute a0, dadx and dady for a perspective-corrected interpolant, + * for a triangle. + * We basically multiply the vertex value by 1/w before computing + * the plane coefficients (a0, dadx, dady). + * Later, when we compute the value at a particular fragment position we'll + * divide the interpolated value by the interpolated W at that fragment. + */ +static void perspective_coef( struct setup_args *args, + unsigned slot, + unsigned vert_attr) +{ + /* premultiply by 1/w (v[0][3] is always 1/w): + */ + __m128 a0 = *(const __m128 *)args->v0[vert_attr]; + __m128 a1 = *(const __m128 *)args->v1[vert_attr]; + __m128 a2 = *(const __m128 *)args->v2[vert_attr]; + + __m128 a0_oow = _mm_mul_ps(a0, _mm_set1_ps(args->v0[0][3])); + __m128 a1_oow = _mm_mul_ps(a1, _mm_set1_ps(args->v1[0][3])); + __m128 a2_oow = _mm_mul_ps(a2, _mm_set1_ps(args->v2[0][3])); + + calc_coef4(args, slot, a0_oow, a1_oow, a2_oow); +} + + + + + +/** + * Compute the args-> dadx, dady, a0 values. + * + * Note that this was effectively a little interpreted program, where + * the opcodes were LP_INTERP_*. This is the program which is now + * being code-generated in lp_state_setup.c. + */ +void lp_setup_tri_fallback( const float (*v0)[4], + const float (*v1)[4], + const float (*v2)[4], + boolean front_facing, + float (*a0)[4], + float (*dadx)[4], + float (*dady)[4], + const struct lp_setup_variant_key *key ) +{ + struct setup_args args; + float pixel_offset = key->pixel_center_half ? 0.5 : 0.0; + float dx01 = v0[0][0] - v1[0][0]; + float dy01 = v0[0][1] - v1[0][1]; + float dx20 = v2[0][0] - v0[0][0]; + float dy20 = v2[0][1] - v0[0][1]; + float oneoverarea = 1.0f / (dx01 * dy20 - dx20 * dy01); + unsigned slot; + + args.v0 = v0; + args.v1 = v1; + args.v2 = v2; + args.frontfacing = front_facing; + args.a0 = a0; + args.dadx = dadx; + args.dady = dady; + + args.x0_center = v0[0][0] - pixel_offset; + args.y0_center = v0[0][1] - pixel_offset; + args.dx01_ooa = dx01 * oneoverarea; + args.dx20_ooa = dx20 * oneoverarea; + args.dy01_ooa = dy01 * oneoverarea; + args.dy20_ooa = dy20 * oneoverarea; + + /* The internal position input is in slot zero: + */ + linear_coef(&args, 0, 0); + + /* setup interpolation for all the remaining attributes: + */ + for (slot = 0; slot < key->num_inputs; slot++) { + unsigned vert_attr = key->inputs[slot].src_index; + + switch (key->inputs[slot].interp) { + case LP_INTERP_CONSTANT: + if (key->flatshade_first) { + constant_coef4(&args, slot+1, args.v0[vert_attr]); + } + else { + constant_coef4(&args, slot+1, args.v2[vert_attr]); + } + break; + + case LP_INTERP_LINEAR: + linear_coef(&args, slot+1, vert_attr); + break; + + case LP_INTERP_PERSPECTIVE: + perspective_coef(&args, slot+1, vert_attr); + break; + + case LP_INTERP_POSITION: + /* + * The generated pixel interpolators will pick up the coeffs from + * slot 0. + */ + break; + + case LP_INTERP_FACING: + setup_facing_coef(&args, slot+1); + break; + + default: + assert(0); + } + } +} + +#else + +void lp_setup_tri_fallback( const float (*v0)[4], + const float (*v1)[4], + const float (*v2)[4], + boolean front_facing, + float (*a0)[4], + float (*dadx)[4], + float (*dady)[4], + const struct lp_setup_variant_key *key ) +{ + /* this path for debugging only, don't need a non-sse version. */ +} + +#endif