i965g: re-starting from the dri driver

2009-10-23 16:55:02 +01:00 · 2009-10-23 16:55:02 +01:00 · 2f5f7c0773
parent da253319f9
commit 2f5f7c0773
68 changed files with 29208 additions and 0 deletions
--- a/src/gallium/drivers/i965/Makefile
+++ b/src/gallium/drivers/i965/Makefile
@ -0,0 +1,104 @@
+
+TOP = ../../../../..
+include $(TOP)/configs/current
+
+LIBNAME = i965_dri.so
+
+DRIVER_SOURCES = \
+	intel_batchbuffer.c \
+	intel_blit.c \
+	intel_buffer_objects.c \
+	intel_buffers.c \
+	intel_clear.c \
+	intel_context.c \
+	intel_decode.c \
+	intel_extensions.c \
+	intel_fbo.c \
+	intel_mipmap_tree.c \
+	intel_regions.c \
+	intel_screen.c \
+	intel_span.c \
+	intel_pixel.c \
+	intel_pixel_bitmap.c \
+	intel_pixel_copy.c \
+	intel_pixel_draw.c \
+	intel_pixel_read.c \
+	intel_state.c \
+	intel_swapbuffers.c \
+	intel_syncobj.c \
+	intel_tex.c \
+	intel_tex_copy.c \
+	intel_tex_format.c \
+	intel_tex_image.c \
+	intel_tex_layout.c \
+	intel_tex_subimage.c \
+	intel_tex_validate.c \
+	brw_cc.c \
+	brw_clip.c \
+	brw_clip_line.c \
+	brw_clip_point.c \
+	brw_clip_state.c \
+	brw_clip_tri.c \
+	brw_clip_unfilled.c \
+	brw_clip_util.c \
+	brw_context.c \
+	brw_curbe.c \
+	brw_disasm.c \
+	brw_draw.c \
+	brw_draw_upload.c \
+	brw_eu.c \
+	brw_eu_debug.c \
+	brw_eu_emit.c \
+	brw_eu_util.c \
+	brw_fallback.c \
+	brw_gs.c \
+	brw_gs_emit.c \
+	brw_gs_state.c \
+	brw_misc_state.c \
+	brw_program.c \
+	brw_queryobj.c \
+	brw_sf.c \
+	brw_sf_emit.c \
+	brw_sf_state.c \
+	brw_state_batch.c \
+	brw_state_cache.c \
+	brw_state_dump.c \
+	brw_state_upload.c \
+	brw_tex.c \
+	brw_tex_layout.c \
+	brw_urb.c \
+	brw_util.c \
+	brw_vs.c \
+	brw_vs_constval.c \
+	brw_vs_emit.c \
+	brw_vs_state.c \
+	brw_vs_surface_state.c \
+	brw_vtbl.c \
+	brw_wm.c \
+	brw_wm_debug.c \
+	brw_wm_emit.c \
+	brw_wm_fp.c \
+	brw_wm_iz.c \
+	brw_wm_glsl.c \
+	brw_wm_pass0.c \
+	brw_wm_pass1.c \
+	brw_wm_pass2.c \
+	brw_wm_sampler_state.c \
+	brw_wm_state.c \
+	brw_wm_surface_state.c 
+
+C_SOURCES = \
+	$(COMMON_SOURCES) \
+	$(MINIGLX_SOURCES) \
+	$(DRIVER_SOURCES)
+
+ASM_SOURCES = 
+
+DRIVER_DEFINES = -I../intel -I../intel/server
+
+DRI_LIB_DEPS += -ldrm_intel
+
+include ../Makefile.template
+
+intel_decode.o: ../intel/intel_decode.c
+intel_tex_layout.o: ../intel/intel_tex_layout.c
--- a/src/gallium/drivers/i965/brw_cc.c
+++ b/src/gallium/drivers/i965/brw_cc.c
@ -0,0 +1,297 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+#include "brw_util.h"
+#include "main/macros.h"
+#include "main/enums.h"
+
+static void prepare_cc_vp( struct brw_context *brw )
+{
+   GLcontext *ctx = &brw->intel.ctx;
+   struct brw_cc_viewport ccv;
+
+   memset(&ccv, 0, sizeof(ccv));
+
+   /* _NEW_VIEWPORT */
+   ccv.min_depth = ctx->Viewport.Near;
+   ccv.max_depth = ctx->Viewport.Far;
+
+   dri_bo_unreference(brw->cc.vp_bo);
+   brw->cc.vp_bo = brw_cache_data( &brw->cache, BRW_CC_VP, &ccv, NULL, 0 );
+}
+
+const struct brw_tracked_state brw_cc_vp = {
+   .dirty = {
+      .mesa = _NEW_VIEWPORT,
+      .brw = BRW_NEW_CONTEXT,
+      .cache = 0
+   },
+   .prepare = prepare_cc_vp
+};
+
+struct brw_cc_unit_key {
+   GLboolean stencil, stencil_two_side, color_blend, alpha_enabled;
+
+   GLenum stencil_func[2], stencil_fail_op[2];
+   GLenum stencil_pass_depth_fail_op[2], stencil_pass_depth_pass_op[2];
+   GLubyte stencil_ref[2], stencil_write_mask[2], stencil_test_mask[2];
+   GLenum logic_op;
+
+   GLenum blend_eq_rgb, blend_eq_a;
+   GLenum blend_src_rgb, blend_src_a;
+   GLenum blend_dst_rgb, blend_dst_a;
+
+   GLenum alpha_func;
+   GLclampf alpha_ref;
+
+   GLboolean dither;
+
+   GLboolean depth_test, depth_write;
+   GLenum depth_func;
+};
+
+static void
+cc_unit_populate_key(struct brw_context *brw, struct brw_cc_unit_key *key)
+{
+   GLcontext *ctx = &brw->intel.ctx;
+   const unsigned back = ctx->Stencil._BackFace;
+
+   memset(key, 0, sizeof(*key));
+
+   key->stencil = ctx->Stencil._Enabled;
+   key->stencil_two_side = ctx->Stencil._TestTwoSide;
+
+   if (key->stencil) {
+      key->stencil_func[0] = ctx->Stencil.Function[0];
+      key->stencil_fail_op[0] = ctx->Stencil.FailFunc[0];
+      key->stencil_pass_depth_fail_op[0] = ctx->Stencil.ZFailFunc[0];
+      key->stencil_pass_depth_pass_op[0] = ctx->Stencil.ZPassFunc[0];
+      key->stencil_ref[0] = ctx->Stencil.Ref[0];
+      key->stencil_write_mask[0] = ctx->Stencil.WriteMask[0];
+      key->stencil_test_mask[0] = ctx->Stencil.ValueMask[0];
+   }
+   if (key->stencil_two_side) {
+      key->stencil_func[1] = ctx->Stencil.Function[back];
+      key->stencil_fail_op[1] = ctx->Stencil.FailFunc[back];
+      key->stencil_pass_depth_fail_op[1] = ctx->Stencil.ZFailFunc[back];
+      key->stencil_pass_depth_pass_op[1] = ctx->Stencil.ZPassFunc[back];
+      key->stencil_ref[1] = ctx->Stencil.Ref[back];
+      key->stencil_write_mask[1] = ctx->Stencil.WriteMask[back];
+      key->stencil_test_mask[1] = ctx->Stencil.ValueMask[back];
+   }
+
+   if (ctx->Color._LogicOpEnabled)
+      key->logic_op = ctx->Color.LogicOp;
+   else
+      key->logic_op = GL_COPY;
+
+   key->color_blend = ctx->Color.BlendEnabled;
+   if (key->color_blend) {
+      key->blend_eq_rgb = ctx->Color.BlendEquationRGB;
+      key->blend_eq_a = ctx->Color.BlendEquationA;
+      key->blend_src_rgb = ctx->Color.BlendSrcRGB;
+      key->blend_dst_rgb = ctx->Color.BlendDstRGB;
+      key->blend_src_a = ctx->Color.BlendSrcA;
+      key->blend_dst_a = ctx->Color.BlendDstA;
+   }
+
+   key->alpha_enabled = ctx->Color.AlphaEnabled;
+   if (key->alpha_enabled) {
+      key->alpha_func = ctx->Color.AlphaFunc;
+      key->alpha_ref = ctx->Color.AlphaRef;
+   }
+
+   key->dither = ctx->Color.DitherFlag;
+
+   key->depth_test = ctx->Depth.Test;
+   if (key->depth_test) {
+      key->depth_func = ctx->Depth.Func;
+      key->depth_write = ctx->Depth.Mask;
+   }
+}
+
+/**
+ * Creates the state cache entry for the given CC unit key.
+ */
+static dri_bo *
+cc_unit_create_from_key(struct brw_context *brw, struct brw_cc_unit_key *key)
+{
+   struct brw_cc_unit_state cc;
+   dri_bo *bo;
+
+   memset(&cc, 0, sizeof(cc));
+
+   /* _NEW_STENCIL */
+   if (key->stencil) {
+      cc.cc0.stencil_enable = 1;
+      cc.cc0.stencil_func =
+	 intel_translate_compare_func(key->stencil_func[0]);
+      cc.cc0.stencil_fail_op =
+	 intel_translate_stencil_op(key->stencil_fail_op[0]);
+      cc.cc0.stencil_pass_depth_fail_op =
+	 intel_translate_stencil_op(key->stencil_pass_depth_fail_op[0]);
+      cc.cc0.stencil_pass_depth_pass_op =
+	 intel_translate_stencil_op(key->stencil_pass_depth_pass_op[0]);
+      cc.cc1.stencil_ref = key->stencil_ref[0];
+      cc.cc1.stencil_write_mask = key->stencil_write_mask[0];
+      cc.cc1.stencil_test_mask = key->stencil_test_mask[0];
+
+      if (key->stencil_two_side) {
+	 cc.cc0.bf_stencil_enable = 1;
+	 cc.cc0.bf_stencil_func =
+	    intel_translate_compare_func(key->stencil_func[1]);
+	 cc.cc0.bf_stencil_fail_op =
+	    intel_translate_stencil_op(key->stencil_fail_op[1]);
+	 cc.cc0.bf_stencil_pass_depth_fail_op =
+	    intel_translate_stencil_op(key->stencil_pass_depth_fail_op[1]);
+	 cc.cc0.bf_stencil_pass_depth_pass_op =
+	    intel_translate_stencil_op(key->stencil_pass_depth_pass_op[1]);
+	 cc.cc1.bf_stencil_ref = key->stencil_ref[1];
+	 cc.cc2.bf_stencil_write_mask = key->stencil_write_mask[1];
+	 cc.cc2.bf_stencil_test_mask = key->stencil_test_mask[1];
+      }
+
+      /* Not really sure about this:
+       */
+      if (key->stencil_write_mask[0] ||
+	  (key->stencil_two_side && key->stencil_write_mask[1]))
+	 cc.cc0.stencil_write_enable = 1;
+   }
+
+   /* _NEW_COLOR */
+   if (key->logic_op != GL_COPY) {
+      cc.cc2.logicop_enable = 1;
+      cc.cc5.logicop_func = intel_translate_logic_op(key->logic_op);
+   } else if (key->color_blend) {
+      GLenum eqRGB = key->blend_eq_rgb;
+      GLenum eqA = key->blend_eq_a;
+      GLenum srcRGB = key->blend_src_rgb;
+      GLenum dstRGB = key->blend_dst_rgb;
+      GLenum srcA = key->blend_src_a;
+      GLenum dstA = key->blend_dst_a;
+
+      if (eqRGB == GL_MIN || eqRGB == GL_MAX) {
+	 srcRGB = dstRGB = GL_ONE;
+      }
+
+      if (eqA == GL_MIN || eqA == GL_MAX) {
+	 srcA = dstA = GL_ONE;
+      }
+
+      cc.cc6.dest_blend_factor = brw_translate_blend_factor(dstRGB);
+      cc.cc6.src_blend_factor = brw_translate_blend_factor(srcRGB);
+      cc.cc6.blend_function = brw_translate_blend_equation(eqRGB);
+
+      cc.cc5.ia_dest_blend_factor = brw_translate_blend_factor(dstA);
+      cc.cc5.ia_src_blend_factor = brw_translate_blend_factor(srcA);
+      cc.cc5.ia_blend_function = brw_translate_blend_equation(eqA);
+
+      cc.cc3.blend_enable = 1;
+      cc.cc3.ia_blend_enable = (srcA != srcRGB ||
+				dstA != dstRGB ||
+				eqA != eqRGB);
+   }
+
+   if (key->alpha_enabled) {
+      cc.cc3.alpha_test = 1;
+      cc.cc3.alpha_test_func = intel_translate_compare_func(key->alpha_func);
+      cc.cc3.alpha_test_format = BRW_ALPHATEST_FORMAT_UNORM8;
+
+      UNCLAMPED_FLOAT_TO_UBYTE(cc.cc7.alpha_ref.ub[0], key->alpha_ref);
+   }
+
+   if (key->dither) {
+      cc.cc5.dither_enable = 1;
+      cc.cc6.y_dither_offset = 0;
+      cc.cc6.x_dither_offset = 0;
+   }
+
+   /* _NEW_DEPTH */
+   if (key->depth_test) {
+      cc.cc2.depth_test = 1;
+      cc.cc2.depth_test_function = intel_translate_compare_func(key->depth_func);
+      cc.cc2.depth_write_enable = key->depth_write;
+   }
+
+   /* CACHE_NEW_CC_VP */
+   cc.cc4.cc_viewport_state_offset = brw->cc.vp_bo->offset >> 5; /* reloc */
+
+   if (INTEL_DEBUG & DEBUG_STATS)
+      cc.cc5.statistics_enable = 1;
+
+   bo = brw_upload_cache(&brw->cache, BRW_CC_UNIT,
+			 key, sizeof(*key),
+			 &brw->cc.vp_bo, 1,
+			 &cc, sizeof(cc),
+			 NULL, NULL);
+
+   /* Emit CC viewport relocation */
+   dri_bo_emit_reloc(bo,
+		     I915_GEM_DOMAIN_INSTRUCTION,
+		     0,
+		     0,
+		     offsetof(struct brw_cc_unit_state, cc4),
+		     brw->cc.vp_bo);
+
+   return bo;
+}
+
+static void prepare_cc_unit( struct brw_context *brw )
+{
+   struct brw_cc_unit_key key;
+
+   cc_unit_populate_key(brw, &key);
+
+   dri_bo_unreference(brw->cc.state_bo);
+   brw->cc.state_bo = brw_search_cache(&brw->cache, BRW_CC_UNIT,
+				       &key, sizeof(key),
+				       &brw->cc.vp_bo, 1,
+				       NULL);
+
+   if (brw->cc.state_bo == NULL)
+      brw->cc.state_bo = cc_unit_create_from_key(brw, &key);
+}
+
+const struct brw_tracked_state brw_cc_unit = {
+   .dirty = {
+      .mesa = _NEW_STENCIL | _NEW_COLOR | _NEW_DEPTH,
+      .brw = 0,
+      .cache = CACHE_NEW_CC_VP
+   },
+   .prepare = prepare_cc_unit,
+};
+
+
+
--- a/src/gallium/drivers/i965/brw_clip.c
+++ b/src/gallium/drivers/i965/brw_clip.c
@ -0,0 +1,273 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "main/glheader.h"
+#include "main/macros.h"
+#include "main/enums.h"
+
+#include "intel_batchbuffer.h"
+
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_state.h"
+#include "brw_clip.h"
+
+
+#define FRONT_UNFILLED_BIT  0x1
+#define BACK_UNFILLED_BIT   0x2
+
+
+static void compile_clip_prog( struct brw_context *brw,
+			     struct brw_clip_prog_key *key )
+{
+   struct brw_clip_compile c;
+   const GLuint *program;
+   GLuint program_size;
+   GLuint delta;
+   GLuint i;
+
+   memset(&c, 0, sizeof(c));
+   
+   /* Begin the compilation:
+    */
+   brw_init_compile(brw, &c.func);
+
+   c.func.single_program_flow = 1;
+
+   c.key = *key;
+   c.need_ff_sync = BRW_IS_IGDNG(brw);
+
+   /* Need to locate the two positions present in vertex + header.
+    * These are currently hardcoded:
+    */
+   c.header_position_offset = ATTR_SIZE;
+
+   if (BRW_IS_IGDNG(brw))
+       delta = 3 * REG_SIZE;
+   else
+       delta = REG_SIZE;
+
+   for (i = 0; i < VERT_RESULT_MAX; i++)
+      if (c.key.attrs & (1<<i)) {
+	 c.offset[i] = delta;
+	 delta += ATTR_SIZE;
+      }
+
+   c.nr_attrs = brw_count_bits(c.key.attrs);
+   
+   if (BRW_IS_IGDNG(brw))
+       c.nr_regs = (c.nr_attrs + 1) / 2 + 3;  /* are vertices packed, or reg-aligned? */
+   else
+       c.nr_regs = (c.nr_attrs + 1) / 2 + 1;  /* are vertices packed, or reg-aligned? */
+
+   c.nr_bytes = c.nr_regs * REG_SIZE;
+
+   c.prog_data.clip_mode = c.key.clip_mode; /* XXX */
+
+   /* For some reason the thread is spawned with only 4 channels
+    * unmasked.  
+    */
+   brw_set_mask_control(&c.func, BRW_MASK_DISABLE);
+
+
+   /* Would ideally have the option of producing a program which could
+    * do all three:
+    */
+   switch (key->primitive) {
+   case GL_TRIANGLES: 
+      if (key->do_unfilled)
+	 brw_emit_unfilled_clip( &c );
+      else
+	 brw_emit_tri_clip( &c );
+      break;
+   case GL_LINES:
+      brw_emit_line_clip( &c );
+      break;
+   case GL_POINTS:
+      brw_emit_point_clip( &c );
+      break;
+   default:
+      assert(0);
+      return;
+   }
+
+	 
+
+   /* get the program
+    */
+   program = brw_get_program(&c.func, &program_size);
+
+   /* Upload
+    */
+   dri_bo_unreference(brw->clip.prog_bo);
+   brw->clip.prog_bo = brw_upload_cache( &brw->cache,
+					 BRW_CLIP_PROG,
+					 &c.key, sizeof(c.key),
+					 NULL, 0,
+					 program, program_size,
+					 &c.prog_data,
+					 &brw->clip.prog_data );
+}
+
+/* Calculate interpolants for triangle and line rasterization.
+ */
+static void upload_clip_prog(struct brw_context *brw)
+{
+   GLcontext *ctx = &brw->intel.ctx;
+   struct brw_clip_prog_key key;
+
+   memset(&key, 0, sizeof(key));
+
+   /* Populate the key:
+    */
+   /* BRW_NEW_REDUCED_PRIMITIVE */
+   key.primitive = brw->intel.reduced_primitive;
+   /* CACHE_NEW_VS_PROG */
+   key.attrs = brw->vs.prog_data->outputs_written;
+   /* _NEW_LIGHT */
+   key.do_flat_shading = (ctx->Light.ShadeModel == GL_FLAT);
+   /* _NEW_TRANSFORM */
+   key.nr_userclip = brw_count_bits(ctx->Transform.ClipPlanesEnabled);
+
+   if (BRW_IS_IGDNG(brw))
+       key.clip_mode = BRW_CLIPMODE_KERNEL_CLIP;
+   else
+       key.clip_mode = BRW_CLIPMODE_NORMAL;
+
+   /* _NEW_POLYGON */
+   if (key.primitive == GL_TRIANGLES) {
+      if (ctx->Polygon.CullFlag &&
+	  ctx->Polygon.CullFaceMode == GL_FRONT_AND_BACK)
+	 key.clip_mode = BRW_CLIPMODE_REJECT_ALL;
+      else {
+	 GLuint fill_front = CLIP_CULL;
+	 GLuint fill_back = CLIP_CULL;
+	 GLuint offset_front = 0;
+	 GLuint offset_back = 0;
+
+	 if (!ctx->Polygon.CullFlag ||
+	     ctx->Polygon.CullFaceMode != GL_FRONT) {
+	    switch (ctx->Polygon.FrontMode) {
+	    case GL_FILL: 
+	       fill_front = CLIP_FILL; 
+	       offset_front = 0;
+	       break;
+	    case GL_LINE:
+	       fill_front = CLIP_LINE;
+	       offset_front = ctx->Polygon.OffsetLine;
+	       break;
+	    case GL_POINT:
+	       fill_front = CLIP_POINT;
+	       offset_front = ctx->Polygon.OffsetPoint;
+	       break;
+	    }
+	 }
+
+	 if (!ctx->Polygon.CullFlag ||
+	     ctx->Polygon.CullFaceMode != GL_BACK) {
+	    switch (ctx->Polygon.BackMode) {
+	    case GL_FILL: 
+	       fill_back = CLIP_FILL; 
+	       offset_back = 0;
+	       break;
+	    case GL_LINE:
+	       fill_back = CLIP_LINE;
+	       offset_back = ctx->Polygon.OffsetLine;
+	       break;
+	    case GL_POINT:
+	       fill_back = CLIP_POINT;
+	       offset_back = ctx->Polygon.OffsetPoint;
+	       break;
+	    }
+	 }
+
+	 if (ctx->Polygon.BackMode != GL_FILL ||
+	     ctx->Polygon.FrontMode != GL_FILL) {
+	    key.do_unfilled = 1;
+
+	    /* Most cases the fixed function units will handle.  Cases where
+	     * one or more polygon faces are unfilled will require help:
+	     */
+	    key.clip_mode = BRW_CLIPMODE_CLIP_NON_REJECTED;
+
+	    if (offset_back || offset_front) {
+	       /* _NEW_POLYGON, _NEW_BUFFERS */
+	       key.offset_units = ctx->Polygon.OffsetUnits * brw->intel.polygon_offset_scale;
+	       key.offset_factor = ctx->Polygon.OffsetFactor * ctx->DrawBuffer->_MRD;
+	    }
+
+	    switch (ctx->Polygon.FrontFace) {
+	    case GL_CCW:
+	       key.fill_ccw = fill_front;
+	       key.fill_cw = fill_back;
+	       key.offset_ccw = offset_front;
+	       key.offset_cw = offset_back;
+	       if (ctx->Light.Model.TwoSide &&
+		   key.fill_cw != CLIP_CULL) 
+		  key.copy_bfc_cw = 1;
+	       break;
+	    case GL_CW:
+	       key.fill_cw = fill_front;
+	       key.fill_ccw = fill_back;
+	       key.offset_cw = offset_front;
+	       key.offset_ccw = offset_back;
+	       if (ctx->Light.Model.TwoSide &&
+		   key.fill_ccw != CLIP_CULL) 
+		  key.copy_bfc_ccw = 1;
+	       break;
+	    }
+	 }
+      }
+   }
+
+   dri_bo_unreference(brw->clip.prog_bo);
+   brw->clip.prog_bo = brw_search_cache(&brw->cache, BRW_CLIP_PROG,
+					&key, sizeof(key),
+					NULL, 0,
+					&brw->clip.prog_data);
+   if (brw->clip.prog_bo == NULL)
+      compile_clip_prog( brw, &key );
+}
+
+
+const struct brw_tracked_state brw_clip_prog = {
+   .dirty = {
+      .mesa  = (_NEW_LIGHT | 
+		_NEW_TRANSFORM |
+		_NEW_POLYGON | 
+		_NEW_BUFFERS),
+      .brw   = (BRW_NEW_REDUCED_PRIMITIVE),
+      .cache = CACHE_NEW_VS_PROG
+   },
+   .prepare = upload_clip_prog
+};
--- a/src/gallium/drivers/i965/brw_clip.h
+++ b/src/gallium/drivers/i965/brw_clip.h
@ -0,0 +1,179 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#ifndef BRW_CLIP_H
+#define BRW_CLIP_H
+
+
+#include "brw_context.h"
+#include "brw_eu.h"
+
+#define MAX_VERTS (3+6+6)	
+
+/* Note that if unfilled primitives are being emitted, we have to fix
+ * up polygon offset and flatshading at this point:
+ */
+struct brw_clip_prog_key {
+   GLuint attrs:32;		
+   GLuint primitive:4;
+   GLuint nr_userclip:3;
+   GLuint do_flat_shading:1;
+   GLuint do_unfilled:1;
+   GLuint fill_cw:2;		/* includes cull information */
+   GLuint fill_ccw:2;		/* includes cull information */
+   GLuint offset_cw:1;
+   GLuint offset_ccw:1;
+   GLuint pad0:17;
+
+   GLuint copy_bfc_cw:1;
+   GLuint copy_bfc_ccw:1;
+   GLuint clip_mode:3;
+   GLuint pad1:27;
+   
+   GLfloat offset_factor;
+   GLfloat offset_units;
+};
+
+
+#define CLIP_LINE   0
+#define CLIP_POINT  1
+#define CLIP_FILL   2
+#define CLIP_CULL   3
+
+
+#define PRIM_MASK  (0x1f)
+
+struct brw_clip_compile {
+   struct brw_compile func;
+   struct brw_clip_prog_key key;
+   struct brw_clip_prog_data prog_data;
+   
+   struct {
+      struct brw_reg R0;
+      struct brw_reg vertex[MAX_VERTS];
+
+      struct brw_reg t;
+      struct brw_reg t0, t1;
+      struct brw_reg dp0, dp1;
+
+      struct brw_reg dpPrev;
+      struct brw_reg dp;
+      struct brw_reg loopcount;
+      struct brw_reg nr_verts;
+      struct brw_reg planemask;
+
+      struct brw_reg inlist;
+      struct brw_reg outlist;
+      struct brw_reg freelist;
+
+      struct brw_reg dir;
+      struct brw_reg tmp0, tmp1;
+      struct brw_reg offset;
+      
+      struct brw_reg fixed_planes;
+      struct brw_reg plane_equation;
+       
+      struct brw_reg ff_sync;
+   } reg;
+
+   /* 3 different ways of expressing vertex size:
+    */
+   GLuint nr_attrs;
+   GLuint nr_regs;
+   GLuint nr_bytes;
+
+   GLuint first_tmp;
+   GLuint last_tmp;
+
+   GLboolean need_direction;
+
+   GLuint last_mrf;
+
+   GLuint header_position_offset;
+   GLuint offset[VERT_ATTRIB_MAX];
+   GLboolean need_ff_sync;
+};
+
+#define ATTR_SIZE  (4*4)
+
+/* Points are only culled, so no need for a clip routine, however it
+ * works out easier to have a dummy one.
+ */
+void brw_emit_unfilled_clip( struct brw_clip_compile *c );
+void brw_emit_tri_clip( struct brw_clip_compile *c );
+void brw_emit_line_clip( struct brw_clip_compile *c );
+void brw_emit_point_clip( struct brw_clip_compile *c );
+
+/* brw_clip_tri.c, for use by the unfilled clip routine:
+ */
+void brw_clip_tri_init_vertices( struct brw_clip_compile *c );
+void brw_clip_tri_flat_shade( struct brw_clip_compile *c );
+void brw_clip_tri( struct brw_clip_compile *c );
+void brw_clip_tri_emit_polygon( struct brw_clip_compile *c );
+void brw_clip_tri_alloc_regs( struct brw_clip_compile *c, 
+			      GLuint nr_verts );
+
+
+/* Utils:
+ */
+
+void brw_clip_interp_vertex( struct brw_clip_compile *c,
+			     struct brw_indirect dest_ptr,
+			     struct brw_indirect v0_ptr, /* from */
+			     struct brw_indirect v1_ptr, /* to */
+			     struct brw_reg t0,
+			     GLboolean force_edgeflag );
+
+void brw_clip_init_planes( struct brw_clip_compile *c );
+
+void brw_clip_emit_vue(struct brw_clip_compile *c, 
+		       struct brw_indirect vert,
+		       GLboolean allocate,
+		       GLboolean eot,
+		       GLuint header);
+
+void brw_clip_kill_thread(struct brw_clip_compile *c);
+
+struct brw_reg brw_clip_plane_stride( struct brw_clip_compile *c );
+struct brw_reg brw_clip_plane0_address( struct brw_clip_compile *c );
+
+void brw_clip_copy_colors( struct brw_clip_compile *c,
+			   GLuint to, GLuint from );
+
+void brw_clip_init_clipmask( struct brw_clip_compile *c );
+
+struct brw_reg get_tmp( struct brw_clip_compile *c );
+
+void brw_clip_project_position(struct brw_clip_compile *c,
+             struct brw_reg pos );
+void brw_clip_ff_sync(struct brw_clip_compile *c);
+void brw_clip_init_ff_sync(struct brw_clip_compile *c);
+#endif
--- a/src/gallium/drivers/i965/brw_clip_line.c
+++ b/src/gallium/drivers/i965/brw_clip_line.c
@ -0,0 +1,276 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "main/glheader.h"
+#include "main/macros.h"
+#include "main/enums.h"
+#include "shader/program.h"
+
+#include "intel_batchbuffer.h"
+
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_clip.h"
+
+
+
+static void brw_clip_line_alloc_regs( struct brw_clip_compile *c )
+{
+   GLuint i = 0,j;
+
+   /* Register usage is static, precompute here:
+    */
+   c->reg.R0 = retype(brw_vec8_grf(i, 0), BRW_REGISTER_TYPE_UD); i++;
+
+   if (c->key.nr_userclip) {
+      c->reg.fixed_planes = brw_vec4_grf(i, 0);
+      i += (6 + c->key.nr_userclip + 1) / 2;
+
+      c->prog_data.curb_read_length = (6 + c->key.nr_userclip + 1) / 2;
+   }
+   else
+      c->prog_data.curb_read_length = 0;
+
+
+   /* Payload vertices plus space for more generated vertices:
+    */
+   for (j = 0; j < 4; j++) {
+      c->reg.vertex[j] = brw_vec4_grf(i, 0);
+      i += c->nr_regs;
+   }
+
+   c->reg.t           = brw_vec1_grf(i, 0);
+   c->reg.t0          = brw_vec1_grf(i, 1);
+   c->reg.t1          = brw_vec1_grf(i, 2);
+   c->reg.planemask   = retype(brw_vec1_grf(i, 3), BRW_REGISTER_TYPE_UD);
+   c->reg.plane_equation = brw_vec4_grf(i, 4);
+   i++;
+
+   c->reg.dp0         = brw_vec1_grf(i, 0); /* fixme - dp4 will clobber r.1,2,3 */
+   c->reg.dp1         = brw_vec1_grf(i, 4);
+   i++;
+
+   if (!c->key.nr_userclip) {
+      c->reg.fixed_planes = brw_vec8_grf(i, 0); 
+      i++;
+   }
+
+   if (c->need_ff_sync) {
+      c->reg.ff_sync = retype(brw_vec1_grf(i, 0), BRW_REGISTER_TYPE_UD);
+      i++;
+   }
+
+   c->first_tmp = i;
+   c->last_tmp = i;
+
+   c->prog_data.urb_read_length = c->nr_regs; /* ? */
+   c->prog_data.total_grf = i;
+}
+
+
+
+/* Line clipping, more or less following the following algorithm:
+ *
+ *  for (p=0;p<MAX_PLANES;p++) {
+ *     if (clipmask & (1 << p)) {
+ *        GLfloat dp0 = DOTPROD( vtx0, plane[p] );
+ *        GLfloat dp1 = DOTPROD( vtx1, plane[p] );
+ *
+ *        if (IS_NEGATIVE(dp1)) {
+ *           GLfloat t = dp1 / (dp1 - dp0);
+ *           if (t > t1) t1 = t;
+ *        } else {
+ *           GLfloat t = dp0 / (dp0 - dp1);
+ *           if (t > t0) t0 = t;
+ *        }
+ *  
+ *        if (t0 + t1 >= 1.0)
+ *           return;
+ *     }
+ *  }
+ *
+ *  interp( ctx, newvtx0, vtx0, vtx1, t0 );
+ *  interp( ctx, newvtx1, vtx1, vtx0, t1 );
+ *
+ */
+static void clip_and_emit_line( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_indirect vtx0     = brw_indirect(0, 0);
+   struct brw_indirect vtx1      = brw_indirect(1, 0);
+   struct brw_indirect newvtx0   = brw_indirect(2, 0);
+   struct brw_indirect newvtx1   = brw_indirect(3, 0);
+   struct brw_indirect plane_ptr = brw_indirect(4, 0);
+   struct brw_instruction *plane_loop;
+   struct brw_instruction *plane_active;
+   struct brw_instruction *is_negative;
+   struct brw_instruction *is_neg2 = NULL;
+   struct brw_instruction *not_culled;
+   struct brw_reg v1_null_ud = retype(vec1(brw_null_reg()), BRW_REGISTER_TYPE_UD);
+
+   brw_MOV(p, get_addr_reg(vtx0),      brw_address(c->reg.vertex[0]));
+   brw_MOV(p, get_addr_reg(vtx1),      brw_address(c->reg.vertex[1]));
+   brw_MOV(p, get_addr_reg(newvtx0),   brw_address(c->reg.vertex[2]));
+   brw_MOV(p, get_addr_reg(newvtx1),   brw_address(c->reg.vertex[3]));
+   brw_MOV(p, get_addr_reg(plane_ptr), brw_clip_plane0_address(c));
+
+   /* Note: init t0, t1 together: 
+    */
+   brw_MOV(p, vec2(c->reg.t0), brw_imm_f(0));
+
+   brw_clip_init_planes(c);
+   brw_clip_init_clipmask(c);
+
+   /* -ve rhw workaround */
+   if (BRW_IS_965(p->brw)) {
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+      brw_AND(p, brw_null_reg(), get_element_ud(c->reg.R0, 2),
+              brw_imm_ud(1<<20));
+      brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(0x3f));
+   }
+
+   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+   plane_loop = brw_DO(p, BRW_EXECUTE_1);
+   {
+      /* if (planemask & 1)
+       */
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+      brw_AND(p, v1_null_ud, c->reg.planemask, brw_imm_ud(1));
+      
+      plane_active = brw_IF(p, BRW_EXECUTE_1);
+      {
+	 if (c->key.nr_userclip)
+	    brw_MOV(p, c->reg.plane_equation, deref_4f(plane_ptr, 0));
+	 else
+	    brw_MOV(p, c->reg.plane_equation, deref_4b(plane_ptr, 0));
+
+	 /* dp = DP4(vtx->position, plane) 
+	  */
+	 brw_DP4(p, vec4(c->reg.dp0), deref_4f(vtx0, c->offset[VERT_RESULT_HPOS]), c->reg.plane_equation);
+
+	 /* if (IS_NEGATIVE(dp1)) 
+	  */
+	 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
+	 brw_DP4(p, vec4(c->reg.dp1), deref_4f(vtx1, c->offset[VERT_RESULT_HPOS]), c->reg.plane_equation);
+	 is_negative = brw_IF(p, BRW_EXECUTE_1);
+	 {
+             /*
+              * Both can be negative on GM965/G965 due to RHW workaround
+              * if so, this object should be rejected.
+              */
+             if (BRW_IS_965(p->brw)) {
+                 brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_LE, c->reg.dp0, brw_imm_f(0.0));
+                 is_neg2 = brw_IF(p, BRW_EXECUTE_1);
+                 {
+                     brw_clip_kill_thread(c);
+                 }
+                 brw_ENDIF(p, is_neg2);
+             }
+
+             brw_ADD(p, c->reg.t, c->reg.dp1, negate(c->reg.dp0));
+             brw_math_invert(p, c->reg.t, c->reg.t);
+             brw_MUL(p, c->reg.t, c->reg.t, c->reg.dp1);
+
+             brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_G, c->reg.t, c->reg.t1 );
+             brw_MOV(p, c->reg.t1, c->reg.t);
+             brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+	 } 
+	 is_negative = brw_ELSE(p, is_negative);
+	 {
+             /* Coming back in.  We know that both cannot be negative
+              * because the line would have been culled in that case.
+              */
+
+             /* If both are positive, do nothing */
+             /* Only on GM965/G965 */
+             if (BRW_IS_965(p->brw)) {
+                 brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.dp0, brw_imm_f(0.0));
+                 is_neg2 = brw_IF(p, BRW_EXECUTE_1);
+             }
+
+             {
+                 brw_ADD(p, c->reg.t, c->reg.dp0, negate(c->reg.dp1));
+                 brw_math_invert(p, c->reg.t, c->reg.t);
+                 brw_MUL(p, c->reg.t, c->reg.t, c->reg.dp0);
+
+                 brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_G, c->reg.t, c->reg.t0 );
+                 brw_MOV(p, c->reg.t0, c->reg.t);
+                 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+             }
+
+             if (BRW_IS_965(p->brw)) {
+                 brw_ENDIF(p, is_neg2);
+             }
+         }
+	 brw_ENDIF(p, is_negative);	 
+      }
+      brw_ENDIF(p, plane_active);
+      
+      /* plane_ptr++;
+       */
+      brw_ADD(p, get_addr_reg(plane_ptr), get_addr_reg(plane_ptr), brw_clip_plane_stride(c));
+
+      /* while (planemask>>=1) != 0
+       */
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+      brw_SHR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(1));
+   }
+   brw_WHILE(p, plane_loop);
+
+   brw_ADD(p, c->reg.t, c->reg.t0, c->reg.t1);
+   brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.t, brw_imm_f(1.0));
+   not_culled = brw_IF(p, BRW_EXECUTE_1);
+   {
+      brw_clip_interp_vertex(c, newvtx0, vtx0, vtx1, c->reg.t0, GL_FALSE);
+      brw_clip_interp_vertex(c, newvtx1, vtx1, vtx0, c->reg.t1, GL_FALSE);
+
+      brw_clip_emit_vue(c, newvtx0, 1, 0, (_3DPRIM_LINESTRIP << 2) | R02_PRIM_START);
+      brw_clip_emit_vue(c, newvtx1, 0, 1, (_3DPRIM_LINESTRIP << 2) | R02_PRIM_END); 
+   }
+   brw_ENDIF(p, not_culled);
+   brw_clip_kill_thread(c);
+}
+
+
+
+void brw_emit_line_clip( struct brw_clip_compile *c )
+{
+   brw_clip_line_alloc_regs(c);
+   brw_clip_init_ff_sync(c);
+
+   if (c->key.do_flat_shading)
+      brw_clip_copy_colors(c, 0, 1);
+                
+   clip_and_emit_line(c);
+}
--- a/src/gallium/drivers/i965/brw_clip_point.c
+++ b/src/gallium/drivers/i965/brw_clip_point.c
@ -0,0 +1,56 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "main/glheader.h"
+#include "main/macros.h"
+#include "main/enums.h"
+#include "shader/program.h"
+
+#include "intel_batchbuffer.h"
+
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_clip.h"
+
+
+/* Point clipping, nothing to do?
+ */
+void brw_emit_point_clip( struct brw_clip_compile *c )
+{
+   /* Send an empty message to kill the thread:
+    */
+   brw_clip_tri_alloc_regs(c, 0);
+   brw_clip_init_ff_sync(c);
+
+   brw_clip_kill_thread(c);
+}
--- a/src/gallium/drivers/i965/brw_clip_state.c
+++ b/src/gallium/drivers/i965/brw_clip_state.c
@ -0,0 +1,184 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+#include "main/macros.h"
+
+struct brw_clip_unit_key {
+   unsigned int total_grf;
+   unsigned int urb_entry_read_length;
+   unsigned int curb_entry_read_length;
+   unsigned int clip_mode;
+
+   unsigned int curbe_offset;
+
+   unsigned int nr_urb_entries, urb_size;
+
+   GLboolean depth_clamp;
+};
+
+static void
+clip_unit_populate_key(struct brw_context *brw, struct brw_clip_unit_key *key)
+{
+   GLcontext *ctx = &brw->intel.ctx;
+   memset(key, 0, sizeof(*key));
+
+   /* CACHE_NEW_CLIP_PROG */
+   key->total_grf = brw->clip.prog_data->total_grf;
+   key->urb_entry_read_length = brw->clip.prog_data->urb_read_length;
+   key->curb_entry_read_length = brw->clip.prog_data->curb_read_length;
+   key->clip_mode = brw->clip.prog_data->clip_mode;
+
+   /* BRW_NEW_CURBE_OFFSETS */
+   key->curbe_offset = brw->curbe.clip_start;
+
+   /* BRW_NEW_URB_FENCE */
+   key->nr_urb_entries = brw->urb.nr_clip_entries;
+   key->urb_size = brw->urb.vsize;
+
+   /* _NEW_TRANSOFORM */
+   key->depth_clamp = ctx->Transform.DepthClamp;
+}
+
+static dri_bo *
+clip_unit_create_from_key(struct brw_context *brw,
+			  struct brw_clip_unit_key *key)
+{
+   struct brw_clip_unit_state clip;
+   dri_bo *bo;
+
+   memset(&clip, 0, sizeof(clip));
+
+   clip.thread0.grf_reg_count = ALIGN(key->total_grf, 16) / 16 - 1;
+   /* reloc */
+   clip.thread0.kernel_start_pointer = brw->clip.prog_bo->offset >> 6;
+
+   clip.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
+   clip.thread1.single_program_flow = 1;
+
+   clip.thread3.urb_entry_read_length = key->urb_entry_read_length;
+   clip.thread3.const_urb_entry_read_length = key->curb_entry_read_length;
+   clip.thread3.const_urb_entry_read_offset = key->curbe_offset * 2;
+   clip.thread3.dispatch_grf_start_reg = 1;
+   clip.thread3.urb_entry_read_offset = 0;
+
+   clip.thread4.nr_urb_entries = key->nr_urb_entries;
+   clip.thread4.urb_entry_allocation_size = key->urb_size - 1;
+   /* If we have enough clip URB entries to run two threads, do so.
+    */
+   if (key->nr_urb_entries >= 10) {
+      /* Half of the URB entries go to each thread, and it has to be an
+       * even number.
+       */
+      assert(key->nr_urb_entries % 2 == 0);
+      
+      /* Although up to 16 concurrent Clip threads are allowed on IGDNG, 
+       * only 2 threads can output VUEs at a time.
+       */
+      if (BRW_IS_IGDNG(brw))
+         clip.thread4.max_threads = 16 - 1;        
+      else
+         clip.thread4.max_threads = 2 - 1;
+   } else {
+      assert(key->nr_urb_entries >= 5);
+      clip.thread4.max_threads = 1 - 1;
+   }
+
+   if (INTEL_DEBUG & DEBUG_SINGLE_THREAD)
+      clip.thread4.max_threads = 0;
+
+   if (INTEL_DEBUG & DEBUG_STATS)
+      clip.thread4.stats_enable = 1;
+
+   clip.clip5.userclip_enable_flags = 0x7f;
+   clip.clip5.userclip_must_clip = 1;
+   clip.clip5.guard_band_enable = 0;
+   if (!key->depth_clamp)
+      clip.clip5.viewport_z_clip_enable = 1;
+   clip.clip5.viewport_xy_clip_enable = 1;
+   clip.clip5.vertex_position_space = BRW_CLIP_NDCSPACE;
+   clip.clip5.api_mode = BRW_CLIP_API_OGL;
+   clip.clip5.clip_mode = key->clip_mode;
+
+   if (BRW_IS_G4X(brw))
+      clip.clip5.negative_w_clip_test = 1;
+
+   clip.clip6.clipper_viewport_state_ptr = 0;
+   clip.viewport_xmin = -1;
+   clip.viewport_xmax = 1;
+   clip.viewport_ymin = -1;
+   clip.viewport_ymax = 1;
+
+   bo = brw_upload_cache(&brw->cache, BRW_CLIP_UNIT,
+			 key, sizeof(*key),
+			 &brw->clip.prog_bo, 1,
+			 &clip, sizeof(clip),
+			 NULL, NULL);
+
+   /* Emit clip program relocation */
+   assert(brw->clip.prog_bo);
+   dri_bo_emit_reloc(bo,
+		     I915_GEM_DOMAIN_INSTRUCTION,
+		     0,
+		     clip.thread0.grf_reg_count << 1,
+		     offsetof(struct brw_clip_unit_state, thread0),
+		     brw->clip.prog_bo);
+
+   return bo;
+}
+
+static void upload_clip_unit( struct brw_context *brw )
+{
+   struct brw_clip_unit_key key;
+
+   clip_unit_populate_key(brw, &key);
+
+   dri_bo_unreference(brw->clip.state_bo);
+   brw->clip.state_bo = brw_search_cache(&brw->cache, BRW_CLIP_UNIT,
+					 &key, sizeof(key),
+					 &brw->clip.prog_bo, 1,
+					 NULL);
+   if (brw->clip.state_bo == NULL) {
+      brw->clip.state_bo = clip_unit_create_from_key(brw, &key);
+   }
+}
+
+const struct brw_tracked_state brw_clip_unit = {
+   .dirty = {
+      .mesa  = _NEW_TRANSFORM,
+      .brw   = (BRW_NEW_CURBE_OFFSETS |
+		BRW_NEW_URB_FENCE),
+      .cache = CACHE_NEW_CLIP_PROG
+   },
+   .prepare = upload_clip_unit,
+};
--- a/src/gallium/drivers/i965/brw_clip_tri.c
+++ b/src/gallium/drivers/i965/brw_clip_tri.c
@ -0,0 +1,603 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "main/glheader.h"
+#include "main/macros.h"
+#include "main/enums.h"
+#include "shader/program.h"
+
+#include "intel_batchbuffer.h"
+
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_clip.h"
+
+static void release_tmps( struct brw_clip_compile *c )
+{
+   c->last_tmp = c->first_tmp;
+}
+
+
+void brw_clip_tri_alloc_regs( struct brw_clip_compile *c, 
+			      GLuint nr_verts )
+{
+   GLuint i = 0,j;
+
+   /* Register usage is static, precompute here:
+    */
+   c->reg.R0 = retype(brw_vec8_grf(i, 0), BRW_REGISTER_TYPE_UD); i++;
+
+   if (c->key.nr_userclip) {
+      c->reg.fixed_planes = brw_vec4_grf(i, 0);
+      i += (6 + c->key.nr_userclip + 1) / 2;
+
+      c->prog_data.curb_read_length = (6 + c->key.nr_userclip + 1) / 2;
+   }
+   else
+      c->prog_data.curb_read_length = 0;
+
+
+   /* Payload vertices plus space for more generated vertices:
+    */
+   for (j = 0; j < nr_verts; j++) {
+      c->reg.vertex[j] = brw_vec4_grf(i, 0);
+      i += c->nr_regs;
+   }
+
+   if (c->nr_attrs & 1) {
+      for (j = 0; j < 3; j++) {
+	 GLuint delta = c->nr_attrs*16 + 32;
+
+         if (BRW_IS_IGDNG(c->func.brw))
+             delta = c->nr_attrs * 16 + 32 * 3;
+
+	 brw_MOV(&c->func, byte_offset(c->reg.vertex[j], delta), brw_imm_f(0));
+      }
+   }
+
+   c->reg.t          = brw_vec1_grf(i, 0);
+   c->reg.loopcount  = retype(brw_vec1_grf(i, 1), BRW_REGISTER_TYPE_D);
+   c->reg.nr_verts   = retype(brw_vec1_grf(i, 2), BRW_REGISTER_TYPE_UD);
+   c->reg.planemask  = retype(brw_vec1_grf(i, 3), BRW_REGISTER_TYPE_UD);
+   c->reg.plane_equation = brw_vec4_grf(i, 4);
+   i++;
+
+   c->reg.dpPrev     = brw_vec1_grf(i, 0); /* fixme - dp4 will clobber r.1,2,3 */
+   c->reg.dp         = brw_vec1_grf(i, 4);
+   i++;
+
+   c->reg.inlist     = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, i, 0);
+   i++;
+
+   c->reg.outlist    = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, i, 0);
+   i++;
+
+   c->reg.freelist   = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, i, 0);
+   i++;
+
+   if (!c->key.nr_userclip) {
+      c->reg.fixed_planes = brw_vec8_grf(i, 0); 
+      i++;
+   }
+
+   if (c->key.do_unfilled) {
+      c->reg.dir     = brw_vec4_grf(i, 0);
+      c->reg.offset  = brw_vec4_grf(i, 4);
+      i++;
+      c->reg.tmp0    = brw_vec4_grf(i, 0);
+      c->reg.tmp1    = brw_vec4_grf(i, 4);
+      i++;
+   }
+
+   if (c->need_ff_sync) {
+      c->reg.ff_sync = retype(brw_vec1_grf(i, 0), BRW_REGISTER_TYPE_UD);
+      i++;
+   }
+
+   c->first_tmp = i;
+   c->last_tmp = i;
+
+   c->prog_data.urb_read_length = c->nr_regs; /* ? */
+   c->prog_data.total_grf = i;
+}
+
+
+
+void brw_clip_tri_init_vertices( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg tmp0 = c->reg.loopcount; /* handy temporary */
+   struct brw_instruction *is_rev;
+
+   /* Initial list of indices for incoming vertexes:
+    */
+   brw_AND(p, tmp0, get_element_ud(c->reg.R0, 2), brw_imm_ud(PRIM_MASK)); 
+   brw_CMP(p, 
+	   vec1(brw_null_reg()), 
+	   BRW_CONDITIONAL_EQ, 
+	   tmp0,
+	   brw_imm_ud(_3DPRIM_TRISTRIP_REVERSE));
+
+   /* XXX: Is there an easier way to do this?  Need to reverse every
+    * second tristrip element:  Can ignore sometimes?
+    */
+   is_rev = brw_IF(p, BRW_EXECUTE_1);
+   {   
+      brw_MOV(p, get_element(c->reg.inlist, 0),  brw_address(c->reg.vertex[1]) );
+      brw_MOV(p, get_element(c->reg.inlist, 1),  brw_address(c->reg.vertex[0]) );
+      if (c->need_direction)
+	 brw_MOV(p, c->reg.dir, brw_imm_f(-1));
+   }
+   is_rev = brw_ELSE(p, is_rev);
+   {
+      brw_MOV(p, get_element(c->reg.inlist, 0),  brw_address(c->reg.vertex[0]) );
+      brw_MOV(p, get_element(c->reg.inlist, 1),  brw_address(c->reg.vertex[1]) );
+      if (c->need_direction)
+	 brw_MOV(p, c->reg.dir, brw_imm_f(1));
+   }
+   brw_ENDIF(p, is_rev);
+
+   brw_MOV(p, get_element(c->reg.inlist, 2),  brw_address(c->reg.vertex[2]) );
+   brw_MOV(p, brw_vec8_grf(c->reg.outlist.nr, 0), brw_imm_f(0));
+   brw_MOV(p, c->reg.nr_verts, brw_imm_ud(3));
+}
+
+
+
+void brw_clip_tri_flat_shade( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *is_poly;
+   struct brw_reg tmp0 = c->reg.loopcount; /* handy temporary */
+
+   brw_AND(p, tmp0, get_element_ud(c->reg.R0, 2), brw_imm_ud(PRIM_MASK)); 
+   brw_CMP(p, 
+	   vec1(brw_null_reg()), 
+	   BRW_CONDITIONAL_EQ, 
+	   tmp0,
+	   brw_imm_ud(_3DPRIM_POLYGON));
+
+   is_poly = brw_IF(p, BRW_EXECUTE_1);
+   {   
+      brw_clip_copy_colors(c, 1, 0);
+      brw_clip_copy_colors(c, 2, 0);
+   }
+   is_poly = brw_ELSE(p, is_poly);
+   {
+      brw_clip_copy_colors(c, 0, 2);
+      brw_clip_copy_colors(c, 1, 2);
+   }
+   brw_ENDIF(p, is_poly);
+}
+
+
+
+/* Use mesa's clipping algorithms, translated to GEN4 assembly.
+ */
+void brw_clip_tri( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_indirect vtx = brw_indirect(0, 0);
+   struct brw_indirect vtxPrev = brw_indirect(1, 0);
+   struct brw_indirect vtxOut = brw_indirect(2, 0);
+   struct brw_indirect plane_ptr = brw_indirect(3, 0);
+   struct brw_indirect inlist_ptr = brw_indirect(4, 0);
+   struct brw_indirect outlist_ptr = brw_indirect(5, 0);
+   struct brw_indirect freelist_ptr = brw_indirect(6, 0);
+   struct brw_instruction *plane_loop;
+   struct brw_instruction *plane_active;
+   struct brw_instruction *vertex_loop;
+   struct brw_instruction *next_test;
+   struct brw_instruction *prev_test;
+   
+   brw_MOV(p, get_addr_reg(vtxPrev),     brw_address(c->reg.vertex[2]) );
+   brw_MOV(p, get_addr_reg(plane_ptr),   brw_clip_plane0_address(c));
+   brw_MOV(p, get_addr_reg(inlist_ptr),  brw_address(c->reg.inlist));
+   brw_MOV(p, get_addr_reg(outlist_ptr), brw_address(c->reg.outlist));
+
+   brw_MOV(p, get_addr_reg(freelist_ptr), brw_address(c->reg.vertex[3]) );
+
+   plane_loop = brw_DO(p, BRW_EXECUTE_1);
+   {
+      /* if (planemask & 1)
+       */
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+      brw_AND(p, vec1(brw_null_reg()), c->reg.planemask, brw_imm_ud(1));
+      
+      plane_active = brw_IF(p, BRW_EXECUTE_1);
+      {
+	 /* vtxOut = freelist_ptr++ 
+	  */
+	 brw_MOV(p, get_addr_reg(vtxOut),       get_addr_reg(freelist_ptr) );
+	 brw_ADD(p, get_addr_reg(freelist_ptr), get_addr_reg(freelist_ptr), brw_imm_uw(c->nr_regs * REG_SIZE));
+
+	 if (c->key.nr_userclip)
+	    brw_MOV(p, c->reg.plane_equation, deref_4f(plane_ptr, 0));
+	 else
+	    brw_MOV(p, c->reg.plane_equation, deref_4b(plane_ptr, 0));
+	    
+	 brw_MOV(p, c->reg.loopcount, c->reg.nr_verts);
+	 brw_MOV(p, c->reg.nr_verts, brw_imm_ud(0));
+
+	 vertex_loop = brw_DO(p, BRW_EXECUTE_1);
+	 {
+	    /* vtx = *input_ptr;
+	     */
+	    brw_MOV(p, get_addr_reg(vtx), deref_1uw(inlist_ptr, 0));
+
+	    /* IS_NEGATIVE(prev) */
+	    brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
+	    brw_DP4(p, vec4(c->reg.dpPrev), deref_4f(vtxPrev, c->offset[VERT_RESULT_HPOS]), c->reg.plane_equation);
+	    prev_test = brw_IF(p, BRW_EXECUTE_1);
+	    {
+	       /* IS_POSITIVE(next)
+		*/
+	       brw_set_conditionalmod(p, BRW_CONDITIONAL_GE);
+	       brw_DP4(p, vec4(c->reg.dp), deref_4f(vtx, c->offset[VERT_RESULT_HPOS]), c->reg.plane_equation);
+	       next_test = brw_IF(p, BRW_EXECUTE_1);
+	       {
+
+		  /* Coming back in.
+		   */
+		  brw_ADD(p, c->reg.t, c->reg.dpPrev, negate(c->reg.dp));
+		  brw_math_invert(p, c->reg.t, c->reg.t);
+		  brw_MUL(p, c->reg.t, c->reg.t, c->reg.dpPrev);
+
+		  /* If (vtxOut == 0) vtxOut = vtxPrev
+		   */
+		  brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_EQ, get_addr_reg(vtxOut), brw_imm_uw(0) );
+		  brw_MOV(p, get_addr_reg(vtxOut), get_addr_reg(vtxPrev) );
+		  brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+		  brw_clip_interp_vertex(c, vtxOut, vtxPrev, vtx, c->reg.t, GL_FALSE);
+
+		  /* *outlist_ptr++ = vtxOut;
+		   * nr_verts++; 
+		   * vtxOut = 0;
+		   */
+		  brw_MOV(p, deref_1uw(outlist_ptr, 0), get_addr_reg(vtxOut));
+		  brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_uw(sizeof(short)));
+		  brw_ADD(p, c->reg.nr_verts, c->reg.nr_verts, brw_imm_ud(1));
+		  brw_MOV(p, get_addr_reg(vtxOut), brw_imm_uw(0) );
+	       }
+	       brw_ENDIF(p, next_test);
+	       
+	    }
+	    prev_test = brw_ELSE(p, prev_test);
+	    {
+	       /* *outlist_ptr++ = vtxPrev;
+		* nr_verts++;
+		*/
+	       brw_MOV(p, deref_1uw(outlist_ptr, 0), get_addr_reg(vtxPrev));
+	       brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_uw(sizeof(short)));
+	       brw_ADD(p, c->reg.nr_verts, c->reg.nr_verts, brw_imm_ud(1));
+
+	       /* IS_NEGATIVE(next)
+		*/
+	       brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
+	       brw_DP4(p, vec4(c->reg.dp), deref_4f(vtx, c->offset[VERT_RESULT_HPOS]), c->reg.plane_equation);
+	       next_test = brw_IF(p, BRW_EXECUTE_1);
+	       {
+		  /* Going out of bounds.  Avoid division by zero as we
+		   * know dp != dpPrev from DIFFERENT_SIGNS, above.
+		   */
+		  brw_ADD(p, c->reg.t, c->reg.dp, negate(c->reg.dpPrev));
+		  brw_math_invert(p, c->reg.t, c->reg.t);
+		  brw_MUL(p, c->reg.t, c->reg.t, c->reg.dp);
+
+		  /* If (vtxOut == 0) vtxOut = vtx
+		   */
+		  brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_EQ, get_addr_reg(vtxOut), brw_imm_uw(0) );
+		  brw_MOV(p, get_addr_reg(vtxOut), get_addr_reg(vtx) );
+		  brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+		  brw_clip_interp_vertex(c, vtxOut, vtx, vtxPrev, c->reg.t, GL_TRUE);		  
+
+		  /* *outlist_ptr++ = vtxOut;
+		   * nr_verts++; 
+		   * vtxOut = 0;
+		   */
+		  brw_MOV(p, deref_1uw(outlist_ptr, 0), get_addr_reg(vtxOut));
+		  brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_uw(sizeof(short)));
+		  brw_ADD(p, c->reg.nr_verts, c->reg.nr_verts, brw_imm_ud(1));
+		  brw_MOV(p, get_addr_reg(vtxOut), brw_imm_uw(0) );
+	       } 	       
+	       brw_ENDIF(p, next_test);
+	    }
+	    brw_ENDIF(p, prev_test);
+	    
+	    /* vtxPrev = vtx;
+	     * inlist_ptr++;
+	     */
+	    brw_MOV(p, get_addr_reg(vtxPrev), get_addr_reg(vtx));
+	    brw_ADD(p, get_addr_reg(inlist_ptr), get_addr_reg(inlist_ptr), brw_imm_uw(sizeof(short)));
+
+	    /* while (--loopcount != 0)
+	     */
+	    brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+	    brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
+	 } 
+	 brw_WHILE(p, vertex_loop);
+
+	 /* vtxPrev = *(outlist_ptr-1)  OR: outlist[nr_verts-1]
+	  * inlist = outlist
+	  * inlist_ptr = &inlist[0]
+	  * outlist_ptr = &outlist[0]
+	  */
+	 brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_w(-2));
+	 brw_MOV(p, get_addr_reg(vtxPrev), deref_1uw(outlist_ptr, 0));
+	 brw_MOV(p, brw_vec8_grf(c->reg.inlist.nr, 0), brw_vec8_grf(c->reg.outlist.nr, 0));
+	 brw_MOV(p, get_addr_reg(inlist_ptr), brw_address(c->reg.inlist));
+	 brw_MOV(p, get_addr_reg(outlist_ptr), brw_address(c->reg.outlist));
+      }
+      brw_ENDIF(p, plane_active);
+      
+      /* plane_ptr++;
+       */
+      brw_ADD(p, get_addr_reg(plane_ptr), get_addr_reg(plane_ptr), brw_clip_plane_stride(c));
+
+      /* nr_verts >= 3 
+       */
+      brw_CMP(p,
+	      vec1(brw_null_reg()),
+	      BRW_CONDITIONAL_GE,
+	      c->reg.nr_verts,
+	      brw_imm_ud(3));
+   
+      /* && (planemask>>=1) != 0
+       */
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+      brw_SHR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(1));
+   }
+   brw_WHILE(p, plane_loop);
+}
+
+
+
+void brw_clip_tri_emit_polygon(struct brw_clip_compile *c)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *loop, *if_insn;
+
+   /* for (loopcount = nr_verts-2; loopcount > 0; loopcount--)
+    */
+   brw_set_conditionalmod(p, BRW_CONDITIONAL_G);
+   brw_ADD(p,
+	   c->reg.loopcount,
+	   c->reg.nr_verts,
+	   brw_imm_d(-2));
+
+   if_insn = brw_IF(p, BRW_EXECUTE_1);
+   {
+      struct brw_indirect v0 = brw_indirect(0, 0);
+      struct brw_indirect vptr = brw_indirect(1, 0);
+
+      brw_MOV(p, get_addr_reg(vptr), brw_address(c->reg.inlist));
+      brw_MOV(p, get_addr_reg(v0), deref_1uw(vptr, 0));
+
+      brw_clip_emit_vue(c, v0, 1, 0, ((_3DPRIM_TRIFAN << 2) | R02_PRIM_START));
+      
+      brw_ADD(p, get_addr_reg(vptr), get_addr_reg(vptr), brw_imm_uw(2));
+      brw_MOV(p, get_addr_reg(v0), deref_1uw(vptr, 0));
+
+      loop = brw_DO(p, BRW_EXECUTE_1);
+      {
+	 brw_clip_emit_vue(c, v0, 1, 0, (_3DPRIM_TRIFAN << 2));
+  
+	 brw_ADD(p, get_addr_reg(vptr), get_addr_reg(vptr), brw_imm_uw(2));
+	 brw_MOV(p, get_addr_reg(v0), deref_1uw(vptr, 0));
+
+	 brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+	 brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
+      }
+      brw_WHILE(p, loop);
+
+      brw_clip_emit_vue(c, v0, 0, 1, ((_3DPRIM_TRIFAN << 2) | R02_PRIM_END));
+   }
+   brw_ENDIF(p, if_insn);
+}
+
+static void do_clip_tri( struct brw_clip_compile *c )
+{
+   brw_clip_init_planes(c);
+
+   brw_clip_tri(c);
+}
+
+
+static void maybe_do_clip_tri( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *do_clip;
+
+   brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_NZ, c->reg.planemask, brw_imm_ud(0));
+   do_clip = brw_IF(p, BRW_EXECUTE_1);
+   {
+      do_clip_tri(c);
+   }
+   brw_ENDIF(p, do_clip);
+}
+
+static void brw_clip_test( struct brw_clip_compile *c )
+{
+    struct brw_reg t = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
+    struct brw_reg t1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
+    struct brw_reg t2 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
+    struct brw_reg t3 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
+
+    struct brw_reg v0 = get_tmp(c);
+    struct brw_reg v1 = get_tmp(c);
+    struct brw_reg v2 = get_tmp(c);
+
+    struct brw_indirect vt0 = brw_indirect(0, 0);
+    struct brw_indirect vt1 = brw_indirect(1, 0);
+    struct brw_indirect vt2 = brw_indirect(2, 0);
+
+    struct brw_compile *p = &c->func;
+    struct brw_instruction *is_outside;
+    struct brw_reg tmp0 = c->reg.loopcount; /* handy temporary */
+
+    brw_MOV(p, get_addr_reg(vt0), brw_address(c->reg.vertex[0]));
+    brw_MOV(p, get_addr_reg(vt1), brw_address(c->reg.vertex[1]));
+    brw_MOV(p, get_addr_reg(vt2), brw_address(c->reg.vertex[2]));
+    brw_MOV(p, v0, deref_4f(vt0, c->offset[VERT_RESULT_HPOS]));
+    brw_MOV(p, v1, deref_4f(vt1, c->offset[VERT_RESULT_HPOS]));
+    brw_MOV(p, v2, deref_4f(vt2, c->offset[VERT_RESULT_HPOS]));
+    brw_AND(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(~0x3f));
+
+    /* test nearz, xmin, ymin plane */
+    /* clip.xyz < -clip.w */
+    brw_CMP(p, t1, BRW_CONDITIONAL_L, v0, negate(get_element(v0, 3))); 
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, t2, BRW_CONDITIONAL_L, v1, negate(get_element(v1, 3))); 
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, t3, BRW_CONDITIONAL_L, v2, negate(get_element(v2, 3))); 
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+    /* All vertices are outside of a plane, rejected */
+    brw_AND(p, t, t1, t2);
+    brw_AND(p, t, t, t3);
+    brw_OR(p, tmp0, get_element(t, 0), get_element(t, 1));
+    brw_OR(p, tmp0, tmp0, get_element(t, 2));
+    brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+    brw_AND(p, brw_null_reg(), tmp0, brw_imm_ud(0x1));
+    is_outside = brw_IF(p, BRW_EXECUTE_1);
+    {
+        brw_clip_kill_thread(c);
+    }
+    brw_ENDIF(p, is_outside);
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+    /* some vertices are inside a plane, some are outside,need to clip */
+    brw_XOR(p, t, t1, t2);
+    brw_XOR(p, t1, t2, t3);
+    brw_OR(p, t, t, t1);
+    brw_AND(p, t, t, brw_imm_ud(0x1));
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+            get_element(t, 0), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<5)));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+            get_element(t, 1), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<3)));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+            get_element(t, 2), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<1)));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+    /* test farz, xmax, ymax plane */
+    /* clip.xyz > clip.w */
+    brw_CMP(p, t1, BRW_CONDITIONAL_G, v0, get_element(v0, 3)); 
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, t2, BRW_CONDITIONAL_G, v1, get_element(v1, 3)); 
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, t3, BRW_CONDITIONAL_G, v2, get_element(v2, 3)); 
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+    /* All vertices are outside of a plane, rejected */
+    brw_AND(p, t, t1, t2);
+    brw_AND(p, t, t, t3);
+    brw_OR(p, tmp0, get_element(t, 0), get_element(t, 1));
+    brw_OR(p, tmp0, tmp0, get_element(t, 2));
+    brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+    brw_AND(p, brw_null_reg(), tmp0, brw_imm_ud(0x1));
+    is_outside = brw_IF(p, BRW_EXECUTE_1);
+    {
+        brw_clip_kill_thread(c);
+    }
+    brw_ENDIF(p, is_outside);
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+    /* some vertices are inside a plane, some are outside,need to clip */
+    brw_XOR(p, t, t1, t2);
+    brw_XOR(p, t1, t2, t3);
+    brw_OR(p, t, t, t1);
+    brw_AND(p, t, t, brw_imm_ud(0x1));
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+            get_element(t, 0), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<4)));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+            get_element(t, 1), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<2)));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+            get_element(t, 2), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<0)));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+    release_tmps(c);
+}
+
+
+void brw_emit_tri_clip( struct brw_clip_compile *c )
+{
+   struct brw_instruction *neg_rhw;
+   struct brw_compile *p = &c->func;
+   brw_clip_tri_alloc_regs(c, 3 + c->key.nr_userclip + 6);
+   brw_clip_tri_init_vertices(c);
+   brw_clip_init_clipmask(c);
+   brw_clip_init_ff_sync(c);
+
+   /* if -ve rhw workaround bit is set, 
+      do cliptest */
+   if (BRW_IS_965(p->brw)) {
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+      brw_AND(p, brw_null_reg(), get_element_ud(c->reg.R0, 2), 
+              brw_imm_ud(1<<20));
+      neg_rhw = brw_IF(p, BRW_EXECUTE_1); 
+      {
+         brw_clip_test(c);
+      }
+      brw_ENDIF(p, neg_rhw);
+   }
+   /* Can't push into do_clip_tri because with polygon (or quad)
+    * flatshading, need to apply the flatshade here because we don't
+    * respect the PV when converting to trifan for emit:
+    */
+   if (c->key.do_flat_shading) 
+      brw_clip_tri_flat_shade(c); 
+      
+   if ((c->key.clip_mode == BRW_CLIPMODE_NORMAL) ||
+       (c->key.clip_mode == BRW_CLIPMODE_KERNEL_CLIP))
+      do_clip_tri(c);
+   else 
+      maybe_do_clip_tri(c);
+
+   brw_clip_tri_emit_polygon(c);
+
+   /* Send an empty message to kill the thread:
+    */
+   brw_clip_kill_thread(c);
+}
--- a/src/gallium/drivers/i965/brw_clip_unfilled.c
+++ b/src/gallium/drivers/i965/brw_clip_unfilled.c
@ -0,0 +1,505 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "main/glheader.h"
+#include "main/macros.h"
+#include "main/enums.h"
+#include "shader/program.h"
+
+#include "intel_batchbuffer.h"
+
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_clip.h"
+
+
+
+/* This is performed against the original triangles, so no indirection
+ * required:
+BZZZT!
+ */
+static void compute_tri_direction( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg e = c->reg.tmp0;
+   struct brw_reg f = c->reg.tmp1;
+   struct brw_reg v0 = byte_offset(c->reg.vertex[0], c->offset[VERT_RESULT_HPOS]); 
+   struct brw_reg v1 = byte_offset(c->reg.vertex[1], c->offset[VERT_RESULT_HPOS]); 
+   struct brw_reg v2 = byte_offset(c->reg.vertex[2], c->offset[VERT_RESULT_HPOS]); 
+
+
+   struct brw_reg v0n = get_tmp(c);
+   struct brw_reg v1n = get_tmp(c);
+   struct brw_reg v2n = get_tmp(c);
+
+   /* Convert to NDC.
+    * NOTE: We can't modify the original vertex coordinates,
+    * as it may impact further operations.
+    * So, we have to keep normalized coordinates in temp registers.
+    *
+    * TBD-KC
+    * Try to optimize unnecessary MOV's.
+    */
+   brw_MOV(p, v0n, v0);
+   brw_MOV(p, v1n, v1);
+   brw_MOV(p, v2n, v2);
+
+   brw_clip_project_position(c, v0n);
+   brw_clip_project_position(c, v1n);
+   brw_clip_project_position(c, v2n);
+
+   /* Calculate the vectors of two edges of the triangle:
+    */
+   brw_ADD(p, e, v0n, negate(v2n)); 
+   brw_ADD(p, f, v1n, negate(v2n)); 
+
+   /* Take their crossproduct:
+    */
+   brw_set_access_mode(p, BRW_ALIGN_16);
+   brw_MUL(p, vec4(brw_null_reg()), brw_swizzle(e, 1,2,0,3),  brw_swizzle(f,2,0,1,3));
+   brw_MAC(p, vec4(e),  negate(brw_swizzle(e, 2,0,1,3)), brw_swizzle(f,1,2,0,3));
+   brw_set_access_mode(p, BRW_ALIGN_1);
+
+   brw_MUL(p, c->reg.dir, c->reg.dir, vec4(e));
+}
+
+
+static void cull_direction( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *ccw;
+   GLuint conditional;
+
+   assert (!(c->key.fill_ccw == CLIP_CULL &&
+	     c->key.fill_cw == CLIP_CULL));
+
+   if (c->key.fill_ccw == CLIP_CULL)
+      conditional = BRW_CONDITIONAL_GE;
+   else
+      conditional = BRW_CONDITIONAL_L;
+
+   brw_CMP(p,
+	   vec1(brw_null_reg()),
+	   conditional,
+	   get_element(c->reg.dir, 2),
+	   brw_imm_f(0));
+   
+   ccw = brw_IF(p, BRW_EXECUTE_1);
+   {
+      brw_clip_kill_thread(c);
+   }
+   brw_ENDIF(p, ccw);
+}
+
+
+
+static void copy_bfc( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *ccw;
+   GLuint conditional;
+
+   /* Do we have any colors to copy? 
+    */
+   if (!(c->offset[VERT_RESULT_COL0] && c->offset[VERT_RESULT_BFC0]) &&
+       !(c->offset[VERT_RESULT_COL1] && c->offset[VERT_RESULT_BFC1]))
+      return;
+
+   /* In some wierd degnerate cases we can end up testing the
+    * direction twice, once for culling and once for bfc copying.  Oh
+    * well, that's what you get for setting wierd GL state.
+    */
+   if (c->key.copy_bfc_ccw)
+      conditional = BRW_CONDITIONAL_GE;
+   else
+      conditional = BRW_CONDITIONAL_L;
+
+   brw_CMP(p,
+	   vec1(brw_null_reg()),
+	   conditional,
+	   get_element(c->reg.dir, 2),
+	   brw_imm_f(0));
+   
+   ccw = brw_IF(p, BRW_EXECUTE_1);
+   {
+      GLuint i;
+
+      for (i = 0; i < 3; i++) {
+	 if (c->offset[VERT_RESULT_COL0] && c->offset[VERT_RESULT_BFC0])
+	    brw_MOV(p, 
+		    byte_offset(c->reg.vertex[i], c->offset[VERT_RESULT_COL0]),
+		    byte_offset(c->reg.vertex[i], c->offset[VERT_RESULT_BFC0]));
+
+	 if (c->offset[VERT_RESULT_COL1] && c->offset[VERT_RESULT_BFC1])
+	    brw_MOV(p, 
+		    byte_offset(c->reg.vertex[i], c->offset[VERT_RESULT_COL1]),
+		    byte_offset(c->reg.vertex[i], c->offset[VERT_RESULT_BFC1]));
+      }
+   }
+   brw_ENDIF(p, ccw);
+}
+
+
+
+
+/*
+  GLfloat iz	= 1.0 / dir.z;
+  GLfloat ac	= dir.x * iz;
+  GLfloat bc	= dir.y * iz;
+  offset = ctx->Polygon.OffsetUnits * DEPTH_SCALE;
+  offset += MAX2( abs(ac), abs(bc) ) * ctx->Polygon.OffsetFactor;
+  offset *= MRD;
+*/
+static void compute_offset( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg off = c->reg.offset;
+   struct brw_reg dir = c->reg.dir;
+   
+   brw_math_invert(p, get_element(off, 2), get_element(dir, 2));
+   brw_MUL(p, vec2(off), dir, get_element(off, 2));
+
+   brw_CMP(p, 
+	   vec1(brw_null_reg()), 
+	   BRW_CONDITIONAL_GE,
+	   brw_abs(get_element(off, 0)), 
+	   brw_abs(get_element(off, 1)));
+
+   brw_SEL(p, vec1(off), brw_abs(get_element(off, 0)), brw_abs(get_element(off, 1)));
+   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+   brw_MUL(p, vec1(off), off, brw_imm_f(c->key.offset_factor));
+   brw_ADD(p, vec1(off), off, brw_imm_f(c->key.offset_units));
+}
+
+
+static void merge_edgeflags( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *is_poly;
+   struct brw_reg tmp0 = get_element_ud(c->reg.tmp0, 0);
+
+   brw_AND(p, tmp0, get_element_ud(c->reg.R0, 2), brw_imm_ud(PRIM_MASK)); 
+   brw_CMP(p, 
+	   vec1(brw_null_reg()), 
+	   BRW_CONDITIONAL_EQ, 
+	   tmp0,
+	   brw_imm_ud(_3DPRIM_POLYGON));
+
+   /* Get away with using reg.vertex because we know that this is not
+    * a _3DPRIM_TRISTRIP_REVERSE:
+    */
+   is_poly = brw_IF(p, BRW_EXECUTE_1);
+   {   
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_EQ);
+      brw_AND(p, vec1(brw_null_reg()), get_element_ud(c->reg.R0, 2), brw_imm_ud(1<<8));
+      brw_MOV(p, byte_offset(c->reg.vertex[0], c->offset[VERT_RESULT_EDGE]), brw_imm_f(0));
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_EQ);
+      brw_AND(p, vec1(brw_null_reg()), get_element_ud(c->reg.R0, 2), brw_imm_ud(1<<9));
+      brw_MOV(p, byte_offset(c->reg.vertex[2], c->offset[VERT_RESULT_EDGE]), brw_imm_f(0));
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+   }
+   brw_ENDIF(p, is_poly);
+}
+
+
+
+static void apply_one_offset( struct brw_clip_compile *c,
+			  struct brw_indirect vert )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg z = deref_1f(vert, c->header_position_offset +
+			       2 * type_sz(BRW_REGISTER_TYPE_F));
+
+   brw_ADD(p, z, z, vec1(c->reg.offset));
+}
+
+
+
+/***********************************************************************
+ * Output clipped polygon as an unfilled primitive:
+ */
+static void emit_lines(struct brw_clip_compile *c,
+		       GLboolean do_offset)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *loop;
+   struct brw_instruction *draw_edge;
+   struct brw_indirect v0 = brw_indirect(0, 0);
+   struct brw_indirect v1 = brw_indirect(1, 0);
+   struct brw_indirect v0ptr = brw_indirect(2, 0);
+   struct brw_indirect v1ptr = brw_indirect(3, 0);
+
+   /* Need a seperate loop for offset:
+    */
+   if (do_offset) {
+      brw_MOV(p, c->reg.loopcount, c->reg.nr_verts);
+      brw_MOV(p, get_addr_reg(v0ptr), brw_address(c->reg.inlist));
+
+      loop = brw_DO(p, BRW_EXECUTE_1);
+      {
+	 brw_MOV(p, get_addr_reg(v0), deref_1uw(v0ptr, 0));
+	 brw_ADD(p, get_addr_reg(v0ptr), get_addr_reg(v0ptr), brw_imm_uw(2));
+	    
+	 apply_one_offset(c, v0);
+	    
+	 brw_set_conditionalmod(p, BRW_CONDITIONAL_G);
+	 brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
+      }
+      brw_WHILE(p, loop);
+   }
+
+   /* v1ptr = &inlist[nr_verts]
+    * *v1ptr = v0
+    */
+   brw_MOV(p, c->reg.loopcount, c->reg.nr_verts);
+   brw_MOV(p, get_addr_reg(v0ptr), brw_address(c->reg.inlist));
+   brw_ADD(p, get_addr_reg(v1ptr), get_addr_reg(v0ptr), retype(c->reg.nr_verts, BRW_REGISTER_TYPE_UW));
+   brw_ADD(p, get_addr_reg(v1ptr), get_addr_reg(v1ptr), retype(c->reg.nr_verts, BRW_REGISTER_TYPE_UW));
+   brw_MOV(p, deref_1uw(v1ptr, 0), deref_1uw(v0ptr, 0));
+
+   loop = brw_DO(p, BRW_EXECUTE_1);
+   {
+      brw_MOV(p, get_addr_reg(v0), deref_1uw(v0ptr, 0));
+      brw_MOV(p, get_addr_reg(v1), deref_1uw(v0ptr, 2));
+      brw_ADD(p, get_addr_reg(v0ptr), get_addr_reg(v0ptr), brw_imm_uw(2));
+
+      /* draw edge if edgeflag != 0 */
+      brw_CMP(p, 
+	      vec1(brw_null_reg()), BRW_CONDITIONAL_NZ, 
+	      deref_1f(v0, c->offset[VERT_RESULT_EDGE]),
+	      brw_imm_f(0));
+      draw_edge = brw_IF(p, BRW_EXECUTE_1);
+      {
+	 brw_clip_emit_vue(c, v0, 1, 0, (_3DPRIM_LINESTRIP << 2) | R02_PRIM_START);
+	 brw_clip_emit_vue(c, v1, 1, 0, (_3DPRIM_LINESTRIP << 2) | R02_PRIM_END);
+      }
+      brw_ENDIF(p, draw_edge);
+
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+      brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
+   }
+   brw_WHILE(p, loop);
+}
+
+
+
+static void emit_points(struct brw_clip_compile *c,
+			GLboolean do_offset )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *loop;
+   struct brw_instruction *draw_point;
+
+   struct brw_indirect v0 = brw_indirect(0, 0);
+   struct brw_indirect v0ptr = brw_indirect(2, 0);
+
+   brw_MOV(p, c->reg.loopcount, c->reg.nr_verts);
+   brw_MOV(p, get_addr_reg(v0ptr), brw_address(c->reg.inlist));
+
+   loop = brw_DO(p, BRW_EXECUTE_1);
+   {
+      brw_MOV(p, get_addr_reg(v0), deref_1uw(v0ptr, 0));
+      brw_ADD(p, get_addr_reg(v0ptr), get_addr_reg(v0ptr), brw_imm_uw(2));
+
+      /* draw if edgeflag != 0 
+       */
+      brw_CMP(p, 
+	      vec1(brw_null_reg()), BRW_CONDITIONAL_NZ, 
+	      deref_1f(v0, c->offset[VERT_RESULT_EDGE]),
+	      brw_imm_f(0));
+      draw_point = brw_IF(p, BRW_EXECUTE_1);
+      {
+	 if (do_offset)
+	    apply_one_offset(c, v0);
+
+	 brw_clip_emit_vue(c, v0, 1, 0, (_3DPRIM_POINTLIST << 2) | R02_PRIM_START | R02_PRIM_END);
+      }
+      brw_ENDIF(p, draw_point);
+
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+      brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
+   }
+   brw_WHILE(p, loop);
+}
+
+
+
+
+
+
+
+static void emit_primitives( struct brw_clip_compile *c,
+			     GLuint mode, 
+			     GLboolean do_offset )
+{
+   switch (mode) {
+   case CLIP_FILL:
+      brw_clip_tri_emit_polygon(c);
+      break;
+
+   case CLIP_LINE:
+      emit_lines(c, do_offset);
+      break;
+
+   case CLIP_POINT:
+      emit_points(c, do_offset);
+      break;
+
+   case CLIP_CULL:
+      assert(0);
+      break;
+   }
+} 
+
+
+
+static void emit_unfilled_primitives( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *ccw;
+
+   /* Direction culling has already been done.
+    */
+   if (c->key.fill_ccw != c->key.fill_cw &&
+       c->key.fill_ccw != CLIP_CULL &&
+       c->key.fill_cw != CLIP_CULL)
+   {
+      brw_CMP(p,
+	      vec1(brw_null_reg()),
+	      BRW_CONDITIONAL_GE,
+	      get_element(c->reg.dir, 2),
+	      brw_imm_f(0));
+   
+      ccw = brw_IF(p, BRW_EXECUTE_1);
+      {
+	 emit_primitives(c, c->key.fill_ccw, c->key.offset_ccw);
+      }
+      ccw = brw_ELSE(p, ccw);
+      {
+	 emit_primitives(c, c->key.fill_cw, c->key.offset_cw);
+      }
+      brw_ENDIF(p, ccw);
+   }
+   else if (c->key.fill_cw != CLIP_CULL) {
+      emit_primitives(c, c->key.fill_cw, c->key.offset_cw);
+   }
+   else if (c->key.fill_ccw != CLIP_CULL) { 
+      emit_primitives(c, c->key.fill_ccw, c->key.offset_ccw);
+   }
+}
+
+
+
+
+static void check_nr_verts( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *if_insn;
+
+   brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.nr_verts, brw_imm_d(3));      
+   if_insn = brw_IF(p, BRW_EXECUTE_1);
+   {
+      brw_clip_kill_thread(c);
+   }
+   brw_ENDIF(p, if_insn);
+}
+
+
+void brw_emit_unfilled_clip( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *do_clip;
+   
+
+   c->need_direction = ((c->key.offset_ccw || c->key.offset_cw) ||
+			(c->key.fill_ccw != c->key.fill_cw) ||
+			c->key.fill_ccw == CLIP_CULL ||
+			c->key.fill_cw == CLIP_CULL ||
+			c->key.copy_bfc_cw ||
+			c->key.copy_bfc_ccw);
+
+   brw_clip_tri_alloc_regs(c, 3 + c->key.nr_userclip + 6);
+   brw_clip_tri_init_vertices(c);
+   brw_clip_init_ff_sync(c);
+
+   assert(c->offset[VERT_RESULT_EDGE]);
+
+   if (c->key.fill_ccw == CLIP_CULL &&
+       c->key.fill_cw == CLIP_CULL) {
+      brw_clip_kill_thread(c);
+      return;
+   }
+
+   merge_edgeflags(c);
+
+   /* Need to use the inlist indirection here: 
+    */
+   if (c->need_direction) 
+      compute_tri_direction(c);
+   
+   if (c->key.fill_ccw == CLIP_CULL ||
+       c->key.fill_cw == CLIP_CULL)
+      cull_direction(c);
+
+   if (c->key.offset_ccw ||
+       c->key.offset_cw)
+      compute_offset(c);
+
+   if (c->key.copy_bfc_ccw ||
+       c->key.copy_bfc_cw)
+      copy_bfc(c);
+
+   /* Need to do this whether we clip or not:
+    */
+   if (c->key.do_flat_shading)
+      brw_clip_tri_flat_shade(c);
+   
+   brw_clip_init_clipmask(c);
+   brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_NZ, c->reg.planemask, brw_imm_ud(0));
+   do_clip = brw_IF(p, BRW_EXECUTE_1);
+   {
+      brw_clip_init_planes(c);
+      brw_clip_tri(c);
+      check_nr_verts(c);
+   }
+   brw_ENDIF(p, do_clip);
+   
+   emit_unfilled_primitives(c);
+   brw_clip_kill_thread(c);
+}
+
+
+
--- a/src/gallium/drivers/i965/brw_clip_util.c
+++ b/src/gallium/drivers/i965/brw_clip_util.c
@ -0,0 +1,396 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "main/glheader.h"
+#include "main/macros.h"
+#include "main/enums.h"
+#include "shader/program.h"
+
+#include "intel_batchbuffer.h"
+
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_clip.h"
+
+
+
+
+struct brw_reg get_tmp( struct brw_clip_compile *c )
+{
+   struct brw_reg tmp = brw_vec4_grf(c->last_tmp, 0);
+
+   if (++c->last_tmp > c->prog_data.total_grf)
+      c->prog_data.total_grf = c->last_tmp;
+
+   return tmp;
+}
+
+static void release_tmp( struct brw_clip_compile *c, struct brw_reg tmp )
+{
+   if (tmp.nr == c->last_tmp-1)
+      c->last_tmp--;
+}
+
+
+static struct brw_reg make_plane_ud(GLuint x, GLuint y, GLuint z, GLuint w)
+{
+   return brw_imm_ud((w<<24) | (z<<16) | (y<<8) | x);
+}
+
+
+void brw_clip_init_planes( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+
+   if (!c->key.nr_userclip) {
+      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 0), make_plane_ud( 0,    0, 0xff, 1));
+      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 1), make_plane_ud( 0,    0,    1, 1));
+      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 2), make_plane_ud( 0, 0xff,    0, 1));
+      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 3), make_plane_ud( 0,    1,    0, 1));
+      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 4), make_plane_ud(0xff,  0,    0, 1));
+      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 5), make_plane_ud( 1,    0,    0, 1));
+   }
+}
+
+
+
+#define W 3
+
+/* Project 'pos' to screen space (or back again), overwrite with results:
+ */
+void brw_clip_project_position(struct brw_clip_compile *c, struct brw_reg pos )
+{
+   struct brw_compile *p = &c->func;
+
+   /* calc rhw 
+    */
+   brw_math_invert(p, get_element(pos, W), get_element(pos, W));
+
+   /* value.xyz *= value.rhw
+    */
+   brw_set_access_mode(p, BRW_ALIGN_16);
+   brw_MUL(p, brw_writemask(pos, WRITEMASK_XYZ), pos, brw_swizzle1(pos, W));
+   brw_set_access_mode(p, BRW_ALIGN_1);
+}
+
+
+static void brw_clip_project_vertex( struct brw_clip_compile *c, 
+				     struct brw_indirect vert_addr )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg tmp = get_tmp(c);
+
+   /* Fixup position.  Extract from the original vertex and re-project
+    * to screen space:
+    */
+   brw_MOV(p, tmp, deref_4f(vert_addr, c->offset[VERT_RESULT_HPOS]));
+   brw_clip_project_position(c, tmp);
+   brw_MOV(p, deref_4f(vert_addr, c->header_position_offset), tmp);
+	 
+   release_tmp(c, tmp);
+}
+
+
+
+
+/* Interpolate between two vertices and put the result into a0.0.  
+ * Increment a0.0 accordingly.
+ */
+void brw_clip_interp_vertex( struct brw_clip_compile *c,
+			     struct brw_indirect dest_ptr,
+			     struct brw_indirect v0_ptr, /* from */
+			     struct brw_indirect v1_ptr, /* to */
+			     struct brw_reg t0,
+			     GLboolean force_edgeflag)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg tmp = get_tmp(c);
+   GLuint i;
+
+   /* Just copy the vertex header:
+    */
+   /*
+    * After CLIP stage, only first 256 bits of the VUE are read
+    * back on IGDNG, so needn't change it
+    */
+   brw_copy_indirect_to_indirect(p, dest_ptr, v0_ptr, 1);
+      
+   /* Iterate over each attribute (could be done in pairs?)
+    */
+   for (i = 0; i < c->nr_attrs; i++) {
+      GLuint delta = i*16 + 32;
+
+      if (BRW_IS_IGDNG(p->brw))
+          delta = i * 16 + 32 * 3;
+
+      if (delta == c->offset[VERT_RESULT_EDGE]) {
+	 if (force_edgeflag) 
+	    brw_MOV(p, deref_4f(dest_ptr, delta), brw_imm_f(1));
+	 else
+	    brw_MOV(p, deref_4f(dest_ptr, delta), deref_4f(v0_ptr, delta));
+      }
+      else {
+	 /* Interpolate: 
+	  *
+	  *        New = attr0 + t*attr1 - t*attr0
+	  */
+	 brw_MUL(p, 
+		 vec4(brw_null_reg()),
+		 deref_4f(v1_ptr, delta),
+		 t0);
+
+	 brw_MAC(p, 
+		 tmp,	      
+		 negate(deref_4f(v0_ptr, delta)),
+		 t0); 
+	      
+	 brw_ADD(p,
+		 deref_4f(dest_ptr, delta), 
+		 deref_4f(v0_ptr, delta),
+		 tmp);
+      }
+   }
+
+   if (i & 1) {
+      GLuint delta = i*16 + 32;
+
+      if (BRW_IS_IGDNG(p->brw))
+          delta = i * 16 + 32 * 3;
+
+      brw_MOV(p, deref_4f(dest_ptr, delta), brw_imm_f(0));
+   }
+
+   release_tmp(c, tmp);
+
+   /* Recreate the projected (NDC) coordinate in the new vertex
+    * header:
+    */
+   brw_clip_project_vertex(c, dest_ptr );
+}
+
+
+
+
+#define MAX_MRF 16
+
+void brw_clip_emit_vue(struct brw_clip_compile *c, 
+		       struct brw_indirect vert,
+		       GLboolean allocate,
+		       GLboolean eot,
+		       GLuint header)
+{
+   struct brw_compile *p = &c->func;
+   GLuint start = c->last_mrf;
+
+   brw_clip_ff_sync(c);
+
+   assert(!(allocate && eot));
+   
+   /* Cycle through mrf regs - probably futile as we have to wait for
+    * the allocation response anyway.  Also, the order this function
+    * is invoked doesn't correspond to the order the instructions will
+    * be executed, so it won't have any effect in many cases.
+    */
+#if 0
+   if (start + c->nr_regs + 1 >= MAX_MRF)
+      start = 0;
+
+   c->last_mrf = start + c->nr_regs + 1;
+#endif
+	
+   /* Copy the vertex from vertn into m1..mN+1:
+    */
+   brw_copy_from_indirect(p, brw_message_reg(start+1), vert, c->nr_regs);
+
+   /* Overwrite PrimType and PrimStart in the message header, for
+    * each vertex in turn:
+    */
+   brw_MOV(p, get_element_ud(c->reg.R0, 2), brw_imm_ud(header));
+
+
+   /* Send each vertex as a seperate write to the urb.  This
+    * is different to the concept in brw_sf_emit.c, where
+    * subsequent writes are used to build up a single urb
+    * entry.  Each of these writes instantiates a seperate
+    * urb entry - (I think... what about 'allocate'?)
+    */
+   brw_urb_WRITE(p, 
+		 allocate ? c->reg.R0 : retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
+		 start,
+		 c->reg.R0,
+		 allocate,
+		 1,		/* used */
+		 c->nr_regs + 1, /* msg length */
+		 allocate ? 1 : 0, /* response_length */ 
+		 eot,		/* eot */
+		 1,		/* writes_complete */
+		 0,		/* urb offset */
+		 BRW_URB_SWIZZLE_NONE);
+}
+
+
+
+void brw_clip_kill_thread(struct brw_clip_compile *c)
+{
+   struct brw_compile *p = &c->func;
+
+   brw_clip_ff_sync(c);
+   /* Send an empty message to kill the thread and release any
+    * allocated urb entry:
+    */
+   brw_urb_WRITE(p, 
+		 retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
+		 0,
+		 c->reg.R0,
+		 0,		/* allocate */
+		 0,		/* used */
+		 1, 		/* msg len */
+		 0, 		/* response len */
+		 1, 		/* eot */
+		 1,		/* writes complete */
+		 0,
+		 BRW_URB_SWIZZLE_NONE);
+}
+
+
+
+
+struct brw_reg brw_clip_plane0_address( struct brw_clip_compile *c )
+{
+   return brw_address(c->reg.fixed_planes);
+}
+
+
+struct brw_reg brw_clip_plane_stride( struct brw_clip_compile *c )
+{
+   if (c->key.nr_userclip) {
+      return brw_imm_uw(16);
+   }
+   else {
+      return brw_imm_uw(4);
+   }
+}
+
+
+/* If flatshading, distribute color from provoking vertex prior to
+ * clipping.
+ */
+void brw_clip_copy_colors( struct brw_clip_compile *c,
+			   GLuint to, GLuint from )
+{
+   struct brw_compile *p = &c->func;
+
+   if (c->offset[VERT_RESULT_COL0])
+      brw_MOV(p, 
+	      byte_offset(c->reg.vertex[to], c->offset[VERT_RESULT_COL0]),
+	      byte_offset(c->reg.vertex[from], c->offset[VERT_RESULT_COL0]));
+
+   if (c->offset[VERT_RESULT_COL1])
+      brw_MOV(p, 
+	      byte_offset(c->reg.vertex[to], c->offset[VERT_RESULT_COL1]),
+	      byte_offset(c->reg.vertex[from], c->offset[VERT_RESULT_COL1]));
+
+   if (c->offset[VERT_RESULT_BFC0])
+      brw_MOV(p, 
+	      byte_offset(c->reg.vertex[to], c->offset[VERT_RESULT_BFC0]),
+	      byte_offset(c->reg.vertex[from], c->offset[VERT_RESULT_BFC0]));
+
+   if (c->offset[VERT_RESULT_BFC1])
+      brw_MOV(p, 
+	      byte_offset(c->reg.vertex[to], c->offset[VERT_RESULT_BFC1]),
+	      byte_offset(c->reg.vertex[from], c->offset[VERT_RESULT_BFC1]));
+}
+
+
+
+void brw_clip_init_clipmask( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg incoming = get_element_ud(c->reg.R0, 2);
+   
+   /* Shift so that lowest outcode bit is rightmost: 
+    */
+   brw_SHR(p, c->reg.planemask, incoming, brw_imm_ud(26));
+
+   if (c->key.nr_userclip) {
+      struct brw_reg tmp = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UD);
+
+      /* Rearrange userclip outcodes so that they come directly after
+       * the fixed plane bits.
+       */
+      brw_AND(p, tmp, incoming, brw_imm_ud(0x3f<<14));
+      brw_SHR(p, tmp, tmp, brw_imm_ud(8));
+      brw_OR(p, c->reg.planemask, c->reg.planemask, tmp);
+      
+      release_tmp(c, tmp);
+   }
+}
+
+void brw_clip_ff_sync(struct brw_clip_compile *c)
+{
+    if (c->need_ff_sync) {
+        struct brw_compile *p = &c->func;
+        struct brw_instruction *need_ff_sync;
+
+        brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
+        brw_AND(p, brw_null_reg(), c->reg.ff_sync, brw_imm_ud(0x1));
+        need_ff_sync = brw_IF(p, BRW_EXECUTE_1);
+        {
+            brw_OR(p, c->reg.ff_sync, c->reg.ff_sync, brw_imm_ud(0x1));
+            brw_ff_sync(p, 
+                    c->reg.R0,
+                    0,
+                    c->reg.R0,
+                    1,	
+                    1,		/* used */
+                    1,  	/* msg length */
+                    1,		/* response length */
+                    0,		/* eot */
+                    1,		/* write compelete */
+                    0,		/* urb offset */
+                    BRW_URB_SWIZZLE_NONE);
+        }
+        brw_ENDIF(p, need_ff_sync);
+        brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    }
+}
+
+void brw_clip_init_ff_sync(struct brw_clip_compile *c)
+{
+    if (c->need_ff_sync) {
+	struct brw_compile *p = &c->func;
+        
+        brw_MOV(p, c->reg.ff_sync, brw_imm_ud(0));
+    }
+}
--- a/src/gallium/drivers/i965/brw_context.c
+++ b/src/gallium/drivers/i965/brw_context.c
@ -0,0 +1,173 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "main/imports.h"
+#include "main/api_noop.h"
+#include "main/macros.h"
+#include "main/vtxfmt.h"
+#include "main/simple_list.h"
+#include "shader/shader_api.h"
+
+#include "brw_context.h"
+#include "brw_defines.h"
+#include "brw_draw.h"
+#include "brw_state.h"
+#include "brw_vs.h"
+#include "intel_tex.h"
+#include "intel_blit.h"
+#include "intel_batchbuffer.h"
+#include "intel_pixel.h"
+#include "intel_span.h"
+#include "tnl/t_pipeline.h"
+
+#include "utils.h"
+
+
+/***************************************
+ * Mesa's Driver Functions
+ ***************************************/
+
+static void brwUseProgram(GLcontext *ctx, GLuint program)
+{
+   _mesa_use_program(ctx, program);
+}
+
+static void brwInitProgFuncs( struct dd_function_table *functions )
+{
+   functions->UseProgram = brwUseProgram;
+}
+static void brwInitDriverFunctions( struct dd_function_table *functions )
+{
+   intelInitDriverFunctions( functions );
+
+   brwInitFragProgFuncs( functions );
+   brwInitProgFuncs( functions );
+   brw_init_queryobj_functions(functions);
+
+   functions->Viewport = intel_viewport;
+}
+
+GLboolean brwCreateContext( const __GLcontextModes *mesaVis,
+			    __DRIcontextPrivate *driContextPriv,
+			    void *sharedContextPrivate)
+{
+   struct dd_function_table functions;
+   struct brw_context *brw = (struct brw_context *) CALLOC_STRUCT(brw_context);
+   struct intel_context *intel = &brw->intel;
+   GLcontext *ctx = &intel->ctx;
+
+   if (!brw) {
+      _mesa_printf("%s: failed to alloc context\n", __FUNCTION__);
+      return GL_FALSE;
+   }
+
+   brwInitVtbl( brw );
+   brwInitDriverFunctions( &functions );
+
+   if (!intelInitContext( intel, mesaVis, driContextPriv,
+			  sharedContextPrivate, &functions )) {
+      _mesa_printf("%s: failed to init intel context\n", __FUNCTION__);
+      FREE(brw);
+      return GL_FALSE;
+   }
+
+   /* Initialize swrast, tnl driver tables: */
+   intelInitSpanFuncs(ctx);
+
+   TNL_CONTEXT(ctx)->Driver.RunPipeline = _tnl_run_pipeline;
+
+   ctx->Const.MaxTextureImageUnits = BRW_MAX_TEX_UNIT;
+   ctx->Const.MaxTextureCoordUnits = 8; /* Mesa limit */
+   ctx->Const.MaxTextureUnits = MIN2(ctx->Const.MaxTextureCoordUnits,
+                                     ctx->Const.MaxTextureImageUnits);
+   ctx->Const.MaxVertexTextureImageUnits = 0; /* no vertex shader textures */
+
+   /* Mesa limits textures to 4kx4k; it would be nice to fix that someday
+    */
+   ctx->Const.MaxTextureLevels = 13;
+   ctx->Const.Max3DTextureLevels = 9;
+   ctx->Const.MaxCubeTextureLevels = 12;
+   ctx->Const.MaxTextureRectSize = (1<<12);
+   
+   ctx->Const.MaxTextureMaxAnisotropy = 16.0;
+
+   /* if conformance mode is set, swrast can handle any size AA point */
+   ctx->Const.MaxPointSizeAA = 255.0;
+
+   /* We want the GLSL compiler to emit code that uses condition codes */
+   ctx->Shader.EmitCondCodes = GL_TRUE;
+   ctx->Shader.EmitNVTempInitialization = GL_TRUE;
+
+   ctx->Const.VertexProgram.MaxNativeInstructions = (16 * 1024);
+   ctx->Const.VertexProgram.MaxAluInstructions = 0;
+   ctx->Const.VertexProgram.MaxTexInstructions = 0;
+   ctx->Const.VertexProgram.MaxTexIndirections = 0;
+   ctx->Const.VertexProgram.MaxNativeAluInstructions = 0;
+   ctx->Const.VertexProgram.MaxNativeTexInstructions = 0;
+   ctx->Const.VertexProgram.MaxNativeTexIndirections = 0;
+   ctx->Const.VertexProgram.MaxNativeAttribs = 16;
+   ctx->Const.VertexProgram.MaxNativeTemps = 256;
+   ctx->Const.VertexProgram.MaxNativeAddressRegs = 1;
+   ctx->Const.VertexProgram.MaxNativeParameters = 1024;
+   ctx->Const.VertexProgram.MaxEnvParams =
+      MIN2(ctx->Const.VertexProgram.MaxNativeParameters,
+	   ctx->Const.VertexProgram.MaxEnvParams);
+
+   ctx->Const.FragmentProgram.MaxNativeInstructions = (16 * 1024);
+   ctx->Const.FragmentProgram.MaxNativeAluInstructions = (16 * 1024);
+   ctx->Const.FragmentProgram.MaxNativeTexInstructions = (16 * 1024);
+   ctx->Const.FragmentProgram.MaxNativeTexIndirections = (16 * 1024);
+   ctx->Const.FragmentProgram.MaxNativeAttribs = 12;
+   ctx->Const.FragmentProgram.MaxNativeTemps = 256;
+   ctx->Const.FragmentProgram.MaxNativeAddressRegs = 0;
+   ctx->Const.FragmentProgram.MaxNativeParameters = 1024;
+   ctx->Const.FragmentProgram.MaxEnvParams =
+      MIN2(ctx->Const.FragmentProgram.MaxNativeParameters,
+	   ctx->Const.FragmentProgram.MaxEnvParams);
+
+   brw_init_state( brw );
+
+   brw->state.dirty.mesa = ~0;
+   brw->state.dirty.brw = ~0;
+
+   brw->emit_state_always = 0;
+
+   ctx->VertexProgram._MaintainTnlProgram = GL_TRUE;
+   ctx->FragmentProgram._MaintainTexEnvProgram = GL_TRUE;
+
+   make_empty_list(&brw->query.active_head);
+
+   brw_draw_init( brw );
+
+   return GL_TRUE;
+}
+
--- a/src/gallium/drivers/i965/brw_context.h
+++ b/src/gallium/drivers/i965/brw_context.h
@ -0,0 +1,767 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#ifndef BRWCONTEXT_INC
+#define BRWCONTEXT_INC
+
+#include "intel_context.h"
+#include "brw_structs.h"
+#include "main/imports.h"
+
+
+/* Glossary:
+ *
+ * URB - uniform resource buffer.  A mid-sized buffer which is
+ * partitioned between the fixed function units and used for passing
+ * values (vertices, primitives, constants) between them.
+ *
+ * CURBE - constant URB entry.  An urb region (entry) used to hold
+ * constant values which the fixed function units can be instructed to
+ * preload into the GRF when spawning a thread.
+ *
+ * VUE - vertex URB entry.  An urb entry holding a vertex and usually
+ * a vertex header.  The header contains control information and
+ * things like primitive type, Begin/end flags and clip codes.  
+ *
+ * PUE - primitive URB entry.  An urb entry produced by the setup (SF)
+ * unit holding rasterization and interpolation parameters.
+ *
+ * GRF - general register file.  One of several register files
+ * addressable by programmed threads.  The inputs (r0, payload, curbe,
+ * urb) of the thread are preloaded to this area before the thread is
+ * spawned.  The registers are individually 8 dwords wide and suitable
+ * for general usage.  Registers holding thread input values are not
+ * special and may be overwritten.
+ *
+ * MRF - message register file.  Threads communicate (and terminate)
+ * by sending messages.  Message parameters are placed in contiguous
+ * MRF registers.  All program output is via these messages.  URB
+ * entries are populated by sending a message to the shared URB
+ * function containing the new data, together with a control word,
+ * often an unmodified copy of R0.
+ *
+ * R0 - GRF register 0.  Typically holds control information used when
+ * sending messages to other threads.
+ *
+ * EU or GEN4 EU: The name of the programmable subsystem of the
+ * i965 hardware.  Threads are executed by the EU, the registers
+ * described above are part of the EU architecture.
+ *
+ * Fixed function units:
+ *
+ * CS - Command streamer.  Notional first unit, little software
+ * interaction.  Holds the URB entries used for constant data, ie the
+ * CURBEs.
+ *
+ * VF/VS - Vertex Fetch / Vertex Shader.  The fixed function part of
+ * this unit is responsible for pulling vertices out of vertex buffers
+ * in vram and injecting them into the processing pipe as VUEs.  If
+ * enabled, it first passes them to a VS thread which is a good place
+ * for the driver to implement any active vertex shader.
+ *
+ * GS - Geometry Shader.  This corresponds to a new DX10 concept.  If
+ * enabled, incoming strips etc are passed to GS threads in individual
+ * line/triangle/point units.  The GS thread may perform arbitary
+ * computation and emit whatever primtives with whatever vertices it
+ * chooses.  This makes GS an excellent place to implement GL's
+ * unfilled polygon modes, though of course it is capable of much
+ * more.  Additionally, GS is used to translate away primitives not
+ * handled by latter units, including Quads and Lineloops.
+ *
+ * CS - Clipper.  Mesa's clipping algorithms are imported to run on
+ * this unit.  The fixed function part performs cliptesting against
+ * the 6 fixed clipplanes and makes descisions on whether or not the
+ * incoming primitive needs to be passed to a thread for clipping.
+ * User clip planes are handled via cooperation with the VS thread.
+ *
+ * SF - Strips Fans or Setup: Triangles are prepared for
+ * rasterization.  Interpolation coefficients are calculated.
+ * Flatshading and two-side lighting usually performed here.
+ *
+ * WM - Windower.  Interpolation of vertex attributes performed here.
+ * Fragment shader implemented here.  SIMD aspects of EU taken full
+ * advantage of, as pixels are processed in blocks of 16.
+ *
+ * CC - Color Calculator.  No EU threads associated with this unit.
+ * Handles blending and (presumably) depth and stencil testing.
+ */
+
+#define BRW_FALLBACK_TEXTURE		 0x1
+#define BRW_MAX_CURBE                    (32*16)
+
+struct brw_context;
+
+#define BRW_NEW_URB_FENCE               0x1
+#define BRW_NEW_FRAGMENT_PROGRAM        0x2
+#define BRW_NEW_VERTEX_PROGRAM          0x4
+#define BRW_NEW_INPUT_DIMENSIONS        0x8
+#define BRW_NEW_CURBE_OFFSETS           0x10
+#define BRW_NEW_REDUCED_PRIMITIVE       0x20
+#define BRW_NEW_PRIMITIVE               0x40
+#define BRW_NEW_CONTEXT                 0x80
+#define BRW_NEW_WM_INPUT_DIMENSIONS     0x100
+#define BRW_NEW_PSP                     0x800
+#define BRW_NEW_WM_SURFACES		0x1000
+#define BRW_NEW_FENCE                   0x2000
+#define BRW_NEW_INDICES			0x4000
+#define BRW_NEW_VERTICES		0x8000
+/**
+ * Used for any batch entry with a relocated pointer that will be used
+ * by any 3D rendering.
+ */
+#define BRW_NEW_BATCH			0x10000
+/** brw->depth_region updated */
+#define BRW_NEW_DEPTH_BUFFER		0x20000
+#define BRW_NEW_NR_WM_SURFACES		0x40000
+#define BRW_NEW_NR_VS_SURFACES		0x80000
+#define BRW_NEW_INDEX_BUFFER		0x100000
+
+struct brw_state_flags {
+   /** State update flags signalled by mesa internals */
+   GLuint mesa;
+   /**
+    * State update flags signalled as the result of brw_tracked_state updates
+    */
+   GLuint brw;
+   /** State update flags signalled by brw_state_cache.c searches */
+   GLuint cache;
+};
+
+
+/** Subclass of Mesa vertex program */
+struct brw_vertex_program {
+   struct gl_vertex_program program;
+   GLuint id;
+   dri_bo *const_buffer;    /** Program constant buffer/surface */
+   GLboolean use_const_buffer;
+};
+
+
+/** Subclass of Mesa fragment program */
+struct brw_fragment_program {
+   struct gl_fragment_program program;
+   GLuint id;  /**< serial no. to identify frag progs, never re-used */
+   GLboolean isGLSL;  /**< really, any IF/LOOP/CONT/BREAK instructions */
+
+   dri_bo *const_buffer;    /** Program constant buffer/surface */
+   GLboolean use_const_buffer;
+
+   /** for debugging, which texture units are referenced */
+   GLbitfield tex_units_used;
+};
+
+
+/* Data about a particular attempt to compile a program.  Note that
+ * there can be many of these, each in a different GL state
+ * corresponding to a different brw_wm_prog_key struct, with different
+ * compiled programs:
+ */
+struct brw_wm_prog_data {
+   GLuint curb_read_length;
+   GLuint urb_read_length;
+
+   GLuint first_curbe_grf;
+   GLuint total_grf;
+   GLuint total_scratch;
+
+   GLuint nr_params;       /**< number of float params/constants */
+   GLboolean error;
+
+   /* Pointer to tracked values (only valid once
+    * _mesa_load_state_parameters has been called at runtime).
+    */
+   const GLfloat *param[BRW_MAX_CURBE];
+};
+
+struct brw_sf_prog_data {
+   GLuint urb_read_length;
+   GLuint total_grf;
+
+   /* Each vertex may have upto 12 attributes, 4 components each,
+    * except WPOS which requires only 2.  (11*4 + 2) == 44 ==> 11
+    * rows.
+    *
+    * Actually we use 4 for each, so call it 12 rows.
+    */
+   GLuint urb_entry_size;
+};
+
+struct brw_clip_prog_data {
+   GLuint curb_read_length;	/* user planes? */
+   GLuint clip_mode;
+   GLuint urb_read_length;
+   GLuint total_grf;
+};
+
+struct brw_gs_prog_data {
+   GLuint urb_read_length;
+   GLuint total_grf;
+};
+
+struct brw_vs_prog_data {
+   GLuint curb_read_length;
+   GLuint urb_read_length;
+   GLuint total_grf;
+   GLuint outputs_written;
+   GLuint nr_params;       /**< number of float params/constants */
+
+   GLuint inputs_read;
+
+   /* Used for calculating urb partitions:
+    */
+   GLuint urb_entry_size;
+};
+
+
+/* Size == 0 if output either not written, or always [0,0,0,1]
+ */
+struct brw_vs_ouput_sizes {
+   GLubyte output_size[VERT_RESULT_MAX];
+};
+
+
+/** Number of texture sampler units */
+#define BRW_MAX_TEX_UNIT 16
+
+/**
+ * Size of our surface binding table for the WM.
+ * This contains pointers to the drawing surfaces and current texture
+ * objects and shader constant buffers (+2).
+ */
+#define BRW_WM_MAX_SURF (MAX_DRAW_BUFFERS + BRW_MAX_TEX_UNIT + 1)
+
+/**
+ * Helpers to convert drawing buffers, textures and constant buffers
+ * to surface binding table indexes, for WM.
+ */
+#define SURF_INDEX_DRAW(d)           (d)
+#define SURF_INDEX_FRAG_CONST_BUFFER (MAX_DRAW_BUFFERS) 
+#define SURF_INDEX_TEXTURE(t)        (MAX_DRAW_BUFFERS + 1 + (t))
+
+/**
+ * Size of surface binding table for the VS.
+ * Only one constant buffer for now.
+ */
+#define BRW_VS_MAX_SURF 1
+
+/**
+ * Only a VS constant buffer
+ */
+#define SURF_INDEX_VERT_CONST_BUFFER 0
+
+
+enum brw_cache_id {
+   BRW_CC_VP,
+   BRW_CC_UNIT,
+   BRW_WM_PROG,
+   BRW_SAMPLER_DEFAULT_COLOR,
+   BRW_SAMPLER,
+   BRW_WM_UNIT,
+   BRW_SF_PROG,
+   BRW_SF_VP,
+   BRW_SF_UNIT,
+   BRW_VS_UNIT,
+   BRW_VS_PROG,
+   BRW_GS_UNIT,
+   BRW_GS_PROG,
+   BRW_CLIP_VP,
+   BRW_CLIP_UNIT,
+   BRW_CLIP_PROG,
+   BRW_SS_SURFACE,
+   BRW_SS_SURF_BIND,
+
+   BRW_MAX_CACHE
+};
+
+struct brw_cache_item {
+   /**
+    * Effectively part of the key, cache_id identifies what kind of state
+    * buffer is involved, and also which brw->state.dirty.cache flag should
+    * be set when this cache item is chosen.
+    */
+   enum brw_cache_id cache_id;
+   /** 32-bit hash of the key data */
+   GLuint hash;
+   GLuint key_size;		/* for variable-sized keys */
+   const void *key;
+   dri_bo **reloc_bufs;
+   GLuint nr_reloc_bufs;
+
+   dri_bo *bo;
+   GLuint data_size;
+
+   struct brw_cache_item *next;
+};   
+
+
+
+struct brw_cache {
+   struct brw_context *brw;
+
+   struct brw_cache_item **items;
+   GLuint size, n_items;
+
+   GLuint key_size[BRW_MAX_CACHE];		/* for fixed-size keys */
+   GLuint aux_size[BRW_MAX_CACHE];
+   char *name[BRW_MAX_CACHE];
+
+   /* Record of the last BOs chosen for each cache_id.  Used to set
+    * brw->state.dirty.cache when a new cache item is chosen.
+    */
+   dri_bo *last_bo[BRW_MAX_CACHE];
+};
+
+
+/* Considered adding a member to this struct to document which flags
+ * an update might raise so that ordering of the state atoms can be
+ * checked or derived at runtime.  Dropped the idea in favor of having
+ * a debug mode where the state is monitored for flags which are
+ * raised that have already been tested against.
+ */
+struct brw_tracked_state {
+   struct brw_state_flags dirty;
+   void (*prepare)( struct brw_context *brw );
+   void (*emit)( struct brw_context *brw );
+};
+
+/* Flags for brw->state.cache.
+ */
+#define CACHE_NEW_CC_VP                  (1<<BRW_CC_VP)
+#define CACHE_NEW_CC_UNIT                (1<<BRW_CC_UNIT)
+#define CACHE_NEW_WM_PROG                (1<<BRW_WM_PROG)
+#define CACHE_NEW_SAMPLER_DEFAULT_COLOR  (1<<BRW_SAMPLER_DEFAULT_COLOR)
+#define CACHE_NEW_SAMPLER                (1<<BRW_SAMPLER)
+#define CACHE_NEW_WM_UNIT                (1<<BRW_WM_UNIT)
+#define CACHE_NEW_SF_PROG                (1<<BRW_SF_PROG)
+#define CACHE_NEW_SF_VP                  (1<<BRW_SF_VP)
+#define CACHE_NEW_SF_UNIT                (1<<BRW_SF_UNIT)
+#define CACHE_NEW_VS_UNIT                (1<<BRW_VS_UNIT)
+#define CACHE_NEW_VS_PROG                (1<<BRW_VS_PROG)
+#define CACHE_NEW_GS_UNIT                (1<<BRW_GS_UNIT)
+#define CACHE_NEW_GS_PROG                (1<<BRW_GS_PROG)
+#define CACHE_NEW_CLIP_VP                (1<<BRW_CLIP_VP)
+#define CACHE_NEW_CLIP_UNIT              (1<<BRW_CLIP_UNIT)
+#define CACHE_NEW_CLIP_PROG              (1<<BRW_CLIP_PROG)
+#define CACHE_NEW_SURFACE                (1<<BRW_SS_SURFACE)
+#define CACHE_NEW_SURF_BIND              (1<<BRW_SS_SURF_BIND)
+
+struct brw_cached_batch_item {
+   struct header *header;
+   GLuint sz;
+   struct brw_cached_batch_item *next;
+};
+   
+
+
+/* Protect against a future where VERT_ATTRIB_MAX > 32.  Wouldn't life
+ * be easier if C allowed arrays of packed elements?
+ */
+#define ATTRIB_BIT_DWORDS  ((VERT_ATTRIB_MAX+31)/32)
+
+struct brw_vertex_element {
+   const struct gl_client_array *glarray;
+
+   /** The corresponding Mesa vertex attribute */
+   gl_vert_attrib attrib;
+   /** Size of a complete element */
+   GLuint element_size;
+   /** Number of uploaded elements for this input. */
+   GLuint count;
+   /** Byte stride between elements in the uploaded array */
+   GLuint stride;
+   /** Offset of the first element within the buffer object */
+   unsigned int offset;
+   /** Buffer object containing the uploaded vertex data */
+   dri_bo *bo;
+};
+
+
+
+struct brw_vertex_info {
+   GLuint sizes[ATTRIB_BIT_DWORDS * 2]; /* sizes:2[VERT_ATTRIB_MAX] */
+};
+
+
+
+
+/* Cache for TNL programs.
+ */
+struct brw_tnl_cache_item {
+   GLuint hash;
+   void *key;
+   void *data;
+   struct brw_tnl_cache_item *next;
+};
+
+struct brw_tnl_cache {
+   struct brw_tnl_cache_item **items;
+   GLuint size, n_items;
+};
+
+struct brw_query_object {
+   struct gl_query_object Base;
+
+   /** Doubly linked list of active query objects in the context. */
+   struct brw_query_object *prev, *next;
+
+   /** Last query BO associated with this query. */
+   dri_bo *bo;
+   /** First index in bo with query data for this object. */
+   int first_index;
+   /** Last index in bo with query data for this object. */
+   int last_index;
+
+   /* Total count of pixels from previous BOs */
+   unsigned int count;
+};
+
+
+/**
+ * brw_context is derived from intel_context.
+ */
+struct brw_context 
+{
+   struct intel_context intel;  /**< base class, must be first field */
+   GLuint primitive;
+
+   GLboolean emit_state_always;
+   GLboolean tmp_fallback;
+   GLboolean no_batch_wrap;
+
+   struct {
+      struct brw_state_flags dirty;
+
+      GLuint nr_color_regions;
+      struct intel_region *color_regions[MAX_DRAW_BUFFERS];
+      struct intel_region *depth_region;
+
+      /**
+       * List of buffers accumulated in brw_validate_state to receive
+       * dri_bo_check_aperture treatment before exec, so we can know if we
+       * should flush the batch and try again before emitting primitives.
+       *
+       * This can be a fixed number as we only have a limited number of
+       * objects referenced from the batchbuffer in a primitive emit,
+       * consisting of the vertex buffers, pipelined state pointers,
+       * the CURBE, the depth buffer, and a query BO.
+       */
+      dri_bo *validated_bos[VERT_ATTRIB_MAX + 16];
+      int validated_bo_count;
+   } state;
+
+   struct brw_cache cache;  /** non-surface items */
+   struct brw_cache surface_cache;  /* surface items */
+   struct brw_cached_batch_item *cached_batch_items;
+
+   struct {
+      struct brw_vertex_element inputs[VERT_ATTRIB_MAX];
+
+      struct brw_vertex_element *enabled[VERT_ATTRIB_MAX];
+      GLuint nr_enabled;
+
+#define BRW_NR_UPLOAD_BUFS 17
+#define BRW_UPLOAD_INIT_SIZE (128*1024)
+
+      struct {
+	 dri_bo *bo;
+	 GLuint offset;
+      } upload;
+
+      /* Summary of size and varying of active arrays, so we can check
+       * for changes to this state:
+       */
+      struct brw_vertex_info info;
+      unsigned int min_index, max_index;
+   } vb;
+
+   struct {
+      /**
+       * Index buffer for this draw_prims call.
+       *
+       * Updates are signaled by BRW_NEW_INDICES.
+       */
+      const struct _mesa_index_buffer *ib;
+
+      /* Updates to these fields are signaled by BRW_NEW_INDEX_BUFFER. */
+      dri_bo *bo;
+      unsigned int offset;
+      unsigned int size;
+      /* Offset to index buffer index to use in CMD_3D_PRIM so that we can
+       * avoid re-uploading the IB packet over and over if we're actually
+       * referencing the same index buffer.
+       */
+      unsigned int start_vertex_offset;
+   } ib;
+
+   /* Active vertex program: 
+    */
+   const struct gl_vertex_program *vertex_program;
+   const struct gl_fragment_program *fragment_program;
+
+
+   /* For populating the gtt:
+    */
+   GLuint next_free_page;
+
+
+   /* BRW_NEW_URB_ALLOCATIONS:
+    */
+   struct {
+      GLuint vsize;		/* vertex size plus header in urb registers */
+      GLuint csize;		/* constant buffer size in urb registers */
+      GLuint sfsize;		/* setup data size in urb registers */
+
+      GLboolean constrained;
+
+      GLuint nr_vs_entries;
+      GLuint nr_gs_entries;
+      GLuint nr_clip_entries;
+      GLuint nr_sf_entries;
+      GLuint nr_cs_entries;
+
+/*       GLuint vs_size; */
+/*       GLuint gs_size; */
+/*       GLuint clip_size; */
+/*       GLuint sf_size; */
+/*       GLuint cs_size; */
+
+      GLuint vs_start;
+      GLuint gs_start;
+      GLuint clip_start;
+      GLuint sf_start;
+      GLuint cs_start;
+   } urb;
+
+   
+   /* BRW_NEW_CURBE_OFFSETS: 
+    */
+   struct {
+      GLuint wm_start;  /**< pos of first wm const in CURBE buffer */
+      GLuint wm_size;   /**< number of float[4] consts, multiple of 16 */
+      GLuint clip_start;
+      GLuint clip_size;
+      GLuint vs_start;
+      GLuint vs_size;
+      GLuint total_size;
+
+      dri_bo *curbe_bo;
+      /** Offset within curbe_bo of space for current curbe entry */
+      GLuint curbe_offset;
+      /** Offset within curbe_bo of space for next curbe entry */
+      GLuint curbe_next_offset;
+
+      GLfloat *last_buf;
+      GLuint last_bufsz;
+      /**
+       *  Whether we should create a new bo instead of reusing the old one
+       * (if we just dispatch the batch pointing at the old one.
+       */
+      GLboolean need_new_bo;
+   } curbe;
+
+   struct {
+      struct brw_vs_prog_data *prog_data;
+
+      dri_bo *prog_bo;
+      dri_bo *state_bo;
+
+      /** Binding table of pointers to surf_bo entries */
+      dri_bo *bind_bo;
+      dri_bo *surf_bo[BRW_VS_MAX_SURF];
+      GLuint nr_surfaces;      
+   } vs;
+
+   struct {
+      struct brw_gs_prog_data *prog_data;
+
+      GLboolean prog_active;
+      dri_bo *prog_bo;
+      dri_bo *state_bo;
+   } gs;
+
+   struct {
+      struct brw_clip_prog_data *prog_data;
+
+      dri_bo *prog_bo;
+      dri_bo *state_bo;
+      dri_bo *vp_bo;
+   } clip;
+
+
+   struct {
+      struct brw_sf_prog_data *prog_data;
+
+      dri_bo *prog_bo;
+      dri_bo *state_bo;
+      dri_bo *vp_bo;
+   } sf;
+
+   struct {
+      struct brw_wm_prog_data *prog_data;
+      struct brw_wm_compile *compile_data;
+
+      /** Input sizes, calculated from active vertex program.
+       * One bit per fragment program input attribute.
+       */
+      GLbitfield input_size_masks[4];
+
+      /** Array of surface default colors (texture border color) */
+      dri_bo *sdc_bo[BRW_MAX_TEX_UNIT];
+
+      GLuint render_surf;
+      GLuint nr_surfaces;      
+
+      GLuint max_threads;
+      dri_bo *scratch_bo;
+
+      GLuint sampler_count;
+      dri_bo *sampler_bo;
+
+      /** Binding table of pointers to surf_bo entries */
+      dri_bo *bind_bo;
+      dri_bo *surf_bo[BRW_WM_MAX_SURF];
+
+      dri_bo *prog_bo;
+      dri_bo *state_bo;
+   } wm;
+
+
+   struct {
+      dri_bo *prog_bo;
+      dri_bo *state_bo;
+      dri_bo *vp_bo;
+   } cc;
+
+   struct {
+      struct brw_query_object active_head;
+      dri_bo *bo;
+      int index;
+      GLboolean active;
+   } query;
+   /* Used to give every program string a unique id
+    */
+   GLuint program_id;
+};
+
+
+#define BRW_PACKCOLOR8888(r,g,b,a)  ((r<<24) | (g<<16) | (b<<8) | a)
+
+
+
+/*======================================================================
+ * brw_vtbl.c
+ */
+void brwInitVtbl( struct brw_context *brw );
+
+/*======================================================================
+ * brw_context.c
+ */
+GLboolean brwCreateContext( const __GLcontextModes *mesaVis,
+			    __DRIcontextPrivate *driContextPriv,
+			    void *sharedContextPrivate);
+
+/*======================================================================
+ * brw_queryobj.c
+ */
+void brw_init_queryobj_functions(struct dd_function_table *functions);
+void brw_prepare_query_begin(struct brw_context *brw);
+void brw_emit_query_begin(struct brw_context *brw);
+void brw_emit_query_end(struct brw_context *brw);
+
+/*======================================================================
+ * brw_state_dump.c
+ */
+void brw_debug_batch(struct intel_context *intel);
+
+/*======================================================================
+ * brw_tex.c
+ */
+void brw_validate_textures( struct brw_context *brw );
+
+
+/*======================================================================
+ * brw_program.c
+ */
+void brwInitFragProgFuncs( struct dd_function_table *functions );
+
+
+/* brw_urb.c
+ */
+void brw_upload_urb_fence(struct brw_context *brw);
+
+/* brw_curbe.c
+ */
+void brw_upload_cs_urb_state(struct brw_context *brw);
+
+/* brw_disasm.c */
+int brw_disasm (FILE *file, struct brw_instruction *inst);
+
+/*======================================================================
+ * Inline conversion functions.  These are better-typed than the
+ * macros used previously:
+ */
+static INLINE struct brw_context *
+brw_context( GLcontext *ctx )
+{
+   return (struct brw_context *)ctx;
+}
+
+static INLINE struct brw_vertex_program *
+brw_vertex_program(struct gl_vertex_program *p)
+{
+   return (struct brw_vertex_program *) p;
+}
+
+static INLINE const struct brw_vertex_program *
+brw_vertex_program_const(const struct gl_vertex_program *p)
+{
+   return (const struct brw_vertex_program *) p;
+}
+
+static INLINE struct brw_fragment_program *
+brw_fragment_program(struct gl_fragment_program *p)
+{
+   return (struct brw_fragment_program *) p;
+}
+
+static INLINE const struct brw_fragment_program *
+brw_fragment_program_const(const struct gl_fragment_program *p)
+{
+   return (const struct brw_fragment_program *) p;
+}
+
+
+
+#define DO_SETUP_BITS ((1<<(FRAG_ATTRIB_MAX)) - 1)
+
+#endif
+
--- a/src/gallium/drivers/i965/brw_curbe.c
+++ b/src/gallium/drivers/i965/brw_curbe.c
@ -0,0 +1,376 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+
+#include "main/glheader.h"
+#include "main/context.h"
+#include "main/macros.h"
+#include "main/enums.h"
+#include "shader/prog_parameter.h"
+#include "shader/prog_print.h"
+#include "shader/prog_statevars.h"
+#include "intel_batchbuffer.h"
+#include "intel_regions.h"
+#include "brw_context.h"
+#include "brw_defines.h"
+#include "brw_state.h"
+#include "brw_util.h"
+
+
+/**
+ * Partition the CURBE between the various users of constant values:
+ * Note that vertex and fragment shaders can now fetch constants out
+ * of constant buffers.  We no longer allocatea block of the GRF for
+ * constants.  That greatly reduces the demand for space in the CURBE.
+ * Some of the comments within are dated...
+ */
+static void calculate_curbe_offsets( struct brw_context *brw )
+{
+   GLcontext *ctx = &brw->intel.ctx;
+   /* CACHE_NEW_WM_PROG */
+   const GLuint nr_fp_regs = (brw->wm.prog_data->nr_params + 15) / 16;
+   
+   /* BRW_NEW_VERTEX_PROGRAM */
+   const GLuint nr_vp_regs = (brw->vs.prog_data->nr_params + 15) / 16;
+   GLuint nr_clip_regs = 0;
+   GLuint total_regs;
+
+   /* _NEW_TRANSFORM */
+   if (ctx->Transform.ClipPlanesEnabled) {
+      GLuint nr_planes = 6 + brw_count_bits(ctx->Transform.ClipPlanesEnabled);
+      nr_clip_regs = (nr_planes * 4 + 15) / 16;
+   }
+
+
+   total_regs = nr_fp_regs + nr_vp_regs + nr_clip_regs;
+
+   /* This can happen - what to do?  Probably rather than falling
+    * back, the best thing to do is emit programs which code the
+    * constants as immediate values.  Could do this either as a static
+    * cap on WM and VS, or adaptively.
+    *
+    * Unfortunately, this is currently dependent on the results of the
+    * program generation process (in the case of wm), so this would
+    * introduce the need to re-generate programs in the event of a
+    * curbe allocation failure.
+    */
+   /* Max size is 32 - just large enough to
+    * hold the 128 parameters allowed by
+    * the fragment and vertex program
+    * api's.  It's not clear what happens
+    * when both VP and FP want to use 128
+    * parameters, though. 
+    */
+   assert(total_regs <= 32);
+
+   /* Lazy resize:
+    */
+   if (nr_fp_regs > brw->curbe.wm_size ||
+       nr_vp_regs > brw->curbe.vs_size ||
+       nr_clip_regs != brw->curbe.clip_size ||
+       (total_regs < brw->curbe.total_size / 4 &&
+	brw->curbe.total_size > 16)) {
+
+      GLuint reg = 0;
+
+      /* Calculate a new layout: 
+       */
+      reg = 0;
+      brw->curbe.wm_start = reg;
+      brw->curbe.wm_size = nr_fp_regs; reg += nr_fp_regs;
+      brw->curbe.clip_start = reg;
+      brw->curbe.clip_size = nr_clip_regs; reg += nr_clip_regs;
+      brw->curbe.vs_start = reg;
+      brw->curbe.vs_size = nr_vp_regs; reg += nr_vp_regs;
+      brw->curbe.total_size = reg;
+
+      if (0)
+	 _mesa_printf("curbe wm %d+%d clip %d+%d vs %d+%d\n",
+		      brw->curbe.wm_start,
+		      brw->curbe.wm_size,
+		      brw->curbe.clip_start,
+		      brw->curbe.clip_size,
+		      brw->curbe.vs_start,
+		      brw->curbe.vs_size );
+
+      brw->state.dirty.brw |= BRW_NEW_CURBE_OFFSETS;
+   }
+}
+
+
+const struct brw_tracked_state brw_curbe_offsets = {
+   .dirty = {
+      .mesa = _NEW_TRANSFORM,
+      .brw  = BRW_NEW_VERTEX_PROGRAM,
+      .cache = CACHE_NEW_WM_PROG
+   },
+   .prepare = calculate_curbe_offsets
+};
+
+
+
+
+/* Define the number of curbes within CS's urb allocation.  Multiple
+ * urb entries -> multiple curbes.  These will be used by
+ * fixed-function hardware in a double-buffering scheme to avoid a
+ * pipeline stall each time the contents of the curbe is changed.
+ */
+void brw_upload_cs_urb_state(struct brw_context *brw)
+{
+   struct brw_cs_urb_state cs_urb;
+   memset(&cs_urb, 0, sizeof(cs_urb));
+
+   /* It appears that this is the state packet for the CS unit, ie. the
+    * urb entries detailed here are housed in the CS range from the
+    * URB_FENCE command.
+    */
+   cs_urb.header.opcode = CMD_CS_URB_STATE;
+   cs_urb.header.length = sizeof(cs_urb)/4 - 2;
+
+   /* BRW_NEW_URB_FENCE */
+   cs_urb.bits0.nr_urb_entries = brw->urb.nr_cs_entries;
+   cs_urb.bits0.urb_entry_size = brw->urb.csize - 1;
+
+   assert(brw->urb.nr_cs_entries);
+   BRW_CACHED_BATCH_STRUCT(brw, &cs_urb);
+}
+
+static GLfloat fixed_plane[6][4] = {
+   { 0,    0,   -1, 1 },
+   { 0,    0,    1, 1 },
+   { 0,   -1,    0, 1 },
+   { 0,    1,    0, 1 },
+   {-1,    0,    0, 1 },
+   { 1,    0,    0, 1 }
+};
+
+/* Upload a new set of constants.  Too much variability to go into the
+ * cache mechanism, but maybe would benefit from a comparison against
+ * the current uploaded set of constants.
+ */
+static void prepare_constant_buffer(struct brw_context *brw)
+{
+   GLcontext *ctx = &brw->intel.ctx;
+   const struct brw_vertex_program *vp =
+      brw_vertex_program_const(brw->vertex_program);
+   const struct brw_fragment_program *fp =
+      brw_fragment_program_const(brw->fragment_program);
+   const GLuint sz = brw->curbe.total_size;
+   const GLuint bufsz = sz * 16 * sizeof(GLfloat);
+   GLfloat *buf;
+   GLuint i;
+
+   if (sz == 0) {
+      if (brw->curbe.last_buf) {
+	 free(brw->curbe.last_buf);
+	 brw->curbe.last_buf = NULL;
+	 brw->curbe.last_bufsz  = 0;
+      }
+      return;
+   }
+
+   buf = (GLfloat *) _mesa_calloc(bufsz);
+
+   /* fragment shader constants */
+   if (brw->curbe.wm_size) {
+      GLuint offset = brw->curbe.wm_start * 16;
+
+      _mesa_load_state_parameters(ctx, fp->program.Base.Parameters); 
+
+      /* copy float constants */
+      for (i = 0; i < brw->wm.prog_data->nr_params; i++) 
+	 buf[offset + i] = *brw->wm.prog_data->param[i];
+   }
+
+
+   /* The clipplanes are actually delivered to both CLIP and VS units.
+    * VS uses them to calculate the outcode bitmasks.
+    */
+   if (brw->curbe.clip_size) {
+      GLuint offset = brw->curbe.clip_start * 16;
+      GLuint j;
+
+      /* If any planes are going this way, send them all this way:
+       */
+      for (i = 0; i < 6; i++) {
+	 buf[offset + i * 4 + 0] = fixed_plane[i][0];
+	 buf[offset + i * 4 + 1] = fixed_plane[i][1];
+	 buf[offset + i * 4 + 2] = fixed_plane[i][2];
+	 buf[offset + i * 4 + 3] = fixed_plane[i][3];
+      }
+
+      /* Clip planes: _NEW_TRANSFORM plus _NEW_PROJECTION to get to
+       * clip-space:
+       */
+      assert(MAX_CLIP_PLANES == 6);
+      for (j = 0; j < MAX_CLIP_PLANES; j++) {
+	 if (ctx->Transform.ClipPlanesEnabled & (1<<j)) {
+	    buf[offset + i * 4 + 0] = ctx->Transform._ClipUserPlane[j][0];
+	    buf[offset + i * 4 + 1] = ctx->Transform._ClipUserPlane[j][1];
+	    buf[offset + i * 4 + 2] = ctx->Transform._ClipUserPlane[j][2];
+	    buf[offset + i * 4 + 3] = ctx->Transform._ClipUserPlane[j][3];
+	    i++;
+	 }
+      }
+   }
+
+   /* vertex shader constants */
+   if (brw->curbe.vs_size) {
+      GLuint offset = brw->curbe.vs_start * 16;
+      GLuint nr = brw->vs.prog_data->nr_params / 4;
+
+      if (brw->vertex_program->IsNVProgram)
+	 _mesa_load_tracked_matrices(ctx);
+
+      /* Updates the ParamaterValues[i] pointers for all parameters of the
+       * basic type of PROGRAM_STATE_VAR.
+       */
+      _mesa_load_state_parameters(ctx, vp->program.Base.Parameters); 
+
+      /* XXX just use a memcpy here */
+      for (i = 0; i < nr; i++) {
+         const GLfloat *value = vp->program.Base.Parameters->ParameterValues[i];
+	 buf[offset + i * 4 + 0] = value[0];
+	 buf[offset + i * 4 + 1] = value[1];
+	 buf[offset + i * 4 + 2] = value[2];
+	 buf[offset + i * 4 + 3] = value[3];
+      }
+   }
+
+   if (0) {
+      for (i = 0; i < sz*16; i+=4) 
+	 _mesa_printf("curbe %d.%d: %f %f %f %f\n", i/8, i&4,
+		      buf[i+0], buf[i+1], buf[i+2], buf[i+3]);
+
+      _mesa_printf("last_buf %p buf %p sz %d/%d cmp %d\n",
+		   brw->curbe.last_buf, buf,
+		   bufsz, brw->curbe.last_bufsz,
+		   brw->curbe.last_buf ? memcmp(buf, brw->curbe.last_buf, bufsz) : -1);
+   }
+
+   if (brw->curbe.curbe_bo != NULL &&
+       brw->curbe.last_buf &&
+       bufsz == brw->curbe.last_bufsz &&
+       memcmp(buf, brw->curbe.last_buf, bufsz) == 0) {
+      /* constants have not changed */
+      _mesa_free(buf);
+   } 
+   else {
+      /* constants have changed */
+      if (brw->curbe.last_buf)
+	 _mesa_free(brw->curbe.last_buf);
+
+      brw->curbe.last_buf = buf;
+      brw->curbe.last_bufsz = bufsz;
+
+      if (brw->curbe.curbe_bo != NULL &&
+	  (brw->curbe.need_new_bo ||
+	   brw->curbe.curbe_next_offset + bufsz > brw->curbe.curbe_bo->size))
+      {
+	 dri_bo_unreference(brw->curbe.curbe_bo);
+	 brw->curbe.curbe_bo = NULL;
+      }
+
+      if (brw->curbe.curbe_bo == NULL) {
+	 /* Allocate a single page for CURBE entries for this batchbuffer.
+	  * They're generally around 64b.
+	  */
+	 brw->curbe.curbe_bo = dri_bo_alloc(brw->intel.bufmgr, "CURBE",
+					    4096, 1 << 6);
+	 brw->curbe.curbe_next_offset = 0;
+      }
+
+      brw->curbe.curbe_offset = brw->curbe.curbe_next_offset;
+      brw->curbe.curbe_next_offset += bufsz;
+      brw->curbe.curbe_next_offset = ALIGN(brw->curbe.curbe_next_offset, 64);
+
+      /* Copy data to the buffer:
+       */
+      dri_bo_subdata(brw->curbe.curbe_bo, brw->curbe.curbe_offset, bufsz, buf);
+   }
+
+   brw_add_validated_bo(brw, brw->curbe.curbe_bo);
+
+   /* Because this provokes an action (ie copy the constants into the
+    * URB), it shouldn't be shortcircuited if identical to the
+    * previous time - because eg. the urb destination may have
+    * changed, or the urb contents different to last time.
+    *
+    * Note that the data referred to is actually copied internally,
+    * not just used in place according to passed pointer.
+    *
+    * It appears that the CS unit takes care of using each available
+    * URB entry (Const URB Entry == CURBE) in turn, and issuing
+    * flushes as necessary when doublebuffering of CURBEs isn't
+    * possible.
+    */
+}
+
+static void emit_constant_buffer(struct brw_context *brw)
+{
+   struct intel_context *intel = &brw->intel;
+   GLuint sz = brw->curbe.total_size;
+
+   BEGIN_BATCH(2, IGNORE_CLIPRECTS);
+   if (sz == 0) {
+      OUT_BATCH((CMD_CONST_BUFFER << 16) | (2 - 2));
+      OUT_BATCH(0);
+   } else {
+      OUT_BATCH((CMD_CONST_BUFFER << 16) | (1 << 8) | (2 - 2));
+      OUT_RELOC(brw->curbe.curbe_bo,
+		I915_GEM_DOMAIN_INSTRUCTION, 0,
+		(sz - 1) + brw->curbe.curbe_offset);
+   }
+   ADVANCE_BATCH();
+}
+
+/* This tracked state is unique in that the state it monitors varies
+ * dynamically depending on the parameters tracked by the fragment and
+ * vertex programs.  This is the template used as a starting point,
+ * each context will maintain a copy of this internally and update as
+ * required.
+ */
+const struct brw_tracked_state brw_constant_buffer = {
+   .dirty = {
+      .mesa = _NEW_PROGRAM_CONSTANTS,
+      .brw  = (BRW_NEW_FRAGMENT_PROGRAM |
+	       BRW_NEW_VERTEX_PROGRAM |
+	       BRW_NEW_URB_FENCE | /* Implicit - hardware requires this, not used above */
+	       BRW_NEW_PSP | /* Implicit - hardware requires this, not used above */
+	       BRW_NEW_CURBE_OFFSETS |
+	       BRW_NEW_BATCH),
+      .cache = (CACHE_NEW_WM_PROG) 
+   },
+   .prepare = prepare_constant_buffer,
+   .emit = emit_constant_buffer,
+};
+
--- a/src/gallium/drivers/i965/brw_defines.h
+++ b/src/gallium/drivers/i965/brw_defines.h
@ -0,0 +1,851 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+ 
+
+#ifndef BRW_DEFINES_H
+#define BRW_DEFINES_H
+
+/* 3D state:
+ */
+#define _3DOP_3DSTATE_PIPELINED       0x0
+#define _3DOP_3DSTATE_NONPIPELINED    0x1
+#define _3DOP_3DCONTROL               0x2
+#define _3DOP_3DPRIMITIVE             0x3
+
+#define _3DSTATE_PIPELINED_POINTERS       0x00
+#define _3DSTATE_BINDING_TABLE_POINTERS   0x01
+#define _3DSTATE_VERTEX_BUFFERS           0x08
+#define _3DSTATE_VERTEX_ELEMENTS          0x09
+#define _3DSTATE_INDEX_BUFFER             0x0A
+#define _3DSTATE_VF_STATISTICS            0x0B
+#define _3DSTATE_DRAWING_RECTANGLE            0x00
+#define _3DSTATE_CONSTANT_COLOR               0x01
+#define _3DSTATE_SAMPLER_PALETTE_LOAD         0x02
+#define _3DSTATE_CHROMA_KEY                   0x04
+#define _3DSTATE_DEPTH_BUFFER                 0x05
+#define _3DSTATE_POLY_STIPPLE_OFFSET          0x06
+#define _3DSTATE_POLY_STIPPLE_PATTERN         0x07
+#define _3DSTATE_LINE_STIPPLE                 0x08
+#define _3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP    0x09
+#define _3DCONTROL    0x00
+
+#define PIPE_CONTROL_NOWRITE          0x00
+#define PIPE_CONTROL_WRITEIMMEDIATE   0x01
+#define PIPE_CONTROL_WRITEDEPTH       0x02
+#define PIPE_CONTROL_WRITETIMESTAMP   0x03
+
+#define PIPE_CONTROL_GTTWRITE_PROCESS_LOCAL 0x00
+#define PIPE_CONTROL_GTTWRITE_GLOBAL        0x01
+
+#define _3DPRIM_POINTLIST         0x01
+#define _3DPRIM_LINELIST          0x02
+#define _3DPRIM_LINESTRIP         0x03
+#define _3DPRIM_TRILIST           0x04
+#define _3DPRIM_TRISTRIP          0x05
+#define _3DPRIM_TRIFAN            0x06
+#define _3DPRIM_QUADLIST          0x07
+#define _3DPRIM_QUADSTRIP         0x08
+#define _3DPRIM_LINELIST_ADJ      0x09
+#define _3DPRIM_LINESTRIP_ADJ     0x0A
+#define _3DPRIM_TRILIST_ADJ       0x0B
+#define _3DPRIM_TRISTRIP_ADJ      0x0C
+#define _3DPRIM_TRISTRIP_REVERSE  0x0D
+#define _3DPRIM_POLYGON           0x0E
+#define _3DPRIM_RECTLIST          0x0F
+#define _3DPRIM_LINELOOP          0x10
+#define _3DPRIM_POINTLIST_BF      0x11
+#define _3DPRIM_LINESTRIP_CONT    0x12
+#define _3DPRIM_LINESTRIP_BF      0x13
+#define _3DPRIM_LINESTRIP_CONT_BF 0x14
+#define _3DPRIM_TRIFAN_NOSTIPPLE  0x15
+
+#define _3DPRIM_VERTEXBUFFER_ACCESS_SEQUENTIAL 0
+#define _3DPRIM_VERTEXBUFFER_ACCESS_RANDOM     1
+
+#define BRW_ANISORATIO_2     0 
+#define BRW_ANISORATIO_4     1 
+#define BRW_ANISORATIO_6     2 
+#define BRW_ANISORATIO_8     3 
+#define BRW_ANISORATIO_10    4 
+#define BRW_ANISORATIO_12    5 
+#define BRW_ANISORATIO_14    6 
+#define BRW_ANISORATIO_16    7
+
+#define BRW_BLENDFACTOR_ONE                 0x1
+#define BRW_BLENDFACTOR_SRC_COLOR           0x2
+#define BRW_BLENDFACTOR_SRC_ALPHA           0x3
+#define BRW_BLENDFACTOR_DST_ALPHA           0x4
+#define BRW_BLENDFACTOR_DST_COLOR           0x5
+#define BRW_BLENDFACTOR_SRC_ALPHA_SATURATE  0x6
+#define BRW_BLENDFACTOR_CONST_COLOR         0x7
+#define BRW_BLENDFACTOR_CONST_ALPHA         0x8
+#define BRW_BLENDFACTOR_SRC1_COLOR          0x9
+#define BRW_BLENDFACTOR_SRC1_ALPHA          0x0A
+#define BRW_BLENDFACTOR_ZERO                0x11
+#define BRW_BLENDFACTOR_INV_SRC_COLOR       0x12
+#define BRW_BLENDFACTOR_INV_SRC_ALPHA       0x13
+#define BRW_BLENDFACTOR_INV_DST_ALPHA       0x14
+#define BRW_BLENDFACTOR_INV_DST_COLOR       0x15
+#define BRW_BLENDFACTOR_INV_CONST_COLOR     0x17
+#define BRW_BLENDFACTOR_INV_CONST_ALPHA     0x18
+#define BRW_BLENDFACTOR_INV_SRC1_COLOR      0x19
+#define BRW_BLENDFACTOR_INV_SRC1_ALPHA      0x1A
+
+#define BRW_BLENDFUNCTION_ADD               0
+#define BRW_BLENDFUNCTION_SUBTRACT          1
+#define BRW_BLENDFUNCTION_REVERSE_SUBTRACT  2
+#define BRW_BLENDFUNCTION_MIN               3
+#define BRW_BLENDFUNCTION_MAX               4
+
+#define BRW_ALPHATEST_FORMAT_UNORM8         0
+#define BRW_ALPHATEST_FORMAT_FLOAT32        1
+
+#define BRW_CHROMAKEY_KILL_ON_ANY_MATCH  0
+#define BRW_CHROMAKEY_REPLACE_BLACK      1
+
+#define BRW_CLIP_API_OGL     0
+#define BRW_CLIP_API_DX      1
+
+#define BRW_CLIPMODE_NORMAL              0
+#define BRW_CLIPMODE_CLIP_ALL            1
+#define BRW_CLIPMODE_CLIP_NON_REJECTED   2
+#define BRW_CLIPMODE_REJECT_ALL          3
+#define BRW_CLIPMODE_ACCEPT_ALL          4
+#define BRW_CLIPMODE_KERNEL_CLIP         5
+
+#define BRW_CLIP_NDCSPACE     0
+#define BRW_CLIP_SCREENSPACE  1
+
+#define BRW_COMPAREFUNCTION_ALWAYS       0
+#define BRW_COMPAREFUNCTION_NEVER        1
+#define BRW_COMPAREFUNCTION_LESS         2
+#define BRW_COMPAREFUNCTION_EQUAL        3
+#define BRW_COMPAREFUNCTION_LEQUAL       4
+#define BRW_COMPAREFUNCTION_GREATER      5
+#define BRW_COMPAREFUNCTION_NOTEQUAL     6
+#define BRW_COMPAREFUNCTION_GEQUAL       7
+
+#define BRW_COVERAGE_PIXELS_HALF     0
+#define BRW_COVERAGE_PIXELS_1        1
+#define BRW_COVERAGE_PIXELS_2        2
+#define BRW_COVERAGE_PIXELS_4        3
+
+#define BRW_CULLMODE_BOTH        0
+#define BRW_CULLMODE_NONE        1
+#define BRW_CULLMODE_FRONT       2
+#define BRW_CULLMODE_BACK        3
+
+#define BRW_DEFAULTCOLOR_R8G8B8A8_UNORM      0
+#define BRW_DEFAULTCOLOR_R32G32B32A32_FLOAT  1
+
+#define BRW_DEPTHFORMAT_D32_FLOAT_S8X24_UINT     0
+#define BRW_DEPTHFORMAT_D32_FLOAT                1
+#define BRW_DEPTHFORMAT_D24_UNORM_S8_UINT        2
+#define BRW_DEPTHFORMAT_D16_UNORM                5
+
+#define BRW_FLOATING_POINT_IEEE_754        0
+#define BRW_FLOATING_POINT_NON_IEEE_754    1
+
+#define BRW_FRONTWINDING_CW      0
+#define BRW_FRONTWINDING_CCW     1
+
+#define BRW_SPRITE_POINT_ENABLE  16
+
+#define BRW_INDEX_BYTE     0
+#define BRW_INDEX_WORD     1
+#define BRW_INDEX_DWORD    2
+
+#define BRW_LOGICOPFUNCTION_CLEAR            0
+#define BRW_LOGICOPFUNCTION_NOR              1
+#define BRW_LOGICOPFUNCTION_AND_INVERTED     2
+#define BRW_LOGICOPFUNCTION_COPY_INVERTED    3
+#define BRW_LOGICOPFUNCTION_AND_REVERSE      4
+#define BRW_LOGICOPFUNCTION_INVERT           5
+#define BRW_LOGICOPFUNCTION_XOR              6
+#define BRW_LOGICOPFUNCTION_NAND             7
+#define BRW_LOGICOPFUNCTION_AND              8
+#define BRW_LOGICOPFUNCTION_EQUIV            9
+#define BRW_LOGICOPFUNCTION_NOOP             10
+#define BRW_LOGICOPFUNCTION_OR_INVERTED      11
+#define BRW_LOGICOPFUNCTION_COPY             12
+#define BRW_LOGICOPFUNCTION_OR_REVERSE       13
+#define BRW_LOGICOPFUNCTION_OR               14
+#define BRW_LOGICOPFUNCTION_SET              15  
+
+#define BRW_MAPFILTER_NEAREST        0x0 
+#define BRW_MAPFILTER_LINEAR         0x1 
+#define BRW_MAPFILTER_ANISOTROPIC    0x2
+
+#define BRW_MIPFILTER_NONE        0   
+#define BRW_MIPFILTER_NEAREST     1   
+#define BRW_MIPFILTER_LINEAR      3
+
+#define BRW_POLYGON_FRONT_FACING     0
+#define BRW_POLYGON_BACK_FACING      1
+
+#define BRW_PREFILTER_ALWAYS     0x0 
+#define BRW_PREFILTER_NEVER      0x1
+#define BRW_PREFILTER_LESS       0x2
+#define BRW_PREFILTER_EQUAL      0x3
+#define BRW_PREFILTER_LEQUAL     0x4
+#define BRW_PREFILTER_GREATER    0x5
+#define BRW_PREFILTER_NOTEQUAL   0x6
+#define BRW_PREFILTER_GEQUAL     0x7
+
+#define BRW_PROVOKING_VERTEX_0    0
+#define BRW_PROVOKING_VERTEX_1    1 
+#define BRW_PROVOKING_VERTEX_2    2
+
+#define BRW_RASTRULE_UPPER_LEFT  0    
+#define BRW_RASTRULE_UPPER_RIGHT 1
+/* These are listed as "Reserved, but not seen as useful"
+ * in Intel documentation (page 212, "Point Rasterization Rule",
+ * section 7.4 "SF Pipeline State Summary", of document
+ * "Intel® 965 Express Chipset Family and Intel® G35 Express
+ * Chipset Graphics Controller Programmer's Reference Manual,
+ * Volume 2: 3D/Media", Revision 1.0b as of January 2008,
+ * available at 
+ *     http://intellinuxgraphics.org/documentation.html
+ * at the time of this writing).
+ *
+ * These appear to be supported on at least some
+ * i965-family devices, and the BRW_RASTRULE_LOWER_RIGHT
+ * is useful when using OpenGL to render to a FBO
+ * (which has the pixel coordinate Y orientation inverted
+ * with respect to the normal OpenGL pixel coordinate system).
+ */
+#define BRW_RASTRULE_LOWER_LEFT  2
+#define BRW_RASTRULE_LOWER_RIGHT 3
+
+#define BRW_RENDERTARGET_CLAMPRANGE_UNORM    0
+#define BRW_RENDERTARGET_CLAMPRANGE_SNORM    1
+#define BRW_RENDERTARGET_CLAMPRANGE_FORMAT   2
+
+#define BRW_STENCILOP_KEEP               0
+#define BRW_STENCILOP_ZERO               1
+#define BRW_STENCILOP_REPLACE            2
+#define BRW_STENCILOP_INCRSAT            3
+#define BRW_STENCILOP_DECRSAT            4
+#define BRW_STENCILOP_INCR               5
+#define BRW_STENCILOP_DECR               6
+#define BRW_STENCILOP_INVERT             7
+
+#define BRW_SURFACE_MIPMAPLAYOUT_BELOW   0
+#define BRW_SURFACE_MIPMAPLAYOUT_RIGHT   1
+
+#define BRW_SURFACEFORMAT_R32G32B32A32_FLOAT             0x000 
+#define BRW_SURFACEFORMAT_R32G32B32A32_SINT              0x001 
+#define BRW_SURFACEFORMAT_R32G32B32A32_UINT              0x002 
+#define BRW_SURFACEFORMAT_R32G32B32A32_UNORM             0x003 
+#define BRW_SURFACEFORMAT_R32G32B32A32_SNORM             0x004 
+#define BRW_SURFACEFORMAT_R64G64_FLOAT                   0x005 
+#define BRW_SURFACEFORMAT_R32G32B32X32_FLOAT             0x006 
+#define BRW_SURFACEFORMAT_R32G32B32A32_SSCALED           0x007
+#define BRW_SURFACEFORMAT_R32G32B32A32_USCALED           0x008
+#define BRW_SURFACEFORMAT_R32G32B32_FLOAT                0x040 
+#define BRW_SURFACEFORMAT_R32G32B32_SINT                 0x041 
+#define BRW_SURFACEFORMAT_R32G32B32_UINT                 0x042 
+#define BRW_SURFACEFORMAT_R32G32B32_UNORM                0x043 
+#define BRW_SURFACEFORMAT_R32G32B32_SNORM                0x044 
+#define BRW_SURFACEFORMAT_R32G32B32_SSCALED              0x045 
+#define BRW_SURFACEFORMAT_R32G32B32_USCALED              0x046 
+#define BRW_SURFACEFORMAT_R16G16B16A16_UNORM             0x080 
+#define BRW_SURFACEFORMAT_R16G16B16A16_SNORM             0x081 
+#define BRW_SURFACEFORMAT_R16G16B16A16_SINT              0x082 
+#define BRW_SURFACEFORMAT_R16G16B16A16_UINT              0x083 
+#define BRW_SURFACEFORMAT_R16G16B16A16_FLOAT             0x084 
+#define BRW_SURFACEFORMAT_R32G32_FLOAT                   0x085 
+#define BRW_SURFACEFORMAT_R32G32_SINT                    0x086 
+#define BRW_SURFACEFORMAT_R32G32_UINT                    0x087 
+#define BRW_SURFACEFORMAT_R32_FLOAT_X8X24_TYPELESS       0x088 
+#define BRW_SURFACEFORMAT_X32_TYPELESS_G8X24_UINT        0x089 
+#define BRW_SURFACEFORMAT_L32A32_FLOAT                   0x08A 
+#define BRW_SURFACEFORMAT_R32G32_UNORM                   0x08B 
+#define BRW_SURFACEFORMAT_R32G32_SNORM                   0x08C 
+#define BRW_SURFACEFORMAT_R64_FLOAT                      0x08D 
+#define BRW_SURFACEFORMAT_R16G16B16X16_UNORM             0x08E 
+#define BRW_SURFACEFORMAT_R16G16B16X16_FLOAT             0x08F 
+#define BRW_SURFACEFORMAT_A32X32_FLOAT                   0x090 
+#define BRW_SURFACEFORMAT_L32X32_FLOAT                   0x091 
+#define BRW_SURFACEFORMAT_I32X32_FLOAT                   0x092 
+#define BRW_SURFACEFORMAT_R16G16B16A16_SSCALED           0x093
+#define BRW_SURFACEFORMAT_R16G16B16A16_USCALED           0x094
+#define BRW_SURFACEFORMAT_R32G32_SSCALED                 0x095
+#define BRW_SURFACEFORMAT_R32G32_USCALED                 0x096
+#define BRW_SURFACEFORMAT_B8G8R8A8_UNORM                 0x0C0 
+#define BRW_SURFACEFORMAT_B8G8R8A8_UNORM_SRGB            0x0C1 
+#define BRW_SURFACEFORMAT_R10G10B10A2_UNORM              0x0C2 
+#define BRW_SURFACEFORMAT_R10G10B10A2_UNORM_SRGB         0x0C3 
+#define BRW_SURFACEFORMAT_R10G10B10A2_UINT               0x0C4 
+#define BRW_SURFACEFORMAT_R10G10B10_SNORM_A2_UNORM       0x0C5 
+#define BRW_SURFACEFORMAT_R8G8B8A8_UNORM                 0x0C7 
+#define BRW_SURFACEFORMAT_R8G8B8A8_UNORM_SRGB            0x0C8 
+#define BRW_SURFACEFORMAT_R8G8B8A8_SNORM                 0x0C9 
+#define BRW_SURFACEFORMAT_R8G8B8A8_SINT                  0x0CA 
+#define BRW_SURFACEFORMAT_R8G8B8A8_UINT                  0x0CB 
+#define BRW_SURFACEFORMAT_R16G16_UNORM                   0x0CC 
+#define BRW_SURFACEFORMAT_R16G16_SNORM                   0x0CD 
+#define BRW_SURFACEFORMAT_R16G16_SINT                    0x0CE 
+#define BRW_SURFACEFORMAT_R16G16_UINT                    0x0CF 
+#define BRW_SURFACEFORMAT_R16G16_FLOAT                   0x0D0 
+#define BRW_SURFACEFORMAT_B10G10R10A2_UNORM              0x0D1 
+#define BRW_SURFACEFORMAT_B10G10R10A2_UNORM_SRGB         0x0D2 
+#define BRW_SURFACEFORMAT_R11G11B10_FLOAT                0x0D3 
+#define BRW_SURFACEFORMAT_R32_SINT                       0x0D6 
+#define BRW_SURFACEFORMAT_R32_UINT                       0x0D7 
+#define BRW_SURFACEFORMAT_R32_FLOAT                      0x0D8 
+#define BRW_SURFACEFORMAT_R24_UNORM_X8_TYPELESS          0x0D9 
+#define BRW_SURFACEFORMAT_X24_TYPELESS_G8_UINT           0x0DA 
+#define BRW_SURFACEFORMAT_L16A16_UNORM                   0x0DF 
+#define BRW_SURFACEFORMAT_I24X8_UNORM                    0x0E0 
+#define BRW_SURFACEFORMAT_L24X8_UNORM                    0x0E1 
+#define BRW_SURFACEFORMAT_A24X8_UNORM                    0x0E2 
+#define BRW_SURFACEFORMAT_I32_FLOAT                      0x0E3 
+#define BRW_SURFACEFORMAT_L32_FLOAT                      0x0E4 
+#define BRW_SURFACEFORMAT_A32_FLOAT                      0x0E5 
+#define BRW_SURFACEFORMAT_B8G8R8X8_UNORM                 0x0E9 
+#define BRW_SURFACEFORMAT_B8G8R8X8_UNORM_SRGB            0x0EA 
+#define BRW_SURFACEFORMAT_R8G8B8X8_UNORM                 0x0EB 
+#define BRW_SURFACEFORMAT_R8G8B8X8_UNORM_SRGB            0x0EC 
+#define BRW_SURFACEFORMAT_R9G9B9E5_SHAREDEXP             0x0ED 
+#define BRW_SURFACEFORMAT_B10G10R10X2_UNORM              0x0EE 
+#define BRW_SURFACEFORMAT_L16A16_FLOAT                   0x0F0 
+#define BRW_SURFACEFORMAT_R32_UNORM                      0x0F1 
+#define BRW_SURFACEFORMAT_R32_SNORM                      0x0F2 
+#define BRW_SURFACEFORMAT_R10G10B10X2_USCALED            0x0F3
+#define BRW_SURFACEFORMAT_R8G8B8A8_SSCALED               0x0F4
+#define BRW_SURFACEFORMAT_R8G8B8A8_USCALED               0x0F5
+#define BRW_SURFACEFORMAT_R16G16_SSCALED                 0x0F6
+#define BRW_SURFACEFORMAT_R16G16_USCALED                 0x0F7
+#define BRW_SURFACEFORMAT_R32_SSCALED                    0x0F8
+#define BRW_SURFACEFORMAT_R32_USCALED                    0x0F9
+#define BRW_SURFACEFORMAT_B5G6R5_UNORM                   0x100 
+#define BRW_SURFACEFORMAT_B5G6R5_UNORM_SRGB              0x101 
+#define BRW_SURFACEFORMAT_B5G5R5A1_UNORM                 0x102 
+#define BRW_SURFACEFORMAT_B5G5R5A1_UNORM_SRGB            0x103 
+#define BRW_SURFACEFORMAT_B4G4R4A4_UNORM                 0x104 
+#define BRW_SURFACEFORMAT_B4G4R4A4_UNORM_SRGB            0x105 
+#define BRW_SURFACEFORMAT_R8G8_UNORM                     0x106 
+#define BRW_SURFACEFORMAT_R8G8_SNORM                     0x107 
+#define BRW_SURFACEFORMAT_R8G8_SINT                      0x108 
+#define BRW_SURFACEFORMAT_R8G8_UINT                      0x109 
+#define BRW_SURFACEFORMAT_R16_UNORM                      0x10A 
+#define BRW_SURFACEFORMAT_R16_SNORM                      0x10B 
+#define BRW_SURFACEFORMAT_R16_SINT                       0x10C 
+#define BRW_SURFACEFORMAT_R16_UINT                       0x10D 
+#define BRW_SURFACEFORMAT_R16_FLOAT                      0x10E 
+#define BRW_SURFACEFORMAT_I16_UNORM                      0x111 
+#define BRW_SURFACEFORMAT_L16_UNORM                      0x112 
+#define BRW_SURFACEFORMAT_A16_UNORM                      0x113 
+#define BRW_SURFACEFORMAT_L8A8_UNORM                     0x114 
+#define BRW_SURFACEFORMAT_I16_FLOAT                      0x115
+#define BRW_SURFACEFORMAT_L16_FLOAT                      0x116
+#define BRW_SURFACEFORMAT_A16_FLOAT                      0x117
+#define BRW_SURFACEFORMAT_L8A8_UNORM_SRGB                0x118
+#define BRW_SURFACEFORMAT_R5G5_SNORM_B6_UNORM            0x119
+#define BRW_SURFACEFORMAT_B5G5R5X1_UNORM                 0x11A
+#define BRW_SURFACEFORMAT_B5G5R5X1_UNORM_SRGB            0x11B
+#define BRW_SURFACEFORMAT_R8G8_SSCALED                   0x11C
+#define BRW_SURFACEFORMAT_R8G8_USCALED                   0x11D
+#define BRW_SURFACEFORMAT_R16_SSCALED                    0x11E
+#define BRW_SURFACEFORMAT_R16_USCALED                    0x11F
+#define BRW_SURFACEFORMAT_R8_UNORM                       0x140 
+#define BRW_SURFACEFORMAT_R8_SNORM                       0x141 
+#define BRW_SURFACEFORMAT_R8_SINT                        0x142 
+#define BRW_SURFACEFORMAT_R8_UINT                        0x143 
+#define BRW_SURFACEFORMAT_A8_UNORM                       0x144 
+#define BRW_SURFACEFORMAT_I8_UNORM                       0x145 
+#define BRW_SURFACEFORMAT_L8_UNORM                       0x146 
+#define BRW_SURFACEFORMAT_P4A4_UNORM                     0x147 
+#define BRW_SURFACEFORMAT_A4P4_UNORM                     0x148
+#define BRW_SURFACEFORMAT_R8_SSCALED                     0x149
+#define BRW_SURFACEFORMAT_R8_USCALED                     0x14A
+#define BRW_SURFACEFORMAT_L8_UNORM_SRGB                  0x14C
+#define BRW_SURFACEFORMAT_R1_UINT                        0x181 
+#define BRW_SURFACEFORMAT_YCRCB_NORMAL                   0x182 
+#define BRW_SURFACEFORMAT_YCRCB_SWAPUVY                  0x183 
+#define BRW_SURFACEFORMAT_BC1_UNORM                      0x186 
+#define BRW_SURFACEFORMAT_BC2_UNORM                      0x187 
+#define BRW_SURFACEFORMAT_BC3_UNORM                      0x188 
+#define BRW_SURFACEFORMAT_BC4_UNORM                      0x189 
+#define BRW_SURFACEFORMAT_BC5_UNORM                      0x18A 
+#define BRW_SURFACEFORMAT_BC1_UNORM_SRGB                 0x18B 
+#define BRW_SURFACEFORMAT_BC2_UNORM_SRGB                 0x18C 
+#define BRW_SURFACEFORMAT_BC3_UNORM_SRGB                 0x18D 
+#define BRW_SURFACEFORMAT_MONO8                          0x18E 
+#define BRW_SURFACEFORMAT_YCRCB_SWAPUV                   0x18F 
+#define BRW_SURFACEFORMAT_YCRCB_SWAPY                    0x190 
+#define BRW_SURFACEFORMAT_DXT1_RGB                       0x191 
+#define BRW_SURFACEFORMAT_FXT1                           0x192 
+#define BRW_SURFACEFORMAT_R8G8B8_UNORM                   0x193 
+#define BRW_SURFACEFORMAT_R8G8B8_SNORM                   0x194 
+#define BRW_SURFACEFORMAT_R8G8B8_SSCALED                 0x195 
+#define BRW_SURFACEFORMAT_R8G8B8_USCALED                 0x196 
+#define BRW_SURFACEFORMAT_R64G64B64A64_FLOAT             0x197 
+#define BRW_SURFACEFORMAT_R64G64B64_FLOAT                0x198 
+#define BRW_SURFACEFORMAT_BC4_SNORM                      0x199 
+#define BRW_SURFACEFORMAT_BC5_SNORM                      0x19A 
+#define BRW_SURFACEFORMAT_R16G16B16_UNORM                0x19C 
+#define BRW_SURFACEFORMAT_R16G16B16_SNORM                0x19D 
+#define BRW_SURFACEFORMAT_R16G16B16_SSCALED              0x19E 
+#define BRW_SURFACEFORMAT_R16G16B16_USCALED              0x19F
+
+#define BRW_SURFACERETURNFORMAT_FLOAT32  0
+#define BRW_SURFACERETURNFORMAT_S1       1
+
+#define BRW_SURFACE_1D      0
+#define BRW_SURFACE_2D      1
+#define BRW_SURFACE_3D      2
+#define BRW_SURFACE_CUBE    3
+#define BRW_SURFACE_BUFFER  4
+#define BRW_SURFACE_NULL    7
+
+#define BRW_TEXCOORDMODE_WRAP            0
+#define BRW_TEXCOORDMODE_MIRROR          1
+#define BRW_TEXCOORDMODE_CLAMP           2
+#define BRW_TEXCOORDMODE_CUBE            3
+#define BRW_TEXCOORDMODE_CLAMP_BORDER    4
+#define BRW_TEXCOORDMODE_MIRROR_ONCE     5
+
+#define BRW_THREAD_PRIORITY_NORMAL   0
+#define BRW_THREAD_PRIORITY_HIGH     1
+
+#define BRW_TILEWALK_XMAJOR                 0
+#define BRW_TILEWALK_YMAJOR                 1
+
+#define BRW_VERTEX_SUBPIXEL_PRECISION_8BITS  0
+#define BRW_VERTEX_SUBPIXEL_PRECISION_4BITS  1
+
+/* Execution Unit (EU) defines
+ */
+
+#define BRW_ALIGN_1   0
+#define BRW_ALIGN_16  1
+
+#define BRW_ADDRESS_DIRECT                        0
+#define BRW_ADDRESS_REGISTER_INDIRECT_REGISTER    1
+
+#define BRW_CHANNEL_X     0
+#define BRW_CHANNEL_Y     1
+#define BRW_CHANNEL_Z     2
+#define BRW_CHANNEL_W     3
+
+#define BRW_COMPRESSION_NONE          0
+#define BRW_COMPRESSION_2NDHALF       1
+#define BRW_COMPRESSION_COMPRESSED    2
+
+#define BRW_CONDITIONAL_NONE  0
+#define BRW_CONDITIONAL_Z     1
+#define BRW_CONDITIONAL_NZ    2
+#define BRW_CONDITIONAL_EQ    1	/* Z */
+#define BRW_CONDITIONAL_NEQ   2	/* NZ */
+#define BRW_CONDITIONAL_G     3
+#define BRW_CONDITIONAL_GE    4
+#define BRW_CONDITIONAL_L     5
+#define BRW_CONDITIONAL_LE    6
+#define BRW_CONDITIONAL_R     7
+#define BRW_CONDITIONAL_O     8
+#define BRW_CONDITIONAL_U     9
+
+#define BRW_DEBUG_NONE        0
+#define BRW_DEBUG_BREAKPOINT  1
+
+#define BRW_DEPENDENCY_NORMAL         0
+#define BRW_DEPENDENCY_NOTCLEARED     1
+#define BRW_DEPENDENCY_NOTCHECKED     2
+#define BRW_DEPENDENCY_DISABLE        3
+
+#define BRW_EXECUTE_1     0
+#define BRW_EXECUTE_2     1
+#define BRW_EXECUTE_4     2
+#define BRW_EXECUTE_8     3
+#define BRW_EXECUTE_16    4
+#define BRW_EXECUTE_32    5
+
+#define BRW_HORIZONTAL_STRIDE_0   0
+#define BRW_HORIZONTAL_STRIDE_1   1
+#define BRW_HORIZONTAL_STRIDE_2   2
+#define BRW_HORIZONTAL_STRIDE_4   3
+
+#define BRW_INSTRUCTION_NORMAL    0
+#define BRW_INSTRUCTION_SATURATE  1
+
+#define BRW_MASK_ENABLE   0
+#define BRW_MASK_DISABLE  1
+
+#define BRW_OPCODE_MOV        1
+#define BRW_OPCODE_SEL        2
+#define BRW_OPCODE_NOT        4
+#define BRW_OPCODE_AND        5
+#define BRW_OPCODE_OR         6
+#define BRW_OPCODE_XOR        7
+#define BRW_OPCODE_SHR        8
+#define BRW_OPCODE_SHL        9
+#define BRW_OPCODE_RSR        10
+#define BRW_OPCODE_RSL        11
+#define BRW_OPCODE_ASR        12
+#define BRW_OPCODE_CMP        16
+#define BRW_OPCODE_CMPN       17
+#define BRW_OPCODE_JMPI       32
+#define BRW_OPCODE_IF         34
+#define BRW_OPCODE_IFF        35
+#define BRW_OPCODE_ELSE       36
+#define BRW_OPCODE_ENDIF      37
+#define BRW_OPCODE_DO         38
+#define BRW_OPCODE_WHILE      39
+#define BRW_OPCODE_BREAK      40
+#define BRW_OPCODE_CONTINUE   41
+#define BRW_OPCODE_HALT       42
+#define BRW_OPCODE_MSAVE      44
+#define BRW_OPCODE_MRESTORE   45
+#define BRW_OPCODE_PUSH       46
+#define BRW_OPCODE_POP        47
+#define BRW_OPCODE_WAIT       48
+#define BRW_OPCODE_SEND       49
+#define BRW_OPCODE_ADD        64
+#define BRW_OPCODE_MUL        65
+#define BRW_OPCODE_AVG        66
+#define BRW_OPCODE_FRC        67
+#define BRW_OPCODE_RNDU       68
+#define BRW_OPCODE_RNDD       69
+#define BRW_OPCODE_RNDE       70
+#define BRW_OPCODE_RNDZ       71
+#define BRW_OPCODE_MAC        72
+#define BRW_OPCODE_MACH       73
+#define BRW_OPCODE_LZD        74
+#define BRW_OPCODE_SAD2       80
+#define BRW_OPCODE_SADA2      81
+#define BRW_OPCODE_DP4        84
+#define BRW_OPCODE_DPH        85
+#define BRW_OPCODE_DP3        86
+#define BRW_OPCODE_DP2        87
+#define BRW_OPCODE_DPA2       88
+#define BRW_OPCODE_LINE       89
+#define BRW_OPCODE_NOP        126
+
+#define BRW_PREDICATE_NONE             0
+#define BRW_PREDICATE_NORMAL           1
+#define BRW_PREDICATE_ALIGN1_ANYV             2
+#define BRW_PREDICATE_ALIGN1_ALLV             3
+#define BRW_PREDICATE_ALIGN1_ANY2H            4
+#define BRW_PREDICATE_ALIGN1_ALL2H            5
+#define BRW_PREDICATE_ALIGN1_ANY4H            6
+#define BRW_PREDICATE_ALIGN1_ALL4H            7
+#define BRW_PREDICATE_ALIGN1_ANY8H            8
+#define BRW_PREDICATE_ALIGN1_ALL8H            9
+#define BRW_PREDICATE_ALIGN1_ANY16H           10
+#define BRW_PREDICATE_ALIGN1_ALL16H           11
+#define BRW_PREDICATE_ALIGN16_REPLICATE_X     2
+#define BRW_PREDICATE_ALIGN16_REPLICATE_Y     3
+#define BRW_PREDICATE_ALIGN16_REPLICATE_Z     4
+#define BRW_PREDICATE_ALIGN16_REPLICATE_W     5
+#define BRW_PREDICATE_ALIGN16_ANY4H           6
+#define BRW_PREDICATE_ALIGN16_ALL4H           7
+
+#define BRW_ARCHITECTURE_REGISTER_FILE    0
+#define BRW_GENERAL_REGISTER_FILE         1
+#define BRW_MESSAGE_REGISTER_FILE         2
+#define BRW_IMMEDIATE_VALUE               3
+
+#define BRW_REGISTER_TYPE_UD  0
+#define BRW_REGISTER_TYPE_D   1
+#define BRW_REGISTER_TYPE_UW  2
+#define BRW_REGISTER_TYPE_W   3
+#define BRW_REGISTER_TYPE_UB  4
+#define BRW_REGISTER_TYPE_B   5
+#define BRW_REGISTER_TYPE_VF  5	/* packed float vector, immediates only? */
+#define BRW_REGISTER_TYPE_HF  6
+#define BRW_REGISTER_TYPE_V   6	/* packed int vector, immediates only, uword dest only */
+#define BRW_REGISTER_TYPE_F   7
+
+#define BRW_ARF_NULL                  0x00
+#define BRW_ARF_ADDRESS               0x10
+#define BRW_ARF_ACCUMULATOR           0x20   
+#define BRW_ARF_FLAG                  0x30
+#define BRW_ARF_MASK                  0x40
+#define BRW_ARF_MASK_STACK            0x50
+#define BRW_ARF_MASK_STACK_DEPTH      0x60
+#define BRW_ARF_STATE                 0x70
+#define BRW_ARF_CONTROL               0x80
+#define BRW_ARF_NOTIFICATION_COUNT    0x90
+#define BRW_ARF_IP                    0xA0
+
+#define BRW_AMASK   0
+#define BRW_IMASK   1
+#define BRW_LMASK   2
+#define BRW_CMASK   3
+
+
+
+#define BRW_THREAD_NORMAL     0
+#define BRW_THREAD_ATOMIC     1
+#define BRW_THREAD_SWITCH     2
+
+#define BRW_VERTICAL_STRIDE_0                 0
+#define BRW_VERTICAL_STRIDE_1                 1
+#define BRW_VERTICAL_STRIDE_2                 2
+#define BRW_VERTICAL_STRIDE_4                 3
+#define BRW_VERTICAL_STRIDE_8                 4
+#define BRW_VERTICAL_STRIDE_16                5
+#define BRW_VERTICAL_STRIDE_32                6
+#define BRW_VERTICAL_STRIDE_64                7
+#define BRW_VERTICAL_STRIDE_128               8
+#define BRW_VERTICAL_STRIDE_256               9
+#define BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL   0xF
+
+#define BRW_WIDTH_1       0
+#define BRW_WIDTH_2       1
+#define BRW_WIDTH_4       2
+#define BRW_WIDTH_8       3
+#define BRW_WIDTH_16      4
+
+#define BRW_STATELESS_BUFFER_BOUNDARY_1K      0
+#define BRW_STATELESS_BUFFER_BOUNDARY_2K      1
+#define BRW_STATELESS_BUFFER_BOUNDARY_4K      2
+#define BRW_STATELESS_BUFFER_BOUNDARY_8K      3
+#define BRW_STATELESS_BUFFER_BOUNDARY_16K     4
+#define BRW_STATELESS_BUFFER_BOUNDARY_32K     5
+#define BRW_STATELESS_BUFFER_BOUNDARY_64K     6
+#define BRW_STATELESS_BUFFER_BOUNDARY_128K    7
+#define BRW_STATELESS_BUFFER_BOUNDARY_256K    8
+#define BRW_STATELESS_BUFFER_BOUNDARY_512K    9
+#define BRW_STATELESS_BUFFER_BOUNDARY_1M      10
+#define BRW_STATELESS_BUFFER_BOUNDARY_2M      11
+
+#define BRW_POLYGON_FACING_FRONT      0
+#define BRW_POLYGON_FACING_BACK       1
+
+#define BRW_MESSAGE_TARGET_NULL               0
+#define BRW_MESSAGE_TARGET_MATH               1
+#define BRW_MESSAGE_TARGET_SAMPLER            2
+#define BRW_MESSAGE_TARGET_GATEWAY            3
+#define BRW_MESSAGE_TARGET_DATAPORT_READ      4
+#define BRW_MESSAGE_TARGET_DATAPORT_WRITE     5
+#define BRW_MESSAGE_TARGET_URB                6
+#define BRW_MESSAGE_TARGET_THREAD_SPAWNER     7
+
+#define BRW_SAMPLER_RETURN_FORMAT_FLOAT32     0
+#define BRW_SAMPLER_RETURN_FORMAT_UINT32      2
+#define BRW_SAMPLER_RETURN_FORMAT_SINT32      3
+
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE              0
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE             0
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS        0
+#define BRW_SAMPLER_MESSAGE_SIMD8_KILLPIX             1
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD        1
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD         1
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_GRADIENTS  2
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS    2
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_COMPARE    0
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE     2
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_RESINFO           2
+#define BRW_SAMPLER_MESSAGE_SIMD8_RESINFO             2
+#define BRW_SAMPLER_MESSAGE_SIMD16_RESINFO            2
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_LD                3
+#define BRW_SAMPLER_MESSAGE_SIMD8_LD                  3
+#define BRW_SAMPLER_MESSAGE_SIMD16_LD                 3
+
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_IGDNG            0
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_IGDNG          0
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_IGDNG           0
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_IGDNG       1
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_BIAS_IGDNG     1
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS_IGDNG      1
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_IGDNG        2
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD_IGDNG      2
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD_IGDNG       2
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_COMPARE_IGDNG    3
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_COMPARE_IGDNG  3
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE_IGDNG   3
+
+/* for IGDNG only */
+#define BRW_SAMPLER_SIMD_MODE_SIMD4X2                   0
+#define BRW_SAMPLER_SIMD_MODE_SIMD8                     1
+#define BRW_SAMPLER_SIMD_MODE_SIMD16                    2
+#define BRW_SAMPLER_SIMD_MODE_SIMD32_64                 3
+
+#define BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW   0
+#define BRW_DATAPORT_OWORD_BLOCK_1_OWORDHIGH  1
+#define BRW_DATAPORT_OWORD_BLOCK_2_OWORDS     2
+#define BRW_DATAPORT_OWORD_BLOCK_4_OWORDS     3
+#define BRW_DATAPORT_OWORD_BLOCK_8_OWORDS     4
+
+#define BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD     0
+#define BRW_DATAPORT_OWORD_DUAL_BLOCK_4OWORDS    2
+
+#define BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS   2
+#define BRW_DATAPORT_DWORD_SCATTERED_BLOCK_16DWORDS  3
+
+#define BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ          0
+#define BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ     1
+#define BRW_DATAPORT_READ_MESSAGE_DWORD_BLOCK_READ          2
+#define BRW_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ      3
+
+#define BRW_DATAPORT_READ_TARGET_DATA_CACHE      0
+#define BRW_DATAPORT_READ_TARGET_RENDER_CACHE    1
+#define BRW_DATAPORT_READ_TARGET_SAMPLER_CACHE   2
+
+#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE                0
+#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED     1
+#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01         2
+#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23         3
+#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01       4
+
+#define BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE                0
+#define BRW_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE           1
+#define BRW_DATAPORT_WRITE_MESSAGE_DWORD_BLOCK_WRITE                2
+#define BRW_DATAPORT_WRITE_MESSAGE_DWORD_SCATTERED_WRITE            3
+#define BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE              4
+#define BRW_DATAPORT_WRITE_MESSAGE_STREAMED_VERTEX_BUFFER_WRITE     5
+#define BRW_DATAPORT_WRITE_MESSAGE_FLUSH_RENDER_CACHE               7
+
+#define BRW_MATH_FUNCTION_INV                              1
+#define BRW_MATH_FUNCTION_LOG                              2
+#define BRW_MATH_FUNCTION_EXP                              3
+#define BRW_MATH_FUNCTION_SQRT                             4
+#define BRW_MATH_FUNCTION_RSQ                              5
+#define BRW_MATH_FUNCTION_SIN                              6 /* was 7 */
+#define BRW_MATH_FUNCTION_COS                              7 /* was 8 */
+#define BRW_MATH_FUNCTION_SINCOS                           8 /* was 6 */
+#define BRW_MATH_FUNCTION_TAN                              9
+#define BRW_MATH_FUNCTION_POW                              10
+#define BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER   11
+#define BRW_MATH_FUNCTION_INT_DIV_QUOTIENT                 12
+#define BRW_MATH_FUNCTION_INT_DIV_REMAINDER                13
+
+#define BRW_MATH_INTEGER_UNSIGNED     0
+#define BRW_MATH_INTEGER_SIGNED       1
+
+#define BRW_MATH_PRECISION_FULL        0
+#define BRW_MATH_PRECISION_PARTIAL     1
+
+#define BRW_MATH_SATURATE_NONE         0
+#define BRW_MATH_SATURATE_SATURATE     1
+
+#define BRW_MATH_DATA_VECTOR  0
+#define BRW_MATH_DATA_SCALAR  1
+
+#define BRW_URB_OPCODE_WRITE  0
+
+#define BRW_URB_SWIZZLE_NONE          0
+#define BRW_URB_SWIZZLE_INTERLEAVE    1
+#define BRW_URB_SWIZZLE_TRANSPOSE     2
+
+#define BRW_SCRATCH_SPACE_SIZE_1K     0
+#define BRW_SCRATCH_SPACE_SIZE_2K     1
+#define BRW_SCRATCH_SPACE_SIZE_4K     2
+#define BRW_SCRATCH_SPACE_SIZE_8K     3
+#define BRW_SCRATCH_SPACE_SIZE_16K    4
+#define BRW_SCRATCH_SPACE_SIZE_32K    5
+#define BRW_SCRATCH_SPACE_SIZE_64K    6
+#define BRW_SCRATCH_SPACE_SIZE_128K   7
+#define BRW_SCRATCH_SPACE_SIZE_256K   8
+#define BRW_SCRATCH_SPACE_SIZE_512K   9
+#define BRW_SCRATCH_SPACE_SIZE_1M     10
+#define BRW_SCRATCH_SPACE_SIZE_2M     11
+
+
+
+
+#define CMD_URB_FENCE                 0x6000
+#define CMD_CS_URB_STATE              0x6001
+#define CMD_CONST_BUFFER              0x6002
+
+#define CMD_STATE_BASE_ADDRESS        0x6101
+#define CMD_STATE_INSN_POINTER        0x6102
+#define CMD_PIPELINE_SELECT_965       0x6104
+#define CMD_PIPELINE_SELECT_GM45      0x6904
+
+#define CMD_PIPELINED_STATE_POINTERS  0x7800
+#define CMD_BINDING_TABLE_PTRS        0x7801
+
+#define CMD_VERTEX_BUFFER             0x7808
+# define BRW_VB0_INDEX_SHIFT		27
+# define BRW_VB0_ACCESS_VERTEXDATA	(0 << 26)
+# define BRW_VB0_ACCESS_INSTANCEDATA	(1 << 26)
+# define BRW_VB0_PITCH_SHIFT		0
+
+#define CMD_VERTEX_ELEMENT            0x7809
+# define BRW_VE0_INDEX_SHIFT		27
+# define BRW_VE0_FORMAT_SHIFT		16
+# define BRW_VE0_VALID			(1 << 26)
+# define BRW_VE0_SRC_OFFSET_SHIFT	0
+# define BRW_VE1_COMPONENT_NOSTORE	0
+# define BRW_VE1_COMPONENT_STORE_SRC	1
+# define BRW_VE1_COMPONENT_STORE_0	2
+# define BRW_VE1_COMPONENT_STORE_1_FLT	3
+# define BRW_VE1_COMPONENT_STORE_1_INT	4
+# define BRW_VE1_COMPONENT_STORE_VID	5
+# define BRW_VE1_COMPONENT_STORE_IID	6
+# define BRW_VE1_COMPONENT_STORE_PID	7
+# define BRW_VE1_COMPONENT_0_SHIFT	28
+# define BRW_VE1_COMPONENT_1_SHIFT	24
+# define BRW_VE1_COMPONENT_2_SHIFT	20
+# define BRW_VE1_COMPONENT_3_SHIFT	16
+# define BRW_VE1_DST_OFFSET_SHIFT	0
+
+#define CMD_INDEX_BUFFER              0x780a
+#define CMD_VF_STATISTICS_965         0x780b
+#define CMD_VF_STATISTICS_GM45        0x680b
+
+#define CMD_DRAW_RECT                 0x7900
+#define CMD_BLEND_CONSTANT_COLOR      0x7901
+#define CMD_CHROMA_KEY                0x7904
+#define CMD_DEPTH_BUFFER              0x7905
+#define CMD_POLY_STIPPLE_OFFSET       0x7906
+#define CMD_POLY_STIPPLE_PATTERN      0x7907
+#define CMD_LINE_STIPPLE_PATTERN      0x7908
+#define CMD_GLOBAL_DEPTH_OFFSET_CLAMP 0x7909
+#define CMD_AA_LINE_PARAMETERS        0x790a
+
+#define CMD_PIPE_CONTROL              0x7a00
+
+#define CMD_3D_PRIM                   0x7b00
+
+#define CMD_MI_FLUSH                  0x0200
+
+
+/* Various values from the R0 vertex header:
+ */
+#define R02_PRIM_END    0x1
+#define R02_PRIM_START  0x2
+
+#include "intel_chipset.h"
+
+#define BRW_IS_G4X(brw)         (IS_G4X((brw)->intel.intelScreen->deviceID))
+#define BRW_IS_IGDNG(brw)         (IS_IGDNG((brw)->intel.intelScreen->deviceID))
+#define BRW_IS_965(brw)         (!(BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)))
+#define CMD_PIPELINE_SELECT(brw)        ((BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)) ? CMD_PIPELINE_SELECT_GM45 : CMD_PIPELINE_SELECT_965)
+#define CMD_VF_STATISTICS(brw)          ((BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)) ? CMD_VF_STATISTICS_GM45 : CMD_VF_STATISTICS_965)
+#define URB_SIZES(brw)                  (BRW_IS_IGDNG(brw) ? 1024 : \
+                                         (BRW_IS_G4X(brw) ? 384 : 256))  /* 512 bit units */
+
+#endif
--- a/src/gallium/drivers/i965/brw_disasm.c
+++ b/src/gallium/drivers/i965/brw_disasm.c
@ -0,0 +1,903 @@
+/*
+ * Copyright © 2008 Keith Packard
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that copyright
+ * notice and this permission notice appear in supporting documentation, and
+ * that the name of the copyright holders not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  The copyright holders make no representations
+ * about the suitability of this software for any purpose.  It is provided "as
+ * is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THIS SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <getopt.h>
+#include <unistd.h>
+#include <stdarg.h>
+
+#include "main/mtypes.h"
+
+#include "brw_context.h"
+#include "brw_defines.h"
+
+struct {
+    char    *name;
+    int	    nsrc;
+    int	    ndst;
+} opcode[128] = {
+    [BRW_OPCODE_MOV] = { .name = "mov", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_FRC] = { .name = "frc", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_RNDU] = { .name = "rndu", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_RNDD] = { .name = "rndd", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_RNDE] = { .name = "rnde", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_RNDZ] = { .name = "rndz", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_NOT] = { .name = "not", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_LZD] = { .name = "lzd", .nsrc = 1, .ndst = 1 },
+
+    [BRW_OPCODE_MUL] = { .name = "mul", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_MAC] = { .name = "mac", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_MACH] = { .name = "mach", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_LINE] = { .name = "line", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_SAD2] = { .name = "sad2", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_SADA2] = { .name = "sada2", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_DP4] = { .name = "dp4", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_DPH] = { .name = "dph", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_DP3] = { .name = "dp3", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_DP2] = { .name = "dp2", .nsrc = 2, .ndst = 1 },
+
+    [BRW_OPCODE_AVG] = { .name = "avg", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_ADD] = { .name = "add", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_SEL] = { .name = "sel", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_AND] = { .name = "and", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_OR] = { .name = "or", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_XOR] = { .name = "xor", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_SHR] = { .name = "shr", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_SHL] = { .name = "shl", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_ASR] = { .name = "asr", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_CMP] = { .name = "cmp", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_CMPN] = { .name = "cmpn", .nsrc = 2, .ndst = 1 },
+
+    [BRW_OPCODE_SEND] = { .name = "send", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_NOP] = { .name = "nop", .nsrc = 0, .ndst = 0 },
+    [BRW_OPCODE_JMPI] = { .name = "jmpi", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_IF] = { .name = "if", .nsrc = 2, .ndst = 0 },
+    [BRW_OPCODE_IFF] = { .name = "iff", .nsrc = 1, .ndst = 01 },
+    [BRW_OPCODE_WHILE] = { .name = "while", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_ELSE] = { .name = "else", .nsrc = 2, .ndst = 0 },
+    [BRW_OPCODE_BREAK] = { .name = "break", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_CONTINUE] = { .name = "cont", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_HALT] = { .name = "halt", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_MSAVE] = { .name = "msave", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_PUSH] = { .name = "push", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_MRESTORE] = { .name = "mrest", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_POP] = { .name = "pop", .nsrc = 2, .ndst = 0 },
+    [BRW_OPCODE_WAIT] = { .name = "wait", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_DO] = { .name = "do", .nsrc = 0, .ndst = 0 },
+    [BRW_OPCODE_ENDIF] = { .name = "endif", .nsrc = 2, .ndst = 0 },
+};
+
+char *conditional_modifier[16] = {
+    [BRW_CONDITIONAL_NONE] = "",
+    [BRW_CONDITIONAL_Z] = ".e",
+    [BRW_CONDITIONAL_NZ] = ".ne",
+    [BRW_CONDITIONAL_G] = ".g",
+    [BRW_CONDITIONAL_GE] = ".ge",
+    [BRW_CONDITIONAL_L] = ".l",
+    [BRW_CONDITIONAL_LE] = ".le",
+    [BRW_CONDITIONAL_R] = ".r",
+    [BRW_CONDITIONAL_O] = ".o",
+    [BRW_CONDITIONAL_U] = ".u",
+};
+
+char *negate[2] = {
+    [0] = "",
+    [1] = "-",
+};
+
+char *_abs[2] = {
+    [0] = "",
+    [1] = "(abs)",
+};
+
+char *vert_stride[16] = {
+    [0] = "0",
+    [1] = "1",
+    [2] = "2",
+    [3] = "4",
+    [4] = "8",
+    [5] = "16",
+    [6] = "32",
+    [15] = "VxH",
+};
+
+char *width[8] = {
+    [0] = "1",
+    [1] = "2",
+    [2] = "4",
+    [3] = "8",
+    [4] = "16",
+};
+
+char *horiz_stride[4] = {
+    [0] = "0",
+    [1] = "1",
+    [2] = "2",
+    [3] = "4"
+};
+
+char *chan_sel[4] = {
+    [0] = "x",
+    [1] = "y",
+    [2] = "z",
+    [3] = "w",
+};
+
+char *dest_condmod[16] = {
+};
+
+char *debug_ctrl[2] = {
+    [0] = "",
+    [1] = ".breakpoint"
+};
+
+char *saturate[2] = {
+    [0] = "",
+    [1] = ".sat"
+};
+
+char *exec_size[8] = {
+    [0] = "1",
+    [1] = "2",
+    [2] = "4",
+    [3] = "8",
+    [4] = "16",
+    [5] = "32"
+};
+
+char *pred_inv[2] = {
+    [0] = "+",
+    [1] = "-"
+};
+
+char *pred_ctrl_align16[16] = {
+    [1] = "",
+    [2] = ".x",
+    [3] = ".y",
+    [4] = ".z",
+    [5] = ".w",
+    [6] = ".any4h",
+    [7] = ".all4h",
+};
+
+char *pred_ctrl_align1[16] = {
+    [1] = "",
+    [2] = ".anyv",
+    [3] = ".allv",
+    [4] = ".any2h",
+    [5] = ".all2h",
+    [6] = ".any4h",
+    [7] = ".all4h",
+    [8] = ".any8h",
+    [9] = ".all8h",
+    [10] = ".any16h",
+    [11] = ".all16h",
+};
+
+char *thread_ctrl[4] = {
+    [0] = "",
+    [2] = "switch"
+};
+
+char *compr_ctrl[4] = {
+    [0] = "",
+    [1] = "sechalf",
+    [2] = "compr",
+};
+
+char *dep_ctrl[4] = {
+    [0] = "",
+    [1] = "NoDDClr",
+    [2] = "NoDDChk",
+    [3] = "NoDDClr,NoDDChk",
+};
+
+char *mask_ctrl[4] = {
+    [0] = "",
+    [1] = "nomask",
+};
+
+char *access_mode[2] = {
+    [0] = "align1",
+    [1] = "align16",
+};
+
+char *reg_encoding[8] = {
+    [0] = "UD",
+    [1] = "D",
+    [2] = "UW",
+    [3] = "W",
+    [4] = "UB",
+    [5] = "B",
+    [7] = "F"
+};
+
+char *imm_encoding[8] = {
+    [0] = "UD",
+    [1] = "D",
+    [2] = "UW",
+    [3] = "W",
+    [5] = "VF",
+    [5] = "V",
+    [7] = "F"
+};
+
+char *reg_file[4] = {
+    [0] = "A",
+    [1] = "g",
+    [2] = "m",
+    [3] = "imm",
+};
+
+char *writemask[16] = {
+    [0x0] = ".",
+    [0x1] = ".x",
+    [0x2] = ".y",
+    [0x3] = ".xy",
+    [0x4] = ".z",
+    [0x5] = ".xz",
+    [0x6] = ".yz",
+    [0x7] = ".xyz",
+    [0x8] = ".w",
+    [0x9] = ".xw",
+    [0xa] = ".yw",
+    [0xb] = ".xyw",
+    [0xc] = ".zw",
+    [0xd] = ".xzw",
+    [0xe] = ".yzw",
+    [0xf] = "",
+};
+
+char *end_of_thread[2] = {
+    [0] = "",
+    [1] = "EOT"
+};
+
+char *target_function[16] = {
+    [BRW_MESSAGE_TARGET_NULL] = "null",
+    [BRW_MESSAGE_TARGET_MATH] = "math",
+    [BRW_MESSAGE_TARGET_SAMPLER] = "sampler",
+    [BRW_MESSAGE_TARGET_GATEWAY] = "gateway",
+    [BRW_MESSAGE_TARGET_DATAPORT_READ] = "read",
+    [BRW_MESSAGE_TARGET_DATAPORT_WRITE] = "write",
+    [BRW_MESSAGE_TARGET_URB] = "urb",
+    [BRW_MESSAGE_TARGET_THREAD_SPAWNER] = "thread_spawner"
+};
+
+char *math_function[16] = {
+    [BRW_MATH_FUNCTION_INV] = "inv",
+    [BRW_MATH_FUNCTION_LOG] = "log",
+    [BRW_MATH_FUNCTION_EXP] = "exp",
+    [BRW_MATH_FUNCTION_SQRT] = "sqrt",
+    [BRW_MATH_FUNCTION_RSQ] = "rsq",
+    [BRW_MATH_FUNCTION_SIN] = "sin",
+    [BRW_MATH_FUNCTION_COS] = "cos",
+    [BRW_MATH_FUNCTION_SINCOS] = "sincos",
+    [BRW_MATH_FUNCTION_TAN] = "tan",
+    [BRW_MATH_FUNCTION_POW] = "pow",
+    [BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER] = "intdivmod",
+    [BRW_MATH_FUNCTION_INT_DIV_QUOTIENT] = "intmod",
+    [BRW_MATH_FUNCTION_INT_DIV_REMAINDER] = "intdiv",
+};
+
+char *math_saturate[2] = {
+    [0] = "",
+    [1] = "sat"
+};
+
+char *math_signed[2] = {
+    [0] = "",
+    [1] = "signed"
+};
+
+char *math_scalar[2] = {
+    [0] = "",
+    [1] = "scalar"
+};
+
+char *math_precision[2] = {
+    [0] = "",
+    [1] = "partial_precision"
+};
+
+char *urb_swizzle[4] = {
+    [BRW_URB_SWIZZLE_NONE] = "",
+    [BRW_URB_SWIZZLE_INTERLEAVE] = "interleave",
+    [BRW_URB_SWIZZLE_TRANSPOSE] = "transpose",
+};
+
+char *urb_allocate[2] = {
+    [0] = "",
+    [1] = "allocate"
+};
+
+char *urb_used[2] = {
+    [0] = "",
+    [1] = "used"
+};
+
+char *urb_complete[2] = {
+    [0] = "",
+    [1] = "complete"
+};
+
+char *sampler_target_format[4] = {
+    [0] = "F",
+    [2] = "UD",
+    [3] = "D"
+};
+
+
+static int column;
+
+static int string (FILE *file, char *string)
+{
+    fputs (string, file);
+    column += strlen (string);
+    return 0;
+}
+
+static int format (FILE *f, char *format, ...)
+{
+    char    buf[1024];
+    va_list	args;
+    va_start (args, format);
+
+    vsnprintf (buf, sizeof (buf) - 1, format, args);
+    string (f, buf);
+    return 0;
+}
+
+static int newline (FILE *f)
+{
+    putc ('\n', f);
+    column = 0;
+    return 0;
+}
+
+static int pad (FILE *f, int c)
+{
+    do
+	string (f, " ");
+    while (column < c);
+    return 0;
+}
+
+static int control (FILE *file, char *name, char *ctrl[], GLuint id, int *space)
+{
+    if (!ctrl[id]) {
+	fprintf (file, "*** invalid %s value %d ",
+		 name, id);
+	return 1;
+    }
+    if (ctrl[id][0])
+    {
+	if (space && *space)
+	    string (file, " ");
+	string (file, ctrl[id]);
+	if (space)
+	    *space = 1;
+    }
+    return 0;
+}
+
+static int print_opcode (FILE *file, int id)
+{
+    if (!opcode[id].name) {
+	format (file, "*** invalid opcode value %d ", id);
+	return 1;
+    }
+    string (file, opcode[id].name);
+    return 0;
+}
+
+static int reg (FILE *file, GLuint _reg_file, GLuint _reg_nr)
+{
+    int	err = 0;
+    if (_reg_file == BRW_ARCHITECTURE_REGISTER_FILE) {
+	switch (_reg_nr & 0xf0) {
+	case BRW_ARF_NULL:
+	    string (file, "null");
+	    return -1;
+	case BRW_ARF_ADDRESS:
+	    format (file, "a%d", _reg_nr & 0x0f);
+	    break;
+	case BRW_ARF_ACCUMULATOR:
+	    format (file, "acc%d", _reg_nr & 0x0f);
+	    break;
+	case BRW_ARF_MASK:
+	    format (file, "mask%d", _reg_nr & 0x0f);
+	    break;
+	case BRW_ARF_MASK_STACK:
+	    format (file, "msd%d", _reg_nr & 0x0f);
+	    break;
+	case BRW_ARF_STATE:
+	    format (file, "sr%d", _reg_nr & 0x0f);
+	    break;
+	case BRW_ARF_CONTROL:
+	    format (file, "cr%d", _reg_nr & 0x0f);
+	    break;
+	case BRW_ARF_NOTIFICATION_COUNT:
+	    format (file, "n%d", _reg_nr & 0x0f);
+	    break;
+	case BRW_ARF_IP:
+	    string (file, "ip");
+	    return -1;
+	    break;
+	default:
+	    format (file, "ARF%d", _reg_nr);
+	    break;
+	}
+    } else {
+	err  |= control (file, "src reg file", reg_file, _reg_file, NULL);
+	format (file, "%d", _reg_nr);
+    }
+    return err;
+}
+
+static int dest (FILE *file, struct brw_instruction *inst)
+{
+    int	err = 0;
+
+    if (inst->header.access_mode == BRW_ALIGN_1)
+    {
+	if (inst->bits1.da1.dest_address_mode == BRW_ADDRESS_DIRECT)
+	{
+	    err |= reg (file, inst->bits1.da1.dest_reg_file, inst->bits1.da1.dest_reg_nr);
+	    if (err == -1)
+		return 0;
+	    if (inst->bits1.da1.dest_subreg_nr)
+		format (file, ".%d", inst->bits1.da1.dest_subreg_nr);
+	    format (file, "<%d>", inst->bits1.da1.dest_horiz_stride);
+	    err |= control (file, "dest reg encoding", reg_encoding, inst->bits1.da1.dest_reg_type, NULL);
+	}
+	else
+	{
+	    string (file, "g[a0");
+	    if (inst->bits1.ia1.dest_subreg_nr)
+		format (file, ".%d", inst->bits1.ia1.dest_subreg_nr);
+	    if (inst->bits1.ia1.dest_indirect_offset)
+		format (file, " %d", inst->bits1.ia1.dest_indirect_offset);
+	    string (file, "]");
+	    format (file, "<%d>", inst->bits1.ia1.dest_horiz_stride);
+	    err |= control (file, "dest reg encoding", reg_encoding, inst->bits1.ia1.dest_reg_type, NULL);
+	}
+    }
+    else
+    {
+	if (inst->bits1.da16.dest_address_mode == BRW_ADDRESS_DIRECT)
+	{
+	    err |= reg (file, inst->bits1.da16.dest_reg_file, inst->bits1.da16.dest_reg_nr);
+	    if (err == -1)
+		return 0;
+	    if (inst->bits1.da16.dest_subreg_nr)
+		format (file, ".%d", inst->bits1.da16.dest_subreg_nr);
+	    string (file, "<1>");
+	    err |= control (file, "writemask", writemask, inst->bits1.da16.dest_writemask, NULL);
+	    err |= control (file, "dest reg encoding", reg_encoding, inst->bits1.da16.dest_reg_type, NULL);
+	}
+	else
+	{
+	    err = 1;
+	    string (file, "Indirect align16 address mode not supported");
+	}
+    }
+
+    return 0;
+}
+
+static int src_align1_region (FILE *file,
+			      GLuint _vert_stride, GLuint _width, GLuint _horiz_stride)
+{
+    int err = 0;
+    string (file, "<");
+    err |= control (file, "vert stride", vert_stride, _vert_stride, NULL);
+    string (file, ",");
+    err |= control (file, "width", width, _width, NULL);
+    string (file, ",");
+    err |= control (file, "horiz_stride", horiz_stride, _horiz_stride, NULL);
+    string (file, ">");
+    return err;
+}
+
+static int src_da1 (FILE *file, GLuint type, GLuint _reg_file,
+		    GLuint _vert_stride, GLuint _width, GLuint _horiz_stride,
+		    GLuint reg_num, GLuint sub_reg_num, GLuint __abs, GLuint _negate)
+{
+    int err = 0;
+    err |= control (file, "negate", negate, _negate, NULL);
+    err |= control (file, "abs", _abs, __abs, NULL);
+
+    err |= reg (file, _reg_file, reg_num);
+    if (err == -1)
+	return 0;
+    if (sub_reg_num)
+	format (file, ".%d", sub_reg_num);
+    src_align1_region (file, _vert_stride, _width, _horiz_stride);
+    err |= control (file, "src reg encoding", reg_encoding, type, NULL);
+    return err;
+}
+
+static int src_ia1 (FILE *file,
+		    GLuint type,
+		    GLuint _reg_file,
+		    GLint _addr_imm,
+		    GLuint _addr_subreg_nr,
+		    GLuint _negate,
+		    GLuint __abs,
+		    GLuint _addr_mode,
+		    GLuint _horiz_stride,
+		    GLuint _width,
+		    GLuint _vert_stride)
+{
+    int err = 0;
+    err |= control (file, "negate", negate, _negate, NULL);
+    err |= control (file, "abs", _abs, __abs, NULL);
+
+    string (file, "g[a0");
+    if (_addr_subreg_nr)
+	format (file, ".%d", _addr_subreg_nr);
+    if (_addr_imm)
+	format (file, " %d", _addr_imm);
+    string (file, "]");
+    src_align1_region (file, _vert_stride, _width, _horiz_stride);
+    err |= control (file, "src reg encoding", reg_encoding, type, NULL);
+    return err;
+}
+
+static int src_da16 (FILE *file,
+		     GLuint _reg_type,
+		     GLuint _reg_file,
+		     GLuint _vert_stride,
+		     GLuint _reg_nr,
+		     GLuint _subreg_nr,
+		     GLuint __abs,
+		     GLuint _negate,
+		     GLuint swz_x,
+		     GLuint swz_y,
+		     GLuint swz_z,
+		     GLuint swz_w)
+{
+    int err = 0;
+    err |= control (file, "negate", negate, _negate, NULL);
+    err |= control (file, "abs", _abs, __abs, NULL);
+
+    err |= reg (file, _reg_file, _reg_nr);
+    if (err == -1)
+	return 0;
+    if (_subreg_nr)
+	format (file, ".%d", _subreg_nr);
+    string (file, "<");
+    err |= control (file, "vert stride", vert_stride, _vert_stride, NULL);
+    string (file, ",1,1>");
+    err |= control (file, "src da16 reg type", reg_encoding, _reg_type, NULL);
+    /*
+     * Three kinds of swizzle display:
+     *  identity - nothing printed
+     *  1->all	 - print the single channel
+     *  1->1     - print the mapping
+     */
+    if (swz_x == BRW_CHANNEL_X &&
+	swz_y == BRW_CHANNEL_Y &&
+	swz_z == BRW_CHANNEL_Z &&
+	swz_w == BRW_CHANNEL_W)
+    {
+	;
+    }
+    else if (swz_x == swz_y && swz_x == swz_z && swz_x == swz_w)
+    {
+	string (file, ".");
+	err |= control (file, "channel select", chan_sel, swz_x, NULL);
+    }
+    else
+    {
+	string (file, ".");
+	err |= control (file, "channel select", chan_sel, swz_x, NULL);
+	err |= control (file, "channel select", chan_sel, swz_y, NULL);
+	err |= control (file, "channel select", chan_sel, swz_z, NULL);
+	err |= control (file, "channel select", chan_sel, swz_w, NULL);
+    }
+    return err;
+}
+
+
+static int imm (FILE *file, GLuint type, struct brw_instruction *inst) {
+    switch (type) {
+    case BRW_REGISTER_TYPE_UD:
+	format (file, "0x%08xUD", inst->bits3.ud);
+	break;
+    case BRW_REGISTER_TYPE_D:
+	format (file, "%dD", inst->bits3.d);
+	break;
+    case BRW_REGISTER_TYPE_UW:
+	format (file, "0x%04xUW", (uint16_t) inst->bits3.ud);
+	break;
+    case BRW_REGISTER_TYPE_W:
+	format (file, "%dW", (int16_t) inst->bits3.d);
+	break;
+    case BRW_REGISTER_TYPE_UB:
+	format (file, "0x%02xUB", (int8_t) inst->bits3.ud);
+	break;
+    case BRW_REGISTER_TYPE_VF:
+	format (file, "Vector Float");
+	break;
+    case BRW_REGISTER_TYPE_V:
+	format (file, "0x%08xV", inst->bits3.ud);
+	break;
+    case BRW_REGISTER_TYPE_F:
+	format (file, "%-gF", inst->bits3.f);
+    }
+    return 0;
+}
+
+static int src0 (FILE *file, struct brw_instruction *inst)
+{
+    if (inst->bits1.da1.src0_reg_file == BRW_IMMEDIATE_VALUE)
+	return imm (file, inst->bits1.da1.src0_reg_type,
+		    inst);
+    else if (inst->header.access_mode == BRW_ALIGN_1)
+    {
+	if (inst->bits2.da1.src0_address_mode == BRW_ADDRESS_DIRECT)
+	{
+	    return src_da1 (file,
+			    inst->bits1.da1.src0_reg_type,
+			    inst->bits1.da1.src0_reg_file,
+			    inst->bits2.da1.src0_vert_stride,
+			    inst->bits2.da1.src0_width,
+			    inst->bits2.da1.src0_horiz_stride,
+			    inst->bits2.da1.src0_reg_nr,
+			    inst->bits2.da1.src0_subreg_nr,
+			    inst->bits2.da1.src0_abs,
+			    inst->bits2.da1.src0_negate);
+	}
+	else
+	{
+	    return src_ia1 (file,
+			    inst->bits1.ia1.src0_reg_type,
+			    inst->bits1.ia1.src0_reg_file,
+			    inst->bits2.ia1.src0_indirect_offset,
+			    inst->bits2.ia1.src0_subreg_nr,
+			    inst->bits2.ia1.src0_negate,
+			    inst->bits2.ia1.src0_abs,
+			    inst->bits2.ia1.src0_address_mode,
+			    inst->bits2.ia1.src0_horiz_stride,
+			    inst->bits2.ia1.src0_width,
+			    inst->bits2.ia1.src0_vert_stride);
+	}
+    }
+    else
+    {
+	if (inst->bits2.da16.src0_address_mode == BRW_ADDRESS_DIRECT)
+	{
+	    return src_da16 (file,
+			     inst->bits1.da16.src0_reg_type,
+			     inst->bits1.da16.src0_reg_file,
+			     inst->bits2.da16.src0_vert_stride,
+			     inst->bits2.da16.src0_reg_nr,
+			     inst->bits2.da16.src0_subreg_nr,
+			     inst->bits2.da16.src0_abs,
+			     inst->bits2.da16.src0_negate,
+			     inst->bits2.da16.src0_swz_x,
+			     inst->bits2.da16.src0_swz_y,
+			     inst->bits2.da16.src0_swz_z,
+			     inst->bits2.da16.src0_swz_w);
+	}
+	else
+	{
+	    string (file, "Indirect align16 address mode not supported");
+	    return 1;
+	}
+    }
+}
+
+static int src1 (FILE *file, struct brw_instruction *inst)
+{
+    if (inst->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE)
+	return imm (file, inst->bits1.da1.src1_reg_type,
+		    inst);
+    else if (inst->header.access_mode == BRW_ALIGN_1)
+    {
+	if (inst->bits3.da1.src1_address_mode == BRW_ADDRESS_DIRECT)
+	{
+	    return src_da1 (file,
+			    inst->bits1.da1.src1_reg_type,
+			    inst->bits1.da1.src1_reg_file,
+			    inst->bits3.da1.src1_vert_stride,
+			    inst->bits3.da1.src1_width,
+			    inst->bits3.da1.src1_horiz_stride,
+			    inst->bits3.da1.src1_reg_nr,
+			    inst->bits3.da1.src1_subreg_nr,
+			    inst->bits3.da1.src1_abs,
+			    inst->bits3.da1.src1_negate);
+	}
+	else
+	{
+	    return src_ia1 (file,
+			    inst->bits1.ia1.src1_reg_type,
+			    inst->bits1.ia1.src1_reg_file,
+			    inst->bits3.ia1.src1_indirect_offset,
+			    inst->bits3.ia1.src1_subreg_nr,
+			    inst->bits3.ia1.src1_negate,
+			    inst->bits3.ia1.src1_abs,
+			    inst->bits3.ia1.src1_address_mode,
+			    inst->bits3.ia1.src1_horiz_stride,
+			    inst->bits3.ia1.src1_width,
+			    inst->bits3.ia1.src1_vert_stride);
+	}
+    }
+    else
+    {
+	if (inst->bits3.da16.src1_address_mode == BRW_ADDRESS_DIRECT)
+	{
+	    return src_da16 (file,
+			     inst->bits1.da16.src1_reg_type,
+			     inst->bits1.da16.src1_reg_file,
+			     inst->bits3.da16.src1_vert_stride,
+			     inst->bits3.da16.src1_reg_nr,
+			     inst->bits3.da16.src1_subreg_nr,
+			     inst->bits3.da16.src1_abs,
+			     inst->bits3.da16.src1_negate,
+			     inst->bits3.da16.src1_swz_x,
+			     inst->bits3.da16.src1_swz_y,
+			     inst->bits3.da16.src1_swz_z,
+			     inst->bits3.da16.src1_swz_w);
+	}
+	else
+	{
+	    string (file, "Indirect align16 address mode not supported");
+	    return 1;
+	}
+    }
+}
+
+int brw_disasm (FILE *file, struct brw_instruction *inst)
+{
+    int	err = 0;
+    int space = 0;
+
+    if (inst->header.predicate_control) {
+	string (file, "(");
+	err |= control (file, "predicate inverse", pred_inv, inst->header.predicate_inverse, NULL);
+	string (file, "f0");
+	if (inst->bits2.da1.flag_reg_nr)
+	    format (file, ".%d", inst->bits2.da1.flag_reg_nr);
+	if (inst->header.access_mode == BRW_ALIGN_1)
+	    err |= control (file, "predicate control align1", pred_ctrl_align1,
+			    inst->header.predicate_control, NULL);
+	else
+	    err |= control (file, "predicate control align16", pred_ctrl_align16,
+			    inst->header.predicate_control, NULL);
+	string (file, ") ");
+    }
+
+    err |= print_opcode (file, inst->header.opcode);
+    err |= control (file, "saturate", saturate, inst->header.saturate, NULL);
+    err |= control (file, "debug control", debug_ctrl, inst->header.debug_control, NULL);
+
+    if (inst->header.opcode != BRW_OPCODE_SEND)
+	err |= control (file, "conditional modifier", conditional_modifier,
+			inst->header.destreg__conditionalmod, NULL);
+
+    if (inst->header.opcode != BRW_OPCODE_NOP) {
+	string (file, "(");
+	err |= control (file, "execution size", exec_size, inst->header.execution_size, NULL);
+	string (file, ")");
+    }
+
+    if (inst->header.opcode == BRW_OPCODE_SEND)
+	format (file, " %d", inst->header.destreg__conditionalmod);
+
+    if (opcode[inst->header.opcode].ndst > 0) {
+	pad (file, 16);
+	err |= dest (file, inst);
+    }
+    if (opcode[inst->header.opcode].nsrc > 0) {
+	pad (file, 32);
+	err |= src0 (file, inst);
+    }
+    if (opcode[inst->header.opcode].nsrc > 1) {
+	pad (file, 48);
+	err |= src1 (file, inst);
+    }
+
+    if (inst->header.opcode == BRW_OPCODE_SEND) {
+	newline (file);
+	pad (file, 16);
+	space = 0;
+	err |= control (file, "target function", target_function,
+			inst->bits3.generic.msg_target, &space);
+	switch (inst->bits3.generic.msg_target) {
+	case BRW_MESSAGE_TARGET_MATH:
+	    err |= control (file, "math function", math_function,
+			    inst->bits3.math.function, &space);
+	    err |= control (file, "math saturate", math_saturate,
+			    inst->bits3.math.saturate, &space);
+	    err |= control (file, "math signed", math_signed,
+			    inst->bits3.math.int_type, &space);
+	    err |= control (file, "math scalar", math_scalar,
+			    inst->bits3.math.data_type, &space);
+	    err |= control (file, "math precision", math_precision,
+			    inst->bits3.math.precision, &space);
+	    break;
+	case BRW_MESSAGE_TARGET_SAMPLER:
+	    format (file, " (%d, %d, ",
+		    inst->bits3.sampler.binding_table_index,
+		    inst->bits3.sampler.sampler);
+	    err |= control (file, "sampler target format", sampler_target_format,
+			    inst->bits3.sampler.return_format, NULL);
+	    string (file, ")");
+	    break;
+	case BRW_MESSAGE_TARGET_DATAPORT_WRITE:
+	    format (file, " (%d, %d, %d, %d)",
+		    inst->bits3.dp_write.binding_table_index,
+		    (inst->bits3.dp_write.pixel_scoreboard_clear << 3) |
+		    inst->bits3.dp_write.msg_control,
+		    inst->bits3.dp_write.msg_type,
+		    inst->bits3.dp_write.send_commit_msg);
+	    break;
+	case BRW_MESSAGE_TARGET_URB:
+	    format (file, " %d", inst->bits3.urb.offset);
+	    space = 1;
+	    err |= control (file, "urb swizzle", urb_swizzle,
+			    inst->bits3.urb.swizzle_control, &space);
+	    err |= control (file, "urb allocate", urb_allocate,
+			    inst->bits3.urb.allocate, &space);
+	    err |= control (file, "urb used", urb_used,
+			    inst->bits3.urb.used, &space);
+	    err |= control (file, "urb complete", urb_complete,
+			    inst->bits3.urb.complete, &space);
+	    break;
+	case BRW_MESSAGE_TARGET_THREAD_SPAWNER:
+	    break;
+	default:
+	    format (file, "unsupported target %d", inst->bits3.generic.msg_target);
+	    break;
+	}
+	if (space)
+	    string (file, " ");
+	format (file, "mlen %d",
+		inst->bits3.generic.msg_length);
+	format (file, " rlen %d",
+		inst->bits3.generic.response_length);
+    }
+    pad (file, 64);
+    if (inst->header.opcode != BRW_OPCODE_NOP) {
+	string (file, "{");
+	space = 1;
+	err |= control(file, "access mode", access_mode, inst->header.access_mode, &space);
+	err |= control (file, "mask control", mask_ctrl, inst->header.mask_control, &space);
+	err |= control (file, "dependency control", dep_ctrl, inst->header.dependency_control, &space);
+	err |= control (file, "compression control", compr_ctrl, inst->header.compression_control, &space);
+	err |= control (file, "thread control", thread_ctrl, inst->header.thread_control, &space);
+	if (inst->header.opcode == BRW_OPCODE_SEND)
+	    err |= control (file, "end of thread", end_of_thread,
+			    inst->bits3.generic.end_of_thread, &space);
+	if (space)
+	    string (file, " ");
+	string (file, "}");
+    }
+    string (file, ";");
+    newline (file);
+    return err;
+}
--- a/src/gallium/drivers/i965/brw_draw.c
+++ b/src/gallium/drivers/i965/brw_draw.c
@ -0,0 +1,493 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "main/glheader.h"
+#include "main/context.h"
+#include "main/state.h"
+#include "main/enums.h"
+#include "tnl/tnl.h"
+#include "vbo/vbo_context.h"
+#include "swrast/swrast.h"
+#include "swrast_setup/swrast_setup.h"
+
+#include "brw_draw.h"
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_fallback.h"
+
+#include "intel_batchbuffer.h"
+#include "intel_buffer_objects.h"
+
+#define FILE_DEBUG_FLAG DEBUG_BATCH
+
+static GLuint prim_to_hw_prim[GL_POLYGON+1] = {
+   _3DPRIM_POINTLIST,
+   _3DPRIM_LINELIST,
+   _3DPRIM_LINELOOP,
+   _3DPRIM_LINESTRIP,
+   _3DPRIM_TRILIST,
+   _3DPRIM_TRISTRIP,
+   _3DPRIM_TRIFAN,
+   _3DPRIM_QUADLIST,
+   _3DPRIM_QUADSTRIP,
+   _3DPRIM_POLYGON
+};
+
+
+static const GLenum reduced_prim[GL_POLYGON+1] = {  
+   GL_POINTS,
+   GL_LINES,
+   GL_LINES,
+   GL_LINES,
+   GL_TRIANGLES,
+   GL_TRIANGLES,
+   GL_TRIANGLES,
+   GL_TRIANGLES,
+   GL_TRIANGLES,
+   GL_TRIANGLES
+};
+
+
+/* When the primitive changes, set a state bit and re-validate.  Not
+ * the nicest and would rather deal with this by having all the
+ * programs be immune to the active primitive (ie. cope with all
+ * possibilities).  That may not be realistic however.
+ */
+static GLuint brw_set_prim(struct brw_context *brw, GLenum prim)
+{
+   GLcontext *ctx = &brw->intel.ctx;
+
+   if (INTEL_DEBUG & DEBUG_PRIMS)
+      _mesa_printf("PRIM: %s\n", _mesa_lookup_enum_by_nr(prim));
+   
+   /* Slight optimization to avoid the GS program when not needed:
+    */
+   if (prim == GL_QUAD_STRIP &&
+       ctx->Light.ShadeModel != GL_FLAT &&
+       ctx->Polygon.FrontMode == GL_FILL &&
+       ctx->Polygon.BackMode == GL_FILL)
+      prim = GL_TRIANGLE_STRIP;
+
+   if (prim != brw->primitive) {
+      brw->primitive = prim;
+      brw->state.dirty.brw |= BRW_NEW_PRIMITIVE;
+
+      if (reduced_prim[prim] != brw->intel.reduced_primitive) {
+	 brw->intel.reduced_primitive = reduced_prim[prim];
+	 brw->state.dirty.brw |= BRW_NEW_REDUCED_PRIMITIVE;
+      }
+   }
+
+   return prim_to_hw_prim[prim];
+}
+
+
+static GLuint trim(GLenum prim, GLuint length)
+{
+   if (prim == GL_QUAD_STRIP)
+      return length > 3 ? (length - length % 2) : 0;
+   else if (prim == GL_QUADS)
+      return length - length % 4;
+   else 
+      return length;
+}
+
+
+static void brw_emit_prim(struct brw_context *brw,
+			  const struct _mesa_prim *prim,
+			  uint32_t hw_prim)
+{
+   struct brw_3d_primitive prim_packet;
+   struct intel_context *intel = &brw->intel;
+
+   if (INTEL_DEBUG & DEBUG_PRIMS)
+      _mesa_printf("PRIM: %s %d %d\n", _mesa_lookup_enum_by_nr(prim->mode), 
+		   prim->start, prim->count);
+
+   prim_packet.header.opcode = CMD_3D_PRIM;
+   prim_packet.header.length = sizeof(prim_packet)/4 - 2;
+   prim_packet.header.pad = 0;
+   prim_packet.header.topology = hw_prim;
+   prim_packet.header.indexed = prim->indexed;
+
+   prim_packet.verts_per_instance = trim(prim->mode, prim->count);
+   prim_packet.start_vert_location = prim->start;
+   if (prim->indexed)
+      prim_packet.start_vert_location += brw->ib.start_vertex_offset;
+   prim_packet.instance_count = 1;
+   prim_packet.start_instance_location = 0;
+   prim_packet.base_vert_location = prim->basevertex;
+
+   /* Can't wrap here, since we rely on the validated state. */
+   brw->no_batch_wrap = GL_TRUE;
+
+   /* If we're set to always flush, do it before and after the primitive emit.
+    * We want to catch both missed flushes that hurt instruction/state cache
+    * and missed flushes of the render cache as it heads to other parts of
+    * the besides the draw code.
+    */
+   if (intel->always_flush_cache) {
+      BEGIN_BATCH(1, IGNORE_CLIPRECTS);
+      OUT_BATCH(intel->vtbl.flush_cmd());
+      ADVANCE_BATCH();
+   }
+   if (prim_packet.verts_per_instance) {
+      intel_batchbuffer_data( brw->intel.batch, &prim_packet,
+			      sizeof(prim_packet), LOOP_CLIPRECTS);
+   }
+   if (intel->always_flush_cache) {
+      BEGIN_BATCH(1, IGNORE_CLIPRECTS);
+      OUT_BATCH(intel->vtbl.flush_cmd());
+      ADVANCE_BATCH();
+   }
+
+   brw->no_batch_wrap = GL_FALSE;
+}
+
+static void brw_merge_inputs( struct brw_context *brw,
+		       const struct gl_client_array *arrays[])
+{
+   struct brw_vertex_info old = brw->vb.info;
+   GLuint i;
+
+   for (i = 0; i < VERT_ATTRIB_MAX; i++)
+      dri_bo_unreference(brw->vb.inputs[i].bo);
+
+   memset(&brw->vb.inputs, 0, sizeof(brw->vb.inputs));
+   memset(&brw->vb.info, 0, sizeof(brw->vb.info));
+
+   for (i = 0; i < VERT_ATTRIB_MAX; i++) {
+      brw->vb.inputs[i].glarray = arrays[i];
+      brw->vb.inputs[i].attrib = (gl_vert_attrib) i;
+
+      if (arrays[i]->StrideB != 0)
+	 brw->vb.info.sizes[i/16] |= (brw->vb.inputs[i].glarray->Size - 1) <<
+	    ((i%16) * 2);
+   }
+
+   /* Raise statechanges if input sizes have changed. */
+   if (memcmp(brw->vb.info.sizes, old.sizes, sizeof(old.sizes)) != 0)
+      brw->state.dirty.brw |= BRW_NEW_INPUT_DIMENSIONS;
+}
+
+/* XXX: could split the primitive list to fallback only on the
+ * non-conformant primitives.
+ */
+static GLboolean check_fallbacks( struct brw_context *brw,
+				  const struct _mesa_prim *prim,
+				  GLuint nr_prims )
+{
+   GLcontext *ctx = &brw->intel.ctx;
+   GLuint i;
+
+   /* If we don't require strict OpenGL conformance, never 
+    * use fallbacks.  If we're forcing fallbacks, always
+    * use fallfacks.
+    */
+   if (brw->intel.conformance_mode == 0)
+      return GL_FALSE;
+
+   if (brw->intel.conformance_mode == 2)
+      return GL_TRUE;
+
+   if (ctx->Polygon.SmoothFlag) {
+      for (i = 0; i < nr_prims; i++)
+	 if (reduced_prim[prim[i].mode] == GL_TRIANGLES) 
+	    return GL_TRUE;
+   }
+
+   /* BRW hardware will do AA lines, but they are non-conformant it
+    * seems.  TBD whether we keep this fallback:
+    */
+   if (ctx->Line.SmoothFlag) {
+      for (i = 0; i < nr_prims; i++)
+	 if (reduced_prim[prim[i].mode] == GL_LINES) 
+	    return GL_TRUE;
+   }
+
+   /* Stipple -- these fallbacks could be resolved with a little
+    * bit of work?
+    */
+   if (ctx->Line.StippleFlag) {
+      for (i = 0; i < nr_prims; i++) {
+	 /* GS doesn't get enough information to know when to reset
+	  * the stipple counter?!?
+	  */
+	 if (prim[i].mode == GL_LINE_LOOP || prim[i].mode == GL_LINE_STRIP) 
+	    return GL_TRUE;
+	    
+	 if (prim[i].mode == GL_POLYGON &&
+	     (ctx->Polygon.FrontMode == GL_LINE ||
+	      ctx->Polygon.BackMode == GL_LINE))
+	    return GL_TRUE;
+      }
+   }
+
+   if (ctx->Point.SmoothFlag) {
+      for (i = 0; i < nr_prims; i++)
+	 if (prim[i].mode == GL_POINTS) 
+	    return GL_TRUE;
+   }
+
+   /* BRW hardware doesn't handle GL_CLAMP texturing correctly;
+    * brw_wm_sampler_state:translate_wrap_mode() treats GL_CLAMP
+    * as GL_CLAMP_TO_EDGE instead.  If we're using GL_CLAMP, and
+    * we want strict conformance, force the fallback.
+    * Right now, we only do this for 2D textures.
+    */
+   {
+      int u;
+      for (u = 0; u < ctx->Const.MaxTextureCoordUnits; u++) {
+         struct gl_texture_unit *texUnit = &ctx->Texture.Unit[u];
+         if (texUnit->Enabled) {
+            if (texUnit->Enabled & TEXTURE_1D_BIT) {
+               if (texUnit->CurrentTex[TEXTURE_1D_INDEX]->WrapS == GL_CLAMP) {
+                   return GL_TRUE;
+               }
+            }
+            if (texUnit->Enabled & TEXTURE_2D_BIT) {
+               if (texUnit->CurrentTex[TEXTURE_2D_INDEX]->WrapS == GL_CLAMP ||
+                   texUnit->CurrentTex[TEXTURE_2D_INDEX]->WrapT == GL_CLAMP) {
+                   return GL_TRUE;
+               }
+            }
+            if (texUnit->Enabled & TEXTURE_3D_BIT) {
+               if (texUnit->CurrentTex[TEXTURE_3D_INDEX]->WrapS == GL_CLAMP ||
+                   texUnit->CurrentTex[TEXTURE_3D_INDEX]->WrapT == GL_CLAMP ||
+                   texUnit->CurrentTex[TEXTURE_3D_INDEX]->WrapR == GL_CLAMP) {
+                   return GL_TRUE;
+               }
+            }
+         }
+      }
+   }
+      
+   /* Nothing stopping us from the fast path now */
+   return GL_FALSE;
+}
+
+/* May fail if out of video memory for texture or vbo upload, or on
+ * fallback conditions.
+ */
+static GLboolean brw_try_draw_prims( GLcontext *ctx,
+				     const struct gl_client_array *arrays[],
+				     const struct _mesa_prim *prim,
+				     GLuint nr_prims,
+				     const struct _mesa_index_buffer *ib,
+				     GLuint min_index,
+				     GLuint max_index )
+{
+   struct intel_context *intel = intel_context(ctx);
+   struct brw_context *brw = brw_context(ctx);
+   GLboolean retval = GL_FALSE;
+   GLboolean warn = GL_FALSE;
+   GLboolean first_time = GL_TRUE;
+   GLuint i;
+
+   if (ctx->NewState)
+      _mesa_update_state( ctx );
+
+   /* We have to validate the textures *before* checking for fallbacks;
+    * otherwise, the software fallback won't be able to rely on the
+    * texture state, the firstLevel and lastLevel fields won't be
+    * set in the intel texture object (they'll both be 0), and the 
+    * software fallback will segfault if it attempts to access any
+    * texture level other than level 0.
+    */
+   brw_validate_textures( brw );
+
+   if (check_fallbacks(brw, prim, nr_prims))
+      return GL_FALSE;
+
+   /* Bind all inputs, derive varying and size information:
+    */
+   brw_merge_inputs( brw, arrays );
+
+   brw->ib.ib = ib;
+   brw->state.dirty.brw |= BRW_NEW_INDICES;
+
+   brw->vb.min_index = min_index;
+   brw->vb.max_index = max_index;
+   brw->state.dirty.brw |= BRW_NEW_VERTICES;
+
+   /* Have to validate state quite late.  Will rebuild tnl_program,
+    * which depends on varying information.  
+    * 
+    * Note this is where brw->vs->prog_data.inputs_read is calculated,
+    * so can't access it earlier.
+    */
+
+   LOCK_HARDWARE(intel);
+
+   if (!intel->constant_cliprect && intel->driDrawable->numClipRects == 0) {
+      UNLOCK_HARDWARE(intel);
+      return GL_TRUE;
+   }
+
+   for (i = 0; i < nr_prims; i++) {
+      uint32_t hw_prim;
+
+      /* Flush the batch if it's approaching full, so that we don't wrap while
+       * we've got validated state that needs to be in the same batch as the
+       * primitives.  This fraction is just a guess (minimal full state plus
+       * a primitive is around 512 bytes), and would be better if we had
+       * an upper bound of how much we might emit in a single
+       * brw_try_draw_prims().
+       */
+      intel_batchbuffer_require_space(intel->batch, intel->batch->size / 4,
+				      LOOP_CLIPRECTS);
+
+      hw_prim = brw_set_prim(brw, prim[i].mode);
+
+      if (first_time || (brw->state.dirty.brw & BRW_NEW_PRIMITIVE)) {
+	 first_time = GL_FALSE;
+
+	 brw_validate_state(brw);
+
+	 /* Various fallback checks:  */
+	 if (brw->intel.Fallback)
+	    goto out;
+
+	 /* Check that we can fit our state in with our existing batchbuffer, or
+	  * flush otherwise.
+	  */
+	 if (dri_bufmgr_check_aperture_space(brw->state.validated_bos,
+					     brw->state.validated_bo_count)) {
+	    static GLboolean warned;
+	    intel_batchbuffer_flush(intel->batch);
+
+	    /* Validate the state after we flushed the batch (which would have
+	     * changed the set of dirty state).  If we still fail to
+	     * check_aperture, warn of what's happening, but attempt to continue
+	     * on since it may succeed anyway, and the user would probably rather
+	     * see a failure and a warning than a fallback.
+	     */
+	    brw_validate_state(brw);
+	    if (!warned &&
+		dri_bufmgr_check_aperture_space(brw->state.validated_bos,
+						brw->state.validated_bo_count)) {
+	       warn = GL_TRUE;
+	       warned = GL_TRUE;
+	    }
+	 }
+
+	 brw_upload_state(brw);
+      }
+
+      brw_emit_prim(brw, &prim[i], hw_prim);
+
+      retval = GL_TRUE;
+   }
+
+   if (intel->always_flush_batch)
+      intel_batchbuffer_flush(intel->batch);
+ out:
+   UNLOCK_HARDWARE(intel);
+
+   brw_state_cache_check_size(brw);
+
+   if (warn)
+      fprintf(stderr, "i965: Single primitive emit potentially exceeded "
+	      "available aperture space\n");
+
+   if (!retval)
+      DBG("%s failed\n", __FUNCTION__);
+
+   return retval;
+}
+
+void brw_draw_prims( GLcontext *ctx,
+		     const struct gl_client_array *arrays[],
+		     const struct _mesa_prim *prim,
+		     GLuint nr_prims,
+		     const struct _mesa_index_buffer *ib,
+		     GLboolean index_bounds_valid,
+		     GLuint min_index,
+		     GLuint max_index )
+{
+   GLboolean retval;
+
+   if (!vbo_all_varyings_in_vbos(arrays)) {
+      if (!index_bounds_valid)
+	 vbo_get_minmax_index(ctx, prim, ib, &min_index, &max_index);
+
+      /* Decide if we want to rebase.  If so we end up recursing once
+       * only into this function.
+       */
+      if (min_index != 0) {
+	 vbo_rebase_prims(ctx, arrays,
+			  prim, nr_prims,
+			  ib, min_index, max_index,
+			  brw_draw_prims );
+	 return;
+      }
+   }
+
+   /* Make a first attempt at drawing:
+    */
+   retval = brw_try_draw_prims(ctx, arrays, prim, nr_prims, ib, min_index, max_index);
+
+   /* Otherwise, we really are out of memory.  Pass the drawing
+    * command to the software tnl module and which will in turn call
+    * swrast to do the drawing.
+    */
+   if (!retval) {
+       _swsetup_Wakeup(ctx);
+      _tnl_draw_prims(ctx, arrays, prim, nr_prims, ib, min_index, max_index);
+   }
+
+}
+
+void brw_draw_init( struct brw_context *brw )
+{
+   GLcontext *ctx = &brw->intel.ctx;
+   struct vbo_context *vbo = vbo_context(ctx);
+
+   /* Register our drawing function: 
+    */
+   vbo->draw_prims = brw_draw_prims;
+}
+
+void brw_draw_destroy( struct brw_context *brw )
+{
+   int i;
+
+   if (brw->vb.upload.bo != NULL) {
+      dri_bo_unreference(brw->vb.upload.bo);
+      brw->vb.upload.bo = NULL;
+   }
+
+   for (i = 0; i < VERT_ATTRIB_MAX; i++) {
+      dri_bo_unreference(brw->vb.inputs[i].bo);
+      brw->vb.inputs[i].bo = NULL;
+   }
+
+   dri_bo_unreference(brw->ib.bo);
+   brw->ib.bo = NULL;
+}
--- a/src/gallium/drivers/i965/brw_draw.h
+++ b/src/gallium/drivers/i965/brw_draw.h
@ -0,0 +1,54 @@
+ /**************************************************************************
+ * 
+ * Copyright 2005 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef BRW_DRAW_H
+#define BRW_DRAW_H
+
+#include "main/mtypes.h"		/* for GLcontext... */
+#include "vbo/vbo.h"
+
+struct brw_context;
+
+
+void brw_draw_prims( GLcontext *ctx,
+		     const struct gl_client_array *arrays[],
+		     const struct _mesa_prim *prims,
+		     GLuint nr_prims,
+		     const struct _mesa_index_buffer *ib,
+		     GLboolean index_bounds_valid,
+		     GLuint min_index,
+		     GLuint max_index );
+
+void brw_draw_init( struct brw_context *brw );
+void brw_draw_destroy( struct brw_context *brw );
+
+/* brw_draw_current.c
+ */
+void brw_init_current_values(GLcontext *ctx,
+			     struct gl_client_array *arrays);
+
+#endif
--- a/src/gallium/drivers/i965/brw_draw_upload.c
+++ b/src/gallium/drivers/i965/brw_draw_upload.c
@ -0,0 +1,742 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "main/glheader.h"
+#include "main/bufferobj.h"
+#include "main/context.h"
+#include "main/state.h"
+#include "main/api_validate.h"
+#include "main/enums.h"
+
+#include "brw_draw.h"
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_fallback.h"
+
+#include "intel_batchbuffer.h"
+#include "intel_buffer_objects.h"
+#include "intel_tex.h"
+
+static GLuint double_types[5] = {
+   0,
+   BRW_SURFACEFORMAT_R64_FLOAT,
+   BRW_SURFACEFORMAT_R64G64_FLOAT,
+   BRW_SURFACEFORMAT_R64G64B64_FLOAT,
+   BRW_SURFACEFORMAT_R64G64B64A64_FLOAT
+};
+
+static GLuint float_types[5] = {
+   0,
+   BRW_SURFACEFORMAT_R32_FLOAT,
+   BRW_SURFACEFORMAT_R32G32_FLOAT,
+   BRW_SURFACEFORMAT_R32G32B32_FLOAT,
+   BRW_SURFACEFORMAT_R32G32B32A32_FLOAT
+};
+
+static GLuint uint_types_norm[5] = {
+   0,
+   BRW_SURFACEFORMAT_R32_UNORM,
+   BRW_SURFACEFORMAT_R32G32_UNORM,
+   BRW_SURFACEFORMAT_R32G32B32_UNORM,
+   BRW_SURFACEFORMAT_R32G32B32A32_UNORM
+};
+
+static GLuint uint_types_scale[5] = {
+   0,
+   BRW_SURFACEFORMAT_R32_USCALED,
+   BRW_SURFACEFORMAT_R32G32_USCALED,
+   BRW_SURFACEFORMAT_R32G32B32_USCALED,
+   BRW_SURFACEFORMAT_R32G32B32A32_USCALED
+};
+
+static GLuint int_types_norm[5] = {
+   0,
+   BRW_SURFACEFORMAT_R32_SNORM,
+   BRW_SURFACEFORMAT_R32G32_SNORM,
+   BRW_SURFACEFORMAT_R32G32B32_SNORM,
+   BRW_SURFACEFORMAT_R32G32B32A32_SNORM
+};
+
+static GLuint int_types_scale[5] = {
+   0,
+   BRW_SURFACEFORMAT_R32_SSCALED,
+   BRW_SURFACEFORMAT_R32G32_SSCALED,
+   BRW_SURFACEFORMAT_R32G32B32_SSCALED,
+   BRW_SURFACEFORMAT_R32G32B32A32_SSCALED
+};
+
+static GLuint ushort_types_norm[5] = {
+   0,
+   BRW_SURFACEFORMAT_R16_UNORM,
+   BRW_SURFACEFORMAT_R16G16_UNORM,
+   BRW_SURFACEFORMAT_R16G16B16_UNORM,
+   BRW_SURFACEFORMAT_R16G16B16A16_UNORM
+};
+
+static GLuint ushort_types_scale[5] = {
+   0,
+   BRW_SURFACEFORMAT_R16_USCALED,
+   BRW_SURFACEFORMAT_R16G16_USCALED,
+   BRW_SURFACEFORMAT_R16G16B16_USCALED,
+   BRW_SURFACEFORMAT_R16G16B16A16_USCALED
+};
+
+static GLuint short_types_norm[5] = {
+   0,
+   BRW_SURFACEFORMAT_R16_SNORM,
+   BRW_SURFACEFORMAT_R16G16_SNORM,
+   BRW_SURFACEFORMAT_R16G16B16_SNORM,
+   BRW_SURFACEFORMAT_R16G16B16A16_SNORM
+};
+
+static GLuint short_types_scale[5] = {
+   0,
+   BRW_SURFACEFORMAT_R16_SSCALED,
+   BRW_SURFACEFORMAT_R16G16_SSCALED,
+   BRW_SURFACEFORMAT_R16G16B16_SSCALED,
+   BRW_SURFACEFORMAT_R16G16B16A16_SSCALED
+};
+
+static GLuint ubyte_types_norm[5] = {
+   0,
+   BRW_SURFACEFORMAT_R8_UNORM,
+   BRW_SURFACEFORMAT_R8G8_UNORM,
+   BRW_SURFACEFORMAT_R8G8B8_UNORM,
+   BRW_SURFACEFORMAT_R8G8B8A8_UNORM
+};
+
+static GLuint ubyte_types_scale[5] = {
+   0,
+   BRW_SURFACEFORMAT_R8_USCALED,
+   BRW_SURFACEFORMAT_R8G8_USCALED,
+   BRW_SURFACEFORMAT_R8G8B8_USCALED,
+   BRW_SURFACEFORMAT_R8G8B8A8_USCALED
+};
+
+static GLuint byte_types_norm[5] = {
+   0,
+   BRW_SURFACEFORMAT_R8_SNORM,
+   BRW_SURFACEFORMAT_R8G8_SNORM,
+   BRW_SURFACEFORMAT_R8G8B8_SNORM,
+   BRW_SURFACEFORMAT_R8G8B8A8_SNORM
+};
+
+static GLuint byte_types_scale[5] = {
+   0,
+   BRW_SURFACEFORMAT_R8_SSCALED,
+   BRW_SURFACEFORMAT_R8G8_SSCALED,
+   BRW_SURFACEFORMAT_R8G8B8_SSCALED,
+   BRW_SURFACEFORMAT_R8G8B8A8_SSCALED
+};
+
+
+/**
+ * Given vertex array type/size/format/normalized info, return
+ * the appopriate hardware surface type.
+ * Format will be GL_RGBA or possibly GL_BGRA for GLubyte[4] color arrays.
+ */
+static GLuint get_surface_type( GLenum type, GLuint size,
+                                GLenum format, GLboolean normalized )
+{
+   if (INTEL_DEBUG & DEBUG_VERTS)
+      _mesa_printf("type %s size %d normalized %d\n", 
+		   _mesa_lookup_enum_by_nr(type), size, normalized);
+
+   if (normalized) {
+      switch (type) {
+      case GL_DOUBLE: return double_types[size];
+      case GL_FLOAT: return float_types[size];
+      case GL_INT: return int_types_norm[size];
+      case GL_SHORT: return short_types_norm[size];
+      case GL_BYTE: return byte_types_norm[size];
+      case GL_UNSIGNED_INT: return uint_types_norm[size];
+      case GL_UNSIGNED_SHORT: return ushort_types_norm[size];
+      case GL_UNSIGNED_BYTE:
+         if (format == GL_BGRA) {
+            /* See GL_EXT_vertex_array_bgra */
+            assert(size == 4);
+            return BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
+         }
+         else {
+            return ubyte_types_norm[size];
+         }
+      default: assert(0); return 0;
+      }      
+   }
+   else {
+      assert(format == GL_RGBA); /* sanity check */
+      switch (type) {
+      case GL_DOUBLE: return double_types[size];
+      case GL_FLOAT: return float_types[size];
+      case GL_INT: return int_types_scale[size];
+      case GL_SHORT: return short_types_scale[size];
+      case GL_BYTE: return byte_types_scale[size];
+      case GL_UNSIGNED_INT: return uint_types_scale[size];
+      case GL_UNSIGNED_SHORT: return ushort_types_scale[size];
+      case GL_UNSIGNED_BYTE: return ubyte_types_scale[size];
+      default: assert(0); return 0;
+      }      
+   }
+}
+
+
+static GLuint get_size( GLenum type )
+{
+   switch (type) {
+   case GL_DOUBLE: return sizeof(GLdouble);
+   case GL_FLOAT: return sizeof(GLfloat);
+   case GL_INT: return sizeof(GLint);
+   case GL_SHORT: return sizeof(GLshort);
+   case GL_BYTE: return sizeof(GLbyte);
+   case GL_UNSIGNED_INT: return sizeof(GLuint);
+   case GL_UNSIGNED_SHORT: return sizeof(GLushort);
+   case GL_UNSIGNED_BYTE: return sizeof(GLubyte);
+   default: return 0;
+   }      
+}
+
+static GLuint get_index_type(GLenum type) 
+{
+   switch (type) {
+   case GL_UNSIGNED_BYTE:  return BRW_INDEX_BYTE;
+   case GL_UNSIGNED_SHORT: return BRW_INDEX_WORD;
+   case GL_UNSIGNED_INT:   return BRW_INDEX_DWORD;
+   default: assert(0); return 0;
+   }
+}
+
+static void wrap_buffers( struct brw_context *brw,
+			  GLuint size )
+{
+   if (size < BRW_UPLOAD_INIT_SIZE)
+      size = BRW_UPLOAD_INIT_SIZE;
+
+   brw->vb.upload.offset = 0;
+
+   if (brw->vb.upload.bo != NULL)
+      dri_bo_unreference(brw->vb.upload.bo);
+   brw->vb.upload.bo = dri_bo_alloc(brw->intel.bufmgr, "temporary VBO",
+				    size, 1);
+
+   /* Set the internal VBO\ to no-backing-store.  We only use them as a
+    * temporary within a brw_try_draw_prims while the lock is held.
+    */
+   /* DON'T DO THIS AS IF WE HAVE TO RE-ORG MEMORY WE NEED SOMEWHERE WITH
+      FAKE TO PUSH THIS STUFF */
+//   if (!brw->intel.ttm)
+//      dri_bo_fake_disable_backing_store(brw->vb.upload.bo, NULL, NULL);
+}
+
+static void get_space( struct brw_context *brw,
+		       GLuint size,
+		       dri_bo **bo_return,
+		       GLuint *offset_return )
+{
+   size = ALIGN(size, 64);
+
+   if (brw->vb.upload.bo == NULL ||
+       brw->vb.upload.offset + size > brw->vb.upload.bo->size) {
+      wrap_buffers(brw, size);
+   }
+
+   assert(*bo_return == NULL);
+   dri_bo_reference(brw->vb.upload.bo);
+   *bo_return = brw->vb.upload.bo;
+   *offset_return = brw->vb.upload.offset;
+   brw->vb.upload.offset += size;
+}
+
+static void
+copy_array_to_vbo_array( struct brw_context *brw,
+			 struct brw_vertex_element *element,
+			 GLuint dst_stride)
+{
+   struct intel_context *intel = &brw->intel;
+   GLuint size = element->count * dst_stride;
+
+   get_space(brw, size, &element->bo, &element->offset);
+
+   if (element->glarray->StrideB == 0) {
+      assert(element->count == 1);
+      element->stride = 0;
+   } else {
+      element->stride = dst_stride;
+   }
+
+   if (dst_stride == element->glarray->StrideB) {
+      if (intel->intelScreen->kernel_exec_fencing) {
+	 drm_intel_gem_bo_map_gtt(element->bo);
+	 memcpy((char *)element->bo->virtual + element->offset,
+		element->glarray->Ptr, size);
+	 drm_intel_gem_bo_unmap_gtt(element->bo);
+      } else {
+	 dri_bo_subdata(element->bo,
+			element->offset,
+			size,
+			element->glarray->Ptr);
+      }
+   } else {
+      char *dest;
+      const unsigned char *src = element->glarray->Ptr;
+      int i;
+
+      if (intel->intelScreen->kernel_exec_fencing) {
+	 drm_intel_gem_bo_map_gtt(element->bo);
+	 dest = element->bo->virtual;
+	 dest += element->offset;
+
+	 for (i = 0; i < element->count; i++) {
+	    memcpy(dest, src, dst_stride);
+	    src += element->glarray->StrideB;
+	    dest += dst_stride;
+	 }
+
+	 drm_intel_gem_bo_unmap_gtt(element->bo);
+      } else {
+	 void *data;
+
+	 data = _mesa_malloc(dst_stride * element->count);
+	 dest = data;
+	 for (i = 0; i < element->count; i++) {
+	    memcpy(dest, src, dst_stride);
+	    src += element->glarray->StrideB;
+	    dest += dst_stride;
+	 }
+
+	 dri_bo_subdata(element->bo,
+			element->offset,
+			size,
+			data);
+
+	 _mesa_free(data);
+      }
+   }
+}
+
+static void brw_prepare_vertices(struct brw_context *brw)
+{
+   GLcontext *ctx = &brw->intel.ctx;
+   struct intel_context *intel = intel_context(ctx);
+   GLbitfield vs_inputs = brw->vs.prog_data->inputs_read; 
+   GLuint i;
+   const unsigned char *ptr = NULL;
+   GLuint interleave = 0;
+   unsigned int min_index = brw->vb.min_index;
+   unsigned int max_index = brw->vb.max_index;
+
+   struct brw_vertex_element *upload[VERT_ATTRIB_MAX];
+   GLuint nr_uploads = 0;
+
+   /* First build an array of pointers to ve's in vb.inputs_read
+    */
+   if (0)
+      _mesa_printf("%s %d..%d\n", __FUNCTION__, min_index, max_index);
+
+   /* Accumulate the list of enabled arrays. */
+   brw->vb.nr_enabled = 0;
+   while (vs_inputs) {
+      GLuint i = _mesa_ffsll(vs_inputs) - 1;
+      struct brw_vertex_element *input = &brw->vb.inputs[i];
+
+      vs_inputs &= ~(1 << i);
+      brw->vb.enabled[brw->vb.nr_enabled++] = input;
+   }
+
+   /* XXX: In the rare cases where this happens we fallback all
+    * the way to software rasterization, although a tnl fallback
+    * would be sufficient.  I don't know of *any* real world
+    * cases with > 17 vertex attributes enabled, so it probably
+    * isn't an issue at this point.
+    */
+   if (brw->vb.nr_enabled >= BRW_VEP_MAX) {
+      intel->Fallback = 1;
+      return;
+   }
+
+   for (i = 0; i < brw->vb.nr_enabled; i++) {
+      struct brw_vertex_element *input = brw->vb.enabled[i];
+
+      input->element_size = get_size(input->glarray->Type) * input->glarray->Size;
+
+      if (_mesa_is_bufferobj(input->glarray->BufferObj)) {
+	 struct intel_buffer_object *intel_buffer =
+	    intel_buffer_object(input->glarray->BufferObj);
+
+	 /* Named buffer object: Just reference its contents directly. */
+	 dri_bo_unreference(input->bo);
+	 input->bo = intel_bufferobj_buffer(intel, intel_buffer,
+					    INTEL_READ);
+	 dri_bo_reference(input->bo);
+	 input->offset = (unsigned long)input->glarray->Ptr;
+	 input->stride = input->glarray->StrideB;
+	 input->count = input->glarray->_MaxElement;
+
+	 /* This is a common place to reach if the user mistakenly supplies
+	  * a pointer in place of a VBO offset.  If we just let it go through,
+	  * we may end up dereferencing a pointer beyond the bounds of the
+	  * GTT.  We would hope that the VBO's max_index would save us, but
+	  * Mesa appears to hand us min/max values not clipped to the
+	  * array object's _MaxElement, and _MaxElement frequently appears
+	  * to be wrong anyway.
+	  *
+	  * The VBO spec allows application termination in this case, and it's
+	  * probably a service to the poor programmer to do so rather than
+	  * trying to just not render.
+	  */
+	 assert(input->offset < input->bo->size);
+      } else {
+	 input->count = input->glarray->StrideB ? max_index + 1 - min_index : 1;
+	 if (input->bo != NULL) {
+	    /* Already-uploaded vertex data is present from a previous
+	     * prepare_vertices, but we had to re-validate state due to
+	     * check_aperture failing and a new batch being produced.
+	     */
+	    continue;
+	 }
+
+	 /* Queue the buffer object up to be uploaded in the next pass,
+	  * when we've decided if we're doing interleaved or not.
+	  */
+	 if (input->attrib == VERT_ATTRIB_POS) {
+	    /* Position array not properly enabled:
+	     */
+            if (input->glarray->StrideB == 0) {
+               intel->Fallback = 1;
+               return;
+            }
+
+	    interleave = input->glarray->StrideB;
+	    ptr = input->glarray->Ptr;
+	 }
+	 else if (interleave != input->glarray->StrideB ||
+		  (const unsigned char *)input->glarray->Ptr - ptr < 0 ||
+		  (const unsigned char *)input->glarray->Ptr - ptr > interleave)
+	 {
+	    interleave = 0;
+	 }
+
+	 upload[nr_uploads++] = input;
+	 
+	 /* We rebase drawing to start at element zero only when
+	  * varyings are not in vbos, which means we can end up
+	  * uploading non-varying arrays (stride != 0) when min_index
+	  * is zero.  This doesn't matter as the amount to upload is
+	  * the same for these arrays whether the draw call is rebased
+	  * or not - we just have to upload the one element.
+	  */
+	 assert(min_index == 0 || input->glarray->StrideB == 0);
+      }
+   }
+
+   /* Handle any arrays to be uploaded. */
+   if (nr_uploads > 1 && interleave && interleave <= 256) {
+      /* All uploads are interleaved, so upload the arrays together as
+       * interleaved.  First, upload the contents and set up upload[0].
+       */
+      copy_array_to_vbo_array(brw, upload[0], interleave);
+
+      for (i = 1; i < nr_uploads; i++) {
+	 /* Then, just point upload[i] at upload[0]'s buffer. */
+	 upload[i]->stride = interleave;
+	 upload[i]->offset = upload[0]->offset +
+	    ((const unsigned char *)upload[i]->glarray->Ptr - ptr);
+	 upload[i]->bo = upload[0]->bo;
+	 dri_bo_reference(upload[i]->bo);
+      }
+   }
+   else {
+      /* Upload non-interleaved arrays */
+      for (i = 0; i < nr_uploads; i++) {
+          copy_array_to_vbo_array(brw, upload[i], upload[i]->element_size);
+      }
+   }
+
+   brw_prepare_query_begin(brw);
+
+   for (i = 0; i < brw->vb.nr_enabled; i++) {
+      struct brw_vertex_element *input = brw->vb.enabled[i];
+
+      brw_add_validated_bo(brw, input->bo);
+   }
+}
+
+static void brw_emit_vertices(struct brw_context *brw)
+{
+   GLcontext *ctx = &brw->intel.ctx;
+   struct intel_context *intel = intel_context(ctx);
+   GLuint i;
+
+   brw_emit_query_begin(brw);
+
+   /* If the VS doesn't read any inputs (calculating vertex position from
+    * a state variable for some reason, for example), emit a single pad
+    * VERTEX_ELEMENT struct and bail.
+    *
+    * The stale VB state stays in place, but they don't do anything unless
+    * a VE loads from them.
+    */
+   if (brw->vb.nr_enabled == 0) {
+      BEGIN_BATCH(3, IGNORE_CLIPRECTS);
+      OUT_BATCH((CMD_VERTEX_ELEMENT << 16) | 1);
+      OUT_BATCH((0 << BRW_VE0_INDEX_SHIFT) |
+		BRW_VE0_VALID |
+		(BRW_SURFACEFORMAT_R32G32B32A32_FLOAT << BRW_VE0_FORMAT_SHIFT) |
+		(0 << BRW_VE0_SRC_OFFSET_SHIFT));
+      OUT_BATCH((BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_0_SHIFT) |
+		(BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_1_SHIFT) |
+		(BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_2_SHIFT) |
+		(BRW_VE1_COMPONENT_STORE_1_FLT << BRW_VE1_COMPONENT_3_SHIFT));
+      ADVANCE_BATCH();
+      return;
+   }
+
+   /* Now emit VB and VEP state packets.
+    *
+    * This still defines a hardware VB for each input, even if they
+    * are interleaved or from the same VBO.  TBD if this makes a
+    * performance difference.
+    */
+   BEGIN_BATCH(1 + brw->vb.nr_enabled * 4, IGNORE_CLIPRECTS);
+   OUT_BATCH((CMD_VERTEX_BUFFER << 16) |
+	     ((1 + brw->vb.nr_enabled * 4) - 2));
+
+   for (i = 0; i < brw->vb.nr_enabled; i++) {
+      struct brw_vertex_element *input = brw->vb.enabled[i];
+
+      OUT_BATCH((i << BRW_VB0_INDEX_SHIFT) |
+		BRW_VB0_ACCESS_VERTEXDATA |
+		(input->stride << BRW_VB0_PITCH_SHIFT));
+      OUT_RELOC(input->bo,
+		I915_GEM_DOMAIN_VERTEX, 0,
+		input->offset);
+      if (BRW_IS_IGDNG(brw)) {
+          if (input->stride) {
+              OUT_RELOC(input->bo,
+                        I915_GEM_DOMAIN_VERTEX, 0,
+                        input->offset + input->stride * input->count - 1);
+          } else {
+              assert(input->count == 1);
+              OUT_RELOC(input->bo,
+                        I915_GEM_DOMAIN_VERTEX, 0,
+                        input->offset + input->element_size - 1);
+          }
+      } else
+          OUT_BATCH(input->stride ? input->count : 0);
+      OUT_BATCH(0); /* Instance data step rate */
+   }
+   ADVANCE_BATCH();
+
+   BEGIN_BATCH(1 + brw->vb.nr_enabled * 2, IGNORE_CLIPRECTS);
+   OUT_BATCH((CMD_VERTEX_ELEMENT << 16) | ((1 + brw->vb.nr_enabled * 2) - 2));
+   for (i = 0; i < brw->vb.nr_enabled; i++) {
+      struct brw_vertex_element *input = brw->vb.enabled[i];
+      uint32_t format = get_surface_type(input->glarray->Type,
+					 input->glarray->Size,
+					 input->glarray->Format,
+					 input->glarray->Normalized);
+      uint32_t comp0 = BRW_VE1_COMPONENT_STORE_SRC;
+      uint32_t comp1 = BRW_VE1_COMPONENT_STORE_SRC;
+      uint32_t comp2 = BRW_VE1_COMPONENT_STORE_SRC;
+      uint32_t comp3 = BRW_VE1_COMPONENT_STORE_SRC;
+
+      switch (input->glarray->Size) {
+      case 0: comp0 = BRW_VE1_COMPONENT_STORE_0;
+      case 1: comp1 = BRW_VE1_COMPONENT_STORE_0;
+      case 2: comp2 = BRW_VE1_COMPONENT_STORE_0;
+      case 3: comp3 = BRW_VE1_COMPONENT_STORE_1_FLT;
+	 break;
+      }
+
+      OUT_BATCH((i << BRW_VE0_INDEX_SHIFT) |
+		BRW_VE0_VALID |
+		(format << BRW_VE0_FORMAT_SHIFT) |
+		(0 << BRW_VE0_SRC_OFFSET_SHIFT));
+
+      if (BRW_IS_IGDNG(brw))
+          OUT_BATCH((comp0 << BRW_VE1_COMPONENT_0_SHIFT) |
+                    (comp1 << BRW_VE1_COMPONENT_1_SHIFT) |
+                    (comp2 << BRW_VE1_COMPONENT_2_SHIFT) |
+                    (comp3 << BRW_VE1_COMPONENT_3_SHIFT));
+      else
+          OUT_BATCH((comp0 << BRW_VE1_COMPONENT_0_SHIFT) |
+                    (comp1 << BRW_VE1_COMPONENT_1_SHIFT) |
+                    (comp2 << BRW_VE1_COMPONENT_2_SHIFT) |
+                    (comp3 << BRW_VE1_COMPONENT_3_SHIFT) |
+                    ((i * 4) << BRW_VE1_DST_OFFSET_SHIFT));
+   }
+   ADVANCE_BATCH();
+}
+
+const struct brw_tracked_state brw_vertices = {
+   .dirty = {
+      .mesa = 0,
+      .brw = BRW_NEW_BATCH | BRW_NEW_VERTICES,
+      .cache = 0,
+   },
+   .prepare = brw_prepare_vertices,
+   .emit = brw_emit_vertices,
+};
+
+static void brw_prepare_indices(struct brw_context *brw)
+{
+   GLcontext *ctx = &brw->intel.ctx;
+   struct intel_context *intel = &brw->intel;
+   const struct _mesa_index_buffer *index_buffer = brw->ib.ib;
+   GLuint ib_size;
+   dri_bo *bo = NULL;
+   struct gl_buffer_object *bufferobj;
+   GLuint offset;
+   GLuint ib_type_size;
+
+   if (index_buffer == NULL)
+      return;
+
+   ib_type_size = get_size(index_buffer->type);
+   ib_size = ib_type_size * index_buffer->count;
+   bufferobj = index_buffer->obj;;
+
+   /* Turn into a proper VBO:
+    */
+   if (!_mesa_is_bufferobj(bufferobj)) {
+      brw->ib.start_vertex_offset = 0;
+
+      /* Get new bufferobj, offset:
+       */
+      get_space(brw, ib_size, &bo, &offset);
+
+      /* Straight upload
+       */
+      if (intel->intelScreen->kernel_exec_fencing) {
+	 drm_intel_gem_bo_map_gtt(bo);
+	 memcpy((char *)bo->virtual + offset, index_buffer->ptr, ib_size);
+	 drm_intel_gem_bo_unmap_gtt(bo);
+      } else {
+	 dri_bo_subdata(bo, offset, ib_size, index_buffer->ptr);
+      }
+   } else {
+      offset = (GLuint) (unsigned long) index_buffer->ptr;
+      brw->ib.start_vertex_offset = 0;
+
+      /* If the index buffer isn't aligned to its element size, we have to
+       * rebase it into a temporary.
+       */
+       if ((get_size(index_buffer->type) - 1) & offset) {
+           GLubyte *map = ctx->Driver.MapBuffer(ctx,
+                                                GL_ELEMENT_ARRAY_BUFFER_ARB,
+                                                GL_DYNAMIC_DRAW_ARB,
+                                                bufferobj);
+           map += offset;
+
+	   get_space(brw, ib_size, &bo, &offset);
+
+	   dri_bo_subdata(bo, offset, ib_size, map);
+
+           ctx->Driver.UnmapBuffer(ctx, GL_ELEMENT_ARRAY_BUFFER_ARB, bufferobj);
+       } else {
+	  bo = intel_bufferobj_buffer(intel, intel_buffer_object(bufferobj),
+				      INTEL_READ);
+	  dri_bo_reference(bo);
+
+	  /* Use CMD_3D_PRIM's start_vertex_offset to avoid re-uploading
+	   * the index buffer state when we're just moving the start index
+	   * of our drawing.
+	   */
+	  brw->ib.start_vertex_offset = offset / ib_type_size;
+	  offset = 0;
+	  ib_size = bo->size;
+       }
+   }
+
+   if (brw->ib.bo != bo ||
+       brw->ib.offset != offset ||
+       brw->ib.size != ib_size)
+   {
+      drm_intel_bo_unreference(brw->ib.bo);
+      brw->ib.bo = bo;
+      brw->ib.offset = offset;
+      brw->ib.size = ib_size;
+
+      brw->state.dirty.brw |= BRW_NEW_INDEX_BUFFER;
+   } else {
+      drm_intel_bo_unreference(bo);
+   }
+
+   brw_add_validated_bo(brw, brw->ib.bo);
+}
+
+const struct brw_tracked_state brw_indices = {
+   .dirty = {
+      .mesa = 0,
+      .brw = BRW_NEW_INDICES,
+      .cache = 0,
+   },
+   .prepare = brw_prepare_indices,
+};
+
+static void brw_emit_index_buffer(struct brw_context *brw)
+{
+   struct intel_context *intel = &brw->intel;
+   const struct _mesa_index_buffer *index_buffer = brw->ib.ib;
+
+   if (index_buffer == NULL)
+      return;
+
+   /* Emit the indexbuffer packet:
+    */
+   {
+      struct brw_indexbuffer ib;
+
+      memset(&ib, 0, sizeof(ib));
+
+      ib.header.bits.opcode = CMD_INDEX_BUFFER;
+      ib.header.bits.length = sizeof(ib)/4 - 2;
+      ib.header.bits.index_format = get_index_type(index_buffer->type);
+      ib.header.bits.cut_index_enable = 0;
+
+      BEGIN_BATCH(4, IGNORE_CLIPRECTS);
+      OUT_BATCH( ib.header.dword );
+      OUT_RELOC(brw->ib.bo,
+		I915_GEM_DOMAIN_VERTEX, 0,
+		brw->ib.offset);
+      OUT_RELOC(brw->ib.bo,
+		I915_GEM_DOMAIN_VERTEX, 0,
+		brw->ib.offset + brw->ib.size - 1);
+      OUT_BATCH( 0 );
+      ADVANCE_BATCH();
+   }
+}
+
+const struct brw_tracked_state brw_index_buffer = {
+   .dirty = {
+      .mesa = 0,
+      .brw = BRW_NEW_BATCH | BRW_NEW_INDEX_BUFFER,
+      .cache = 0,
+   },
+   .emit = brw_emit_index_buffer,
+};
--- a/src/gallium/drivers/i965/brw_eu.c
+++ b/src/gallium/drivers/i965/brw_eu.c
@ -0,0 +1,254 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+  
+
+#include "brw_context.h"
+#include "brw_defines.h"
+#include "brw_eu.h"
+
+
+
+/* How does predicate control work when execution_size != 8?  Do I
+ * need to test/set for 0xffff when execution_size is 16?
+ */
+void brw_set_predicate_control_flag_value( struct brw_compile *p, GLuint value )
+{
+   p->current->header.predicate_control = BRW_PREDICATE_NONE;
+
+   if (value != 0xff) {
+      if (value != p->flag_value) {
+	 brw_push_insn_state(p);
+	 brw_MOV(p, brw_flag_reg(), brw_imm_uw(value));
+	 p->flag_value = value;
+	 brw_pop_insn_state(p);
+      }
+
+      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
+   }   
+}
+
+void brw_set_predicate_control( struct brw_compile *p, GLuint pc )
+{
+   p->current->header.predicate_control = pc;
+}
+
+void brw_set_conditionalmod( struct brw_compile *p, GLuint conditional )
+{
+   p->current->header.destreg__conditionalmod = conditional;
+}
+
+void brw_set_access_mode( struct brw_compile *p, GLuint access_mode )
+{
+   p->current->header.access_mode = access_mode;
+}
+
+void brw_set_compression_control( struct brw_compile *p, GLboolean compression_control )
+{
+   p->current->header.compression_control = compression_control;
+}
+
+void brw_set_mask_control( struct brw_compile *p, GLuint value )
+{
+   p->current->header.mask_control = value;
+}
+
+void brw_set_saturate( struct brw_compile *p, GLuint value )
+{
+   p->current->header.saturate = value;
+}
+
+void brw_push_insn_state( struct brw_compile *p )
+{
+   assert(p->current != &p->stack[BRW_EU_MAX_INSN_STACK-1]);
+   memcpy(p->current+1, p->current, sizeof(struct brw_instruction));
+   p->current++;   
+}
+
+void brw_pop_insn_state( struct brw_compile *p )
+{
+   assert(p->current != p->stack);
+   p->current--;
+}
+
+
+/***********************************************************************
+ */
+void brw_init_compile( struct brw_context *brw, struct brw_compile *p )
+{
+   p->brw = brw;
+   p->nr_insn = 0;
+   p->current = p->stack;
+   memset(p->current, 0, sizeof(p->current[0]));
+
+   /* Some defaults?
+    */
+   brw_set_mask_control(p, BRW_MASK_ENABLE); /* what does this do? */
+   brw_set_saturate(p, 0);
+   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+   brw_set_predicate_control_flag_value(p, 0xff); 
+}
+
+
+const GLuint *brw_get_program( struct brw_compile *p,
+			       GLuint *sz )
+{
+   GLuint i;
+
+   for (i = 0; i < 8; i++)
+      brw_NOP(p);
+
+   *sz = p->nr_insn * sizeof(struct brw_instruction);
+   return (const GLuint *)p->store;
+}
+
+
+
+/**
+ * Subroutine calls require special attention.
+ * Mesa instructions may be expanded into multiple hardware instructions
+ * so the prog_instruction::BranchTarget field can't be used as an index
+ * into the hardware instructions.
+ *
+ * The BranchTarget field isn't needed, however.  Mesa's GLSL compiler
+ * emits CAL and BGNSUB instructions with labels that can be used to map
+ * subroutine calls to actual subroutine code blocks.
+ *
+ * The structures and function here implement patching of CAL instructions
+ * so they jump to the right subroutine code...
+ */
+
+
+/**
+ * For each OPCODE_BGNSUB we create one of these.
+ */
+struct brw_glsl_label
+{
+   const char *name; /**< the label string */
+   GLuint position;  /**< the position of the brw instruction for this label */
+   struct brw_glsl_label *next;  /**< next in linked list */
+};
+
+
+/**
+ * For each OPCODE_CAL we create one of these.
+ */
+struct brw_glsl_call
+{
+   GLuint call_inst_pos;  /**< location of the CAL instruction */
+   const char *sub_name;  /**< name of subroutine to call */
+   struct brw_glsl_call *next;  /**< next in linked list */
+};
+
+
+/**
+ * Called for each OPCODE_BGNSUB.
+ */
+void
+brw_save_label(struct brw_compile *c, const char *name, GLuint position)
+{
+   struct brw_glsl_label *label = CALLOC_STRUCT(brw_glsl_label);
+   label->name = name;
+   label->position = position;
+   label->next = c->first_label;
+   c->first_label = label;
+}
+
+
+/**
+ * Called for each OPCODE_CAL.
+ */
+void
+brw_save_call(struct brw_compile *c, const char *name, GLuint call_pos)
+{
+   struct brw_glsl_call *call = CALLOC_STRUCT(brw_glsl_call);
+   call->call_inst_pos = call_pos;
+   call->sub_name = name;
+   call->next = c->first_call;
+   c->first_call = call;
+}
+
+
+/**
+ * Lookup a label, return label's position/offset.
+ */
+static GLuint
+brw_lookup_label(struct brw_compile *c, const char *name)
+{
+   const struct brw_glsl_label *label;
+   for (label = c->first_label; label; label = label->next) {
+      if (strcmp(name, label->name) == 0) {
+         return label->position;
+      }
+   }
+   abort();  /* should never happen */
+   return ~0;
+}
+
+
+/**
+ * When we're done generating code, this function is called to resolve
+ * subroutine calls.
+ */
+void
+brw_resolve_cals(struct brw_compile *c)
+{
+    const struct brw_glsl_call *call;
+
+    for (call = c->first_call; call; call = call->next) {
+        const GLuint sub_loc = brw_lookup_label(c, call->sub_name);
+	struct brw_instruction *brw_call_inst = &c->store[call->call_inst_pos];
+	struct brw_instruction *brw_sub_inst = &c->store[sub_loc];
+	GLint offset = brw_sub_inst - brw_call_inst;
+
+	/* patch brw_inst1 to point to brw_inst2 */
+	brw_set_src1(brw_call_inst, brw_imm_d(offset * 16));
+    }
+
+    /* free linked list of calls */
+    {
+        struct brw_glsl_call *call, *next;
+        for (call = c->first_call; call; call = next) {
+	    next = call->next;
+	    _mesa_free(call);
+	}
+	c->first_call = NULL;
+    }
+
+    /* free linked list of labels */
+    {
+        struct brw_glsl_label *label, *next;
+	for (label = c->first_label; label; label = next) {
+	    next = label->next;
+	    _mesa_free(label);
+	}
+	c->first_label = NULL;
+    }
+}
--- a/src/gallium/drivers/i965/brw_eu.h
+++ b/src/gallium/drivers/i965/brw_eu.h
@ -0,0 +1,968 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+   
+
+#ifndef BRW_EU_H
+#define BRW_EU_H
+
+#include "brw_structs.h"
+#include "brw_defines.h"
+#include "shader/prog_instruction.h"
+
+#define BRW_SWIZZLE4(a,b,c,d) (((a)<<0) | ((b)<<2) | ((c)<<4) | ((d)<<6))
+#define BRW_GET_SWZ(swz, idx) (((swz) >> ((idx)*2)) & 0x3)
+
+#define BRW_SWIZZLE_NOOP      BRW_SWIZZLE4(0,1,2,3)
+#define BRW_SWIZZLE_XYZW      BRW_SWIZZLE4(0,1,2,3)
+#define BRW_SWIZZLE_XXXX      BRW_SWIZZLE4(0,0,0,0)
+#define BRW_SWIZZLE_XYXY      BRW_SWIZZLE4(0,1,0,1)
+
+
+#define REG_SIZE (8*4)
+
+
+/* These aren't hardware structs, just something useful for us to pass around:
+ *
+ * Align1 operation has a lot of control over input ranges.  Used in
+ * WM programs to implement shaders decomposed into "channel serial"
+ * or "structure of array" form:
+ */
+struct brw_reg
+{
+   GLuint type:4;
+   GLuint file:2;
+   GLuint nr:8;
+   GLuint subnr:5;		/* :1 in align16 */
+   GLuint negate:1;		/* source only */
+   GLuint abs:1;		/* source only */
+   GLuint vstride:4;		/* source only */
+   GLuint width:3;		/* src only, align1 only */
+   GLuint hstride:2;   		/* align1 only */
+   GLuint address_mode:1;	/* relative addressing, hopefully! */
+   GLuint pad0:1;
+
+   union {      
+      struct {
+	 GLuint swizzle:8;		/* src only, align16 only */
+	 GLuint writemask:4;		/* dest only, align16 only */
+	 GLint  indirect_offset:10;	/* relative addressing offset */
+	 GLuint pad1:10;		/* two dwords total */
+      } bits;
+
+      GLfloat f;
+      GLint   d;
+      GLuint ud;
+   } dw1;      
+};
+
+
+struct brw_indirect {
+   GLuint addr_subnr:4;
+   GLint addr_offset:10;
+   GLuint pad:18;
+};
+
+
+struct brw_glsl_label;
+struct brw_glsl_call;
+
+
+
+#define BRW_EU_MAX_INSN_STACK 5
+#define BRW_EU_MAX_INSN 10000
+
+struct brw_compile {
+   struct brw_instruction store[BRW_EU_MAX_INSN];
+   GLuint nr_insn;
+
+   /* Allow clients to push/pop instruction state:
+    */
+   struct brw_instruction stack[BRW_EU_MAX_INSN_STACK];
+   struct brw_instruction *current;
+
+   GLuint flag_value;
+   GLboolean single_program_flow;
+   struct brw_context *brw;
+
+   struct brw_glsl_label *first_label;  /**< linked list of labels */
+   struct brw_glsl_call *first_call;    /**< linked list of CALs */
+};
+
+
+void
+brw_save_label(struct brw_compile *c, const char *name, GLuint position);
+
+void
+brw_save_call(struct brw_compile *c, const char *name, GLuint call_pos);
+
+void
+brw_resolve_cals(struct brw_compile *c);
+
+
+
+static INLINE int type_sz( GLuint type )
+{
+   switch( type ) {
+   case BRW_REGISTER_TYPE_UD:
+   case BRW_REGISTER_TYPE_D:
+   case BRW_REGISTER_TYPE_F:
+      return 4;
+   case BRW_REGISTER_TYPE_HF:
+   case BRW_REGISTER_TYPE_UW:
+   case BRW_REGISTER_TYPE_W:
+      return 2;
+   case BRW_REGISTER_TYPE_UB:
+   case BRW_REGISTER_TYPE_B:
+      return 1;
+   default:
+      return 0;
+   }
+}
+
+/**
+ * Construct a brw_reg.
+ * \param file  one of the BRW_x_REGISTER_FILE values
+ * \param nr  register number/index
+ * \param subnr  register sub number
+ * \param type  one of BRW_REGISTER_TYPE_x
+ * \param vstride  one of BRW_VERTICAL_STRIDE_x
+ * \param width  one of BRW_WIDTH_x
+ * \param hstride  one of BRW_HORIZONTAL_STRIDE_x
+ * \param swizzle  one of BRW_SWIZZLE_x
+ * \param writemask  WRITEMASK_X/Y/Z/W bitfield
+ */
+static INLINE struct brw_reg brw_reg( GLuint file,
+                                      GLuint nr,
+                                      GLuint subnr,
+                                      GLuint type,
+                                      GLuint vstride,
+                                      GLuint width,
+                                      GLuint hstride,
+                                      GLuint swizzle,
+                                      GLuint writemask )
+{
+   struct brw_reg reg;
+   if (type == BRW_GENERAL_REGISTER_FILE)
+      assert(nr < BRW_MAX_GRF);
+   else if (type == BRW_MESSAGE_REGISTER_FILE)
+      assert(nr < BRW_MAX_MRF);
+   else if (type == BRW_ARCHITECTURE_REGISTER_FILE)
+      assert(nr <= BRW_ARF_IP);
+
+   reg.type = type;
+   reg.file = file;
+   reg.nr = nr;
+   reg.subnr = subnr * type_sz(type);
+   reg.negate = 0;
+   reg.abs = 0;
+   reg.vstride = vstride;
+   reg.width = width;
+   reg.hstride = hstride;
+   reg.address_mode = BRW_ADDRESS_DIRECT;
+   reg.pad0 = 0;
+
+   /* Could do better: If the reg is r5.3<0;1,0>, we probably want to
+    * set swizzle and writemask to W, as the lower bits of subnr will
+    * be lost when converted to align16.  This is probably too much to
+    * keep track of as you'd want it adjusted by suboffset(), etc.
+    * Perhaps fix up when converting to align16?
+    */
+   reg.dw1.bits.swizzle = swizzle;
+   reg.dw1.bits.writemask = writemask;
+   reg.dw1.bits.indirect_offset = 0;
+   reg.dw1.bits.pad1 = 0;
+   return reg;
+}
+
+/** Construct float[16] register */
+static INLINE struct brw_reg brw_vec16_reg( GLuint file,
+					      GLuint nr,
+					      GLuint subnr )
+{
+   return brw_reg(file,
+		  nr,
+		  subnr,
+		  BRW_REGISTER_TYPE_F,
+		  BRW_VERTICAL_STRIDE_16,
+		  BRW_WIDTH_16,
+		  BRW_HORIZONTAL_STRIDE_1,
+		  BRW_SWIZZLE_XYZW,
+		  WRITEMASK_XYZW);
+}
+
+/** Construct float[8] register */
+static INLINE struct brw_reg brw_vec8_reg( GLuint file,
+					     GLuint nr,
+					     GLuint subnr )
+{
+   return brw_reg(file,
+		  nr,
+		  subnr,
+		  BRW_REGISTER_TYPE_F,
+		  BRW_VERTICAL_STRIDE_8,
+		  BRW_WIDTH_8,
+		  BRW_HORIZONTAL_STRIDE_1,
+		  BRW_SWIZZLE_XYZW,
+		  WRITEMASK_XYZW);
+}
+
+/** Construct float[4] register */
+static INLINE struct brw_reg brw_vec4_reg( GLuint file,
+					      GLuint nr,
+					      GLuint subnr )
+{
+   return brw_reg(file,
+		  nr,
+		  subnr,
+		  BRW_REGISTER_TYPE_F,
+		  BRW_VERTICAL_STRIDE_4,
+		  BRW_WIDTH_4,
+		  BRW_HORIZONTAL_STRIDE_1,
+		  BRW_SWIZZLE_XYZW,
+		  WRITEMASK_XYZW);
+}
+
+/** Construct float[2] register */
+static INLINE struct brw_reg brw_vec2_reg( GLuint file,
+					      GLuint nr,
+					      GLuint subnr )
+{
+   return brw_reg(file,
+		  nr,
+		  subnr,
+		  BRW_REGISTER_TYPE_F,
+		  BRW_VERTICAL_STRIDE_2,
+		  BRW_WIDTH_2,
+		  BRW_HORIZONTAL_STRIDE_1,
+		  BRW_SWIZZLE_XYXY,
+		  WRITEMASK_XY);
+}
+
+/** Construct float[1] register */
+static INLINE struct brw_reg brw_vec1_reg( GLuint file,
+					     GLuint nr,
+					     GLuint subnr )
+{
+   return brw_reg(file,
+		  nr,
+		  subnr,
+		  BRW_REGISTER_TYPE_F,
+		  BRW_VERTICAL_STRIDE_0,
+		  BRW_WIDTH_1,
+		  BRW_HORIZONTAL_STRIDE_0,
+		  BRW_SWIZZLE_XXXX,
+		  WRITEMASK_X);
+}
+
+
+static INLINE struct brw_reg retype( struct brw_reg reg,
+				       GLuint type )
+{
+   reg.type = type;
+   return reg;
+}
+
+static INLINE struct brw_reg suboffset( struct brw_reg reg,
+					  GLuint delta )
+{   
+   reg.subnr += delta * type_sz(reg.type);
+   return reg;
+}
+
+
+static INLINE struct brw_reg offset( struct brw_reg reg,
+				       GLuint delta )
+{
+   reg.nr += delta;
+   return reg;
+}
+
+
+static INLINE struct brw_reg byte_offset( struct brw_reg reg,
+					    GLuint bytes )
+{
+   GLuint newoffset = reg.nr * REG_SIZE + reg.subnr + bytes;
+   reg.nr = newoffset / REG_SIZE;
+   reg.subnr = newoffset % REG_SIZE;
+   return reg;
+}
+   
+
+/** Construct unsigned word[16] register */
+static INLINE struct brw_reg brw_uw16_reg( GLuint file,
+					     GLuint nr,
+					     GLuint subnr )
+{
+   return suboffset(retype(brw_vec16_reg(file, nr, 0), BRW_REGISTER_TYPE_UW), subnr);
+}
+
+/** Construct unsigned word[8] register */
+static INLINE struct brw_reg brw_uw8_reg( GLuint file,
+					    GLuint nr,
+					    GLuint subnr )
+{
+   return suboffset(retype(brw_vec8_reg(file, nr, 0), BRW_REGISTER_TYPE_UW), subnr);
+}
+
+/** Construct unsigned word[1] register */
+static INLINE struct brw_reg brw_uw1_reg( GLuint file,
+					    GLuint nr,
+					    GLuint subnr )
+{
+   return suboffset(retype(brw_vec1_reg(file, nr, 0), BRW_REGISTER_TYPE_UW), subnr);
+}
+
+static INLINE struct brw_reg brw_imm_reg( GLuint type )
+{
+   return brw_reg( BRW_IMMEDIATE_VALUE,
+		   0,
+		   0,
+		   type,
+		   BRW_VERTICAL_STRIDE_0,
+		   BRW_WIDTH_1,
+		   BRW_HORIZONTAL_STRIDE_0,
+		   0,
+		   0);      
+}
+
+/** Construct float immediate register */
+static INLINE struct brw_reg brw_imm_f( GLfloat f )
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_F);
+   imm.dw1.f = f;
+   return imm;
+}
+
+/** Construct integer immediate register */
+static INLINE struct brw_reg brw_imm_d( GLint d )
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_D);
+   imm.dw1.d = d;
+   return imm;
+}
+
+/** Construct uint immediate register */
+static INLINE struct brw_reg brw_imm_ud( GLuint ud )
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_UD);
+   imm.dw1.ud = ud;
+   return imm;
+}
+
+/** Construct ushort immediate register */
+static INLINE struct brw_reg brw_imm_uw( GLushort uw )
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_UW);
+   imm.dw1.ud = uw | (uw << 16);
+   return imm;
+}
+
+/** Construct short immediate register */
+static INLINE struct brw_reg brw_imm_w( GLshort w )
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_W);
+   imm.dw1.d = w | (w << 16);
+   return imm;
+}
+
+/* brw_imm_b and brw_imm_ub aren't supported by hardware - the type
+ * numbers alias with _V and _VF below:
+ */
+
+/** Construct vector of eight signed half-byte values */
+static INLINE struct brw_reg brw_imm_v( GLuint v )
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_V);
+   imm.vstride = BRW_VERTICAL_STRIDE_0;
+   imm.width = BRW_WIDTH_8;
+   imm.hstride = BRW_HORIZONTAL_STRIDE_1;
+   imm.dw1.ud = v;
+   return imm;
+}
+
+/** Construct vector of four 8-bit float values */
+static INLINE struct brw_reg brw_imm_vf( GLuint v )
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_VF);
+   imm.vstride = BRW_VERTICAL_STRIDE_0;
+   imm.width = BRW_WIDTH_4;
+   imm.hstride = BRW_HORIZONTAL_STRIDE_1;
+   imm.dw1.ud = v;
+   return imm;
+}
+
+#define VF_ZERO 0x0
+#define VF_ONE  0x30
+#define VF_NEG  (1<<7)
+
+static INLINE struct brw_reg brw_imm_vf4( GLuint v0, 
+					    GLuint v1, 
+					    GLuint v2,
+					    GLuint v3)
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_VF);
+   imm.vstride = BRW_VERTICAL_STRIDE_0;
+   imm.width = BRW_WIDTH_4;
+   imm.hstride = BRW_HORIZONTAL_STRIDE_1;
+   imm.dw1.ud = ((v0 << 0) |
+		 (v1 << 8) |
+		 (v2 << 16) |
+		 (v3 << 24));
+   return imm;
+}
+
+
+static INLINE struct brw_reg brw_address( struct brw_reg reg )
+{
+   return brw_imm_uw(reg.nr * REG_SIZE + reg.subnr);
+}
+
+/** Construct float[1] general-purpose register */
+static INLINE struct brw_reg brw_vec1_grf( GLuint nr, GLuint subnr )
+{
+   return brw_vec1_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+/** Construct float[2] general-purpose register */
+static INLINE struct brw_reg brw_vec2_grf( GLuint nr, GLuint subnr )
+{
+   return brw_vec2_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+/** Construct float[4] general-purpose register */
+static INLINE struct brw_reg brw_vec4_grf( GLuint nr, GLuint subnr )
+{
+   return brw_vec4_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+/** Construct float[8] general-purpose register */
+static INLINE struct brw_reg brw_vec8_grf( GLuint nr, GLuint subnr )
+{
+   return brw_vec8_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+
+static INLINE struct brw_reg brw_uw8_grf( GLuint nr, GLuint subnr )
+{
+   return brw_uw8_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+static INLINE struct brw_reg brw_uw16_grf( GLuint nr, GLuint subnr )
+{
+   return brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+
+/** Construct null register (usually used for setting condition codes) */
+static INLINE struct brw_reg brw_null_reg( void )
+{
+   return brw_vec8_reg(BRW_ARCHITECTURE_REGISTER_FILE, 
+		       BRW_ARF_NULL, 
+		       0);
+}
+
+static INLINE struct brw_reg brw_address_reg( GLuint subnr )
+{
+   return brw_uw1_reg(BRW_ARCHITECTURE_REGISTER_FILE, 
+		      BRW_ARF_ADDRESS, 
+		      subnr);
+}
+
+/* If/else instructions break in align16 mode if writemask & swizzle
+ * aren't xyzw.  This goes against the convention for other scalar
+ * regs:
+ */
+static INLINE struct brw_reg brw_ip_reg( void )
+{
+   return brw_reg(BRW_ARCHITECTURE_REGISTER_FILE, 
+		  BRW_ARF_IP, 
+		  0,
+		  BRW_REGISTER_TYPE_UD,
+		  BRW_VERTICAL_STRIDE_4, /* ? */
+		  BRW_WIDTH_1,
+		  BRW_HORIZONTAL_STRIDE_0,
+		  BRW_SWIZZLE_XYZW, /* NOTE! */
+		  WRITEMASK_XYZW); /* NOTE! */
+}
+
+static INLINE struct brw_reg brw_acc_reg( void )
+{
+   return brw_vec8_reg(BRW_ARCHITECTURE_REGISTER_FILE, 
+		       BRW_ARF_ACCUMULATOR, 
+		       0);
+}
+
+
+static INLINE struct brw_reg brw_flag_reg( void )
+{
+   return brw_uw1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+		      BRW_ARF_FLAG,
+		      0);
+}
+
+
+static INLINE struct brw_reg brw_mask_reg( GLuint subnr )
+{
+   return brw_uw1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+		      BRW_ARF_MASK,
+		      subnr);
+}
+
+static INLINE struct brw_reg brw_message_reg( GLuint nr )
+{
+   assert(nr < BRW_MAX_MRF);
+   return brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE,
+		       nr,
+		       0);
+}
+
+
+
+
+/* This is almost always called with a numeric constant argument, so
+ * make things easy to evaluate at compile time:
+ */
+static INLINE GLuint cvt( GLuint val )
+{
+   switch (val) {
+   case 0: return 0;
+   case 1: return 1;
+   case 2: return 2;
+   case 4: return 3;
+   case 8: return 4;
+   case 16: return 5;
+   case 32: return 6;
+   }
+   return 0;
+}
+
+static INLINE struct brw_reg stride( struct brw_reg reg,
+				       GLuint vstride,
+				       GLuint width,
+				       GLuint hstride )
+{
+   reg.vstride = cvt(vstride);
+   reg.width = cvt(width) - 1;
+   reg.hstride = cvt(hstride);
+   return reg;
+}
+
+
+static INLINE struct brw_reg vec16( struct brw_reg reg )
+{
+   return stride(reg, 16,16,1);
+}
+
+static INLINE struct brw_reg vec8( struct brw_reg reg )
+{
+   return stride(reg, 8,8,1);
+}
+
+static INLINE struct brw_reg vec4( struct brw_reg reg )
+{
+   return stride(reg, 4,4,1);
+}
+
+static INLINE struct brw_reg vec2( struct brw_reg reg )
+{
+   return stride(reg, 2,2,1);
+}
+
+static INLINE struct brw_reg vec1( struct brw_reg reg )
+{
+   return stride(reg, 0,1,0);
+}
+
+
+static INLINE struct brw_reg get_element( struct brw_reg reg, GLuint elt )
+{
+   return vec1(suboffset(reg, elt));
+}
+
+static INLINE struct brw_reg get_element_ud( struct brw_reg reg, GLuint elt )
+{
+   return vec1(suboffset(retype(reg, BRW_REGISTER_TYPE_UD), elt));
+}
+
+
+static INLINE struct brw_reg brw_swizzle( struct brw_reg reg,
+					    GLuint x,
+					    GLuint y, 
+					    GLuint z,
+					    GLuint w)
+{
+   reg.dw1.bits.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(reg.dw1.bits.swizzle, x),
+				       BRW_GET_SWZ(reg.dw1.bits.swizzle, y),
+				       BRW_GET_SWZ(reg.dw1.bits.swizzle, z),
+				       BRW_GET_SWZ(reg.dw1.bits.swizzle, w));
+   return reg;
+}
+
+
+static INLINE struct brw_reg brw_swizzle1( struct brw_reg reg,
+					     GLuint x )
+{
+   return brw_swizzle(reg, x, x, x, x);
+}
+
+static INLINE struct brw_reg brw_writemask( struct brw_reg reg,
+					      GLuint mask )
+{
+   reg.dw1.bits.writemask &= mask;
+   return reg;
+}
+
+static INLINE struct brw_reg brw_set_writemask( struct brw_reg reg,
+						  GLuint mask )
+{
+   reg.dw1.bits.writemask = mask;
+   return reg;
+}
+
+static INLINE struct brw_reg negate( struct brw_reg reg )
+{
+   reg.negate ^= 1;
+   return reg;
+}
+
+static INLINE struct brw_reg brw_abs( struct brw_reg reg )
+{
+   reg.abs = 1;
+   return reg;
+}
+
+/***********************************************************************
+ */
+static INLINE struct brw_reg brw_vec4_indirect( GLuint subnr,
+						  GLint offset )
+{
+   struct brw_reg reg =  brw_vec4_grf(0, 0);
+   reg.subnr = subnr;
+   reg.address_mode = BRW_ADDRESS_REGISTER_INDIRECT_REGISTER;
+   reg.dw1.bits.indirect_offset = offset;
+   return reg;
+}
+
+static INLINE struct brw_reg brw_vec1_indirect( GLuint subnr,
+						  GLint offset )
+{
+   struct brw_reg reg =  brw_vec1_grf(0, 0);
+   reg.subnr = subnr;
+   reg.address_mode = BRW_ADDRESS_REGISTER_INDIRECT_REGISTER;
+   reg.dw1.bits.indirect_offset = offset;
+   return reg;
+}
+
+static INLINE struct brw_reg deref_4f(struct brw_indirect ptr, GLint offset)
+{
+   return brw_vec4_indirect(ptr.addr_subnr, ptr.addr_offset + offset);
+}
+
+static INLINE struct brw_reg deref_1f(struct brw_indirect ptr, GLint offset)
+{
+   return brw_vec1_indirect(ptr.addr_subnr, ptr.addr_offset + offset);
+}
+
+static INLINE struct brw_reg deref_4b(struct brw_indirect ptr, GLint offset)
+{
+   return retype(deref_4f(ptr, offset), BRW_REGISTER_TYPE_B);
+}
+
+static INLINE struct brw_reg deref_1uw(struct brw_indirect ptr, GLint offset)
+{
+   return retype(deref_1f(ptr, offset), BRW_REGISTER_TYPE_UW);
+}
+
+static INLINE struct brw_reg deref_1d(struct brw_indirect ptr, GLint offset)
+{
+   return retype(deref_1f(ptr, offset), BRW_REGISTER_TYPE_D);
+}
+
+static INLINE struct brw_reg deref_1ud(struct brw_indirect ptr, GLint offset)
+{
+   return retype(deref_1f(ptr, offset), BRW_REGISTER_TYPE_UD);
+}
+
+static INLINE struct brw_reg get_addr_reg(struct brw_indirect ptr)
+{
+   return brw_address_reg(ptr.addr_subnr);
+}
+
+static INLINE struct brw_indirect brw_indirect_offset( struct brw_indirect ptr, GLint offset )
+{
+   ptr.addr_offset += offset;
+   return ptr;
+}
+
+static INLINE struct brw_indirect brw_indirect( GLuint addr_subnr, GLint offset )
+{
+   struct brw_indirect ptr;
+   ptr.addr_subnr = addr_subnr;
+   ptr.addr_offset = offset;
+   ptr.pad = 0;
+   return ptr;
+}
+
+/** Do two brw_regs refer to the same register? */
+static INLINE GLboolean
+brw_same_reg(struct brw_reg r1, struct brw_reg r2)
+{
+   return r1.file == r2.file && r1.nr == r2.nr;
+}
+
+static INLINE struct brw_instruction *current_insn( struct brw_compile *p)
+{
+   return &p->store[p->nr_insn];
+}
+
+void brw_pop_insn_state( struct brw_compile *p );
+void brw_push_insn_state( struct brw_compile *p );
+void brw_set_mask_control( struct brw_compile *p, GLuint value );
+void brw_set_saturate( struct brw_compile *p, GLuint value );
+void brw_set_access_mode( struct brw_compile *p, GLuint access_mode );
+void brw_set_compression_control( struct brw_compile *p, GLboolean control );
+void brw_set_predicate_control_flag_value( struct brw_compile *p, GLuint value );
+void brw_set_predicate_control( struct brw_compile *p, GLuint pc );
+void brw_set_conditionalmod( struct brw_compile *p, GLuint conditional );
+
+void brw_init_compile( struct brw_context *, struct brw_compile *p );
+const GLuint *brw_get_program( struct brw_compile *p, GLuint *sz );
+
+
+/* Helpers for regular instructions:
+ */
+#define ALU1(OP)					\
+struct brw_instruction *brw_##OP(struct brw_compile *p,	\
+	      struct brw_reg dest,			\
+	      struct brw_reg src0);
+
+#define ALU2(OP)					\
+struct brw_instruction *brw_##OP(struct brw_compile *p,	\
+	      struct brw_reg dest,			\
+	      struct brw_reg src0,			\
+	      struct brw_reg src1);
+
+ALU1(MOV)
+ALU2(SEL)
+ALU1(NOT)
+ALU2(AND)
+ALU2(OR)
+ALU2(XOR)
+ALU2(SHR)
+ALU2(SHL)
+ALU2(RSR)
+ALU2(RSL)
+ALU2(ASR)
+ALU2(JMPI)
+ALU2(ADD)
+ALU2(MUL)
+ALU1(FRC)
+ALU1(RNDD)
+ALU1(RNDZ)
+ALU2(MAC)
+ALU2(MACH)
+ALU1(LZD)
+ALU2(DP4)
+ALU2(DPH)
+ALU2(DP3)
+ALU2(DP2)
+ALU2(LINE)
+
+#undef ALU1
+#undef ALU2
+
+
+
+/* Helpers for SEND instruction:
+ */
+void brw_urb_WRITE(struct brw_compile *p,
+		   struct brw_reg dest,
+		   GLuint msg_reg_nr,
+		   struct brw_reg src0,
+		   GLboolean allocate,
+		   GLboolean used,
+		   GLuint msg_length,
+		   GLuint response_length,
+		   GLboolean eot,
+		   GLboolean writes_complete,
+		   GLuint offset,
+		   GLuint swizzle);
+
+void brw_ff_sync(struct brw_compile *p,
+		   struct brw_reg dest,
+		   GLuint msg_reg_nr,
+		   struct brw_reg src0,
+		   GLboolean allocate,
+		   GLboolean used,
+		   GLuint msg_length,
+		   GLuint response_length,
+		   GLboolean eot,
+		   GLboolean writes_complete,
+		   GLuint offset,
+		   GLuint swizzle);
+
+void brw_fb_WRITE(struct brw_compile *p,
+		   struct brw_reg dest,
+		   GLuint msg_reg_nr,
+		   struct brw_reg src0,
+		   GLuint binding_table_index,
+		   GLuint msg_length,
+		   GLuint response_length,
+		   GLboolean eot);
+
+void brw_SAMPLE(struct brw_compile *p,
+		struct brw_reg dest,
+		GLuint msg_reg_nr,
+		struct brw_reg src0,
+		GLuint binding_table_index,
+		GLuint sampler,
+		GLuint writemask,
+		GLuint msg_type,
+		GLuint response_length,
+		GLuint msg_length,
+		GLboolean eot,
+		GLuint header_present,
+		GLuint simd_mode);
+
+void brw_math_16( struct brw_compile *p,
+		  struct brw_reg dest,
+		  GLuint function,
+		  GLuint saturate,
+		  GLuint msg_reg_nr,
+		  struct brw_reg src,
+		  GLuint precision );
+
+void brw_math( struct brw_compile *p,
+	       struct brw_reg dest,
+	       GLuint function,
+	       GLuint saturate,
+	       GLuint msg_reg_nr,
+	       struct brw_reg src,
+	       GLuint data_type,
+	       GLuint precision );
+
+void brw_dp_READ_16( struct brw_compile *p,
+		     struct brw_reg dest,
+		     GLuint scratch_offset );
+
+void brw_dp_READ_4( struct brw_compile *p,
+                    struct brw_reg dest,
+                    GLboolean relAddr,
+                    GLuint location,
+                    GLuint bind_table_index );
+
+void brw_dp_READ_4_vs( struct brw_compile *p,
+                       struct brw_reg dest,
+                       GLuint oword,
+                       GLboolean relAddr,
+                       struct brw_reg addrReg,
+                       GLuint location,
+                       GLuint bind_table_index );
+
+void brw_dp_WRITE_16( struct brw_compile *p,
+		      struct brw_reg src,
+		      GLuint scratch_offset );
+
+/* If/else/endif.  Works by manipulating the execution flags on each
+ * channel.
+ */
+struct brw_instruction *brw_IF(struct brw_compile *p, 
+			       GLuint execute_size);
+
+struct brw_instruction *brw_ELSE(struct brw_compile *p, 
+				 struct brw_instruction *if_insn);
+
+void brw_ENDIF(struct brw_compile *p, 
+	       struct brw_instruction *if_or_else_insn);
+
+
+/* DO/WHILE loops:
+ */
+struct brw_instruction *brw_DO(struct brw_compile *p,
+			       GLuint execute_size);
+
+struct brw_instruction *brw_WHILE(struct brw_compile *p, 
+	       struct brw_instruction *patch_insn);
+
+struct brw_instruction *brw_BREAK(struct brw_compile *p);
+struct brw_instruction *brw_CONT(struct brw_compile *p);
+/* Forward jumps:
+ */
+void brw_land_fwd_jump(struct brw_compile *p, 
+		       struct brw_instruction *jmp_insn);
+
+
+
+void brw_NOP(struct brw_compile *p);
+
+/* Special case: there is never a destination, execution size will be
+ * taken from src0:
+ */
+void brw_CMP(struct brw_compile *p,
+	     struct brw_reg dest,
+	     GLuint conditional,
+	     struct brw_reg src0,
+	     struct brw_reg src1);
+
+void brw_print_reg( struct brw_reg reg );
+
+
+/*********************************************************************** 
+ * brw_eu_util.c:
+ */
+
+void brw_copy_indirect_to_indirect(struct brw_compile *p,
+				   struct brw_indirect dst_ptr,
+				   struct brw_indirect src_ptr,
+				   GLuint count);
+
+void brw_copy_from_indirect(struct brw_compile *p,
+			    struct brw_reg dst,
+			    struct brw_indirect ptr,
+			    GLuint count);
+
+void brw_copy4(struct brw_compile *p,
+	       struct brw_reg dst,
+	       struct brw_reg src,
+	       GLuint count);
+
+void brw_copy8(struct brw_compile *p,
+	       struct brw_reg dst,
+	       struct brw_reg src,
+	       GLuint count);
+
+void brw_math_invert( struct brw_compile *p, 
+		      struct brw_reg dst,
+		      struct brw_reg src);
+
+void brw_set_src1( struct brw_instruction *insn,
+                          struct brw_reg reg );
+#endif
--- a/src/gallium/drivers/i965/brw_eu_debug.c
+++ b/src/gallium/drivers/i965/brw_eu_debug.c
@ -0,0 +1,95 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+    
+
+#include "main/mtypes.h"
+#include "main/imports.h"
+#include "brw_eu.h"
+
+void brw_print_reg( struct brw_reg hwreg )
+{
+   static const char *file[] = {
+      "arf",
+      "grf",
+      "msg",
+      "imm"
+   };
+
+   static const char *type[] = {
+      "ud",
+      "d",
+      "uw",
+      "w",
+      "ub",
+      "vf",
+      "hf",
+      "f"
+   };
+
+   _mesa_printf("%s%s", 
+		hwreg.abs ? "abs/" : "",
+		hwreg.negate ? "-" : "");
+     
+   if (hwreg.file == BRW_GENERAL_REGISTER_FILE &&
+       hwreg.nr % 2 == 0 &&
+       hwreg.subnr == 0 &&
+       hwreg.vstride == BRW_VERTICAL_STRIDE_8 &&
+       hwreg.width == BRW_WIDTH_8 &&
+       hwreg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
+       hwreg.type == BRW_REGISTER_TYPE_F) {
+      /* vector register */
+      _mesa_printf("vec%d", hwreg.nr);
+   }
+   else if (hwreg.file == BRW_GENERAL_REGISTER_FILE &&
+	    hwreg.vstride == BRW_VERTICAL_STRIDE_0 &&
+	    hwreg.width == BRW_WIDTH_1 &&
+	    hwreg.hstride == BRW_HORIZONTAL_STRIDE_0 &&
+	    hwreg.type == BRW_REGISTER_TYPE_F) {      
+      /* "scalar" register */
+      _mesa_printf("scl%d.%d", hwreg.nr, hwreg.subnr / 4);
+   }
+   else if (hwreg.file == BRW_IMMEDIATE_VALUE) {
+      _mesa_printf("imm %f", hwreg.dw1.f);
+   }
+   else {
+      _mesa_printf("%s%d.%d<%d;%d,%d>:%s", 
+		   file[hwreg.file],
+		   hwreg.nr,
+		   hwreg.subnr / type_sz(hwreg.type),
+		   hwreg.vstride ? (1<<(hwreg.vstride-1)) : 0,
+		   1<<hwreg.width,
+		   hwreg.hstride ? (1<<(hwreg.hstride-1)) : 0,		
+		   type[hwreg.type]);
+   }
+}
+
+
+
--- a/src/gallium/drivers/i965/brw_eu_emit.c
+++ b/src/gallium/drivers/i965/brw_eu_emit.c
--- a/src/gallium/drivers/i965/brw_eu_util.c
+++ b/src/gallium/drivers/i965/brw_eu_util.c
@ -0,0 +1,126 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+      
+
+#include "brw_context.h"
+#include "brw_defines.h"
+#include "brw_eu.h"
+
+
+void brw_math_invert( struct brw_compile *p, 
+			     struct brw_reg dst,
+			     struct brw_reg src)
+{
+   brw_math( p, 
+	     dst,
+	     BRW_MATH_FUNCTION_INV, 
+	     BRW_MATH_SATURATE_NONE,
+	     0,
+	     src,
+	     BRW_MATH_PRECISION_FULL, 
+	     BRW_MATH_DATA_VECTOR );
+}
+
+
+
+void brw_copy4(struct brw_compile *p,
+	       struct brw_reg dst,
+	       struct brw_reg src,
+	       GLuint count)
+{
+   GLuint i;
+
+   dst = vec4(dst);
+   src = vec4(src);
+
+   for (i = 0; i < count; i++)
+   {
+      GLuint delta = i*32;
+      brw_MOV(p, byte_offset(dst, delta),    byte_offset(src, delta));
+      brw_MOV(p, byte_offset(dst, delta+16), byte_offset(src, delta+16));
+   }
+}
+
+
+void brw_copy8(struct brw_compile *p,
+	       struct brw_reg dst,
+	       struct brw_reg src,
+	       GLuint count)
+{
+   GLuint i;
+
+   dst = vec8(dst);
+   src = vec8(src);
+
+   for (i = 0; i < count; i++)
+   {
+      GLuint delta = i*32;
+      brw_MOV(p, byte_offset(dst, delta),    byte_offset(src, delta));
+   }
+}
+
+
+void brw_copy_indirect_to_indirect(struct brw_compile *p,
+				   struct brw_indirect dst_ptr,
+				   struct brw_indirect src_ptr,
+				   GLuint count)
+{
+   GLuint i;
+
+   for (i = 0; i < count; i++)
+   {
+      GLuint delta = i*32;
+      brw_MOV(p, deref_4f(dst_ptr, delta),    deref_4f(src_ptr, delta));
+      brw_MOV(p, deref_4f(dst_ptr, delta+16), deref_4f(src_ptr, delta+16));
+   }
+}
+
+
+void brw_copy_from_indirect(struct brw_compile *p,
+			    struct brw_reg dst,
+			    struct brw_indirect ptr,
+			    GLuint count)
+{
+   GLuint i;
+
+   dst = vec4(dst);
+
+   for (i = 0; i < count; i++)
+   {
+      GLuint delta = i*32;
+      brw_MOV(p, byte_offset(dst, delta),    deref_4f(ptr, delta));
+      brw_MOV(p, byte_offset(dst, delta+16), deref_4f(ptr, delta+16));
+   }
+}
+
+
+
+
--- a/src/gallium/drivers/i965/brw_gs.c
+++ b/src/gallium/drivers/i965/brw_gs.c
@ -0,0 +1,201 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+      
+#include "main/glheader.h"
+#include "main/macros.h"
+#include "main/enums.h"
+
+#include "intel_batchbuffer.h"
+
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_state.h"
+#include "brw_gs.h"
+
+
+
+static void compile_gs_prog( struct brw_context *brw,
+			     struct brw_gs_prog_key *key )
+{
+   struct brw_gs_compile c;
+   const GLuint *program;
+   GLuint program_size;
+
+   memset(&c, 0, sizeof(c));
+   
+   c.key = *key;
+   c.need_ff_sync = BRW_IS_IGDNG(brw);
+   /* Need to locate the two positions present in vertex + header.
+    * These are currently hardcoded:
+    */
+   c.nr_attrs = brw_count_bits(c.key.attrs);
+
+   if (BRW_IS_IGDNG(brw))
+       c.nr_regs = (c.nr_attrs + 1) / 2 + 3;  /* are vertices packed, or reg-aligned? */
+   else
+       c.nr_regs = (c.nr_attrs + 1) / 2 + 1;  /* are vertices packed, or reg-aligned? */
+
+   c.nr_bytes = c.nr_regs * REG_SIZE;
+
+   
+   /* Begin the compilation:
+    */
+   brw_init_compile(brw, &c.func);
+
+   c.func.single_program_flow = 1;
+
+   /* For some reason the thread is spawned with only 4 channels
+    * unmasked.  
+    */
+   brw_set_mask_control(&c.func, BRW_MASK_DISABLE);
+
+
+   /* Note that primitives which don't require a GS program have
+    * already been weeded out by this stage:
+    */
+   switch (key->primitive) {
+   case GL_QUADS:
+      brw_gs_quads( &c ); 
+      break;
+   case GL_QUAD_STRIP:
+      brw_gs_quad_strip( &c );
+      break;
+   case GL_LINE_LOOP:
+      brw_gs_lines( &c );
+      break;
+   case GL_LINES:
+      if (key->hint_gs_always)
+	 brw_gs_lines( &c );
+      else {
+	 return;
+      }
+      break;
+   case GL_TRIANGLES:
+      if (key->hint_gs_always)
+	 brw_gs_tris( &c );
+      else {
+	 return;
+      }
+      break;
+   case GL_POINTS:
+      if (key->hint_gs_always)
+	 brw_gs_points( &c );
+      else {
+	 return;
+      }
+      break;      
+   default:
+      return;
+   }
+
+   /* get the program
+    */
+   program = brw_get_program(&c.func, &program_size);
+
+   /* Upload
+    */
+   dri_bo_unreference(brw->gs.prog_bo);
+   brw->gs.prog_bo = brw_upload_cache( &brw->cache, BRW_GS_PROG,
+				       &c.key, sizeof(c.key),
+				       NULL, 0,
+				       program, program_size,
+				       &c.prog_data,
+				       &brw->gs.prog_data );
+}
+
+static const GLenum gs_prim[GL_POLYGON+1] = {  
+   GL_POINTS,
+   GL_LINES,
+   GL_LINE_LOOP,
+   GL_LINES,
+   GL_TRIANGLES,
+   GL_TRIANGLES,
+   GL_TRIANGLES,
+   GL_QUADS,
+   GL_QUAD_STRIP,
+   GL_TRIANGLES
+};
+
+static void populate_key( struct brw_context *brw,
+			  struct brw_gs_prog_key *key )
+{
+   memset(key, 0, sizeof(*key));
+
+   /* CACHE_NEW_VS_PROG */
+   key->attrs = brw->vs.prog_data->outputs_written;
+
+   /* BRW_NEW_PRIMITIVE */
+   key->primitive = gs_prim[brw->primitive];
+
+   key->hint_gs_always = 0;	/* debug code? */
+
+   key->need_gs_prog = (key->hint_gs_always ||
+			brw->primitive == GL_QUADS ||
+			brw->primitive == GL_QUAD_STRIP ||
+			brw->primitive == GL_LINE_LOOP);
+}
+
+/* Calculate interpolants for triangle and line rasterization.
+ */
+static void prepare_gs_prog(struct brw_context *brw)
+{
+   struct brw_gs_prog_key key;
+   /* Populate the key:
+    */
+   populate_key(brw, &key);
+
+   if (brw->gs.prog_active != key.need_gs_prog) {
+      brw->state.dirty.cache |= CACHE_NEW_GS_PROG;
+      brw->gs.prog_active = key.need_gs_prog;
+   }
+
+   if (brw->gs.prog_active) {
+      dri_bo_unreference(brw->gs.prog_bo);
+      brw->gs.prog_bo = brw_search_cache(&brw->cache, BRW_GS_PROG,
+					 &key, sizeof(key),
+					 NULL, 0,
+					 &brw->gs.prog_data);
+      if (brw->gs.prog_bo == NULL)
+	 compile_gs_prog( brw, &key );
+   }
+}
+
+
+const struct brw_tracked_state brw_gs_prog = {
+   .dirty = {
+      .mesa  = 0,
+      .brw   = BRW_NEW_PRIMITIVE,
+      .cache = CACHE_NEW_VS_PROG
+   },
+   .prepare = prepare_gs_prog
+};
--- a/src/gallium/drivers/i965/brw_gs.h
+++ b/src/gallium/drivers/i965/brw_gs.h
@ -0,0 +1,76 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+ 
+
+#ifndef BRW_GS_H
+#define BRW_GS_H
+
+
+#include "brw_context.h"
+#include "brw_eu.h"
+
+#define MAX_GS_VERTS (4)	     
+
+struct brw_gs_prog_key {
+   GLuint attrs:32;
+   GLuint primitive:4;
+   GLuint hint_gs_always:1;
+   GLuint need_gs_prog:1;
+   GLuint pad:26;
+};
+
+struct brw_gs_compile {
+   struct brw_compile func;
+   struct brw_gs_prog_key key;
+   struct brw_gs_prog_data prog_data;
+   
+   struct {
+      struct brw_reg R0;
+      struct brw_reg vertex[MAX_GS_VERTS];
+   } reg;
+
+   /* 3 different ways of expressing vertex size:
+    */
+   GLuint nr_attrs;
+   GLuint nr_regs;
+   GLuint nr_bytes;
+   GLboolean need_ff_sync;
+};
+
+#define ATTR_SIZE  (4*4)
+
+void brw_gs_quads( struct brw_gs_compile *c );
+void brw_gs_quad_strip( struct brw_gs_compile *c );
+void brw_gs_tris( struct brw_gs_compile *c );
+void brw_gs_lines( struct brw_gs_compile *c );
+void brw_gs_points( struct brw_gs_compile *c );
+
+#endif
--- a/src/gallium/drivers/i965/brw_gs_emit.c
+++ b/src/gallium/drivers/i965/brw_gs_emit.c
@ -0,0 +1,186 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+ 
+
+#include "main/glheader.h"
+#include "main/macros.h"
+#include "main/enums.h"
+
+#include "shader/program.h"
+#include "intel_batchbuffer.h"
+
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_gs.h"
+
+static void brw_gs_alloc_regs( struct brw_gs_compile *c,
+			       GLuint nr_verts )
+{
+   GLuint i = 0,j;
+
+   /* Register usage is static, precompute here:
+    */
+   c->reg.R0 = retype(brw_vec8_grf(i, 0), BRW_REGISTER_TYPE_UD); i++;
+
+   /* Payload vertices plus space for more generated vertices:
+    */
+   for (j = 0; j < nr_verts; j++) {
+      c->reg.vertex[j] = brw_vec4_grf(i, 0);
+      i += c->nr_regs;
+   }
+
+   c->prog_data.urb_read_length = c->nr_regs; 
+   c->prog_data.total_grf = i;
+}
+
+
+static void brw_gs_emit_vue(struct brw_gs_compile *c, 
+			    struct brw_reg vert,
+			    GLboolean last,
+			    GLuint header)
+{
+   struct brw_compile *p = &c->func;
+   GLboolean allocate = !last;
+
+   /* Overwrite PrimType and PrimStart in the message header, for
+    * each vertex in turn:
+    */
+   brw_MOV(p, get_element_ud(c->reg.R0, 2), brw_imm_ud(header));
+
+   /* Copy the vertex from vertn into m1..mN+1:
+    */
+   brw_copy8(p, brw_message_reg(1), vert, c->nr_regs);
+
+   /* Send each vertex as a seperate write to the urb.  This is
+    * different to the concept in brw_sf_emit.c, where subsequent
+    * writes are used to build up a single urb entry.  Each of these
+    * writes instantiates a seperate urb entry, and a new one must be
+    * allocated each time.
+    */
+   brw_urb_WRITE(p, 
+		 allocate ? c->reg.R0 : retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
+		 0,
+		 c->reg.R0,
+		 allocate,
+		 1,		/* used */
+		 c->nr_regs + 1, /* msg length */
+		 allocate ? 1 : 0, /* response length */
+		 allocate ? 0 : 1, /* eot */
+		 1,		/* writes_complete */
+		 0,		/* urb offset */
+		 BRW_URB_SWIZZLE_NONE);
+}
+
+static void brw_gs_ff_sync(struct brw_gs_compile *c, int num_prim)
+{
+	struct brw_compile *p = &c->func;
+	brw_MOV(p, get_element_ud(c->reg.R0, 1), brw_imm_ud(num_prim));
+	brw_ff_sync(p, 
+				c->reg.R0,
+				0,
+				c->reg.R0,
+				1,	
+				1,		/* used */
+				1,  	/* msg length */
+				1,		/* response length */
+				0,		/* eot */
+				1,		/* write compelete */
+				0,		/* urb offset */
+				BRW_URB_SWIZZLE_NONE);
+}
+
+
+void brw_gs_quads( struct brw_gs_compile *c )
+{
+   brw_gs_alloc_regs(c, 4);
+   
+   /* Use polygons for correct edgeflag behaviour. Note that vertex 3
+    * is the PV for quads, but vertex 0 for polygons:
+    */
+   if (c->need_ff_sync)
+	   brw_gs_ff_sync(c, 1);    
+   brw_gs_emit_vue(c, c->reg.vertex[3], 0, ((_3DPRIM_POLYGON << 2) | R02_PRIM_START));
+   brw_gs_emit_vue(c, c->reg.vertex[0], 0, (_3DPRIM_POLYGON << 2));
+   brw_gs_emit_vue(c, c->reg.vertex[1], 0, (_3DPRIM_POLYGON << 2)); 
+   brw_gs_emit_vue(c, c->reg.vertex[2], 1, ((_3DPRIM_POLYGON << 2) | R02_PRIM_END));
+}
+
+void brw_gs_quad_strip( struct brw_gs_compile *c )
+{
+   brw_gs_alloc_regs(c, 4);
+   
+   if (c->need_ff_sync)
+	   brw_gs_ff_sync(c, 1);      
+   brw_gs_emit_vue(c, c->reg.vertex[2], 0, ((_3DPRIM_POLYGON << 2) | R02_PRIM_START));
+   brw_gs_emit_vue(c, c->reg.vertex[3], 0, (_3DPRIM_POLYGON << 2));
+   brw_gs_emit_vue(c, c->reg.vertex[0], 0, (_3DPRIM_POLYGON << 2)); 
+   brw_gs_emit_vue(c, c->reg.vertex[1], 1, ((_3DPRIM_POLYGON << 2) | R02_PRIM_END));
+}
+
+void brw_gs_tris( struct brw_gs_compile *c )
+{
+   brw_gs_alloc_regs(c, 3);
+
+   if (c->need_ff_sync)
+	   brw_gs_ff_sync(c, 1);      
+   brw_gs_emit_vue(c, c->reg.vertex[0], 0, ((_3DPRIM_TRILIST << 2) | R02_PRIM_START));
+   brw_gs_emit_vue(c, c->reg.vertex[1], 0, (_3DPRIM_TRILIST << 2));
+   brw_gs_emit_vue(c, c->reg.vertex[2], 1, ((_3DPRIM_TRILIST << 2) | R02_PRIM_END));
+}
+
+void brw_gs_lines( struct brw_gs_compile *c )
+{
+   brw_gs_alloc_regs(c, 2);
+
+   if (c->need_ff_sync)
+	   brw_gs_ff_sync(c, 1);      
+   brw_gs_emit_vue(c, c->reg.vertex[0], 0, ((_3DPRIM_LINESTRIP << 2) | R02_PRIM_START));
+   brw_gs_emit_vue(c, c->reg.vertex[1], 1, ((_3DPRIM_LINESTRIP << 2) | R02_PRIM_END));
+}
+
+void brw_gs_points( struct brw_gs_compile *c )
+{
+   brw_gs_alloc_regs(c, 1);
+
+   if (c->need_ff_sync)
+	   brw_gs_ff_sync(c, 1);      
+   brw_gs_emit_vue(c, c->reg.vertex[0], 1, ((_3DPRIM_POINTLIST << 2) | R02_PRIM_START | R02_PRIM_END));
+}
+
+
+
+
+
+
+
+
--- a/src/gallium/drivers/i965/brw_gs_state.c
+++ b/src/gallium/drivers/i965/brw_gs_state.c
@ -0,0 +1,149 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+ 
+
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+#include "main/macros.h"
+
+struct brw_gs_unit_key {
+   unsigned int total_grf;
+   unsigned int urb_entry_read_length;
+
+   unsigned int curbe_offset;
+
+   unsigned int nr_urb_entries, urb_size;
+   GLboolean prog_active;
+};
+
+static void
+gs_unit_populate_key(struct brw_context *brw, struct brw_gs_unit_key *key)
+{
+   memset(key, 0, sizeof(*key));
+
+   /* CACHE_NEW_GS_PROG */
+   key->prog_active = brw->gs.prog_active;
+   if (key->prog_active) {
+      key->total_grf = brw->gs.prog_data->total_grf;
+      key->urb_entry_read_length = brw->gs.prog_data->urb_read_length;
+   } else {
+      key->total_grf = 1;
+      key->urb_entry_read_length = 1;
+   }
+
+   /* BRW_NEW_CURBE_OFFSETS */
+   key->curbe_offset = brw->curbe.clip_start;
+
+   /* BRW_NEW_URB_FENCE */
+   key->nr_urb_entries = brw->urb.nr_gs_entries;
+   key->urb_size = brw->urb.vsize;
+}
+
+static dri_bo *
+gs_unit_create_from_key(struct brw_context *brw, struct brw_gs_unit_key *key)
+{
+   struct brw_gs_unit_state gs;
+   dri_bo *bo;
+
+   memset(&gs, 0, sizeof(gs));
+
+   gs.thread0.grf_reg_count = ALIGN(key->total_grf, 16) / 16 - 1;
+   if (key->prog_active) /* reloc */
+      gs.thread0.kernel_start_pointer = brw->gs.prog_bo->offset >> 6;
+
+   gs.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
+   gs.thread1.single_program_flow = 1;
+
+   gs.thread3.dispatch_grf_start_reg = 1;
+   gs.thread3.const_urb_entry_read_offset = 0;
+   gs.thread3.const_urb_entry_read_length = 0;
+   gs.thread3.urb_entry_read_offset = 0;
+   gs.thread3.urb_entry_read_length = key->urb_entry_read_length;
+
+   gs.thread4.nr_urb_entries = key->nr_urb_entries;
+   gs.thread4.urb_entry_allocation_size = key->urb_size - 1;
+
+   if (key->nr_urb_entries >= 8)
+      gs.thread4.max_threads = 1;
+   else
+      gs.thread4.max_threads = 0;
+
+   if (BRW_IS_IGDNG(brw))
+      gs.thread4.rendering_enable = 1;
+
+   if (INTEL_DEBUG & DEBUG_STATS)
+      gs.thread4.stats_enable = 1;
+
+   bo = brw_upload_cache(&brw->cache, BRW_GS_UNIT,
+			 key, sizeof(*key),
+			 &brw->gs.prog_bo, 1,
+			 &gs, sizeof(gs),
+			 NULL, NULL);
+
+   if (key->prog_active) {
+      /* Emit GS program relocation */
+      dri_bo_emit_reloc(bo,
+			I915_GEM_DOMAIN_INSTRUCTION, 0,
+			gs.thread0.grf_reg_count << 1,
+			offsetof(struct brw_gs_unit_state, thread0),
+			brw->gs.prog_bo);
+   }
+
+   return bo;
+}
+
+static void prepare_gs_unit(struct brw_context *brw)
+{
+   struct brw_gs_unit_key key;
+
+   gs_unit_populate_key(brw, &key);
+
+   dri_bo_unreference(brw->gs.state_bo);
+   brw->gs.state_bo = brw_search_cache(&brw->cache, BRW_GS_UNIT,
+				       &key, sizeof(key),
+				       &brw->gs.prog_bo, 1,
+				       NULL);
+   if (brw->gs.state_bo == NULL) {
+      brw->gs.state_bo = gs_unit_create_from_key(brw, &key);
+   }
+}
+
+const struct brw_tracked_state brw_gs_unit = {
+   .dirty = {
+      .mesa  = 0,
+      .brw   = (BRW_NEW_CURBE_OFFSETS |
+		BRW_NEW_URB_FENCE),
+      .cache = CACHE_NEW_GS_PROG
+   },
+   .prepare = prepare_gs_unit,
+};
--- a/src/gallium/drivers/i965/brw_misc_state.c
+++ b/src/gallium/drivers/i965/brw_misc_state.c
@ -0,0 +1,545 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+ 
+
+
+#include "intel_batchbuffer.h"
+#include "intel_regions.h"
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+
+
+
+
+
+/***********************************************************************
+ * Blend color
+ */
+
+static void upload_blend_constant_color(struct brw_context *brw)
+{
+   GLcontext *ctx = &brw->intel.ctx;
+   struct brw_blend_constant_color bcc;
+
+   memset(&bcc, 0, sizeof(bcc));      
+   bcc.header.opcode = CMD_BLEND_CONSTANT_COLOR;
+   bcc.header.length = sizeof(bcc)/4-2;
+   bcc.blend_constant_color[0] = ctx->Color.BlendColor[0];
+   bcc.blend_constant_color[1] = ctx->Color.BlendColor[1];
+   bcc.blend_constant_color[2] = ctx->Color.BlendColor[2];
+   bcc.blend_constant_color[3] = ctx->Color.BlendColor[3];
+
+   BRW_CACHED_BATCH_STRUCT(brw, &bcc);
+}
+
+
+const struct brw_tracked_state brw_blend_constant_color = {
+   .dirty = {
+      .mesa = _NEW_COLOR,
+      .brw = 0,
+      .cache = 0
+   },
+   .emit = upload_blend_constant_color
+};
+
+/* Constant single cliprect for framebuffer object or DRI2 drawing */
+static void upload_drawing_rect(struct brw_context *brw)
+{
+   struct intel_context *intel = &brw->intel;
+   GLcontext *ctx = &intel->ctx;
+
+   if (!intel->constant_cliprect)
+      return;
+
+   BEGIN_BATCH(4, NO_LOOP_CLIPRECTS);
+   OUT_BATCH(_3DSTATE_DRAWRECT_INFO_I965);
+   OUT_BATCH(0); /* xmin, ymin */
+   OUT_BATCH(((ctx->DrawBuffer->Width - 1) & 0xffff) |
+	    ((ctx->DrawBuffer->Height - 1) << 16));
+   OUT_BATCH(0);
+   ADVANCE_BATCH();
+}
+
+const struct brw_tracked_state brw_drawing_rect = {
+   .dirty = {
+      .mesa = _NEW_BUFFERS,
+      .brw = 0,
+      .cache = 0
+   },
+   .emit = upload_drawing_rect
+};
+
+static void prepare_binding_table_pointers(struct brw_context *brw)
+{
+   brw_add_validated_bo(brw, brw->vs.bind_bo);
+   brw_add_validated_bo(brw, brw->wm.bind_bo);
+}
+
+/**
+ * Upload the binding table pointers, which point each stage's array of surface
+ * state pointers.
+ *
+ * The binding table pointers are relative to the surface state base address,
+ * which is 0.
+ */
+static void upload_binding_table_pointers(struct brw_context *brw)
+{
+   struct intel_context *intel = &brw->intel;
+
+   BEGIN_BATCH(6, IGNORE_CLIPRECTS);
+   OUT_BATCH(CMD_BINDING_TABLE_PTRS << 16 | (6 - 2));
+   if (brw->vs.bind_bo != NULL)
+      OUT_RELOC(brw->vs.bind_bo, I915_GEM_DOMAIN_SAMPLER, 0, 0); /* vs */
+   else
+      OUT_BATCH(0);
+   OUT_BATCH(0); /* gs */
+   OUT_BATCH(0); /* clip */
+   OUT_BATCH(0); /* sf */
+   OUT_RELOC(brw->wm.bind_bo, I915_GEM_DOMAIN_SAMPLER, 0, 0); /* wm/ps */
+   ADVANCE_BATCH();
+}
+
+const struct brw_tracked_state brw_binding_table_pointers = {
+   .dirty = {
+      .mesa = 0,
+      .brw = BRW_NEW_BATCH,
+      .cache = CACHE_NEW_SURF_BIND,
+   },
+   .prepare = prepare_binding_table_pointers,
+   .emit = upload_binding_table_pointers,
+};
+
+
+/**
+ * Upload pointers to the per-stage state.
+ *
+ * The state pointers in this packet are all relative to the general state
+ * base address set by CMD_STATE_BASE_ADDRESS, which is 0.
+ */
+static void upload_pipelined_state_pointers(struct brw_context *brw )
+{
+   struct intel_context *intel = &brw->intel;
+
+   BEGIN_BATCH(7, IGNORE_CLIPRECTS);
+   OUT_BATCH(CMD_PIPELINED_STATE_POINTERS << 16 | (7 - 2));
+   OUT_RELOC(brw->vs.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
+   if (brw->gs.prog_active)
+      OUT_RELOC(brw->gs.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 1);
+   else
+      OUT_BATCH(0);
+   OUT_RELOC(brw->clip.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 1);
+   OUT_RELOC(brw->sf.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
+   OUT_RELOC(brw->wm.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
+   OUT_RELOC(brw->cc.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
+   ADVANCE_BATCH();
+
+   brw->state.dirty.brw |= BRW_NEW_PSP;
+}
+
+
+static void prepare_psp_urb_cbs(struct brw_context *brw)
+{
+   brw_add_validated_bo(brw, brw->vs.state_bo);
+   brw_add_validated_bo(brw, brw->gs.state_bo);
+   brw_add_validated_bo(brw, brw->clip.state_bo);
+   brw_add_validated_bo(brw, brw->sf.state_bo);
+   brw_add_validated_bo(brw, brw->wm.state_bo);
+   brw_add_validated_bo(brw, brw->cc.state_bo);
+}
+
+static void upload_psp_urb_cbs(struct brw_context *brw )
+{
+   upload_pipelined_state_pointers(brw);
+   brw_upload_urb_fence(brw);
+   brw_upload_cs_urb_state(brw);
+}
+
+const struct brw_tracked_state brw_psp_urb_cbs = {
+   .dirty = {
+      .mesa = 0,
+      .brw = BRW_NEW_URB_FENCE | BRW_NEW_BATCH,
+      .cache = (CACHE_NEW_VS_UNIT | 
+		CACHE_NEW_GS_UNIT | 
+		CACHE_NEW_GS_PROG | 
+		CACHE_NEW_CLIP_UNIT | 
+		CACHE_NEW_SF_UNIT | 
+		CACHE_NEW_WM_UNIT | 
+		CACHE_NEW_CC_UNIT)
+   },
+   .prepare = prepare_psp_urb_cbs,
+   .emit = upload_psp_urb_cbs,
+};
+
+static void prepare_depthbuffer(struct brw_context *brw)
+{
+   struct intel_region *region = brw->state.depth_region;
+
+   if (region != NULL)
+      brw_add_validated_bo(brw, region->buffer);
+}
+
+static void emit_depthbuffer(struct brw_context *brw)
+{
+   struct intel_context *intel = &brw->intel;
+   struct intel_region *region = brw->state.depth_region;
+   unsigned int len = (BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)) ? 6 : 5;
+
+   if (region == NULL) {
+      BEGIN_BATCH(len, IGNORE_CLIPRECTS);
+      OUT_BATCH(CMD_DEPTH_BUFFER << 16 | (len - 2));
+      OUT_BATCH((BRW_DEPTHFORMAT_D32_FLOAT << 18) |
+		(BRW_SURFACE_NULL << 29));
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+
+      if (BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw))
+         OUT_BATCH(0);
+
+      ADVANCE_BATCH();
+   } else {
+      unsigned int format;
+
+      switch (region->cpp) {
+      case 2:
+	 format = BRW_DEPTHFORMAT_D16_UNORM;
+	 break;
+      case 4:
+	 if (intel->depth_buffer_is_float)
+	    format = BRW_DEPTHFORMAT_D32_FLOAT;
+	 else
+	    format = BRW_DEPTHFORMAT_D24_UNORM_S8_UINT;
+	 break;
+      default:
+	 assert(0);
+	 return;
+      }
+
+      assert(region->tiling != I915_TILING_X);
+
+      BEGIN_BATCH(len, IGNORE_CLIPRECTS);
+      OUT_BATCH(CMD_DEPTH_BUFFER << 16 | (len - 2));
+      OUT_BATCH(((region->pitch * region->cpp) - 1) |
+		(format << 18) |
+		(BRW_TILEWALK_YMAJOR << 26) |
+		((region->tiling != I915_TILING_NONE) << 27) |
+		(BRW_SURFACE_2D << 29));
+      OUT_RELOC(region->buffer,
+		I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
+		0);
+      OUT_BATCH((BRW_SURFACE_MIPMAPLAYOUT_BELOW << 1) |
+		((region->pitch - 1) << 6) |
+		((region->height - 1) << 19));
+      OUT_BATCH(0);
+
+      if (BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw))
+         OUT_BATCH(0);
+
+      ADVANCE_BATCH();
+   }
+}
+
+const struct brw_tracked_state brw_depthbuffer = {
+   .dirty = {
+      .mesa = 0,
+      .brw = BRW_NEW_DEPTH_BUFFER | BRW_NEW_BATCH,
+      .cache = 0,
+   },
+   .prepare = prepare_depthbuffer,
+   .emit = emit_depthbuffer,
+};
+
+
+
+/***********************************************************************
+ * Polygon stipple packet
+ */
+
+static void upload_polygon_stipple(struct brw_context *brw)
+{
+   GLcontext *ctx = &brw->intel.ctx;
+   struct brw_polygon_stipple bps;
+   GLuint i;
+
+   memset(&bps, 0, sizeof(bps));
+   bps.header.opcode = CMD_POLY_STIPPLE_PATTERN;
+   bps.header.length = sizeof(bps)/4-2;
+
+   /* Polygon stipple is provided in OpenGL order, i.e. bottom
+    * row first.  If we're rendering to a window (i.e. the
+    * default frame buffer object, 0), then we need to invert
+    * it to match our pixel layout.  But if we're rendering
+    * to a FBO (i.e. any named frame buffer object), we *don't*
+    * need to invert - we already match the layout.
+    */
+   if (ctx->DrawBuffer->Name == 0) {
+      for (i = 0; i < 32; i++)
+         bps.stipple[i] = ctx->PolygonStipple[31 - i]; /* invert */
+   }
+   else {
+      for (i = 0; i < 32; i++)
+         bps.stipple[i] = ctx->PolygonStipple[i]; /* don't invert */
+   }
+
+   BRW_CACHED_BATCH_STRUCT(brw, &bps);
+}
+
+const struct brw_tracked_state brw_polygon_stipple = {
+   .dirty = {
+      .mesa = _NEW_POLYGONSTIPPLE,
+      .brw = 0,
+      .cache = 0
+   },
+   .emit = upload_polygon_stipple
+};
+
+
+/***********************************************************************
+ * Polygon stipple offset packet
+ */
+
+static void upload_polygon_stipple_offset(struct brw_context *brw)
+{
+   __DRIdrawablePrivate *dPriv = brw->intel.driDrawable;
+   struct brw_polygon_stipple_offset bpso;
+
+   memset(&bpso, 0, sizeof(bpso));
+   bpso.header.opcode = CMD_POLY_STIPPLE_OFFSET;
+   bpso.header.length = sizeof(bpso)/4-2;
+
+   /* If we're drawing to a system window (ctx->DrawBuffer->Name == 0),
+    * we have to invert the Y axis in order to match the OpenGL
+    * pixel coordinate system, and our offset must be matched
+    * to the window position.  If we're drawing to a FBO
+    * (ctx->DrawBuffer->Name != 0), then our native pixel coordinate
+    * system works just fine, and there's no window system to
+    * worry about.
+    */
+   if (brw->intel.ctx.DrawBuffer->Name == 0) {
+      bpso.bits0.x_offset = (32 - (dPriv->x & 31)) & 31;
+      bpso.bits0.y_offset = (32 - ((dPriv->y + dPriv->h) & 31)) & 31;
+   }
+   else {
+      bpso.bits0.y_offset = 0;
+      bpso.bits0.x_offset = 0;
+   }
+
+   BRW_CACHED_BATCH_STRUCT(brw, &bpso);
+}
+
+#define _NEW_WINDOW_POS 0x40000000
+
+const struct brw_tracked_state brw_polygon_stipple_offset = {
+   .dirty = {
+      .mesa = _NEW_WINDOW_POS,
+      .brw = 0,
+      .cache = 0
+   },
+   .emit = upload_polygon_stipple_offset
+};
+
+/**********************************************************************
+ * AA Line parameters
+ */
+static void upload_aa_line_parameters(struct brw_context *brw)
+{
+   struct brw_aa_line_parameters balp;
+   
+   if (BRW_IS_965(brw))
+      return;
+
+   /* use legacy aa line coverage computation */
+   memset(&balp, 0, sizeof(balp));
+   balp.header.opcode = CMD_AA_LINE_PARAMETERS;
+   balp.header.length = sizeof(balp) / 4 - 2;
+   
+   BRW_CACHED_BATCH_STRUCT(brw, &balp);
+}
+
+const struct brw_tracked_state brw_aa_line_parameters = {
+   .dirty = {
+      .mesa = 0,
+      .brw = BRW_NEW_CONTEXT,
+      .cache = 0
+   },
+   .emit = upload_aa_line_parameters
+};
+
+/***********************************************************************
+ * Line stipple packet
+ */
+
+static void upload_line_stipple(struct brw_context *brw)
+{
+   GLcontext *ctx = &brw->intel.ctx;
+   struct brw_line_stipple bls;
+   GLfloat tmp;
+   GLint tmpi;
+
+   memset(&bls, 0, sizeof(bls));
+   bls.header.opcode = CMD_LINE_STIPPLE_PATTERN;
+   bls.header.length = sizeof(bls)/4 - 2;
+
+   bls.bits0.pattern = ctx->Line.StipplePattern;
+   bls.bits1.repeat_count = ctx->Line.StippleFactor;
+
+   tmp = 1.0 / (GLfloat) ctx->Line.StippleFactor;
+   tmpi = tmp * (1<<13);
+
+
+   bls.bits1.inverse_repeat_count = tmpi;
+
+   BRW_CACHED_BATCH_STRUCT(brw, &bls);
+}
+
+const struct brw_tracked_state brw_line_stipple = {
+   .dirty = {
+      .mesa = _NEW_LINE,
+      .brw = 0,
+      .cache = 0
+   },
+   .emit = upload_line_stipple
+};
+
+
+/***********************************************************************
+ * Misc invarient state packets
+ */
+
+static void upload_invarient_state( struct brw_context *brw )
+{
+   {
+      /* 0x61040000  Pipeline Select */
+      /*     PipelineSelect            : 0 */
+      struct brw_pipeline_select ps;
+
+      memset(&ps, 0, sizeof(ps));
+      ps.header.opcode = CMD_PIPELINE_SELECT(brw);
+      ps.header.pipeline_select = 0;
+      BRW_BATCH_STRUCT(brw, &ps);
+   }
+
+   {
+      struct brw_global_depth_offset_clamp gdo;
+      memset(&gdo, 0, sizeof(gdo));
+
+      /* Disable depth offset clamping. 
+       */
+      gdo.header.opcode = CMD_GLOBAL_DEPTH_OFFSET_CLAMP;
+      gdo.header.length = sizeof(gdo)/4 - 2;
+      gdo.depth_offset_clamp = 0.0;
+
+      BRW_BATCH_STRUCT(brw, &gdo);
+   }
+
+
+   /* 0x61020000  State Instruction Pointer */
+   {
+      struct brw_system_instruction_pointer sip;
+      memset(&sip, 0, sizeof(sip));
+
+      sip.header.opcode = CMD_STATE_INSN_POINTER;
+      sip.header.length = 0;
+      sip.bits0.pad = 0;
+      sip.bits0.system_instruction_pointer = 0;
+      BRW_BATCH_STRUCT(brw, &sip);
+   }
+
+
+   {
+      struct brw_vf_statistics vfs;
+      memset(&vfs, 0, sizeof(vfs));
+
+      vfs.opcode = CMD_VF_STATISTICS(brw);
+      if (INTEL_DEBUG & DEBUG_STATS)
+	 vfs.statistics_enable = 1; 
+
+      BRW_BATCH_STRUCT(brw, &vfs);
+   }
+}
+
+const struct brw_tracked_state brw_invarient_state = {
+   .dirty = {
+      .mesa = 0,
+      .brw = BRW_NEW_CONTEXT,
+      .cache = 0
+   },
+   .emit = upload_invarient_state
+};
+
+/**
+ * Define the base addresses which some state is referenced from.
+ *
+ * This allows us to avoid having to emit relocations in many places for
+ * cached state, and instead emit pointers inside of large, mostly-static
+ * state pools.  This comes at the expense of memory, and more expensive cache
+ * misses.
+ */
+static void upload_state_base_address( struct brw_context *brw )
+{
+   struct intel_context *intel = &brw->intel;
+
+   /* Output the structure (brw_state_base_address) directly to the
+    * batchbuffer, so we can emit relocations inline.
+    */
+   if (BRW_IS_IGDNG(brw)) {
+       BEGIN_BATCH(8, IGNORE_CLIPRECTS);
+       OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (8 - 2));
+       OUT_BATCH(1); /* General state base address */
+       OUT_BATCH(1); /* Surface state base address */
+       OUT_BATCH(1); /* Indirect object base address */
+       OUT_BATCH(1); /* Instruction base address */
+       OUT_BATCH(1); /* General state upper bound */
+       OUT_BATCH(1); /* Indirect object upper bound */
+       OUT_BATCH(1); /* Instruction access upper bound */
+       ADVANCE_BATCH();
+   } else {
+       BEGIN_BATCH(6, IGNORE_CLIPRECTS);
+       OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (6 - 2));
+       OUT_BATCH(1); /* General state base address */
+       OUT_BATCH(1); /* Surface state base address */
+       OUT_BATCH(1); /* Indirect object base address */
+       OUT_BATCH(1); /* General state upper bound */
+       OUT_BATCH(1); /* Indirect object upper bound */
+       ADVANCE_BATCH();
+   }
+}
+
+const struct brw_tracked_state brw_state_base_address = {
+   .dirty = {
+      .mesa = 0,
+      .brw = BRW_NEW_CONTEXT,
+      .cache = 0,
+   },
+   .emit = upload_state_base_address
+};
--- a/src/gallium/drivers/i965/brw_program.c
+++ b/src/gallium/drivers/i965/brw_program.c
@ -0,0 +1,166 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+  
+#include "main/imports.h"
+#include "main/enums.h"
+#include "shader/prog_parameter.h"
+#include "shader/program.h"
+#include "shader/programopt.h"
+#include "tnl/tnl.h"
+
+#include "brw_context.h"
+#include "brw_util.h"
+#include "brw_wm.h"
+
+static void brwBindProgram( GLcontext *ctx,
+			    GLenum target, 
+			    struct gl_program *prog )
+{
+   struct brw_context *brw = brw_context(ctx);
+
+   switch (target) {
+   case GL_VERTEX_PROGRAM_ARB: 
+      brw->state.dirty.brw |= BRW_NEW_VERTEX_PROGRAM;
+      break;
+   case GL_FRAGMENT_PROGRAM_ARB:
+      brw->state.dirty.brw |= BRW_NEW_FRAGMENT_PROGRAM;
+      break;
+   }
+}
+
+static struct gl_program *brwNewProgram( GLcontext *ctx,
+				      GLenum target, 
+				      GLuint id )
+{
+   struct brw_context *brw = brw_context(ctx);
+
+   switch (target) {
+   case GL_VERTEX_PROGRAM_ARB: {
+      struct brw_vertex_program *prog = CALLOC_STRUCT(brw_vertex_program);
+      if (prog) {
+	 prog->id = brw->program_id++;
+
+	 return _mesa_init_vertex_program( ctx, &prog->program,
+					     target, id );
+      }
+      else
+	 return NULL;
+   }
+
+   case GL_FRAGMENT_PROGRAM_ARB: {
+      struct brw_fragment_program *prog = CALLOC_STRUCT(brw_fragment_program);
+      if (prog) {
+	 prog->id = brw->program_id++;
+
+	 return _mesa_init_fragment_program( ctx, &prog->program,
+					     target, id );
+      }
+      else
+	 return NULL;
+   }
+
+   default:
+      return _mesa_new_program(ctx, target, id);
+   }
+}
+
+static void brwDeleteProgram( GLcontext *ctx,
+			      struct gl_program *prog )
+{
+   if (prog->Target == GL_FRAGMENT_PROGRAM_ARB) {
+      struct gl_fragment_program *fprog = (struct gl_fragment_program *) prog;
+      struct brw_fragment_program *brw_fprog = brw_fragment_program(fprog);
+      dri_bo_unreference(brw_fprog->const_buffer);
+   }
+
+   _mesa_delete_program( ctx, prog );
+}
+
+
+static GLboolean brwIsProgramNative( GLcontext *ctx,
+				     GLenum target, 
+				     struct gl_program *prog )
+{
+   return GL_TRUE;
+}
+
+static void brwProgramStringNotify( GLcontext *ctx,
+				    GLenum target,
+				    struct gl_program *prog )
+{
+   struct brw_context *brw = brw_context(ctx);
+
+   if (target == GL_FRAGMENT_PROGRAM_ARB) {
+      struct gl_fragment_program *fprog = (struct gl_fragment_program *) prog;
+      struct brw_fragment_program *newFP = brw_fragment_program(fprog);
+      const struct brw_fragment_program *curFP =
+         brw_fragment_program_const(brw->fragment_program);
+
+      if (fprog->FogOption) {
+         _mesa_append_fog_code(ctx, fprog);
+         fprog->FogOption = GL_NONE;
+      }
+
+      if (newFP == curFP)
+	 brw->state.dirty.brw |= BRW_NEW_FRAGMENT_PROGRAM;
+      newFP->id = brw->program_id++;      
+      newFP->isGLSL = brw_wm_is_glsl(fprog);
+   }
+   else if (target == GL_VERTEX_PROGRAM_ARB) {
+      struct gl_vertex_program *vprog = (struct gl_vertex_program *) prog;
+      struct brw_vertex_program *newVP = brw_vertex_program(vprog);
+      const struct brw_vertex_program *curVP =
+         brw_vertex_program_const(brw->vertex_program);
+
+      if (newVP == curVP)
+	 brw->state.dirty.brw |= BRW_NEW_VERTEX_PROGRAM;
+      if (newVP->program.IsPositionInvariant) {
+	 _mesa_insert_mvp_code(ctx, &newVP->program);
+      }
+      newVP->id = brw->program_id++;      
+
+      /* Also tell tnl about it:
+       */
+      _tnl_program_string(ctx, target, prog);
+   }
+}
+
+void brwInitFragProgFuncs( struct dd_function_table *functions )
+{
+   assert(functions->ProgramStringNotify == _tnl_program_string); 
+
+   functions->BindProgram = brwBindProgram;
+   functions->NewProgram = brwNewProgram;
+   functions->DeleteProgram = brwDeleteProgram;
+   functions->IsProgramNative = brwIsProgramNative;
+   functions->ProgramStringNotify = brwProgramStringNotify;
+}
+
--- a/src/gallium/drivers/i965/brw_queryobj.c
+++ b/src/gallium/drivers/i965/brw_queryobj.c
@ -0,0 +1,254 @@
+/*
+ * Copyright © 2008 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+/** @file support for ARB_query_object
+ *
+ * ARB_query_object is implemented by using the PIPE_CONTROL command to stall
+ * execution on the completion of previous depth tests, and write the
+ * current PS_DEPTH_COUNT to a buffer object.
+ *
+ * We use before and after counts when drawing during a query so that
+ * we don't pick up other clients' query data in ours.  To reduce overhead,
+ * a single BO is used to record the query data for all active queries at
+ * once.  This also gives us a simple bound on how much batchbuffer space is
+ * required for handling queries, so that we can be sure that we won't
+ * have to emit a batchbuffer without getting the ending PS_DEPTH_COUNT.
+ */
+#include "main/simple_list.h"
+#include "main/imports.h"
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "intel_batchbuffer.h"
+#include "intel_reg.h"
+
+/** Waits on the query object's BO and totals the results for this query */
+static void
+brw_queryobj_get_results(struct brw_query_object *query)
+{
+   int i;
+   uint64_t *results;
+
+   if (query->bo == NULL)
+      return;
+
+   /* Map and count the pixels from the current query BO */
+   dri_bo_map(query->bo, GL_FALSE);
+   results = query->bo->virtual;
+   for (i = query->first_index; i <= query->last_index; i++) {
+      query->Base.Result += results[i * 2 + 1] - results[i * 2];
+   }
+   dri_bo_unmap(query->bo);
+
+   dri_bo_unreference(query->bo);
+   query->bo = NULL;
+}
+
+static struct gl_query_object *
+brw_new_query_object(GLcontext *ctx, GLuint id)
+{
+   struct brw_query_object *query;
+
+   query = _mesa_calloc(sizeof(struct brw_query_object));
+
+   query->Base.Id = id;
+   query->Base.Result = 0;
+   query->Base.Active = GL_FALSE;
+   query->Base.Ready = GL_TRUE;
+
+   return &query->Base;
+}
+
+static void
+brw_delete_query(GLcontext *ctx, struct gl_query_object *q)
+{
+   struct brw_query_object *query = (struct brw_query_object *)q;
+
+   dri_bo_unreference(query->bo);
+   _mesa_free(query);
+}
+
+static void
+brw_begin_query(GLcontext *ctx, struct gl_query_object *q)
+{
+   struct brw_context *brw = brw_context(ctx);
+   struct intel_context *intel = intel_context(ctx);
+   struct brw_query_object *query = (struct brw_query_object *)q;
+
+   /* Reset our driver's tracking of query state. */
+   dri_bo_unreference(query->bo);
+   query->bo = NULL;
+   query->first_index = -1;
+   query->last_index = -1;
+
+   insert_at_head(&brw->query.active_head, query);
+   intel->stats_wm++;
+}
+
+/**
+ * Begin the ARB_occlusion_query query on a query object.
+ */
+static void
+brw_end_query(GLcontext *ctx, struct gl_query_object *q)
+{
+   struct brw_context *brw = brw_context(ctx);
+   struct intel_context *intel = intel_context(ctx);
+   struct brw_query_object *query = (struct brw_query_object *)q;
+
+   /* Flush the batchbuffer in case it has writes to our query BO.
+    * Have later queries write to a new query BO so that further rendering
+    * doesn't delay the collection of our results.
+    */
+   if (query->bo) {
+      brw_emit_query_end(brw);
+      intel_batchbuffer_flush(intel->batch);
+
+      dri_bo_unreference(brw->query.bo);
+      brw->query.bo = NULL;
+   }
+
+   remove_from_list(query);
+
+   intel->stats_wm--;
+}
+
+static void brw_wait_query(GLcontext *ctx, struct gl_query_object *q)
+{
+   struct brw_query_object *query = (struct brw_query_object *)q;
+
+   brw_queryobj_get_results(query);
+   query->Base.Ready = GL_TRUE;
+}
+
+static void brw_check_query(GLcontext *ctx, struct gl_query_object *q)
+{
+   struct brw_query_object *query = (struct brw_query_object *)q;
+
+   if (query->bo == NULL || !drm_intel_bo_busy(query->bo)) {
+      brw_queryobj_get_results(query);
+      query->Base.Ready = GL_TRUE;
+   }
+}
+
+/** Called to set up the query BO and account for its aperture space */
+void
+brw_prepare_query_begin(struct brw_context *brw)
+{
+   struct intel_context *intel = &brw->intel;
+
+   /* Skip if we're not doing any queries. */
+   if (is_empty_list(&brw->query.active_head))
+      return;
+
+   /* Get a new query BO if we're going to need it. */
+   if (brw->query.bo == NULL ||
+       brw->query.index * 2 + 1 >= 4096 / sizeof(uint64_t)) {
+      dri_bo_unreference(brw->query.bo);
+      brw->query.bo = NULL;
+
+      brw->query.bo = dri_bo_alloc(intel->bufmgr, "query", 4096, 1);
+      brw->query.index = 0;
+   }
+
+   brw_add_validated_bo(brw, brw->query.bo);
+}
+
+/** Called just before primitive drawing to get a beginning PS_DEPTH_COUNT. */
+void
+brw_emit_query_begin(struct brw_context *brw)
+{
+   struct intel_context *intel = &brw->intel;
+   struct brw_query_object *query;
+
+   /* Skip if we're not doing any queries, or we've emitted the start. */
+   if (brw->query.active || is_empty_list(&brw->query.active_head))
+      return;
+
+   BEGIN_BATCH(4, IGNORE_CLIPRECTS);
+   OUT_BATCH(_3DSTATE_PIPE_CONTROL |
+	     PIPE_CONTROL_DEPTH_STALL |
+	     PIPE_CONTROL_WRITE_DEPTH_COUNT);
+   /* This object could be mapped cacheable, but we don't have an exposed
+    * mechanism to support that.  Since it's going uncached, tell GEM that
+    * we're writing to it.  The usual clflush should be all that's required
+    * to pick up the results.
+    */
+   OUT_RELOC(brw->query.bo,
+	     I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+	     PIPE_CONTROL_GLOBAL_GTT_WRITE |
+	     ((brw->query.index * 2) * sizeof(uint64_t)));
+   OUT_BATCH(0);
+   OUT_BATCH(0);
+   ADVANCE_BATCH();
+
+   foreach(query, &brw->query.active_head) {
+      if (query->bo != brw->query.bo) {
+	 if (query->bo != NULL)
+	    brw_queryobj_get_results(query);
+	 dri_bo_reference(brw->query.bo);
+	 query->bo = brw->query.bo;
+	 query->first_index = brw->query.index;
+      }
+      query->last_index = brw->query.index;
+   }
+   brw->query.active = GL_TRUE;
+}
+
+/** Called at batchbuffer flush to get an ending PS_DEPTH_COUNT */
+void
+brw_emit_query_end(struct brw_context *brw)
+{
+   struct intel_context *intel = &brw->intel;
+
+   if (!brw->query.active)
+      return;
+
+   BEGIN_BATCH(4, IGNORE_CLIPRECTS);
+   OUT_BATCH(_3DSTATE_PIPE_CONTROL |
+	     PIPE_CONTROL_DEPTH_STALL |
+	     PIPE_CONTROL_WRITE_DEPTH_COUNT);
+   OUT_RELOC(brw->query.bo,
+	     I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+	     PIPE_CONTROL_GLOBAL_GTT_WRITE |
+	     ((brw->query.index * 2 + 1) * sizeof(uint64_t)));
+   OUT_BATCH(0);
+   OUT_BATCH(0);
+   ADVANCE_BATCH();
+
+   brw->query.active = GL_FALSE;
+   brw->query.index++;
+}
+
+void brw_init_queryobj_functions(struct dd_function_table *functions)
+{
+   functions->NewQueryObject = brw_new_query_object;
+   functions->DeleteQuery = brw_delete_query;
+   functions->BeginQuery = brw_begin_query;
+   functions->EndQuery = brw_end_query;
+   functions->CheckQuery = brw_check_query;
+   functions->WaitQuery = brw_wait_query;
+}
--- a/src/gallium/drivers/i965/brw_sf.c
+++ b/src/gallium/drivers/i965/brw_sf.c
@ -0,0 +1,200 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+  
+
+#include "main/glheader.h"
+#include "main/macros.h"
+#include "main/enums.h"
+
+#include "intel_batchbuffer.h"
+
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_sf.h"
+#include "brw_state.h"
+
+static void compile_sf_prog( struct brw_context *brw,
+			     struct brw_sf_prog_key *key )
+{
+   GLcontext *ctx = &brw->intel.ctx;
+   struct brw_sf_compile c;
+   const GLuint *program;
+   GLuint program_size;
+   GLuint i, idx;
+
+   memset(&c, 0, sizeof(c));
+
+   /* Begin the compilation:
+    */
+   brw_init_compile(brw, &c.func);
+
+   c.key = *key;
+   c.nr_attrs = brw_count_bits(c.key.attrs);
+   c.nr_attr_regs = (c.nr_attrs+1)/2;
+   c.nr_setup_attrs = brw_count_bits(c.key.attrs & DO_SETUP_BITS);
+   c.nr_setup_regs = (c.nr_setup_attrs+1)/2;
+
+   c.prog_data.urb_read_length = c.nr_attr_regs;
+   c.prog_data.urb_entry_size = c.nr_setup_regs * 2;
+
+   /* Construct map from attribute number to position in the vertex.
+    */
+   for (i = idx = 0; i < VERT_RESULT_MAX; i++) 
+      if (c.key.attrs & (1<<i)) {
+	 c.attr_to_idx[i] = idx;
+	 c.idx_to_attr[idx] = i;
+	 if (i >= VERT_RESULT_TEX0 && i <= VERT_RESULT_TEX7) {
+            c.point_attrs[i].CoordReplace = 
+               ctx->Point.CoordReplace[i - VERT_RESULT_TEX0];
+	 }
+         else {
+            c.point_attrs[i].CoordReplace = GL_FALSE;
+         }
+	 idx++;
+      }
+   
+   /* Which primitive?  Or all three? 
+    */
+   switch (key->primitive) {
+   case SF_TRIANGLES:
+      c.nr_verts = 3;
+      brw_emit_tri_setup( &c, GL_TRUE );
+      break;
+   case SF_LINES:
+      c.nr_verts = 2;
+      brw_emit_line_setup( &c, GL_TRUE );
+      break;
+   case SF_POINTS:
+      c.nr_verts = 1;
+      if (key->do_point_sprite)
+	  brw_emit_point_sprite_setup( &c, GL_TRUE );
+      else
+	  brw_emit_point_setup( &c, GL_TRUE );
+      break;
+   case SF_UNFILLED_TRIS:
+      c.nr_verts = 3;
+      brw_emit_anyprim_setup( &c );
+      break;
+   default:
+      assert(0);
+      return;
+   }
+
+   /* get the program
+    */
+   program = brw_get_program(&c.func, &program_size);
+
+   /* Upload
+    */
+   dri_bo_unreference(brw->sf.prog_bo);
+   brw->sf.prog_bo = brw_upload_cache( &brw->cache, BRW_SF_PROG,
+				       &c.key, sizeof(c.key),
+				       NULL, 0,
+				       program, program_size,
+				       &c.prog_data,
+				       &brw->sf.prog_data );
+}
+
+/* Calculate interpolants for triangle and line rasterization.
+ */
+static void upload_sf_prog(struct brw_context *brw)
+{
+   GLcontext *ctx = &brw->intel.ctx;
+   struct brw_sf_prog_key key;
+
+   memset(&key, 0, sizeof(key));
+
+   /* Populate the key, noting state dependencies:
+    */
+   /* CACHE_NEW_VS_PROG */
+   key.attrs = brw->vs.prog_data->outputs_written; 
+
+   /* BRW_NEW_REDUCED_PRIMITIVE */
+   switch (brw->intel.reduced_primitive) {
+   case GL_TRIANGLES: 
+      /* NOTE: We just use the edgeflag attribute as an indicator that
+       * unfilled triangles are active.  We don't actually do the
+       * edgeflag testing here, it is already done in the clip
+       * program.
+       */
+      if (key.attrs & (1<<VERT_RESULT_EDGE))
+	 key.primitive = SF_UNFILLED_TRIS;
+      else
+	 key.primitive = SF_TRIANGLES;
+      break;
+   case GL_LINES: 
+      key.primitive = SF_LINES; 
+      break;
+   case GL_POINTS: 
+      key.primitive = SF_POINTS; 
+      break;
+   }
+
+   key.do_point_sprite = ctx->Point.PointSprite;
+   key.SpriteOrigin = ctx->Point.SpriteOrigin;
+   /* _NEW_LIGHT */
+   key.do_flat_shading = (ctx->Light.ShadeModel == GL_FLAT);
+   key.do_twoside_color = (ctx->Light.Enabled && ctx->Light.Model.TwoSide);
+
+   /* _NEW_HINT */
+   key.linear_color = (ctx->Hint.PerspectiveCorrection == GL_FASTEST);
+
+   /* _NEW_POLYGON */
+   if (key.do_twoside_color) {
+      /* If we're rendering to a FBO, we have to invert the polygon
+       * face orientation, just as we invert the viewport in
+       * sf_unit_create_from_key().  ctx->DrawBuffer->Name will be
+       * nonzero if we're rendering to such an FBO.
+       */
+      key.frontface_ccw = (ctx->Polygon.FrontFace == GL_CCW) ^ (ctx->DrawBuffer->Name != 0);
+   }
+
+   dri_bo_unreference(brw->sf.prog_bo);
+   brw->sf.prog_bo = brw_search_cache(&brw->cache, BRW_SF_PROG,
+				      &key, sizeof(key),
+				      NULL, 0,
+				      &brw->sf.prog_data);
+   if (brw->sf.prog_bo == NULL)
+      compile_sf_prog( brw, &key );
+}
+
+
+const struct brw_tracked_state brw_sf_prog = {
+   .dirty = {
+      .mesa  = (_NEW_HINT | _NEW_LIGHT | _NEW_POLYGON | _NEW_POINT),
+      .brw   = (BRW_NEW_REDUCED_PRIMITIVE),
+      .cache = CACHE_NEW_VS_PROG
+   },
+   .prepare = upload_sf_prog
+};
+
--- a/src/gallium/drivers/i965/brw_sf.h
+++ b/src/gallium/drivers/i965/brw_sf.h
@ -0,0 +1,113 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+   
+
+#ifndef BRW_SF_H
+#define BRW_SF_H
+
+
+#include "shader/program.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+
+
+#define SF_POINTS    0
+#define SF_LINES     1
+#define SF_TRIANGLES 2
+#define SF_UNFILLED_TRIS   3
+
+struct brw_sf_prog_key {
+   GLuint attrs:32;
+   GLuint primitive:2;
+   GLuint do_twoside_color:1;
+   GLuint do_flat_shading:1;
+   GLuint frontface_ccw:1;
+   GLuint do_point_sprite:1;
+   GLuint linear_color:1;  /**< linear interp vs. perspective interp */
+   GLuint pad:25;
+   GLenum SpriteOrigin;
+};
+
+struct brw_sf_point_tex {
+	GLboolean CoordReplace;	
+};
+
+struct brw_sf_compile {
+   struct brw_compile func;
+   struct brw_sf_prog_key key;
+   struct brw_sf_prog_data prog_data;
+   
+   struct brw_reg pv;
+   struct brw_reg det;
+   struct brw_reg dx0;
+   struct brw_reg dx2;
+   struct brw_reg dy0;
+   struct brw_reg dy2;
+
+   /* z and 1/w passed in seperately:
+    */
+   struct brw_reg z[3];
+   struct brw_reg inv_w[3];
+   
+   /* The vertices:
+    */
+   struct brw_reg vert[3];
+
+    /* Temporaries, allocated after last vertex reg.
+    */
+   struct brw_reg inv_det;
+   struct brw_reg a1_sub_a0;
+   struct brw_reg a2_sub_a0;
+   struct brw_reg tmp;
+
+   struct brw_reg m1Cx;
+   struct brw_reg m2Cy;
+   struct brw_reg m3C0;
+
+   GLuint nr_verts;
+   GLuint nr_attrs;
+   GLuint nr_attr_regs;
+   GLuint nr_setup_attrs;
+   GLuint nr_setup_regs;
+
+   GLubyte attr_to_idx[VERT_RESULT_MAX];   
+   GLubyte idx_to_attr[VERT_RESULT_MAX];   
+   struct brw_sf_point_tex point_attrs[VERT_RESULT_MAX];
+};
+
+ 
+void brw_emit_tri_setup( struct brw_sf_compile *c, GLboolean allocate );
+void brw_emit_line_setup( struct brw_sf_compile *c, GLboolean allocate );
+void brw_emit_point_setup( struct brw_sf_compile *c, GLboolean allocate );
+void brw_emit_point_sprite_setup( struct brw_sf_compile *c, GLboolean allocate );
+void brw_emit_anyprim_setup( struct brw_sf_compile *c );
+
+#endif
--- a/src/gallium/drivers/i965/brw_sf_emit.c
+++ b/src/gallium/drivers/i965/brw_sf_emit.c
@ -0,0 +1,739 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+   
+
+#include "main/glheader.h"
+#include "main/macros.h"
+#include "main/enums.h"
+
+#include "intel_batchbuffer.h"
+
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_sf.h"
+
+
+static struct brw_reg get_vert_attr(struct brw_sf_compile *c,
+				    struct brw_reg vert,
+				    GLuint attr)
+{
+   GLuint off = c->attr_to_idx[attr] / 2;
+   GLuint sub = c->attr_to_idx[attr] % 2;
+
+   return brw_vec4_grf(vert.nr + off, sub * 4);
+}
+
+static GLboolean have_attr(struct brw_sf_compile *c,
+			   GLuint attr)
+{
+   return (c->key.attrs & (1<<attr)) ? 1 : 0;
+}
+
+/*********************************************************************** 
+ * Twoside lighting
+ */
+static void copy_bfc( struct brw_sf_compile *c,
+		      struct brw_reg vert )
+{
+   struct brw_compile *p = &c->func;
+   GLuint i;
+
+   for (i = 0; i < 2; i++) {
+      if (have_attr(c, VERT_RESULT_COL0+i) &&
+	  have_attr(c, VERT_RESULT_BFC0+i))
+	 brw_MOV(p, 
+		 get_vert_attr(c, vert, VERT_RESULT_COL0+i), 
+		 get_vert_attr(c, vert, VERT_RESULT_BFC0+i));
+   }
+}
+
+
+static void do_twoside_color( struct brw_sf_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *if_insn;
+   GLuint backface_conditional = c->key.frontface_ccw ? BRW_CONDITIONAL_G : BRW_CONDITIONAL_L;
+
+   /* Already done in clip program:
+    */
+   if (c->key.primitive == SF_UNFILLED_TRIS)
+      return;
+
+   /* XXX: What happens if BFC isn't present?  This could only happen
+    * for user-supplied vertex programs, as t_vp_build.c always does
+    * the right thing.
+    */
+   if (!(have_attr(c, VERT_RESULT_COL0) && have_attr(c, VERT_RESULT_BFC0)) &&
+       !(have_attr(c, VERT_RESULT_COL1) && have_attr(c, VERT_RESULT_BFC1)))
+      return;
+   
+   /* Need to use BRW_EXECUTE_4 and also do an 4-wide compare in order
+    * to get all channels active inside the IF.  In the clipping code
+    * we run with NoMask, so it's not an option and we can use
+    * BRW_EXECUTE_1 for all comparisions.
+    */
+   brw_push_insn_state(p);
+   brw_CMP(p, vec4(brw_null_reg()), backface_conditional, c->det, brw_imm_f(0));
+   if_insn = brw_IF(p, BRW_EXECUTE_4); 
+   {
+      switch (c->nr_verts) {
+      case 3: copy_bfc(c, c->vert[2]);
+      case 2: copy_bfc(c, c->vert[1]);
+      case 1: copy_bfc(c, c->vert[0]);
+      }
+   }
+   brw_ENDIF(p, if_insn);
+   brw_pop_insn_state(p);
+}
+
+
+
+/***********************************************************************
+ * Flat shading
+ */
+
+#define VERT_RESULT_COLOR_BITS ((1<<VERT_RESULT_COL0) | \
+                                 (1<<VERT_RESULT_COL1))
+
+static void copy_colors( struct brw_sf_compile *c,
+		     struct brw_reg dst,
+		     struct brw_reg src)
+{
+   struct brw_compile *p = &c->func;
+   GLuint i;
+
+   for (i = VERT_RESULT_COL0; i <= VERT_RESULT_COL1; i++) {
+      if (have_attr(c,i))
+	 brw_MOV(p, 
+		 get_vert_attr(c, dst, i), 
+		 get_vert_attr(c, src, i));
+   }
+}
+
+
+
+/* Need to use a computed jump to copy flatshaded attributes as the
+ * vertices are ordered according to y-coordinate before reaching this
+ * point, so the PV could be anywhere.
+ */
+static void do_flatshade_triangle( struct brw_sf_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg ip = brw_ip_reg();
+   GLuint nr = brw_count_bits(c->key.attrs & VERT_RESULT_COLOR_BITS);
+   GLuint jmpi = 1;
+
+   if (!nr)
+      return;
+
+   /* Already done in clip program:
+    */
+   if (c->key.primitive == SF_UNFILLED_TRIS)
+      return;
+
+   if (BRW_IS_IGDNG(p->brw))
+       jmpi = 2;
+
+   brw_push_insn_state(p);
+   
+   brw_MUL(p, c->pv, c->pv, brw_imm_d(jmpi*(nr*2+1)));
+   brw_JMPI(p, ip, ip, c->pv);
+
+   copy_colors(c, c->vert[1], c->vert[0]);
+   copy_colors(c, c->vert[2], c->vert[0]);
+   brw_JMPI(p, ip, ip, brw_imm_d(jmpi*(nr*4+1)));
+
+   copy_colors(c, c->vert[0], c->vert[1]);
+   copy_colors(c, c->vert[2], c->vert[1]);
+   brw_JMPI(p, ip, ip, brw_imm_d(jmpi*nr*2));
+
+   copy_colors(c, c->vert[0], c->vert[2]);
+   copy_colors(c, c->vert[1], c->vert[2]);
+
+   brw_pop_insn_state(p);
+}
+	
+
+static void do_flatshade_line( struct brw_sf_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg ip = brw_ip_reg();
+   GLuint nr = brw_count_bits(c->key.attrs & VERT_RESULT_COLOR_BITS);
+   GLuint jmpi = 1;
+
+   if (!nr)
+      return;
+
+   /* Already done in clip program: 
+    */
+   if (c->key.primitive == SF_UNFILLED_TRIS)
+      return;
+
+   if (BRW_IS_IGDNG(p->brw))
+       jmpi = 2;
+
+   brw_push_insn_state(p);
+   
+   brw_MUL(p, c->pv, c->pv, brw_imm_d(jmpi*(nr+1)));
+   brw_JMPI(p, ip, ip, c->pv);
+   copy_colors(c, c->vert[1], c->vert[0]);
+
+   brw_JMPI(p, ip, ip, brw_imm_ud(jmpi*nr));
+   copy_colors(c, c->vert[0], c->vert[1]);
+
+   brw_pop_insn_state(p);
+}
+
+	
+
+/***********************************************************************
+ * Triangle setup.
+ */
+
+
+static void alloc_regs( struct brw_sf_compile *c )
+{
+   GLuint reg, i;
+
+   /* Values computed by fixed function unit:
+    */
+   c->pv  = retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_D);
+   c->det = brw_vec1_grf(1, 2);
+   c->dx0 = brw_vec1_grf(1, 3);
+   c->dx2 = brw_vec1_grf(1, 4);
+   c->dy0 = brw_vec1_grf(1, 5);
+   c->dy2 = brw_vec1_grf(1, 6);
+
+   /* z and 1/w passed in seperately:
+    */
+   c->z[0]     = brw_vec1_grf(2, 0);
+   c->inv_w[0] = brw_vec1_grf(2, 1);
+   c->z[1]     = brw_vec1_grf(2, 2);
+   c->inv_w[1] = brw_vec1_grf(2, 3);
+   c->z[2]     = brw_vec1_grf(2, 4);
+   c->inv_w[2] = brw_vec1_grf(2, 5);
+   
+   /* The vertices:
+    */
+   reg = 3;
+   for (i = 0; i < c->nr_verts; i++) {
+      c->vert[i] = brw_vec8_grf(reg, 0);
+      reg += c->nr_attr_regs;
+   }
+
+   /* Temporaries, allocated after last vertex reg.
+    */
+   c->inv_det = brw_vec1_grf(reg, 0);  reg++;
+   c->a1_sub_a0 = brw_vec8_grf(reg, 0);  reg++;
+   c->a2_sub_a0 = brw_vec8_grf(reg, 0);  reg++;
+   c->tmp = brw_vec8_grf(reg, 0);  reg++;
+
+   /* Note grf allocation:
+    */
+   c->prog_data.total_grf = reg;
+   
+
+   /* Outputs of this program - interpolation coefficients for
+    * rasterization:
+    */
+   c->m1Cx = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 1, 0);
+   c->m2Cy = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 2, 0);
+   c->m3C0 = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 3, 0);
+}
+
+
+static void copy_z_inv_w( struct brw_sf_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   GLuint i;
+
+   brw_push_insn_state(p);
+	
+   /* Copy both scalars with a single MOV:
+    */
+   for (i = 0; i < c->nr_verts; i++)
+      brw_MOV(p, vec2(suboffset(c->vert[i], 2)), vec2(c->z[i]));
+	 
+   brw_pop_insn_state(p);
+}
+
+
+static void invert_det( struct brw_sf_compile *c)
+{
+   /* Looks like we invert all 8 elements just to get 1/det in
+    * position 2 !?!
+    */
+   brw_math(&c->func, 
+	    c->inv_det, 
+	    BRW_MATH_FUNCTION_INV,
+	    BRW_MATH_SATURATE_NONE,
+	    0, 
+	    c->det,
+	    BRW_MATH_DATA_SCALAR,
+	    BRW_MATH_PRECISION_FULL);
+
+}
+
+
+static GLboolean calculate_masks( struct brw_sf_compile *c,
+				  GLuint reg,
+				  GLushort *pc,
+				  GLushort *pc_persp,
+				  GLushort *pc_linear)
+{
+   GLboolean is_last_attr = (reg == c->nr_setup_regs - 1);
+   GLuint persp_mask;
+   GLuint linear_mask;
+
+   if (c->key.do_flat_shading || c->key.linear_color)
+      persp_mask = c->key.attrs & ~(FRAG_BIT_WPOS |
+                                    FRAG_BIT_COL0 |
+                                    FRAG_BIT_COL1);
+   else
+      persp_mask = c->key.attrs & ~(FRAG_BIT_WPOS);
+
+   if (c->key.do_flat_shading)
+      linear_mask = c->key.attrs & ~(FRAG_BIT_COL0|FRAG_BIT_COL1);
+   else
+      linear_mask = c->key.attrs;
+
+   *pc_persp = 0;
+   *pc_linear = 0;
+   *pc = 0xf;
+      
+   if (persp_mask & (1 << c->idx_to_attr[reg*2])) 
+      *pc_persp = 0xf;
+
+   if (linear_mask & (1 << c->idx_to_attr[reg*2])) 
+      *pc_linear = 0xf;
+
+   /* Maybe only processs one attribute on the final round:
+    */
+   if (reg*2+1 < c->nr_setup_attrs) {
+      *pc |= 0xf0;
+
+      if (persp_mask & (1 << c->idx_to_attr[reg*2+1])) 
+	 *pc_persp |= 0xf0;
+
+      if (linear_mask & (1 << c->idx_to_attr[reg*2+1])) 
+	 *pc_linear |= 0xf0;
+   }
+
+   return is_last_attr;
+}
+
+
+
+void brw_emit_tri_setup( struct brw_sf_compile *c, GLboolean allocate)
+{
+   struct brw_compile *p = &c->func;
+   GLuint i;
+
+   c->nr_verts = 3;
+
+   if (allocate)
+      alloc_regs(c);
+
+   invert_det(c);
+   copy_z_inv_w(c);
+
+   if (c->key.do_twoside_color) 
+      do_twoside_color(c);
+
+   if (c->key.do_flat_shading)
+      do_flatshade_triangle(c);
+      
+   
+   for (i = 0; i < c->nr_setup_regs; i++)
+   {
+      /* Pair of incoming attributes:
+       */
+      struct brw_reg a0 = offset(c->vert[0], i);
+      struct brw_reg a1 = offset(c->vert[1], i);
+      struct brw_reg a2 = offset(c->vert[2], i);
+      GLushort pc, pc_persp, pc_linear;
+      GLboolean last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear);
+
+      if (pc_persp)
+      {
+	 brw_set_predicate_control_flag_value(p, pc_persp);
+	 brw_MUL(p, a0, a0, c->inv_w[0]);
+	 brw_MUL(p, a1, a1, c->inv_w[1]);
+	 brw_MUL(p, a2, a2, c->inv_w[2]);
+      }
+      
+      
+      /* Calculate coefficients for interpolated values:
+       */      
+      if (pc_linear)
+      {
+	 brw_set_predicate_control_flag_value(p, pc_linear);
+
+	 brw_ADD(p, c->a1_sub_a0, a1, negate(a0));
+	 brw_ADD(p, c->a2_sub_a0, a2, negate(a0));
+
+	 /* calculate dA/dx
+	  */
+	 brw_MUL(p, brw_null_reg(), c->a1_sub_a0, c->dy2);
+	 brw_MAC(p, c->tmp, c->a2_sub_a0, negate(c->dy0));
+	 brw_MUL(p, c->m1Cx, c->tmp, c->inv_det);
+		
+	 /* calculate dA/dy
+	  */
+	 brw_MUL(p, brw_null_reg(), c->a2_sub_a0, c->dx0);
+	 brw_MAC(p, c->tmp, c->a1_sub_a0, negate(c->dx2));
+	 brw_MUL(p, c->m2Cy, c->tmp, c->inv_det);
+      }
+
+      {
+	 brw_set_predicate_control_flag_value(p, pc); 
+	 /* start point for interpolation
+	  */
+	 brw_MOV(p, c->m3C0, a0);
+      
+	 /* Copy m0..m3 to URB.  m0 is implicitly copied from r0 in
+	  * the send instruction:
+	  */	 
+	 brw_urb_WRITE(p, 
+		       brw_null_reg(),
+		       0,
+		       brw_vec8_grf(0, 0), /* r0, will be copied to m0 */
+		       0, 	/* allocate */
+		       1,	/* used */
+		       4, 	/* msg len */
+		       0,	/* response len */
+		       last,	/* eot */
+		       last, 	/* writes complete */
+		       i*4,	/* offset */
+		       BRW_URB_SWIZZLE_TRANSPOSE); /* XXX: Swizzle control "SF to windower" */
+      }
+   }
+}
+
+
+
+void brw_emit_line_setup( struct brw_sf_compile *c, GLboolean allocate)
+{
+   struct brw_compile *p = &c->func;
+   GLuint i;
+
+
+   c->nr_verts = 2;
+
+   if (allocate)
+      alloc_regs(c);
+
+   invert_det(c);
+   copy_z_inv_w(c);
+
+   if (c->key.do_flat_shading)
+      do_flatshade_line(c);
+
+   for (i = 0; i < c->nr_setup_regs; i++)
+   {
+      /* Pair of incoming attributes:
+       */
+      struct brw_reg a0 = offset(c->vert[0], i);
+      struct brw_reg a1 = offset(c->vert[1], i);
+      GLushort pc, pc_persp, pc_linear;
+      GLboolean last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear);
+
+      if (pc_persp)
+      {
+	 brw_set_predicate_control_flag_value(p, pc_persp);
+	 brw_MUL(p, a0, a0, c->inv_w[0]);
+	 brw_MUL(p, a1, a1, c->inv_w[1]);
+      }
+
+      /* Calculate coefficients for position, color:
+       */
+      if (pc_linear) {
+	 brw_set_predicate_control_flag_value(p, pc_linear); 
+
+	 brw_ADD(p, c->a1_sub_a0, a1, negate(a0));
+
+ 	 brw_MUL(p, c->tmp, c->a1_sub_a0, c->dx0); 
+	 brw_MUL(p, c->m1Cx, c->tmp, c->inv_det);
+		
+	 brw_MUL(p, c->tmp, c->a1_sub_a0, c->dy0);
+	 brw_MUL(p, c->m2Cy, c->tmp, c->inv_det);
+      }
+
+      {
+	 brw_set_predicate_control_flag_value(p, pc); 
+
+	 /* start point for interpolation
+	  */
+	 brw_MOV(p, c->m3C0, a0);
+
+	 /* Copy m0..m3 to URB. 
+	  */
+	 brw_urb_WRITE(p, 
+		       brw_null_reg(),
+		       0,
+		       brw_vec8_grf(0, 0),
+		       0, 	/* allocate */
+		       1, 	/* used */
+		       4, 	/* msg len */
+		       0,	/* response len */
+		       last, 	/* eot */
+		       last, 	/* writes complete */
+		       i*4,	/* urb destination offset */
+		       BRW_URB_SWIZZLE_TRANSPOSE); 
+      }
+   } 
+}
+
+void brw_emit_point_sprite_setup( struct brw_sf_compile *c, GLboolean allocate)
+{
+   struct brw_compile *p = &c->func;
+   GLuint i;
+
+   c->nr_verts = 1;
+
+   if (allocate)
+      alloc_regs(c);
+
+   copy_z_inv_w(c);
+   for (i = 0; i < c->nr_setup_regs; i++)
+   {
+      struct brw_sf_point_tex *tex = &c->point_attrs[c->idx_to_attr[2*i]];
+      struct brw_reg a0 = offset(c->vert[0], i);
+      GLushort pc, pc_persp, pc_linear;
+      GLboolean last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear);
+            
+      if (pc_persp)
+      {				
+	  if (!tex->CoordReplace) {
+	      brw_set_predicate_control_flag_value(p, pc_persp);
+	      brw_MUL(p, a0, a0, c->inv_w[0]);
+	  }
+      }
+
+      if (tex->CoordReplace) {
+	  /* Caculate 1.0/PointWidth */
+	  brw_math(&c->func,
+		  c->tmp,
+		  BRW_MATH_FUNCTION_INV,
+		  BRW_MATH_SATURATE_NONE,
+		  0,
+		  c->dx0,
+		  BRW_MATH_DATA_SCALAR,
+		  BRW_MATH_PRECISION_FULL);
+
+	  if (c->key.SpriteOrigin == GL_LOWER_LEFT) {
+	   	brw_MUL(p, c->m1Cx, c->tmp, c->inv_w[0]);
+		brw_MOV(p, vec1(suboffset(c->m1Cx, 1)), brw_imm_f(0.0));
+	  	brw_MUL(p, c->m2Cy, c->tmp, negate(c->inv_w[0]));
+		brw_MOV(p, vec1(suboffset(c->m2Cy, 0)), brw_imm_f(0.0));
+	  } else {
+	   	brw_MUL(p, c->m1Cx, c->tmp, c->inv_w[0]);
+		brw_MOV(p, vec1(suboffset(c->m1Cx, 1)), brw_imm_f(0.0));
+	  	brw_MUL(p, c->m2Cy, c->tmp, c->inv_w[0]);
+		brw_MOV(p, vec1(suboffset(c->m2Cy, 0)), brw_imm_f(0.0));
+	  }
+      } else {
+	  brw_MOV(p, c->m1Cx, brw_imm_ud(0));
+	  brw_MOV(p, c->m2Cy, brw_imm_ud(0));
+      }
+
+      {
+	 brw_set_predicate_control_flag_value(p, pc); 
+	 if (tex->CoordReplace) {
+	     if (c->key.SpriteOrigin == GL_LOWER_LEFT) {
+		 brw_MUL(p, c->m3C0, c->inv_w[0], brw_imm_f(1.0));
+		 brw_MOV(p, vec1(suboffset(c->m3C0, 0)), brw_imm_f(0.0));
+	     }
+	     else
+		 brw_MOV(p, c->m3C0, brw_imm_f(0.0));
+	 } else {
+	 	brw_MOV(p, c->m3C0, a0); /* constant value */
+	 }
+
+	 /* Copy m0..m3 to URB. 
+	  */
+	 brw_urb_WRITE(p, 
+		       brw_null_reg(),
+		       0,
+		       brw_vec8_grf(0, 0),
+		       0, 	/* allocate */
+		       1,	/* used */
+		       4, 	/* msg len */
+		       0,	/* response len */
+		       last, 	/* eot */
+		       last, 	/* writes complete */
+		       i*4,	/* urb destination offset */
+		       BRW_URB_SWIZZLE_TRANSPOSE);
+      }
+   }
+}
+
+/* Points setup - several simplifications as all attributes are
+ * constant across the face of the point (point sprites excluded!)
+ */
+void brw_emit_point_setup( struct brw_sf_compile *c, GLboolean allocate)
+{
+   struct brw_compile *p = &c->func;
+   GLuint i;
+
+   c->nr_verts = 1;
+   
+   if (allocate)
+      alloc_regs(c);
+
+   copy_z_inv_w(c);
+
+   brw_MOV(p, c->m1Cx, brw_imm_ud(0)); /* zero - move out of loop */
+   brw_MOV(p, c->m2Cy, brw_imm_ud(0)); /* zero - move out of loop */
+
+   for (i = 0; i < c->nr_setup_regs; i++)
+   {
+      struct brw_reg a0 = offset(c->vert[0], i);
+      GLushort pc, pc_persp, pc_linear;
+      GLboolean last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear);
+            
+      if (pc_persp)
+      {				
+	 /* This seems odd as the values are all constant, but the
+	  * fragment shader will be expecting it:
+	  */
+	 brw_set_predicate_control_flag_value(p, pc_persp);
+	 brw_MUL(p, a0, a0, c->inv_w[0]);
+      }
+
+
+      /* The delta values are always zero, just send the starting
+       * coordinate.  Again, this is to fit in with the interpolation
+       * code in the fragment shader.
+       */
+      {
+	 brw_set_predicate_control_flag_value(p, pc); 
+
+	 brw_MOV(p, c->m3C0, a0); /* constant value */
+
+	 /* Copy m0..m3 to URB. 
+	  */
+	 brw_urb_WRITE(p, 
+		       brw_null_reg(),
+		       0,
+		       brw_vec8_grf(0, 0),
+		       0, 	/* allocate */
+		       1,	/* used */
+		       4, 	/* msg len */
+		       0,	/* response len */
+		       last, 	/* eot */
+		       last, 	/* writes complete */
+		       i*4,	/* urb destination offset */
+		       BRW_URB_SWIZZLE_TRANSPOSE);
+      }
+   }
+}
+
+void brw_emit_anyprim_setup( struct brw_sf_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg ip = brw_ip_reg();
+   struct brw_reg payload_prim = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 1, 0);
+   struct brw_reg payload_attr = get_element_ud(brw_vec1_reg(BRW_GENERAL_REGISTER_FILE, 1, 0), 0); 
+   struct brw_reg primmask;
+   struct brw_instruction *jmp;
+   struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
+   
+   GLuint saveflag;
+
+   c->nr_verts = 3;
+   alloc_regs(c);
+
+   primmask = retype(get_element(c->tmp, 0), BRW_REGISTER_TYPE_UD);
+
+   brw_MOV(p, primmask, brw_imm_ud(1));
+   brw_SHL(p, primmask, primmask, payload_prim);
+
+   brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
+   brw_AND(p, v1_null_ud, primmask, brw_imm_ud((1<<_3DPRIM_TRILIST) |
+					       (1<<_3DPRIM_TRISTRIP) |
+					       (1<<_3DPRIM_TRIFAN) |
+					       (1<<_3DPRIM_TRISTRIP_REVERSE) |
+					       (1<<_3DPRIM_POLYGON) |
+					       (1<<_3DPRIM_RECTLIST) |
+					       (1<<_3DPRIM_TRIFAN_NOSTIPPLE)));
+   jmp = brw_JMPI(p, ip, ip, brw_imm_d(0));
+   {
+      saveflag = p->flag_value;
+      brw_push_insn_state(p); 
+      brw_emit_tri_setup( c, GL_FALSE );
+      brw_pop_insn_state(p);
+      p->flag_value = saveflag;
+      /* note - thread killed in subroutine, so must
+       * restore the flag which is changed when building
+       * the subroutine. fix #13240
+       */
+   }
+   brw_land_fwd_jump(p, jmp);
+
+   brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
+   brw_AND(p, v1_null_ud, primmask, brw_imm_ud((1<<_3DPRIM_LINELIST) |
+					       (1<<_3DPRIM_LINESTRIP) |
+					       (1<<_3DPRIM_LINELOOP) |
+					       (1<<_3DPRIM_LINESTRIP_CONT) |
+					       (1<<_3DPRIM_LINESTRIP_BF) |
+					       (1<<_3DPRIM_LINESTRIP_CONT_BF)));
+   jmp = brw_JMPI(p, ip, ip, brw_imm_d(0));
+   {
+      saveflag = p->flag_value;
+      brw_push_insn_state(p); 
+      brw_emit_line_setup( c, GL_FALSE );
+      brw_pop_insn_state(p);
+      p->flag_value = saveflag;
+      /* note - thread killed in subroutine */
+   }
+   brw_land_fwd_jump(p, jmp); 
+
+   brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
+   brw_AND(p, v1_null_ud, payload_attr, brw_imm_ud(1<<BRW_SPRITE_POINT_ENABLE));
+   jmp = brw_JMPI(p, ip, ip, brw_imm_d(0));
+   {
+      saveflag = p->flag_value;
+      brw_push_insn_state(p); 
+      brw_emit_point_sprite_setup( c, GL_FALSE );
+      brw_pop_insn_state(p);
+      p->flag_value = saveflag;
+   }
+   brw_land_fwd_jump(p, jmp); 
+
+   brw_emit_point_setup( c, GL_FALSE );
+}
+
+
+
+
--- a/src/gallium/drivers/i965/brw_sf_state.c
+++ b/src/gallium/drivers/i965/brw_sf_state.c
@ -0,0 +1,365 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+   
+
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+#include "main/macros.h"
+#include "intel_fbo.h"
+
+static void upload_sf_vp(struct brw_context *brw)
+{
+   GLcontext *ctx = &brw->intel.ctx;
+   const GLfloat depth_scale = 1.0F / ctx->DrawBuffer->_DepthMaxF;
+   struct brw_sf_viewport sfv;
+   GLfloat y_scale, y_bias;
+   const GLboolean render_to_fbo = (ctx->DrawBuffer->Name != 0);
+   const GLfloat *v = ctx->Viewport._WindowMap.m;
+
+   memset(&sfv, 0, sizeof(sfv));
+
+   if (render_to_fbo) {
+      y_scale = 1.0;
+      y_bias = 0;
+   }
+   else {
+      y_scale = -1.0;
+      y_bias = ctx->DrawBuffer->Height;
+   }
+
+   /* _NEW_VIEWPORT */
+
+   sfv.viewport.m00 = v[MAT_SX];
+   sfv.viewport.m11 = v[MAT_SY] * y_scale;
+   sfv.viewport.m22 = v[MAT_SZ] * depth_scale;
+   sfv.viewport.m30 = v[MAT_TX];
+   sfv.viewport.m31 = v[MAT_TY] * y_scale + y_bias;
+   sfv.viewport.m32 = v[MAT_TZ] * depth_scale;
+
+   /* _NEW_SCISSOR | _NEW_BUFFERS | _NEW_VIEWPORT
+    * for DrawBuffer->_[XY]{min,max}
+    */
+
+   /* The scissor only needs to handle the intersection of drawable and
+    * scissor rect.  Clipping to the boundaries of static shared buffers
+    * for front/back/depth is covered by looping over cliprects in brw_draw.c.
+    *
+    * Note that the hardware's coordinates are inclusive, while Mesa's min is
+    * inclusive but max is exclusive.
+    */
+   if (render_to_fbo) {
+      /* texmemory: Y=0=bottom */
+      sfv.scissor.xmin = ctx->DrawBuffer->_Xmin;
+      sfv.scissor.xmax = ctx->DrawBuffer->_Xmax - 1;
+      sfv.scissor.ymin = ctx->DrawBuffer->_Ymin;
+      sfv.scissor.ymax = ctx->DrawBuffer->_Ymax - 1;
+   }
+   else {
+      /* memory: Y=0=top */
+      sfv.scissor.xmin = ctx->DrawBuffer->_Xmin;
+      sfv.scissor.xmax = ctx->DrawBuffer->_Xmax - 1;
+      sfv.scissor.ymin = ctx->DrawBuffer->Height - ctx->DrawBuffer->_Ymax;
+      sfv.scissor.ymax = ctx->DrawBuffer->Height - ctx->DrawBuffer->_Ymin - 1;
+   }
+
+   dri_bo_unreference(brw->sf.vp_bo);
+   brw->sf.vp_bo = brw_cache_data( &brw->cache, BRW_SF_VP, &sfv, NULL, 0 );
+}
+
+const struct brw_tracked_state brw_sf_vp = {
+   .dirty = {
+      .mesa  = (_NEW_VIEWPORT | 
+		_NEW_SCISSOR |
+		_NEW_BUFFERS),
+      .brw   = 0,
+      .cache = 0
+   },
+   .prepare = upload_sf_vp
+};
+
+struct brw_sf_unit_key {
+   unsigned int total_grf;
+   unsigned int urb_entry_read_length;
+
+   unsigned int nr_urb_entries, urb_size, sfsize;
+
+   GLenum front_face, cull_face, provoking_vertex;
+   unsigned scissor:1;
+   unsigned line_smooth:1;
+   unsigned point_sprite:1;
+   unsigned point_attenuated:1;
+   unsigned render_to_fbo:1;
+   float line_width;
+   float point_size;
+};
+
+static void
+sf_unit_populate_key(struct brw_context *brw, struct brw_sf_unit_key *key)
+{
+   GLcontext *ctx = &brw->intel.ctx;
+   memset(key, 0, sizeof(*key));
+
+   /* CACHE_NEW_SF_PROG */
+   key->total_grf = brw->sf.prog_data->total_grf;
+   key->urb_entry_read_length = brw->sf.prog_data->urb_read_length;
+
+   /* BRW_NEW_URB_FENCE */
+   key->nr_urb_entries = brw->urb.nr_sf_entries;
+   key->urb_size = brw->urb.vsize;
+   key->sfsize = brw->urb.sfsize;
+
+   key->scissor = ctx->Scissor.Enabled;
+   key->front_face = ctx->Polygon.FrontFace;
+
+   if (ctx->Polygon.CullFlag)
+      key->cull_face = ctx->Polygon.CullFaceMode;
+   else
+      key->cull_face = GL_NONE;
+
+   key->line_width = ctx->Line.Width;
+   key->line_smooth = ctx->Line.SmoothFlag;
+
+   key->point_sprite = ctx->Point.PointSprite;
+   key->point_size = CLAMP(ctx->Point.Size, ctx->Point.MinSize, ctx->Point.MaxSize);
+   key->point_attenuated = ctx->Point._Attenuated;
+
+   /* _NEW_LIGHT */
+   key->provoking_vertex = ctx->Light.ProvokingVertex;
+
+   key->render_to_fbo = brw->intel.ctx.DrawBuffer->Name != 0;
+}
+
+static dri_bo *
+sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
+			dri_bo **reloc_bufs)
+{
+   struct brw_sf_unit_state sf;
+   dri_bo *bo;
+   int chipset_max_threads;
+   memset(&sf, 0, sizeof(sf));
+
+   sf.thread0.grf_reg_count = ALIGN(key->total_grf, 16) / 16 - 1;
+   sf.thread0.kernel_start_pointer = brw->sf.prog_bo->offset >> 6; /* reloc */
+
+   sf.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
+
+   sf.thread3.dispatch_grf_start_reg = 3;
+
+   if (BRW_IS_IGDNG(brw))
+       sf.thread3.urb_entry_read_offset = 3;
+   else
+       sf.thread3.urb_entry_read_offset = 1;
+
+   sf.thread3.urb_entry_read_length = key->urb_entry_read_length;
+
+   sf.thread4.nr_urb_entries = key->nr_urb_entries;
+   sf.thread4.urb_entry_allocation_size = key->sfsize - 1;
+
+   /* Each SF thread produces 1 PUE, and there can be up to 24(Pre-IGDNG) or 
+    * 48(IGDNG) threads 
+    */
+   if (BRW_IS_IGDNG(brw))
+      chipset_max_threads = 48;
+   else
+      chipset_max_threads = 24;
+
+   sf.thread4.max_threads = MIN2(chipset_max_threads, key->nr_urb_entries) - 1;
+
+   if (INTEL_DEBUG & DEBUG_SINGLE_THREAD)
+      sf.thread4.max_threads = 0;
+
+   if (INTEL_DEBUG & DEBUG_STATS)
+      sf.thread4.stats_enable = 1;
+
+   /* CACHE_NEW_SF_VP */
+   sf.sf5.sf_viewport_state_offset = brw->sf.vp_bo->offset >> 5; /* reloc */
+
+   sf.sf5.viewport_transform = 1;
+
+   /* _NEW_SCISSOR */
+   if (key->scissor)
+      sf.sf6.scissor = 1;
+
+   /* _NEW_POLYGON */
+   if (key->front_face == GL_CCW)
+      sf.sf5.front_winding = BRW_FRONTWINDING_CCW;
+   else
+      sf.sf5.front_winding = BRW_FRONTWINDING_CW;
+
+   /* The viewport is inverted for rendering to a FBO, and that inverts
+    * polygon front/back orientation.
+    */
+   sf.sf5.front_winding ^= key->render_to_fbo;
+
+   switch (key->cull_face) {
+   case GL_FRONT:
+      sf.sf6.cull_mode = BRW_CULLMODE_FRONT;
+      break;
+   case GL_BACK:
+      sf.sf6.cull_mode = BRW_CULLMODE_BACK;
+      break;
+   case GL_FRONT_AND_BACK:
+      sf.sf6.cull_mode = BRW_CULLMODE_BOTH;
+      break;
+   case GL_NONE:
+      sf.sf6.cull_mode = BRW_CULLMODE_NONE;
+      break;
+   default:
+      assert(0);
+      break;
+   }
+
+   /* _NEW_LINE */
+   /* XXX use ctx->Const.Min/MaxLineWidth here */
+   sf.sf6.line_width = CLAMP(key->line_width, 1.0, 5.0) * (1<<1);
+
+   sf.sf6.line_endcap_aa_region_width = 1;
+   if (key->line_smooth)
+      sf.sf6.aa_enable = 1;
+   else if (sf.sf6.line_width <= 0x2)
+       sf.sf6.line_width = 0;
+
+   /* _NEW_BUFFERS */
+   key->render_to_fbo = brw->intel.ctx.DrawBuffer->Name != 0;
+   if (!key->render_to_fbo) {
+      /* Rendering to an OpenGL window */
+      sf.sf6.point_rast_rule = BRW_RASTRULE_UPPER_RIGHT;
+   }
+   else {
+      /* If rendering to an FBO, the pixel coordinate system is
+       * inverted with respect to the normal OpenGL coordinate
+       * system, so BRW_RASTRULE_LOWER_RIGHT is correct.
+       * But this value is listed as "Reserved, but not seen as useful"
+       * in Intel documentation (page 212, "Point Rasterization Rule",
+       * section 7.4 "SF Pipeline State Summary", of document
+       * "Intel® 965 Express Chipset Family and Intel® G35 Express
+       * Chipset Graphics Controller Programmer's Reference Manual,
+       * Volume 2: 3D/Media", Revision 1.0b as of January 2008,
+       * available at 
+       *     http://intellinuxgraphics.org/documentation.html
+       * at the time of this writing).
+       *
+       * It does work on at least some devices, if not all;
+       * if devices that don't support it can be identified,
+       * the likely failure case is that points are rasterized
+       * incorrectly, which is no worse than occurs without
+       * the value, so we're using it here.
+       */
+      sf.sf6.point_rast_rule = BRW_RASTRULE_LOWER_RIGHT;
+   }
+   /* XXX clamp max depends on AA vs. non-AA */
+
+   /* _NEW_POINT */
+   sf.sf7.sprite_point = key->point_sprite;
+   sf.sf7.point_size = CLAMP(rint(key->point_size), 1, 255) * (1<<3);
+   sf.sf7.use_point_size_state = !key->point_attenuated;
+   sf.sf7.aa_line_distance_mode = 0;
+
+   /* might be BRW_NEW_PRIMITIVE if we have to adjust pv for polygons:
+    */
+   if (key->provoking_vertex == GL_LAST_VERTEX_CONVENTION) {
+      sf.sf7.trifan_pv = 2;
+      sf.sf7.linestrip_pv = 1;
+      sf.sf7.tristrip_pv = 2;
+   } else {
+      sf.sf7.trifan_pv = 1;
+      sf.sf7.linestrip_pv = 0;
+      sf.sf7.tristrip_pv = 0;
+   }
+   sf.sf7.line_last_pixel_enable = 0;
+
+   /* Set bias for OpenGL rasterization rules:
+    */
+   sf.sf6.dest_org_vbias = 0x8;
+   sf.sf6.dest_org_hbias = 0x8;
+
+   bo = brw_upload_cache(&brw->cache, BRW_SF_UNIT,
+			 key, sizeof(*key),
+			 reloc_bufs, 2,
+			 &sf, sizeof(sf),
+			 NULL, NULL);
+
+   /* STATE_PREFETCH command description describes this state as being
+    * something loaded through the GPE (L2 ISC), so it's INSTRUCTION domain.
+    */
+   /* Emit SF program relocation */
+   dri_bo_emit_reloc(bo,
+		     I915_GEM_DOMAIN_INSTRUCTION, 0,
+		     sf.thread0.grf_reg_count << 1,
+		     offsetof(struct brw_sf_unit_state, thread0),
+		     brw->sf.prog_bo);
+
+   /* Emit SF viewport relocation */
+   dri_bo_emit_reloc(bo,
+		     I915_GEM_DOMAIN_INSTRUCTION, 0,
+		     sf.sf5.front_winding | (sf.sf5.viewport_transform << 1),
+		     offsetof(struct brw_sf_unit_state, sf5),
+		     brw->sf.vp_bo);
+
+   return bo;
+}
+
+static void upload_sf_unit( struct brw_context *brw )
+{
+   struct brw_sf_unit_key key;
+   dri_bo *reloc_bufs[2];
+
+   sf_unit_populate_key(brw, &key);
+
+   reloc_bufs[0] = brw->sf.prog_bo;
+   reloc_bufs[1] = brw->sf.vp_bo;
+
+   dri_bo_unreference(brw->sf.state_bo);
+   brw->sf.state_bo = brw_search_cache(&brw->cache, BRW_SF_UNIT,
+				       &key, sizeof(key),
+				       reloc_bufs, 2,
+				       NULL);
+   if (brw->sf.state_bo == NULL) {
+      brw->sf.state_bo = sf_unit_create_from_key(brw, &key, reloc_bufs);
+   }
+}
+
+const struct brw_tracked_state brw_sf_unit = {
+   .dirty = {
+      .mesa  = (_NEW_POLYGON | 
+		_NEW_LIGHT |
+		_NEW_LINE | 
+		_NEW_POINT | 
+		_NEW_SCISSOR |
+		_NEW_BUFFERS),
+      .brw   = BRW_NEW_URB_FENCE,
+      .cache = (CACHE_NEW_SF_VP |
+		CACHE_NEW_SF_PROG)
+   },
+   .prepare = upload_sf_unit,
+};
--- a/src/gallium/drivers/i965/brw_state.h
+++ b/src/gallium/drivers/i965/brw_state.h
@ -0,0 +1,173 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+    
+
+#ifndef BRW_STATE_H
+#define BRW_STATE_H
+
+#include "brw_context.h"
+
+static inline void
+brw_add_validated_bo(struct brw_context *brw, dri_bo *bo)
+{
+   assert(brw->state.validated_bo_count < ARRAY_SIZE(brw->state.validated_bos));
+
+   if (bo != NULL) {
+      dri_bo_reference(bo);
+      brw->state.validated_bos[brw->state.validated_bo_count++] = bo;
+   }
+};
+
+const struct brw_tracked_state brw_blend_constant_color;
+const struct brw_tracked_state brw_cc_unit;
+const struct brw_tracked_state brw_cc_vp;
+const struct brw_tracked_state brw_check_fallback;
+const struct brw_tracked_state brw_clip_prog;
+const struct brw_tracked_state brw_clip_unit;
+const struct brw_tracked_state brw_constant_buffer;
+const struct brw_tracked_state brw_curbe_offsets;
+const struct brw_tracked_state brw_invarient_state;
+const struct brw_tracked_state brw_gs_prog;
+const struct brw_tracked_state brw_gs_unit;
+const struct brw_tracked_state brw_line_stipple;
+const struct brw_tracked_state brw_aa_line_parameters;
+const struct brw_tracked_state brw_pipelined_state_pointers;
+const struct brw_tracked_state brw_binding_table_pointers;
+const struct brw_tracked_state brw_depthbuffer;
+const struct brw_tracked_state brw_polygon_stipple_offset;
+const struct brw_tracked_state brw_polygon_stipple;
+const struct brw_tracked_state brw_program_parameters;
+const struct brw_tracked_state brw_recalculate_urb_fence;
+const struct brw_tracked_state brw_sf_prog;
+const struct brw_tracked_state brw_sf_unit;
+const struct brw_tracked_state brw_sf_vp;
+const struct brw_tracked_state brw_state_base_address;
+const struct brw_tracked_state brw_urb_fence;
+const struct brw_tracked_state brw_vertex_state;
+const struct brw_tracked_state brw_vs_surfaces;
+const struct brw_tracked_state brw_vs_prog;
+const struct brw_tracked_state brw_vs_unit;
+const struct brw_tracked_state brw_wm_input_sizes;
+const struct brw_tracked_state brw_wm_prog;
+const struct brw_tracked_state brw_wm_samplers;
+const struct brw_tracked_state brw_wm_constant_surface;
+const struct brw_tracked_state brw_wm_surfaces;
+const struct brw_tracked_state brw_wm_unit;
+
+const struct brw_tracked_state brw_psp_urb_cbs;
+
+const struct brw_tracked_state brw_pipe_control;
+
+const struct brw_tracked_state brw_drawing_rect;
+const struct brw_tracked_state brw_indices;
+const struct brw_tracked_state brw_vertices;
+const struct brw_tracked_state brw_index_buffer;
+
+/**
+ * Use same key for WM and VS surfaces.
+ */
+struct brw_surface_key {
+   GLenum target, depthmode;
+   dri_bo *bo;
+   GLint format, internal_format;
+   GLint first_level, last_level;
+   GLint width, height, depth;
+   GLint pitch, cpp;
+   uint32_t tiling;
+   GLuint offset;
+};
+
+/***********************************************************************
+ * brw_state.c
+ */
+void brw_validate_state(struct brw_context *brw);
+void brw_upload_state(struct brw_context *brw);
+void brw_init_state(struct brw_context *brw);
+void brw_destroy_state(struct brw_context *brw);
+
+/***********************************************************************
+ * brw_state_cache.c
+ */
+dri_bo *brw_cache_data(struct brw_cache *cache,
+		       enum brw_cache_id cache_id,
+		       const void *data,
+		       dri_bo **reloc_bufs,
+		       GLuint nr_reloc_bufs);
+
+dri_bo *brw_cache_data_sz(struct brw_cache *cache,
+			  enum brw_cache_id cache_id,
+			  const void *data,
+			  GLuint data_size,
+			  dri_bo **reloc_bufs,
+			  GLuint nr_reloc_bufs);
+
+dri_bo *brw_upload_cache( struct brw_cache *cache,
+			  enum brw_cache_id cache_id,
+			  const void *key,
+			  GLuint key_sz,
+			  dri_bo **reloc_bufs,
+			  GLuint nr_reloc_bufs,
+			  const void *data,
+			  GLuint data_sz,
+			  const void *aux,
+			  void *aux_return );
+
+dri_bo *brw_search_cache( struct brw_cache *cache,
+			  enum brw_cache_id cache_id,
+			  const void *key,
+			  GLuint key_size,
+			  dri_bo **reloc_bufs,
+			  GLuint nr_reloc_bufs,
+			  void *aux_return);
+void brw_state_cache_check_size( struct brw_context *brw );
+
+void brw_init_caches( struct brw_context *brw );
+void brw_destroy_caches( struct brw_context *brw );
+void brw_state_cache_bo_delete(struct brw_cache *cache, dri_bo *bo);
+
+/***********************************************************************
+ * brw_state_batch.c
+ */
+#define BRW_BATCH_STRUCT(brw, s) intel_batchbuffer_data( brw->intel.batch, (s), sizeof(*(s)), IGNORE_CLIPRECTS)
+#define BRW_CACHED_BATCH_STRUCT(brw, s) brw_cached_batch_struct( brw, (s), sizeof(*(s)) )
+
+GLboolean brw_cached_batch_struct( struct brw_context *brw,
+				   const void *data,
+				   GLuint sz );
+void brw_destroy_batch_cache( struct brw_context *brw );
+void brw_clear_batch_cache( struct brw_context *brw );
+
+/* brw_wm_surface_state.c */
+dri_bo *
+brw_create_constant_surface( struct brw_context *brw,
+                             struct brw_surface_key *key );
+
+#endif
--- a/src/gallium/drivers/i965/brw_state_batch.c
+++ b/src/gallium/drivers/i965/brw_state_batch.c
@ -0,0 +1,99 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+     
+
+
+#include "brw_state.h"
+#include "intel_batchbuffer.h"
+#include "main/imports.h"
+
+
+
+/* A facility similar to the data caching code above, which aims to
+ * prevent identical commands being issued repeatedly.
+ */
+GLboolean brw_cached_batch_struct( struct brw_context *brw,
+				   const void *data,
+				   GLuint sz )
+{
+   struct brw_cached_batch_item *item = brw->cached_batch_items;
+   struct header *newheader = (struct header *)data;
+
+   if (brw->emit_state_always) {
+      intel_batchbuffer_data(brw->intel.batch, data, sz, IGNORE_CLIPRECTS);
+      return GL_TRUE;
+   }
+
+   while (item) {
+      if (item->header->opcode == newheader->opcode) {
+	 if (item->sz == sz && memcmp(item->header, newheader, sz) == 0)
+	    return GL_FALSE;
+	 if (item->sz != sz) {
+	    _mesa_free(item->header);
+	    item->header = _mesa_malloc(sz);
+	    item->sz = sz;
+	 }
+	 goto emit;
+      }
+      item = item->next;
+   }
+
+   assert(!item);
+   item = CALLOC_STRUCT(brw_cached_batch_item);
+   item->header = _mesa_malloc(sz);
+   item->sz = sz;
+   item->next = brw->cached_batch_items;
+   brw->cached_batch_items = item;
+
+ emit:
+   memcpy(item->header, newheader, sz);
+   intel_batchbuffer_data(brw->intel.batch, data, sz, IGNORE_CLIPRECTS);
+   return GL_TRUE;
+}
+
+void brw_clear_batch_cache( struct brw_context *brw )
+{
+   struct brw_cached_batch_item *item = brw->cached_batch_items;
+
+   while (item) {
+      struct brw_cached_batch_item *next = item->next;
+      free((void *)item->header);
+      free(item);
+      item = next;
+   }
+
+   brw->cached_batch_items = NULL;
+}
+
+void brw_destroy_batch_cache( struct brw_context *brw )
+{
+   brw_clear_batch_cache(brw);
+}
--- a/src/gallium/drivers/i965/brw_state_cache.c
+++ b/src/gallium/drivers/i965/brw_state_cache.c
@ -0,0 +1,597 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+/** @file brw_state_cache.c
+ *
+ * This file implements a simple static state cache for 965.  The consumers
+ * can query the hash table of state using a cache_id, opaque key data,
+ * and list of buffers that will be used in relocations, and receive the
+ * corresponding state buffer object of state (plus associated auxiliary
+ * data) in return.
+ *
+ * The inner workings are a simple hash table based on a CRC of the key data.
+ * The cache_id and relocation target buffers associated with the state
+ * buffer are included as auxiliary key data, but are not part of the hash
+ * value (this should be fixed, but will likely be fixed instead by making
+ * consumers use structured keys).
+ *
+ * Replacement is not implemented.  Instead, when the cache gets too big, at
+ * a safe point (unlock) we throw out all of the cache data and let it
+ * regenerate for the next rendering operation.
+ *
+ * The reloc_buf pointers need to be included as key data, otherwise the
+ * non-unique values stuffed in the offset in key data through
+ * brw_cache_data() may result in successful probe for state buffers
+ * even when the buffer being referenced doesn't match.  The result would be
+ * that the same state cache entry is used twice for different buffers,
+ * only one of the two buffers referenced gets put into the offset, and the
+ * incorrect program is run for the other instance.
+ */
+
+#include "main/imports.h"
+#include "brw_state.h"
+#include "intel_batchbuffer.h"
+
+/* XXX: Fixme - have to include these to get the sizes of the prog_key
+ * structs:
+ */
+#include "brw_wm.h"
+#include "brw_vs.h"
+#include "brw_clip.h"
+#include "brw_sf.h"
+#include "brw_gs.h"
+
+
+static GLuint
+hash_key(const void *key, GLuint key_size,
+         dri_bo **reloc_bufs, GLuint nr_reloc_bufs)
+{
+   GLuint *ikey = (GLuint *)key;
+   GLuint hash = 0, i;
+
+   assert(key_size % 4 == 0);
+
+   /* I'm sure this can be improved on:
+    */
+   for (i = 0; i < key_size/4; i++) {
+      hash ^= ikey[i];
+      hash = (hash << 5) | (hash >> 27);
+   }
+
+   /* Include the BO pointers as key data as well */
+   ikey = (GLuint *)reloc_bufs;
+   key_size = nr_reloc_bufs * sizeof(dri_bo *);
+   for (i = 0; i < key_size/4; i++) {
+      hash ^= ikey[i];
+      hash = (hash << 5) | (hash >> 27);
+   }
+
+   return hash;
+}
+
+
+/**
+ * Marks a new buffer as being chosen for the given cache id.
+ */
+static void
+update_cache_last(struct brw_cache *cache, enum brw_cache_id cache_id,
+		  dri_bo *bo)
+{
+   if (bo == cache->last_bo[cache_id])
+      return; /* no change */
+
+   dri_bo_unreference(cache->last_bo[cache_id]);
+   cache->last_bo[cache_id] = bo;
+   dri_bo_reference(cache->last_bo[cache_id]);
+   cache->brw->state.dirty.cache |= 1 << cache_id;
+}
+
+
+static struct brw_cache_item *
+search_cache(struct brw_cache *cache, enum brw_cache_id cache_id,
+	     GLuint hash, const void *key, GLuint key_size,
+	     dri_bo **reloc_bufs, GLuint nr_reloc_bufs)
+{
+   struct brw_cache_item *c;
+
+#if 0
+   int bucketcount = 0;
+
+   for (c = cache->items[hash % cache->size]; c; c = c->next)
+      bucketcount++;
+
+   fprintf(stderr, "bucket %d/%d = %d/%d items\n", hash % cache->size,
+	   cache->size, bucketcount, cache->n_items);
+#endif
+
+   for (c = cache->items[hash % cache->size]; c; c = c->next) {
+      if (c->cache_id == cache_id &&
+	  c->hash == hash &&
+	  c->key_size == key_size &&
+	  memcmp(c->key, key, key_size) == 0 &&
+	  c->nr_reloc_bufs == nr_reloc_bufs &&
+	  memcmp(c->reloc_bufs, reloc_bufs,
+		 nr_reloc_bufs * sizeof(dri_bo *)) == 0)
+	 return c;
+   }
+
+   return NULL;
+}
+
+
+static void
+rehash(struct brw_cache *cache)
+{
+   struct brw_cache_item **items;
+   struct brw_cache_item *c, *next;
+   GLuint size, i;
+
+   size = cache->size * 3;
+   items = (struct brw_cache_item**) _mesa_calloc(size * sizeof(*items));
+
+   for (i = 0; i < cache->size; i++)
+      for (c = cache->items[i]; c; c = next) {
+	 next = c->next;
+	 c->next = items[c->hash % size];
+	 items[c->hash % size] = c;
+      }
+
+   FREE(cache->items);
+   cache->items = items;
+   cache->size = size;
+}
+
+
+/**
+ * Returns the buffer object matching cache_id and key, or NULL.
+ */
+dri_bo *
+brw_search_cache(struct brw_cache *cache,
+                 enum brw_cache_id cache_id,
+                 const void *key,
+                 GLuint key_size,
+                 dri_bo **reloc_bufs, GLuint nr_reloc_bufs,
+                 void *aux_return)
+{
+   struct brw_cache_item *item;
+   GLuint hash = hash_key(key, key_size, reloc_bufs, nr_reloc_bufs);
+
+   item = search_cache(cache, cache_id, hash, key, key_size,
+		       reloc_bufs, nr_reloc_bufs);
+
+   if (item == NULL)
+      return NULL;
+
+   if (aux_return)
+      *(void **)aux_return = (void *)((char *)item->key + item->key_size);
+
+   update_cache_last(cache, cache_id, item->bo);
+
+   dri_bo_reference(item->bo);
+   return item->bo;
+}
+
+
+dri_bo *
+brw_upload_cache( struct brw_cache *cache,
+		  enum brw_cache_id cache_id,
+		  const void *key,
+		  GLuint key_size,
+		  dri_bo **reloc_bufs,
+		  GLuint nr_reloc_bufs,
+		  const void *data,
+		  GLuint data_size,
+		  const void *aux,
+		  void *aux_return )
+{
+   struct brw_cache_item *item = CALLOC_STRUCT(brw_cache_item);
+   GLuint hash = hash_key(key, key_size, reloc_bufs, nr_reloc_bufs);
+   GLuint relocs_size = nr_reloc_bufs * sizeof(dri_bo *);
+   GLuint aux_size = cache->aux_size[cache_id];
+   void *tmp;
+   dri_bo *bo;
+   int i;
+
+   /* Create the buffer object to contain the data */
+   bo = dri_bo_alloc(cache->brw->intel.bufmgr,
+		     cache->name[cache_id], data_size, 1 << 6);
+
+
+   /* Set up the memory containing the key, aux_data, and reloc_bufs */
+   tmp = _mesa_malloc(key_size + aux_size + relocs_size);
+
+   memcpy(tmp, key, key_size);
+   memcpy(tmp + key_size, aux, cache->aux_size[cache_id]);
+   memcpy(tmp + key_size + aux_size, reloc_bufs, relocs_size);
+   for (i = 0; i < nr_reloc_bufs; i++) {
+      if (reloc_bufs[i] != NULL)
+	 dri_bo_reference(reloc_bufs[i]);
+   }
+
+   item->cache_id = cache_id;
+   item->key = tmp;
+   item->hash = hash;
+   item->key_size = key_size;
+   item->reloc_bufs = tmp + key_size + aux_size;
+   item->nr_reloc_bufs = nr_reloc_bufs;
+
+   item->bo = bo;
+   dri_bo_reference(bo);
+   item->data_size = data_size;
+
+   if (cache->n_items > cache->size * 1.5)
+      rehash(cache);
+
+   hash %= cache->size;
+   item->next = cache->items[hash];
+   cache->items[hash] = item;
+   cache->n_items++;
+
+   if (aux_return) {
+      assert(cache->aux_size[cache_id]);
+      *(void **)aux_return = (void *)((char *)item->key + item->key_size);
+   }
+
+   if (INTEL_DEBUG & DEBUG_STATE)
+      _mesa_printf("upload %s: %d bytes to cache id %d\n",
+		   cache->name[cache_id],
+		   data_size, cache_id);
+
+   /* Copy data to the buffer */
+   dri_bo_subdata(bo, 0, data_size, data);
+
+   update_cache_last(cache, cache_id, bo);
+
+   return bo;
+}
+
+
+/**
+ * This doesn't really work with aux data.  Use search/upload instead
+ */
+dri_bo *
+brw_cache_data_sz(struct brw_cache *cache,
+		  enum brw_cache_id cache_id,
+		  const void *data,
+		  GLuint data_size,
+		  dri_bo **reloc_bufs,
+		  GLuint nr_reloc_bufs)
+{
+   dri_bo *bo;
+   struct brw_cache_item *item;
+   GLuint hash = hash_key(data, data_size, reloc_bufs, nr_reloc_bufs);
+
+   item = search_cache(cache, cache_id, hash, data, data_size,
+		       reloc_bufs, nr_reloc_bufs);
+   if (item) {
+      update_cache_last(cache, cache_id, item->bo);
+      dri_bo_reference(item->bo);
+      return item->bo;
+   }
+
+   bo = brw_upload_cache(cache, cache_id,
+			 data, data_size,
+			 reloc_bufs, nr_reloc_bufs,
+			 data, data_size,
+			 NULL, NULL);
+
+   return bo;
+}
+
+
+/**
+ * Wrapper around brw_cache_data_sz using the cache_id's canonical key size.
+ *
+ * If nr_reloc_bufs is nonzero, brw_search_cache()/brw_upload_cache() would be
+ * better to use, as the potentially changing offsets in the data-used-as-key
+ * will result in excessive cache misses.
+ */
+dri_bo *
+brw_cache_data(struct brw_cache *cache,
+	       enum brw_cache_id cache_id,
+	       const void *data,
+	       dri_bo **reloc_bufs,
+	       GLuint nr_reloc_bufs)
+{
+   return brw_cache_data_sz(cache, cache_id, data, cache->key_size[cache_id],
+			    reloc_bufs, nr_reloc_bufs);
+}
+
+enum pool_type {
+   DW_SURFACE_STATE,
+   DW_GENERAL_STATE
+};
+
+
+static void
+brw_init_cache_id(struct brw_cache *cache,
+                  const char *name,
+                  enum brw_cache_id id,
+                  GLuint key_size,
+                  GLuint aux_size)
+{
+   cache->name[id] = strdup(name);
+   cache->key_size[id] = key_size;
+   cache->aux_size[id] = aux_size;
+}
+
+
+static void
+brw_init_non_surface_cache(struct brw_context *brw)
+{
+   struct brw_cache *cache = &brw->cache;
+
+   cache->brw = brw;
+
+   cache->size = 7;
+   cache->n_items = 0;
+   cache->items = (struct brw_cache_item **)
+      _mesa_calloc(cache->size * sizeof(struct brw_cache_item));
+
+   brw_init_cache_id(cache,
+		     "CC_VP",
+		     BRW_CC_VP,
+		     sizeof(struct brw_cc_viewport),
+		     0);
+
+   brw_init_cache_id(cache,
+		     "CC_UNIT",
+		     BRW_CC_UNIT,
+		     sizeof(struct brw_cc_unit_state),
+		     0);
+
+   brw_init_cache_id(cache,
+		     "WM_PROG",
+		     BRW_WM_PROG,
+		     sizeof(struct brw_wm_prog_key),
+		     sizeof(struct brw_wm_prog_data));
+
+   brw_init_cache_id(cache,
+		     "SAMPLER_DEFAULT_COLOR",
+		     BRW_SAMPLER_DEFAULT_COLOR,
+		     sizeof(struct brw_sampler_default_color),
+		     0);
+
+   brw_init_cache_id(cache,
+		     "SAMPLER",
+		     BRW_SAMPLER,
+		     0,		/* variable key/data size */
+		     0);
+
+   brw_init_cache_id(cache,
+		     "WM_UNIT",
+		     BRW_WM_UNIT,
+		     sizeof(struct brw_wm_unit_state),
+		     0);
+
+   brw_init_cache_id(cache,
+		     "SF_PROG",
+		     BRW_SF_PROG,
+		     sizeof(struct brw_sf_prog_key),
+		     sizeof(struct brw_sf_prog_data));
+
+   brw_init_cache_id(cache,
+		     "SF_VP",
+		     BRW_SF_VP,
+		     sizeof(struct brw_sf_viewport),
+		     0);
+
+   brw_init_cache_id(cache,
+		     "SF_UNIT",
+		     BRW_SF_UNIT,
+		     sizeof(struct brw_sf_unit_state),
+		     0);
+
+   brw_init_cache_id(cache,
+		     "VS_UNIT",
+		     BRW_VS_UNIT,
+		     sizeof(struct brw_vs_unit_state),
+		     0);
+
+   brw_init_cache_id(cache,
+		     "VS_PROG",
+		     BRW_VS_PROG,
+		     sizeof(struct brw_vs_prog_key),
+		     sizeof(struct brw_vs_prog_data));
+
+   brw_init_cache_id(cache,
+		     "CLIP_UNIT",
+		     BRW_CLIP_UNIT,
+		     sizeof(struct brw_clip_unit_state),
+		     0);
+
+   brw_init_cache_id(cache,
+		     "CLIP_PROG",
+		     BRW_CLIP_PROG,
+		     sizeof(struct brw_clip_prog_key),
+		     sizeof(struct brw_clip_prog_data));
+
+   brw_init_cache_id(cache,
+		     "GS_UNIT",
+		     BRW_GS_UNIT,
+		     sizeof(struct brw_gs_unit_state),
+		     0);
+
+   brw_init_cache_id(cache,
+		     "GS_PROG",
+		     BRW_GS_PROG,
+		     sizeof(struct brw_gs_prog_key),
+		     sizeof(struct brw_gs_prog_data));
+}
+
+
+static void
+brw_init_surface_cache(struct brw_context *brw)
+{
+   struct brw_cache *cache = &brw->surface_cache;
+
+   cache->brw = brw;
+
+   cache->size = 7;
+   cache->n_items = 0;
+   cache->items = (struct brw_cache_item **)
+      _mesa_calloc(cache->size * sizeof(struct brw_cache_item));
+
+   brw_init_cache_id(cache,
+		     "SS_SURFACE",
+		     BRW_SS_SURFACE,
+		     sizeof(struct brw_surface_state),
+		     0);
+
+   brw_init_cache_id(cache,
+		     "SS_SURF_BIND",
+		     BRW_SS_SURF_BIND,
+		     0,
+		     0);
+}
+
+
+void
+brw_init_caches(struct brw_context *brw)
+{
+   brw_init_non_surface_cache(brw);
+   brw_init_surface_cache(brw);
+}
+
+
+static void
+brw_clear_cache(struct brw_context *brw, struct brw_cache *cache)
+{
+   struct brw_cache_item *c, *next;
+   GLuint i;
+
+   if (INTEL_DEBUG & DEBUG_STATE)
+      _mesa_printf("%s\n", __FUNCTION__);
+
+   for (i = 0; i < cache->size; i++) {
+      for (c = cache->items[i]; c; c = next) {
+	 int j;
+
+	 next = c->next;
+	 for (j = 0; j < c->nr_reloc_bufs; j++)
+	    dri_bo_unreference(c->reloc_bufs[j]);
+	 dri_bo_unreference(c->bo);
+	 free((void *)c->key);
+	 free(c);
+      }
+      cache->items[i] = NULL;
+   }
+
+   cache->n_items = 0;
+
+   if (brw->curbe.last_buf) {
+      _mesa_free(brw->curbe.last_buf);
+      brw->curbe.last_buf = NULL;
+   }
+
+   brw->state.dirty.mesa |= ~0;
+   brw->state.dirty.brw |= ~0;
+   brw->state.dirty.cache |= ~0;
+}
+
+/* Clear all entries from the cache that point to the given bo.
+ *
+ * This lets us release memory for reuse earlier for known-dead buffers,
+ * at the cost of walking the entire hash table.
+ */
+void
+brw_state_cache_bo_delete(struct brw_cache *cache, dri_bo *bo)
+{
+   struct brw_cache_item **prev;
+   GLuint i;
+
+   if (INTEL_DEBUG & DEBUG_STATE)
+      _mesa_printf("%s\n", __FUNCTION__);
+
+   for (i = 0; i < cache->size; i++) {
+      for (prev = &cache->items[i]; *prev;) {
+	 struct brw_cache_item *c = *prev;
+
+	 if (drm_intel_bo_references(c->bo, bo)) {
+	    int j;
+
+	    *prev = c->next;
+
+	    for (j = 0; j < c->nr_reloc_bufs; j++)
+	       dri_bo_unreference(c->reloc_bufs[j]);
+	    dri_bo_unreference(c->bo);
+	    free((void *)c->key);
+	    free(c);
+	    cache->n_items--;
+	 } else {
+	    prev = &c->next;
+	 }
+      }
+   }
+}
+
+void
+brw_state_cache_check_size(struct brw_context *brw)
+{
+   if (INTEL_DEBUG & DEBUG_STATE)
+      _mesa_printf("%s (n_items=%d)\n", __FUNCTION__, brw->cache.n_items);
+
+   /* un-tuned guess.  We've got around 20 state objects for a total of around
+    * 32k, so 1000 of them is around 1.5MB.
+    */
+   if (brw->cache.n_items > 1000)
+      brw_clear_cache(brw, &brw->cache);
+
+   if (brw->surface_cache.n_items > 1000)
+      brw_clear_cache(brw, &brw->surface_cache);
+}
+
+
+static void
+brw_destroy_cache(struct brw_context *brw, struct brw_cache *cache)
+{
+   GLuint i;
+
+   if (INTEL_DEBUG & DEBUG_STATE)
+      _mesa_printf("%s\n", __FUNCTION__);
+
+   brw_clear_cache(brw, cache);
+   for (i = 0; i < BRW_MAX_CACHE; i++) {
+      dri_bo_unreference(cache->last_bo[i]);
+      free(cache->name[i]);
+   }
+   free(cache->items);
+   cache->items = NULL;
+   cache->size = 0;
+}
+
+
+void
+brw_destroy_caches(struct brw_context *brw)
+{
+   brw_destroy_cache(brw, &brw->cache);
+   brw_destroy_cache(brw, &brw->surface_cache);
+}
--- a/src/gallium/drivers/i965/brw_state_dump.c
+++ b/src/gallium/drivers/i965/brw_state_dump.c
@ -0,0 +1,224 @@
+/*
+ * Copyright © 2007 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+#include "main/mtypes.h"
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+
+/**
+ * Prints out a header, the contents, and the message associated with
+ * the hardware state data given.
+ *
+ * \param name Name of the state object
+ * \param data Pointer to the base of the state object
+ * \param hw_offset Hardware offset of the base of the state data.
+ * \param index Index of the DWORD being output.
+ */
+static void
+state_out(const char *name, void *data, uint32_t hw_offset, int index,
+	  char *fmt, ...)
+{
+    va_list va;
+
+    fprintf(stderr, "%8s: 0x%08x: 0x%08x: ",
+	    name, hw_offset + index * 4, ((uint32_t *)data)[index]);
+    va_start(va, fmt);
+    vfprintf(stderr, fmt, va);
+    va_end(va);
+}
+
+/** Generic, undecoded state buffer debug printout */
+static void
+state_struct_out(const char *name, dri_bo *buffer, unsigned int state_size)
+{
+   int i;
+
+   if (buffer == NULL)
+      return;
+
+   dri_bo_map(buffer, GL_FALSE);
+   for (i = 0; i < state_size / 4; i++) {
+      state_out(name, buffer->virtual, buffer->offset, i,
+		"dword %d\n", i);
+   }
+   dri_bo_unmap(buffer);
+}
+
+static const char *
+get_965_surfacetype(unsigned int surfacetype)
+{
+    switch (surfacetype) {
+    case 0: return "1D";
+    case 1: return "2D";
+    case 2: return "3D";
+    case 3: return "CUBE";
+    case 4: return "BUFFER";
+    case 7: return "NULL";
+    default: return "unknown";
+    }
+}
+
+static const char *
+get_965_surface_format(unsigned int surface_format)
+{
+    switch (surface_format) {
+    case 0x000: return "r32g32b32a32_float";
+    case 0x0c1: return "b8g8r8a8_unorm";
+    case 0x100: return "b5g6r5_unorm";
+    case 0x102: return "b5g5r5a1_unorm";
+    case 0x104: return "b4g4r4a4_unorm";
+    default: return "unknown";
+    }
+}
+
+static void dump_wm_surface_state(struct brw_context *brw)
+{
+   int i;
+
+   for (i = 0; i < brw->wm.nr_surfaces; i++) {
+      dri_bo *surf_bo = brw->wm.surf_bo[i];
+      unsigned int surfoff;
+      struct brw_surface_state *surf;
+      char name[20];
+
+      if (surf_bo == NULL) {
+	 fprintf(stderr, "  WM SS%d: NULL\n", i);
+	 continue;
+      }
+      dri_bo_map(surf_bo, GL_FALSE);
+      surfoff = surf_bo->offset;
+      surf = (struct brw_surface_state *)(surf_bo->virtual);
+
+      sprintf(name, "WM SS%d", i);
+      state_out(name, surf, surfoff, 0, "%s %s\n",
+		get_965_surfacetype(surf->ss0.surface_type),
+		get_965_surface_format(surf->ss0.surface_format));
+      state_out(name, surf, surfoff, 1, "offset\n");
+      state_out(name, surf, surfoff, 2, "%dx%d size, %d mips\n",
+		surf->ss2.width + 1, surf->ss2.height + 1, surf->ss2.mip_count);
+      state_out(name, surf, surfoff, 3, "pitch %d, %stiled\n",
+		surf->ss3.pitch + 1, surf->ss3.tiled_surface ? "" : "not ");
+      state_out(name, surf, surfoff, 4, "mip base %d\n",
+		surf->ss4.min_lod);
+      state_out(name, surf, surfoff, 5, "x,y offset: %d,%d\n",
+		surf->ss5.x_offset, surf->ss5.y_offset);
+
+      dri_bo_unmap(surf_bo);
+   }
+}
+
+static void dump_sf_viewport_state(struct brw_context *brw)
+{
+   const char *name = "SF VP";
+   struct brw_sf_viewport *vp;
+   uint32_t vp_off;
+
+   if (brw->sf.vp_bo == NULL)
+      return;
+
+   dri_bo_map(brw->sf.vp_bo, GL_FALSE);
+
+   vp = brw->sf.vp_bo->virtual;
+   vp_off = brw->sf.vp_bo->offset;
+
+   state_out(name, vp, vp_off, 0, "m00 = %f\n", vp->viewport.m00);
+   state_out(name, vp, vp_off, 1, "m11 = %f\n", vp->viewport.m11);
+   state_out(name, vp, vp_off, 2, "m22 = %f\n", vp->viewport.m22);
+   state_out(name, vp, vp_off, 3, "m30 = %f\n", vp->viewport.m30);
+   state_out(name, vp, vp_off, 4, "m31 = %f\n", vp->viewport.m31);
+   state_out(name, vp, vp_off, 5, "m32 = %f\n", vp->viewport.m32);
+
+   state_out(name, vp, vp_off, 6, "top left = %d,%d\n",
+	     vp->scissor.xmin, vp->scissor.ymin);
+   state_out(name, vp, vp_off, 7, "bottom right = %d,%d\n",
+	     vp->scissor.xmax, vp->scissor.ymax);
+
+   dri_bo_unmap(brw->sf.vp_bo);
+}
+
+static void brw_debug_prog(const char *name, dri_bo *prog)
+{
+   unsigned int i;
+   uint32_t *data;
+
+   if (prog == NULL)
+      return;
+
+   dri_bo_map(prog, GL_FALSE);
+
+   data = prog->virtual;
+
+   for (i = 0; i < prog->size / 4 / 4; i++) {
+      fprintf(stderr, "%8s: 0x%08x: 0x%08x 0x%08x 0x%08x 0x%08x\n",
+	      name, (unsigned int)prog->offset + i * 4 * 4,
+	      data[i * 4], data[i * 4 + 1], data[i * 4 + 2], data[i * 4 + 3]);
+      /* Stop at the end of the program.  It'd be nice to keep track of the actual
+       * intended program size instead of guessing like this.
+       */
+      if (data[i * 4 + 0] == 0 &&
+	  data[i * 4 + 1] == 0 &&
+	  data[i * 4 + 2] == 0 &&
+	  data[i * 4 + 3] == 0)
+	 break;
+   }
+
+   dri_bo_unmap(prog);
+}
+
+
+/**
+ * Print additional debug information associated with the batchbuffer
+ * when DEBUG_BATCH is set.
+ *
+ * For 965, this means mapping the state buffers that would have been referenced
+ * by the batchbuffer and dumping them.
+ *
+ * The buffer offsets printed rely on the buffer containing the last offset
+ * it was validated at.
+ */
+void brw_debug_batch(struct intel_context *intel)
+{
+   struct brw_context *brw = brw_context(&intel->ctx);
+
+   state_struct_out("WM bind", brw->wm.bind_bo, 4 * brw->wm.nr_surfaces);
+   dump_wm_surface_state(brw);
+
+   state_struct_out("VS", brw->vs.state_bo, sizeof(struct brw_vs_unit_state));
+   brw_debug_prog("VS prog", brw->vs.prog_bo);
+
+   state_struct_out("GS", brw->gs.state_bo, sizeof(struct brw_gs_unit_state));
+   brw_debug_prog("GS prog", brw->gs.prog_bo);
+
+   state_struct_out("SF", brw->sf.state_bo, sizeof(struct brw_sf_unit_state));
+   dump_sf_viewport_state(brw);
+   brw_debug_prog("SF prog", brw->sf.prog_bo);
+
+   state_struct_out("WM", brw->wm.state_bo, sizeof(struct brw_wm_unit_state));
+   brw_debug_prog("WM prog", brw->wm.prog_bo);
+}
--- a/src/gallium/drivers/i965/brw_state_upload.c
+++ b/src/gallium/drivers/i965/brw_state_upload.c
@ -0,0 +1,416 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+       
+
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "intel_batchbuffer.h"
+
+/* This is used to initialize brw->state.atoms[].  We could use this
+ * list directly except for a single atom, brw_constant_buffer, which
+ * has a .dirty value which changes according to the parameters of the
+ * current fragment and vertex programs, and so cannot be a static
+ * value.
+ */
+const struct brw_tracked_state *atoms[] =
+{
+   &brw_check_fallback,
+
+   &brw_wm_input_sizes,
+   &brw_vs_prog,
+   &brw_gs_prog, 
+   &brw_clip_prog, 
+   &brw_sf_prog,
+   &brw_wm_prog,
+
+   /* Once all the programs are done, we know how large urb entry
+    * sizes need to be and can decide if we need to change the urb
+    * layout.
+    */
+   &brw_curbe_offsets,
+   &brw_recalculate_urb_fence,
+
+   &brw_cc_vp,
+   &brw_cc_unit,
+
+   &brw_vs_surfaces,		/* must do before unit */
+   &brw_wm_constant_surface,	/* must do before wm surfaces/bind bo */
+   &brw_wm_surfaces,		/* must do before samplers and unit */
+   &brw_wm_samplers,
+
+   &brw_wm_unit,
+   &brw_sf_vp,
+   &brw_sf_unit,
+   &brw_vs_unit,		/* always required, enabled or not */
+   &brw_clip_unit,
+   &brw_gs_unit,  
+
+   /* Command packets:
+    */
+   &brw_invarient_state,
+   &brw_state_base_address,
+
+   &brw_binding_table_pointers,
+   &brw_blend_constant_color,
+
+   &brw_depthbuffer,
+
+   &brw_polygon_stipple,
+   &brw_polygon_stipple_offset,
+
+   &brw_line_stipple,
+   &brw_aa_line_parameters,
+
+   &brw_psp_urb_cbs,
+
+   &brw_drawing_rect,
+   &brw_indices,
+   &brw_index_buffer,
+   &brw_vertices,
+
+   &brw_constant_buffer
+};
+
+
+void brw_init_state( struct brw_context *brw )
+{
+   brw_init_caches(brw);
+}
+
+
+void brw_destroy_state( struct brw_context *brw )
+{
+   brw_destroy_caches(brw);
+   brw_destroy_batch_cache(brw);
+}
+
+/***********************************************************************
+ */
+
+static GLboolean check_state( const struct brw_state_flags *a,
+			      const struct brw_state_flags *b )
+{
+   return ((a->mesa & b->mesa) ||
+	   (a->brw & b->brw) ||
+	   (a->cache & b->cache));
+}
+
+static void accumulate_state( struct brw_state_flags *a,
+			      const struct brw_state_flags *b )
+{
+   a->mesa |= b->mesa;
+   a->brw |= b->brw;
+   a->cache |= b->cache;
+}
+
+
+static void xor_states( struct brw_state_flags *result,
+			     const struct brw_state_flags *a,
+			      const struct brw_state_flags *b )
+{
+   result->mesa = a->mesa ^ b->mesa;
+   result->brw = a->brw ^ b->brw;
+   result->cache = a->cache ^ b->cache;
+}
+
+static void
+brw_clear_validated_bos(struct brw_context *brw)
+{
+   int i;
+
+   /* Clear the last round of validated bos */
+   for (i = 0; i < brw->state.validated_bo_count; i++) {
+      dri_bo_unreference(brw->state.validated_bos[i]);
+      brw->state.validated_bos[i] = NULL;
+   }
+   brw->state.validated_bo_count = 0;
+}
+
+struct dirty_bit_map {
+   uint32_t bit;
+   char *name;
+   uint32_t count;
+};
+
+#define DEFINE_BIT(name) {name, #name, 0}
+
+static struct dirty_bit_map mesa_bits[] = {
+   DEFINE_BIT(_NEW_MODELVIEW),
+   DEFINE_BIT(_NEW_PROJECTION),
+   DEFINE_BIT(_NEW_TEXTURE_MATRIX),
+   DEFINE_BIT(_NEW_COLOR_MATRIX),
+   DEFINE_BIT(_NEW_ACCUM),
+   DEFINE_BIT(_NEW_COLOR),
+   DEFINE_BIT(_NEW_DEPTH),
+   DEFINE_BIT(_NEW_EVAL),
+   DEFINE_BIT(_NEW_FOG),
+   DEFINE_BIT(_NEW_HINT),
+   DEFINE_BIT(_NEW_LIGHT),
+   DEFINE_BIT(_NEW_LINE),
+   DEFINE_BIT(_NEW_PIXEL),
+   DEFINE_BIT(_NEW_POINT),
+   DEFINE_BIT(_NEW_POLYGON),
+   DEFINE_BIT(_NEW_POLYGONSTIPPLE),
+   DEFINE_BIT(_NEW_SCISSOR),
+   DEFINE_BIT(_NEW_STENCIL),
+   DEFINE_BIT(_NEW_TEXTURE),
+   DEFINE_BIT(_NEW_TRANSFORM),
+   DEFINE_BIT(_NEW_VIEWPORT),
+   DEFINE_BIT(_NEW_PACKUNPACK),
+   DEFINE_BIT(_NEW_ARRAY),
+   DEFINE_BIT(_NEW_RENDERMODE),
+   DEFINE_BIT(_NEW_BUFFERS),
+   DEFINE_BIT(_NEW_MULTISAMPLE),
+   DEFINE_BIT(_NEW_TRACK_MATRIX),
+   DEFINE_BIT(_NEW_PROGRAM),
+   DEFINE_BIT(_NEW_PROGRAM_CONSTANTS),
+   {0, 0, 0}
+};
+
+static struct dirty_bit_map brw_bits[] = {
+   DEFINE_BIT(BRW_NEW_URB_FENCE),
+   DEFINE_BIT(BRW_NEW_FRAGMENT_PROGRAM),
+   DEFINE_BIT(BRW_NEW_VERTEX_PROGRAM),
+   DEFINE_BIT(BRW_NEW_INPUT_DIMENSIONS),
+   DEFINE_BIT(BRW_NEW_CURBE_OFFSETS),
+   DEFINE_BIT(BRW_NEW_REDUCED_PRIMITIVE),
+   DEFINE_BIT(BRW_NEW_PRIMITIVE),
+   DEFINE_BIT(BRW_NEW_CONTEXT),
+   DEFINE_BIT(BRW_NEW_WM_INPUT_DIMENSIONS),
+   DEFINE_BIT(BRW_NEW_PSP),
+   DEFINE_BIT(BRW_NEW_FENCE),
+   DEFINE_BIT(BRW_NEW_INDICES),
+   DEFINE_BIT(BRW_NEW_INDEX_BUFFER),
+   DEFINE_BIT(BRW_NEW_VERTICES),
+   DEFINE_BIT(BRW_NEW_BATCH),
+   DEFINE_BIT(BRW_NEW_DEPTH_BUFFER),
+   {0, 0, 0}
+};
+
+static struct dirty_bit_map cache_bits[] = {
+   DEFINE_BIT(CACHE_NEW_CC_VP),
+   DEFINE_BIT(CACHE_NEW_CC_UNIT),
+   DEFINE_BIT(CACHE_NEW_WM_PROG),
+   DEFINE_BIT(CACHE_NEW_SAMPLER_DEFAULT_COLOR),
+   DEFINE_BIT(CACHE_NEW_SAMPLER),
+   DEFINE_BIT(CACHE_NEW_WM_UNIT),
+   DEFINE_BIT(CACHE_NEW_SF_PROG),
+   DEFINE_BIT(CACHE_NEW_SF_VP),
+   DEFINE_BIT(CACHE_NEW_SF_UNIT),
+   DEFINE_BIT(CACHE_NEW_VS_UNIT),
+   DEFINE_BIT(CACHE_NEW_VS_PROG),
+   DEFINE_BIT(CACHE_NEW_GS_UNIT),
+   DEFINE_BIT(CACHE_NEW_GS_PROG),
+   DEFINE_BIT(CACHE_NEW_CLIP_VP),
+   DEFINE_BIT(CACHE_NEW_CLIP_UNIT),
+   DEFINE_BIT(CACHE_NEW_CLIP_PROG),
+   DEFINE_BIT(CACHE_NEW_SURFACE),
+   DEFINE_BIT(CACHE_NEW_SURF_BIND),
+   {0, 0, 0}
+};
+
+
+static void
+brw_update_dirty_count(struct dirty_bit_map *bit_map, int32_t bits)
+{
+   int i;
+
+   for (i = 0; i < 32; i++) {
+      if (bit_map[i].bit == 0)
+	 return;
+
+      if (bit_map[i].bit & bits)
+	 bit_map[i].count++;
+   }
+}
+
+static void
+brw_print_dirty_count(struct dirty_bit_map *bit_map, int32_t bits)
+{
+   int i;
+
+   for (i = 0; i < 32; i++) {
+      if (bit_map[i].bit == 0)
+	 return;
+
+      fprintf(stderr, "0x%08x: %12d (%s)\n",
+	      bit_map[i].bit, bit_map[i].count, bit_map[i].name);
+   }
+}
+
+/***********************************************************************
+ * Emit all state:
+ */
+void brw_validate_state( struct brw_context *brw )
+{
+   GLcontext *ctx = &brw->intel.ctx;
+   struct intel_context *intel = &brw->intel;
+   struct brw_state_flags *state = &brw->state.dirty;
+   GLuint i;
+
+   brw_clear_validated_bos(brw);
+
+   state->mesa |= brw->intel.NewGLState;
+   brw->intel.NewGLState = 0;
+
+   brw_add_validated_bo(brw, intel->batch->buf);
+
+   if (brw->emit_state_always) {
+      state->mesa |= ~0;
+      state->brw |= ~0;
+      state->cache |= ~0;
+   }
+
+   if (brw->fragment_program != ctx->FragmentProgram._Current) {
+      brw->fragment_program = ctx->FragmentProgram._Current;
+      brw->state.dirty.brw |= BRW_NEW_FRAGMENT_PROGRAM;
+   }
+
+   if (brw->vertex_program != ctx->VertexProgram._Current) {
+      brw->vertex_program = ctx->VertexProgram._Current;
+      brw->state.dirty.brw |= BRW_NEW_VERTEX_PROGRAM;
+   }
+
+   if (state->mesa == 0 &&
+       state->cache == 0 &&
+       state->brw == 0)
+      return;
+
+   if (brw->state.dirty.brw & BRW_NEW_CONTEXT)
+      brw_clear_batch_cache(brw);
+
+   brw->intel.Fallback = 0;
+
+   /* do prepare stage for all atoms */
+   for (i = 0; i < Elements(atoms); i++) {
+      const struct brw_tracked_state *atom = atoms[i];
+
+      if (brw->intel.Fallback)
+         break;
+
+      if (check_state(state, &atom->dirty)) {
+         if (atom->prepare) {
+            atom->prepare(brw);
+        }
+      }
+   }
+
+   /* Make sure that the textures which are referenced by the current
+    * brw fragment program are actually present/valid.
+    * If this fails, we can experience GPU lock-ups.
+    */
+   {
+      const struct brw_fragment_program *fp;
+      fp = brw_fragment_program_const(brw->fragment_program);
+      if (fp) {
+         assert((fp->tex_units_used & ctx->Texture._EnabledUnits)
+                == fp->tex_units_used);
+      }
+   }
+}
+
+
+void brw_upload_state(struct brw_context *brw)
+{
+   struct brw_state_flags *state = &brw->state.dirty;
+   int i;
+   static int dirty_count = 0;
+
+   brw_clear_validated_bos(brw);
+
+   if (INTEL_DEBUG) {
+      /* Debug version which enforces various sanity checks on the
+       * state flags which are generated and checked to help ensure
+       * state atoms are ordered correctly in the list.
+       */
+      struct brw_state_flags examined, prev;      
+      _mesa_memset(&examined, 0, sizeof(examined));
+      prev = *state;
+
+      for (i = 0; i < Elements(atoms); i++) {	 
+	 const struct brw_tracked_state *atom = atoms[i];
+	 struct brw_state_flags generated;
+
+	 assert(atom->dirty.mesa ||
+		atom->dirty.brw ||
+		atom->dirty.cache);
+
+	 if (brw->intel.Fallback)
+	    break;
+
+	 if (check_state(state, &atom->dirty)) {
+	    if (atom->emit) {
+	       atom->emit( brw );
+	    }
+	 }
+
+	 accumulate_state(&examined, &atom->dirty);
+
+	 /* generated = (prev ^ state)
+	  * if (examined & generated)
+	  *     fail;
+	  */
+	 xor_states(&generated, &prev, state);
+	 assert(!check_state(&examined, &generated));
+	 prev = *state;
+      }
+   }
+   else {
+      for (i = 0; i < Elements(atoms); i++) {	 
+	 const struct brw_tracked_state *atom = atoms[i];
+
+	 if (brw->intel.Fallback)
+	    break;
+
+	 if (check_state(state, &atom->dirty)) {
+	    if (atom->emit) {
+	       atom->emit( brw );
+	    }
+	 }
+      }
+   }
+
+   if (INTEL_DEBUG & DEBUG_STATE) {
+      brw_update_dirty_count(mesa_bits, state->mesa);
+      brw_update_dirty_count(brw_bits, state->brw);
+      brw_update_dirty_count(cache_bits, state->cache);
+      if (dirty_count++ % 1000 == 0) {
+	 brw_print_dirty_count(mesa_bits, state->mesa);
+	 brw_print_dirty_count(brw_bits, state->brw);
+	 brw_print_dirty_count(cache_bits, state->cache);
+	 fprintf(stderr, "\n");
+      }
+   }
+
+   if (!brw->intel.Fallback)
+      memset(state, 0, sizeof(*state));
+}
--- a/src/gallium/drivers/i965/brw_structs.h
+++ b/src/gallium/drivers/i965/brw_structs.h
--- a/src/gallium/drivers/i965/brw_tex.c
+++ b/src/gallium/drivers/i965/brw_tex.c
@ -0,0 +1,59 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+        
+
+#include "main/glheader.h"
+#include "main/mtypes.h"
+#include "main/teximage.h"
+
+#include "intel_context.h"
+#include "intel_regions.h"
+#include "intel_tex.h"
+#include "brw_context.h"
+
+/**
+ * Finalizes all textures, completing any rendering that needs to be done
+ * to prepare them.
+ */
+void brw_validate_textures( struct brw_context *brw )
+{
+   GLcontext *ctx = &brw->intel.ctx;
+   struct intel_context *intel = &brw->intel;
+   int i;
+
+   for (i = 0; i < BRW_MAX_TEX_UNIT; i++) {
+      struct gl_texture_unit *texUnit = &ctx->Texture.Unit[i];
+
+      if (texUnit->_ReallyEnabled) {
+	 intel_finalize_mipmap_tree(intel, i);
+      }
+   }
+}
--- a/src/gallium/drivers/i965/brw_tex_layout.c
+++ b/src/gallium/drivers/i965/brw_tex_layout.c
@ -0,0 +1,222 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+/* Code to layout images in a mipmap tree for i965.
+ */
+
+#include "intel_mipmap_tree.h"
+#include "intel_tex_layout.h"
+#include "intel_context.h"
+#include "main/macros.h"
+#include "intel_chipset.h"
+
+#define FILE_DEBUG_FLAG DEBUG_MIPTREE
+
+GLboolean brw_miptree_layout(struct intel_context *intel,
+			     struct intel_mipmap_tree *mt,
+			     uint32_t tiling)
+{
+   /* XXX: these vary depending on image format: */
+   /* GLint align_w = 4; */
+
+   switch (mt->target) {
+   case GL_TEXTURE_CUBE_MAP:
+      if (IS_IGDNG(intel->intelScreen->deviceID)) {
+          GLuint align_h = 2, align_w = 4;
+          GLuint level;
+          GLuint x = 0;
+          GLuint y = 0;
+          GLuint width = mt->width0;
+          GLuint height = mt->height0;
+          GLuint qpitch = 0;
+          GLuint y_pitch = 0;
+
+          mt->pitch = mt->width0;
+          intel_get_texture_alignment_unit(mt->internal_format, &align_w, &align_h);
+          y_pitch = ALIGN(height, align_h);
+
+          if (mt->compressed) {
+              mt->pitch = ALIGN(mt->width0, align_w);
+          }
+
+          if (mt->first_level != mt->last_level) {
+              GLuint mip1_width;
+
+              if (mt->compressed) {
+                  mip1_width = ALIGN(minify(mt->width0), align_w)
+                      + ALIGN(minify(minify(mt->width0)), align_w);
+              } else {
+                  mip1_width = ALIGN(minify(mt->width0), align_w)
+                      + minify(minify(mt->width0));
+              }
+
+              if (mip1_width > mt->pitch) {
+                  mt->pitch = mip1_width;
+              }
+          }
+
+          mt->pitch = intel_miptree_pitch_align(intel, mt, tiling, mt->pitch);
+
+          if (mt->compressed) {
+              qpitch = (y_pitch + ALIGN(minify(y_pitch), align_h) + 11 * align_h) / 4 * mt->pitch * mt->cpp;
+              mt->total_height = (y_pitch + ALIGN(minify(y_pitch), align_h) + 11 * align_h) / 4 * 6;
+          } else {
+              qpitch = (y_pitch + ALIGN(minify(y_pitch), align_h) + 11 * align_h) * mt->pitch * mt->cpp;
+              mt->total_height = (y_pitch + ALIGN(minify(y_pitch), align_h) + 11 * align_h) * 6;
+          }
+
+          for (level = mt->first_level; level <= mt->last_level; level++) {
+              GLuint img_height;
+              GLuint nr_images = 6;
+              GLuint q = 0;
+
+              intel_miptree_set_level_info(mt, level, nr_images, x, y, width, 
+                                           height, 1);
+
+              for (q = 0; q < nr_images; q++)
+                  intel_miptree_set_image_offset_ex(mt, level, q, x, y, q * qpitch);
+
+              if (mt->compressed)
+                  img_height = MAX2(1, height/4);
+              else
+                  img_height = ALIGN(height, align_h);
+
+              if (level == mt->first_level + 1) {
+                  x += ALIGN(width, align_w);
+              }
+              else {
+                  y += img_height;
+              }
+
+              width  = minify(width);
+              height = minify(height);
+          }
+
+          break;
+      }
+
+   case GL_TEXTURE_3D: {
+      GLuint width  = mt->width0;
+      GLuint height = mt->height0;
+      GLuint depth = mt->depth0;
+      GLuint pack_x_pitch, pack_x_nr;
+      GLuint pack_y_pitch;
+      GLuint level;
+      GLuint align_h = 2;
+      GLuint align_w = 4;
+
+      mt->total_height = 0;
+      intel_get_texture_alignment_unit(mt->internal_format, &align_w, &align_h);
+
+      if (mt->compressed) {
+          mt->pitch = ALIGN(width, align_w);
+          pack_y_pitch = (height + 3) / 4;
+      } else {
+	 mt->pitch = intel_miptree_pitch_align (intel, mt, tiling, mt->width0);
+	 pack_y_pitch = ALIGN(mt->height0, align_h);
+      }
+
+      pack_x_pitch = width;
+      pack_x_nr = 1;
+
+      for (level = mt->first_level ; level <= mt->last_level ; level++) {
+	 GLuint nr_images = mt->target == GL_TEXTURE_3D ? depth : 6;
+	 GLint x = 0;
+	 GLint y = 0;
+	 GLint q, j;
+
+	 intel_miptree_set_level_info(mt, level, nr_images,
+				      0, mt->total_height,
+				      width, height, depth);
+
+	 for (q = 0; q < nr_images;) {
+	    for (j = 0; j < pack_x_nr && q < nr_images; j++, q++) {
+	       intel_miptree_set_image_offset(mt, level, q, x, y);
+	       x += pack_x_pitch;
+	    }
+
+	    x = 0;
+	    y += pack_y_pitch;
+	 }
+
+
+	 mt->total_height += y;
+	 width  = minify(width);
+	 height = minify(height);
+	 depth  = minify(depth);
+
+	 if (mt->compressed) {
+	    pack_y_pitch = (height + 3) / 4;
+
+	    if (pack_x_pitch > ALIGN(width, align_w)) {
+	       pack_x_pitch = ALIGN(width, align_w);
+	       pack_x_nr <<= 1;
+	    }
+	 } else {
+	    if (pack_x_pitch > 4) {
+	       pack_x_pitch >>= 1;
+	       pack_x_nr <<= 1;
+	       assert(pack_x_pitch * pack_x_nr <= mt->pitch);
+	    }
+
+	    if (pack_y_pitch > 2) {
+	       pack_y_pitch >>= 1;
+	       pack_y_pitch = ALIGN(pack_y_pitch, align_h);
+	    }
+	 }
+
+      }
+      /* The 965's sampler lays cachelines out according to how accesses
+       * in the texture surfaces run, so they may be "vertical" through
+       * memory.  As a result, the docs say in Surface Padding Requirements:
+       * Sampling Engine Surfaces that two extra rows of padding are required.
+       * We don't know of similar requirements for pre-965, but given that
+       * those docs are silent on padding requirements in general, let's play
+       * it safe.
+       */
+      if (mt->target == GL_TEXTURE_CUBE_MAP)
+	 mt->total_height += 2;
+      break;
+   }
+
+   default:
+      i945_miptree_layout_2d(intel, mt, tiling);
+      break;
+   }
+   DBG("%s: %dx%dx%d - sz 0x%x\n", __FUNCTION__,
+		mt->pitch,
+		mt->total_height,
+		mt->cpp,
+		mt->pitch * mt->total_height * mt->cpp );
+
+   return GL_TRUE;
+}
+
--- a/src/gallium/drivers/i965/brw_urb.c
+++ b/src/gallium/drivers/i965/brw_urb.c
@ -0,0 +1,250 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+        
+
+
+#include "intel_batchbuffer.h"
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+
+#define VS 0
+#define GS 1
+#define CLP 2
+#define SF 3
+#define CS 4
+
+/** @file brw_urb.c
+ *
+ * Manages the division of the URB space between the various fixed-function
+ * units.
+ *
+ * See the Thread Initiation Management section of the GEN4 B-Spec, and
+ * the individual *_STATE structures for restrictions on numbers of
+ * entries and threads.
+ */
+
+/*
+ * Generally, a unit requires a min_nr_entries based on how many entries
+ * it produces before the downstream unit gets unblocked and can use and
+ * dereference some of its handles.
+ *
+ * The SF unit preallocates a PUE at the start of thread dispatch, and only
+ * uses that one.  So it requires one entry per thread.
+ *
+ * For CLIP, the SF unit will hold the previous primitive while the
+ * next is getting assembled, meaning that linestrips require 3 CLIP VUEs
+ * (vertices) to ensure continued processing, trifans require 4, and tristrips
+ * require 5.  There can be 1 or 2 threads, and each has the same requirement.
+ *
+ * GS has the same requirement as CLIP, but it never handles tristrips,
+ * so we can lower the minimum to 4 for the POLYGONs (trifans) it produces.
+ * We only run it single-threaded.
+ *
+ * For VS, the number of entries may be 8, 12, 16, or 32 (or 64 on G4X).
+ * Each thread processes 2 preallocated VUEs (vertices) at a time, and they
+ * get streamed down as soon as threads processing earlier vertices get
+ * theirs accepted.
+ *
+ * Each unit will take the number of URB entries we give it (based on the
+ * entry size calculated in brw_vs_emit.c for VUEs, brw_sf_emit.c for PUEs,
+ * and brw_curbe.c for the CURBEs) and decide its maximum number of
+ * threads it can support based on that. in brw_*_state.c.
+ *
+ * XXX: Are the min_entry_size numbers useful?
+ * XXX: Verify min_nr_entries, esp for VS.
+ * XXX: Verify SF min_entry_size.
+ */
+static const struct {
+   GLuint min_nr_entries;
+   GLuint preferred_nr_entries;
+   GLuint min_entry_size;
+   GLuint max_entry_size;
+} limits[CS+1] = {
+   { 16, 32, 1, 5 },			/* vs */
+   { 4, 8,  1, 5 },			/* gs */
+   { 5, 10,  1, 5 },			/* clp */
+   { 1, 8,  1, 12 },		        /* sf */
+   { 1, 4,  1, 32 }			/* cs */
+};
+
+
+static GLboolean check_urb_layout( struct brw_context *brw )
+{
+   brw->urb.vs_start = 0;
+   brw->urb.gs_start = brw->urb.nr_vs_entries * brw->urb.vsize;
+   brw->urb.clip_start = brw->urb.gs_start + brw->urb.nr_gs_entries * brw->urb.vsize;
+   brw->urb.sf_start = brw->urb.clip_start + brw->urb.nr_clip_entries * brw->urb.vsize;
+   brw->urb.cs_start = brw->urb.sf_start + brw->urb.nr_sf_entries * brw->urb.sfsize;
+
+   return brw->urb.cs_start + brw->urb.nr_cs_entries * brw->urb.csize <= URB_SIZES(brw);
+}
+
+/* Most minimal update, forces re-emit of URB fence packet after GS
+ * unit turned on/off.
+ */
+static void recalculate_urb_fence( struct brw_context *brw )
+{
+   GLuint csize = brw->curbe.total_size;
+   GLuint vsize = brw->vs.prog_data->urb_entry_size;
+   GLuint sfsize = brw->sf.prog_data->urb_entry_size;
+
+   if (csize < limits[CS].min_entry_size)
+      csize = limits[CS].min_entry_size;
+
+   if (vsize < limits[VS].min_entry_size)
+      vsize = limits[VS].min_entry_size;
+
+   if (sfsize < limits[SF].min_entry_size)
+      sfsize = limits[SF].min_entry_size;
+
+   if (brw->urb.vsize < vsize ||
+       brw->urb.sfsize < sfsize ||
+       brw->urb.csize < csize ||
+       (brw->urb.constrained && (brw->urb.vsize > vsize ||
+				 brw->urb.sfsize > sfsize ||
+				 brw->urb.csize > csize))) {
+      
+
+      brw->urb.csize = csize;
+      brw->urb.sfsize = sfsize;
+      brw->urb.vsize = vsize;
+
+      brw->urb.nr_vs_entries = limits[VS].preferred_nr_entries;	
+      brw->urb.nr_gs_entries = limits[GS].preferred_nr_entries;	
+      brw->urb.nr_clip_entries = limits[CLP].preferred_nr_entries;
+      brw->urb.nr_sf_entries = limits[SF].preferred_nr_entries;	
+      brw->urb.nr_cs_entries = limits[CS].preferred_nr_entries;	
+
+      brw->urb.constrained = 0;
+
+      if (BRW_IS_IGDNG(brw)) {
+         brw->urb.nr_vs_entries = 128;
+         brw->urb.nr_sf_entries = 48;
+         if (check_urb_layout(brw)) {
+            goto done;
+         } else {
+            brw->urb.constrained = 1;
+            brw->urb.nr_vs_entries = limits[VS].preferred_nr_entries;
+            brw->urb.nr_sf_entries = limits[SF].preferred_nr_entries;
+         }
+      } else if (BRW_IS_G4X(brw)) {
+	 brw->urb.nr_vs_entries = 64;
+	 if (check_urb_layout(brw)) {
+	    goto done;
+	 } else {
+	    brw->urb.constrained = 1;
+	    brw->urb.nr_vs_entries = limits[VS].preferred_nr_entries;
+	 }
+      }
+
+      if (!check_urb_layout(brw)) {
+	 brw->urb.nr_vs_entries = limits[VS].min_nr_entries;	
+	 brw->urb.nr_gs_entries = limits[GS].min_nr_entries;	
+	 brw->urb.nr_clip_entries = limits[CLP].min_nr_entries;
+	 brw->urb.nr_sf_entries = limits[SF].min_nr_entries;	
+	 brw->urb.nr_cs_entries = limits[CS].min_nr_entries;	
+
+	 /* Mark us as operating with constrained nr_entries, so that next
+	  * time we recalculate we'll resize the fences in the hope of
+	  * escaping constrained mode and getting back to normal performance.
+	  */
+	 brw->urb.constrained = 1;
+	 
+	 if (!check_urb_layout(brw)) {
+	    /* This is impossible, given the maximal sizes of urb
+	     * entries and the values for minimum nr of entries
+	     * provided above.
+	     */
+	    _mesa_printf("couldn't calculate URB layout!\n");
+	    exit(1);
+	 }
+	 
+	 if (INTEL_DEBUG & (DEBUG_URB|DEBUG_FALLBACKS))
+	    _mesa_printf("URB CONSTRAINED\n");
+      }
+
+done:
+      if (INTEL_DEBUG & DEBUG_URB)
+	 _mesa_printf("URB fence: %d ..VS.. %d ..GS.. %d ..CLP.. %d ..SF.. %d ..CS.. %d\n",
+		      brw->urb.vs_start,
+		      brw->urb.gs_start,
+		      brw->urb.clip_start,
+		      brw->urb.sf_start,
+		      brw->urb.cs_start, 
+		      URB_SIZES(brw));
+      
+      brw->state.dirty.brw |= BRW_NEW_URB_FENCE;
+   }
+}
+
+
+const struct brw_tracked_state brw_recalculate_urb_fence = {
+   .dirty = {
+      .mesa = 0,
+      .brw = BRW_NEW_CURBE_OFFSETS,
+      .cache = (CACHE_NEW_VS_PROG |
+		CACHE_NEW_SF_PROG)
+   },
+   .prepare = recalculate_urb_fence
+};
+
+
+
+
+
+void brw_upload_urb_fence(struct brw_context *brw)
+{
+   struct brw_urb_fence uf;
+   memset(&uf, 0, sizeof(uf));
+
+   uf.header.opcode = CMD_URB_FENCE;
+   uf.header.length = sizeof(uf)/4-2;
+   uf.header.vs_realloc = 1;
+   uf.header.gs_realloc = 1;
+   uf.header.clp_realloc = 1;
+   uf.header.sf_realloc = 1;
+   uf.header.vfe_realloc = 1;
+   uf.header.cs_realloc = 1;
+
+   /* The ordering below is correct, not the layout in the
+    * instruction.
+    *
+    * There are 256/384 urb reg pairs in total.
+    */
+   uf.bits0.vs_fence  = brw->urb.gs_start;
+   uf.bits0.gs_fence  = brw->urb.clip_start; 
+   uf.bits0.clp_fence = brw->urb.sf_start; 
+   uf.bits1.sf_fence  = brw->urb.cs_start; 
+   uf.bits1.cs_fence  = URB_SIZES(brw);
+
+   BRW_BATCH_STRUCT(brw, &uf);
+}
--- a/src/gallium/drivers/i965/brw_util.c
+++ b/src/gallium/drivers/i965/brw_util.c
@ -0,0 +1,104 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+         
+
+#include "main/mtypes.h"
+#include "shader/prog_parameter.h"
+#include "brw_util.h"
+#include "brw_defines.h"
+
+GLuint brw_count_bits( GLuint val )
+{
+   GLuint i;
+   for (i = 0; val ; val >>= 1)
+      if (val & 1)
+	 i++;
+   return i;
+}
+
+
+GLuint brw_translate_blend_equation( GLenum mode )
+{
+   switch (mode) {
+   case GL_FUNC_ADD: 
+      return BRW_BLENDFUNCTION_ADD; 
+   case GL_MIN: 
+      return BRW_BLENDFUNCTION_MIN; 
+   case GL_MAX: 
+      return BRW_BLENDFUNCTION_MAX; 
+   case GL_FUNC_SUBTRACT: 
+      return BRW_BLENDFUNCTION_SUBTRACT; 
+   case GL_FUNC_REVERSE_SUBTRACT: 
+      return BRW_BLENDFUNCTION_REVERSE_SUBTRACT; 
+   default: 
+      assert(0);
+      return BRW_BLENDFUNCTION_ADD;
+   }
+}
+
+GLuint brw_translate_blend_factor( GLenum factor )
+{
+   switch(factor) {
+   case GL_ZERO: 
+      return BRW_BLENDFACTOR_ZERO; 
+   case GL_SRC_ALPHA: 
+      return BRW_BLENDFACTOR_SRC_ALPHA; 
+   case GL_ONE: 
+      return BRW_BLENDFACTOR_ONE; 
+   case GL_SRC_COLOR: 
+      return BRW_BLENDFACTOR_SRC_COLOR; 
+   case GL_ONE_MINUS_SRC_COLOR: 
+      return BRW_BLENDFACTOR_INV_SRC_COLOR; 
+   case GL_DST_COLOR: 
+      return BRW_BLENDFACTOR_DST_COLOR; 
+   case GL_ONE_MINUS_DST_COLOR: 
+      return BRW_BLENDFACTOR_INV_DST_COLOR; 
+   case GL_ONE_MINUS_SRC_ALPHA:
+      return BRW_BLENDFACTOR_INV_SRC_ALPHA; 
+   case GL_DST_ALPHA: 
+      return BRW_BLENDFACTOR_DST_ALPHA; 
+   case GL_ONE_MINUS_DST_ALPHA:
+      return BRW_BLENDFACTOR_INV_DST_ALPHA; 
+   case GL_SRC_ALPHA_SATURATE: 
+      return BRW_BLENDFACTOR_SRC_ALPHA_SATURATE;
+   case GL_CONSTANT_COLOR:
+      return BRW_BLENDFACTOR_CONST_COLOR; 
+   case GL_ONE_MINUS_CONSTANT_COLOR:
+      return BRW_BLENDFACTOR_INV_CONST_COLOR;
+   case GL_CONSTANT_ALPHA:
+      return BRW_BLENDFACTOR_CONST_ALPHA; 
+   case GL_ONE_MINUS_CONSTANT_ALPHA:
+      return BRW_BLENDFACTOR_INV_CONST_ALPHA;
+   default:
+      assert(0);
+      return BRW_BLENDFACTOR_ZERO;
+   }   
+}
--- a/src/gallium/drivers/i965/brw_util.h
+++ b/src/gallium/drivers/i965/brw_util.h
@ -0,0 +1,45 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+          
+
+#ifndef BRW_UTIL_H
+#define BRW_UTIL_H
+
+#include "main/mtypes.h"
+
+extern GLuint brw_count_bits( GLuint val );
+extern GLuint brw_parameter_list_state_flags(struct gl_program_parameter_list *paramList);
+extern GLuint brw_translate_blend_factor( GLenum factor );
+extern GLuint brw_translate_blend_equation( GLenum mode );
+
+
+
+#endif
--- a/src/gallium/drivers/i965/brw_vs.c
+++ b/src/gallium/drivers/i965/brw_vs.c
@ -0,0 +1,124 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+           
+
+#include "brw_context.h"
+#include "brw_vs.h"
+#include "brw_util.h"
+#include "brw_state.h"
+#include "shader/prog_print.h"
+
+
+
+static void do_vs_prog( struct brw_context *brw, 
+			struct brw_vertex_program *vp,
+			struct brw_vs_prog_key *key )
+{
+   GLuint program_size;
+   const GLuint *program;
+   struct brw_vs_compile c;
+
+   memset(&c, 0, sizeof(c));
+   memcpy(&c.key, key, sizeof(*key));
+
+   brw_init_compile(brw, &c.func);
+   c.vp = vp;
+
+   c.prog_data.outputs_written = vp->program.Base.OutputsWritten;
+   c.prog_data.inputs_read = vp->program.Base.InputsRead;
+
+   if (c.key.copy_edgeflag) {
+      c.prog_data.outputs_written |= 1<<VERT_RESULT_EDGE;
+      c.prog_data.inputs_read |= 1<<VERT_ATTRIB_EDGEFLAG;
+   }
+
+   if (0)
+      _mesa_print_program(&c.vp->program.Base);
+
+
+
+   /* Emit GEN4 code.
+    */
+   brw_vs_emit(&c);
+
+   /* get the program
+    */
+   program = brw_get_program(&c.func, &program_size);
+
+   dri_bo_unreference(brw->vs.prog_bo);
+   brw->vs.prog_bo = brw_upload_cache( &brw->cache, BRW_VS_PROG,
+				       &c.key, sizeof(c.key),
+				       NULL, 0,
+				       program, program_size,
+				       &c.prog_data,
+				       &brw->vs.prog_data );
+}
+
+
+static void brw_upload_vs_prog(struct brw_context *brw)
+{
+   GLcontext *ctx = &brw->intel.ctx;
+   struct brw_vs_prog_key key;
+   struct brw_vertex_program *vp = 
+      (struct brw_vertex_program *)brw->vertex_program;
+
+   memset(&key, 0, sizeof(key));
+
+   /* Just upload the program verbatim for now.  Always send it all
+    * the inputs it asks for, whether they are varying or not.
+    */
+   key.program_string_id = vp->id;
+   key.nr_userclip = brw_count_bits(ctx->Transform.ClipPlanesEnabled);
+   key.copy_edgeflag = (ctx->Polygon.FrontMode != GL_FILL ||
+			ctx->Polygon.BackMode != GL_FILL);
+
+   /* Make an early check for the key.
+    */
+   dri_bo_unreference(brw->vs.prog_bo);
+   brw->vs.prog_bo = brw_search_cache(&brw->cache, BRW_VS_PROG,
+				      &key, sizeof(key),
+				      NULL, 0,
+				      &brw->vs.prog_data);
+   if (brw->vs.prog_bo == NULL)
+      do_vs_prog(brw, vp, &key);
+}
+
+
+/* See brw_vs.c:
+ */
+const struct brw_tracked_state brw_vs_prog = {
+   .dirty = {
+      .mesa  = _NEW_TRANSFORM | _NEW_POLYGON,
+      .brw   = BRW_NEW_VERTEX_PROGRAM,
+      .cache = 0
+   },
+   .prepare = brw_upload_vs_prog
+};
--- a/src/gallium/drivers/i965/brw_vs.h
+++ b/src/gallium/drivers/i965/brw_vs.h
@ -0,0 +1,88 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+            
+
+#ifndef BRW_VS_H
+#define BRW_VS_H
+
+
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "shader/program.h"
+
+
+struct brw_vs_prog_key {
+   GLuint program_string_id;
+   GLuint nr_userclip:4;
+   GLuint copy_edgeflag:1;
+   GLuint pad:26;
+};
+
+
+struct brw_vs_compile {
+   struct brw_compile func;
+   struct brw_vs_prog_key key;
+   struct brw_vs_prog_data prog_data;
+
+   struct brw_vertex_program *vp;
+
+   GLuint nr_inputs;
+
+   GLuint first_output;
+   GLuint nr_outputs;
+   GLuint first_overflow_output; /**< VERT_ATTRIB_x */
+
+   GLuint first_tmp;
+   GLuint last_tmp;
+
+   struct brw_reg r0;
+   struct brw_reg r1;
+   struct brw_reg regs[PROGRAM_ADDRESS+1][128];
+   struct brw_reg tmp;
+   struct brw_reg stack;
+
+   struct {	
+       GLboolean used_in_src;
+       struct brw_reg reg;
+   } output_regs[128];
+
+   struct brw_reg userplane[6];
+
+   /** we may need up to 3 constants per instruction (if use_const_buffer) */
+   struct {
+      GLint index;
+      struct brw_reg reg;
+   } current_const[3];
+};
+
+void brw_vs_emit( struct brw_vs_compile *c );
+
+#endif
--- a/src/gallium/drivers/i965/brw_vs_emit.c
+++ b/src/gallium/drivers/i965/brw_vs_emit.c
--- a/src/gallium/drivers/i965/brw_vs_state.c
+++ b/src/gallium/drivers/i965/brw_vs_state.c
@ -0,0 +1,185 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+            
+
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+#include "main/macros.h"
+
+struct brw_vs_unit_key {
+   unsigned int total_grf;
+   unsigned int urb_entry_read_length;
+   unsigned int curb_entry_read_length;
+
+   unsigned int curbe_offset;
+
+   unsigned int nr_urb_entries, urb_size;
+
+   unsigned int nr_surfaces;
+};
+
+static void
+vs_unit_populate_key(struct brw_context *brw, struct brw_vs_unit_key *key)
+{
+   GLcontext *ctx = &brw->intel.ctx;
+
+   memset(key, 0, sizeof(*key));
+
+   /* CACHE_NEW_VS_PROG */
+   key->total_grf = brw->vs.prog_data->total_grf;
+   key->urb_entry_read_length = brw->vs.prog_data->urb_read_length;
+   key->curb_entry_read_length = brw->vs.prog_data->curb_read_length;
+
+   /* BRW_NEW_URB_FENCE */
+   key->nr_urb_entries = brw->urb.nr_vs_entries;
+   key->urb_size = brw->urb.vsize;
+
+   /* BRW_NEW_NR_VS_SURFACES */
+   key->nr_surfaces = brw->vs.nr_surfaces;
+
+   /* BRW_NEW_CURBE_OFFSETS, _NEW_TRANSFORM */
+   if (ctx->Transform.ClipPlanesEnabled) {
+      /* Note that we read in the userclip planes as well, hence
+       * clip_start:
+       */
+      key->curbe_offset = brw->curbe.clip_start;
+   }
+   else {
+      key->curbe_offset = brw->curbe.vs_start;
+   }
+}
+
+static dri_bo *
+vs_unit_create_from_key(struct brw_context *brw, struct brw_vs_unit_key *key)
+{
+   struct brw_vs_unit_state vs;
+   dri_bo *bo;
+   int chipset_max_threads;
+
+   memset(&vs, 0, sizeof(vs));
+
+   vs.thread0.kernel_start_pointer = brw->vs.prog_bo->offset >> 6; /* reloc */
+   vs.thread0.grf_reg_count = ALIGN(key->total_grf, 16) / 16 - 1;
+   vs.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
+   /* Choosing multiple program flow means that we may get 2-vertex threads,
+    * which will have the channel mask for dwords 4-7 enabled in the thread,
+    * and those dwords will be written to the second URB handle when we
+    * brw_urb_WRITE() results.
+    */
+   vs.thread1.single_program_flow = 0;
+
+   if (BRW_IS_IGDNG(brw))
+      vs.thread1.binding_table_entry_count = 0; /* hardware requirement */
+   else
+      vs.thread1.binding_table_entry_count = key->nr_surfaces;
+
+   vs.thread3.urb_entry_read_length = key->urb_entry_read_length;
+   vs.thread3.const_urb_entry_read_length = key->curb_entry_read_length;
+   vs.thread3.dispatch_grf_start_reg = 1;
+   vs.thread3.urb_entry_read_offset = 0;
+   vs.thread3.const_urb_entry_read_offset = key->curbe_offset * 2;
+
+   if (BRW_IS_IGDNG(brw))
+       vs.thread4.nr_urb_entries = key->nr_urb_entries >> 2;
+   else
+       vs.thread4.nr_urb_entries = key->nr_urb_entries;
+
+   vs.thread4.urb_entry_allocation_size = key->urb_size - 1;
+
+   if (BRW_IS_IGDNG(brw))
+      chipset_max_threads = 72;
+   else if (BRW_IS_G4X(brw))
+      chipset_max_threads = 32;
+   else
+      chipset_max_threads = 16;
+   vs.thread4.max_threads = CLAMP(key->nr_urb_entries / 2,
+				  1, chipset_max_threads) - 1;
+
+   if (INTEL_DEBUG & DEBUG_SINGLE_THREAD)
+      vs.thread4.max_threads = 0;
+
+   /* No samplers for ARB_vp programs:
+    */
+   /* It has to be set to 0 for IGDNG
+    */
+   vs.vs5.sampler_count = 0;
+
+   if (INTEL_DEBUG & DEBUG_STATS)
+      vs.thread4.stats_enable = 1;
+
+   /* Vertex program always enabled:
+    */
+   vs.vs6.vs_enable = 1;
+
+   bo = brw_upload_cache(&brw->cache, BRW_VS_UNIT,
+			 key, sizeof(*key),
+			 &brw->vs.prog_bo, 1,
+			 &vs, sizeof(vs),
+			 NULL, NULL);
+
+   /* Emit VS program relocation */
+   dri_bo_emit_reloc(bo,
+		     I915_GEM_DOMAIN_INSTRUCTION, 0,
+		     vs.thread0.grf_reg_count << 1,
+		     offsetof(struct brw_vs_unit_state, thread0),
+		     brw->vs.prog_bo);
+
+   return bo;
+}
+
+static void prepare_vs_unit(struct brw_context *brw)
+{
+   struct brw_vs_unit_key key;
+
+   vs_unit_populate_key(brw, &key);
+
+   dri_bo_unreference(brw->vs.state_bo);
+   brw->vs.state_bo = brw_search_cache(&brw->cache, BRW_VS_UNIT,
+				       &key, sizeof(key),
+				       &brw->vs.prog_bo, 1,
+				       NULL);
+   if (brw->vs.state_bo == NULL) {
+      brw->vs.state_bo = vs_unit_create_from_key(brw, &key);
+   }
+}
+
+const struct brw_tracked_state brw_vs_unit = {
+   .dirty = {
+      .mesa  = _NEW_TRANSFORM,
+      .brw   = (BRW_NEW_CURBE_OFFSETS |
+                BRW_NEW_NR_VS_SURFACES |
+		BRW_NEW_URB_FENCE),
+      .cache = CACHE_NEW_VS_PROG
+   },
+   .prepare = prepare_vs_unit,
+};
--- a/src/gallium/drivers/i965/brw_vs_surface_state.c
+++ b/src/gallium/drivers/i965/brw_vs_surface_state.c
@ -0,0 +1,226 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "main/mtypes.h"
+#include "main/texformat.h"
+#include "main/texstore.h"
+#include "shader/prog_parameter.h"
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+
+/* Creates a new VS constant buffer reflecting the current VS program's
+ * constants, if needed by the VS program.
+ *
+ * Otherwise, constants go through the CURBEs using the brw_constant_buffer
+ * state atom.
+ */
+static drm_intel_bo *
+brw_vs_update_constant_buffer(struct brw_context *brw)
+{
+   struct intel_context *intel = &brw->intel;
+   struct brw_vertex_program *vp =
+      (struct brw_vertex_program *) brw->vertex_program;
+   const struct gl_program_parameter_list *params = vp->program.Base.Parameters;
+   const int size = params->NumParameters * 4 * sizeof(GLfloat);
+   drm_intel_bo *const_buffer;
+
+   /* BRW_NEW_VERTEX_PROGRAM */
+   if (!vp->use_const_buffer)
+      return NULL;
+
+   const_buffer = drm_intel_bo_alloc(intel->bufmgr, "vp_const_buffer",
+				     size, 64);
+
+   /* _NEW_PROGRAM_CONSTANTS */
+   dri_bo_subdata(const_buffer, 0, size, params->ParameterValues);
+
+   return const_buffer;
+}
+
+/**
+ * Update the surface state for a VS constant buffer.
+ *
+ * Sets brw->vs.surf_bo[surf] and brw->vp->const_buffer.
+ */
+static void
+brw_update_vs_constant_surface( GLcontext *ctx,
+                                GLuint surf)
+{
+   struct brw_context *brw = brw_context(ctx);
+   struct brw_surface_key key;
+   struct brw_vertex_program *vp =
+      (struct brw_vertex_program *) brw->vertex_program;
+   const struct gl_program_parameter_list *params = vp->program.Base.Parameters;
+
+   assert(surf == 0);
+
+   /* If we're in this state update atom, we need to update VS constants, so
+    * free the old buffer and create a new one for the new contents.
+    */
+   dri_bo_unreference(vp->const_buffer);
+   vp->const_buffer = brw_vs_update_constant_buffer(brw);
+
+   /* If there's no constant buffer, then no surface BO is needed to point at
+    * it.
+    */
+   if (vp->const_buffer == 0) {
+      drm_intel_bo_unreference(brw->vs.surf_bo[surf]);
+      brw->vs.surf_bo[surf] = NULL;
+      return;
+   }
+
+   memset(&key, 0, sizeof(key));
+
+   key.format = MESA_FORMAT_RGBA_FLOAT32;
+   key.internal_format = GL_RGBA;
+   key.bo = vp->const_buffer;
+   key.depthmode = GL_NONE;
+   key.pitch = params->NumParameters;
+   key.width = params->NumParameters;
+   key.height = 1;
+   key.depth = 1;
+   key.cpp = 16;
+
+   /*
+   printf("%s:\n", __FUNCTION__);
+   printf("  width %d  height %d  depth %d  cpp %d  pitch %d\n",
+          key.width, key.height, key.depth, key.cpp, key.pitch);
+   */
+
+   drm_intel_bo_unreference(brw->vs.surf_bo[surf]);
+   brw->vs.surf_bo[surf] = brw_search_cache(&brw->surface_cache,
+                                            BRW_SS_SURFACE,
+                                            &key, sizeof(key),
+                                            &key.bo, key.bo ? 1 : 0,
+                                            NULL);
+   if (brw->vs.surf_bo[surf] == NULL) {
+      brw->vs.surf_bo[surf] = brw_create_constant_surface(brw, &key);
+   }
+}
+
+
+/**
+ * Constructs the binding table for the VS surface state.
+ */
+static dri_bo *
+brw_vs_get_binding_table(struct brw_context *brw)
+{
+   dri_bo *bind_bo;
+
+   bind_bo = brw_search_cache(&brw->surface_cache, BRW_SS_SURF_BIND,
+			      NULL, 0,
+			      brw->vs.surf_bo, BRW_VS_MAX_SURF,
+			      NULL);
+
+   if (bind_bo == NULL) {
+      GLuint data_size = BRW_VS_MAX_SURF * sizeof(GLuint);
+      uint32_t *data = malloc(data_size);
+      int i;
+
+      for (i = 0; i < BRW_VS_MAX_SURF; i++)
+         if (brw->vs.surf_bo[i])
+            data[i] = brw->vs.surf_bo[i]->offset;
+         else
+            data[i] = 0;
+
+      bind_bo = brw_upload_cache( &brw->surface_cache, BRW_SS_SURF_BIND,
+				  NULL, 0,
+				  brw->vs.surf_bo, BRW_VS_MAX_SURF,
+				  data, data_size,
+				  NULL, NULL);
+
+      /* Emit binding table relocations to surface state */
+      for (i = 0; i < BRW_VS_MAX_SURF; i++) {
+	 if (brw->vs.surf_bo[i] != NULL) {
+	    /* The presumed offsets were set in the data values for
+	     * brw_upload_cache.
+	     */
+	    drm_intel_bo_emit_reloc(bind_bo, i * 4,
+				    brw->vs.surf_bo[i], 0,
+				    I915_GEM_DOMAIN_INSTRUCTION, 0);
+	 }
+      }
+
+      free(data);
+   }
+
+   return bind_bo;
+}
+
+/**
+ * Vertex shader surfaces (constant buffer).
+ *
+ * This consumes the state updates for the constant buffer needing
+ * to be updated, and produces BRW_NEW_NR_VS_SURFACES for the VS unit and
+ * CACHE_NEW_SURF_BIND for the binding table upload.
+ */
+static void prepare_vs_surfaces(struct brw_context *brw )
+{
+   GLcontext *ctx = &brw->intel.ctx;
+   int i;
+   int nr_surfaces = 0;
+
+   brw_update_vs_constant_surface(ctx, SURF_INDEX_VERT_CONST_BUFFER);
+
+   for (i = 0; i < BRW_VS_MAX_SURF; i++) {
+      if (brw->vs.surf_bo[i] != NULL) {
+	 nr_surfaces = i + 1;
+      }
+   }
+
+   if (brw->vs.nr_surfaces != nr_surfaces) {
+      brw->state.dirty.brw |= BRW_NEW_NR_VS_SURFACES;
+      brw->vs.nr_surfaces = nr_surfaces;
+   }
+
+   /* Note that we don't end up updating the bind_bo if we don't have a
+    * surface to be pointing at.  This should be relatively harmless, as it
+    * just slightly increases our working set size.
+    */
+   if (brw->vs.nr_surfaces != 0) {
+      dri_bo_unreference(brw->vs.bind_bo);
+      brw->vs.bind_bo = brw_vs_get_binding_table(brw);
+   }
+}
+
+const struct brw_tracked_state brw_vs_surfaces = {
+   .dirty = {
+      .mesa = (_NEW_PROGRAM_CONSTANTS),
+      .brw = (BRW_NEW_VERTEX_PROGRAM),
+      .cache = 0
+   },
+   .prepare = prepare_vs_surfaces,
+};
+
+
+
--- a/src/gallium/drivers/i965/brw_wm.c
+++ b/src/gallium/drivers/i965/brw_wm.c
@ -0,0 +1,375 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+             
+#include "main/texformat.h"
+#include "brw_context.h"
+#include "brw_util.h"
+#include "brw_wm.h"
+#include "brw_state.h"
+
+
+/** Return number of src args for given instruction */
+GLuint brw_wm_nr_args( GLuint opcode )
+{
+   switch (opcode) {
+   case WM_FRONTFACING:
+   case WM_PIXELXY:
+      return 0;
+   case WM_CINTERP:
+   case WM_WPOSXY:
+   case WM_DELTAXY:
+      return 1;
+   case WM_LINTERP:
+   case WM_PIXELW:
+      return 2;
+   case WM_FB_WRITE:
+   case WM_PINTERP:
+      return 3;
+   default:
+      assert(opcode < MAX_OPCODE);
+      return _mesa_num_inst_src_regs(opcode);
+   }
+}
+
+
+GLuint brw_wm_is_scalar_result( GLuint opcode )
+{
+   switch (opcode) {
+   case OPCODE_COS:
+   case OPCODE_EX2:
+   case OPCODE_LG2:
+   case OPCODE_POW:
+   case OPCODE_RCP:
+   case OPCODE_RSQ:
+   case OPCODE_SIN:
+   case OPCODE_DP3:
+   case OPCODE_DP4:
+   case OPCODE_DPH:
+   case OPCODE_DST:
+      return 1;
+      
+   default:
+      return 0;
+   }
+}
+
+
+/**
+ * Do GPU code generation for non-GLSL shader.  non-GLSL shaders have
+ * no flow control instructions so we can more readily do SSA-style
+ * optimizations.
+ */
+static void
+brw_wm_non_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
+{
+   /* Augment fragment program.  Add instructions for pre- and
+    * post-fragment-program tasks such as interpolation and fogging.
+    */
+   brw_wm_pass_fp(c);
+
+   /* Translate to intermediate representation.  Build register usage
+    * chains.
+    */
+   brw_wm_pass0(c);
+
+   /* Dead code removal.
+    */
+   brw_wm_pass1(c);
+
+   /* Register allocation.
+    * Divide by two because we operate on 16 pixels at a time and require
+    * two GRF entries for each logical shader register.
+    */
+   c->grf_limit = BRW_WM_MAX_GRF / 2;
+
+   brw_wm_pass2(c);
+
+   /* how many general-purpose registers are used */
+   c->prog_data.total_grf = c->max_wm_grf;
+
+   /* Scratch space is used for register spilling */
+   if (c->last_scratch) {
+      c->prog_data.total_scratch = c->last_scratch + 0x40;
+   }
+   else {
+      c->prog_data.total_scratch = 0;
+   }
+
+   /* Emit GEN4 code.
+    */
+   brw_wm_emit(c);
+}
+
+
+/**
+ * All Mesa program -> GPU code generation goes through this function.
+ * Depending on the instructions used (i.e. flow control instructions)
+ * we'll use one of two code generators.
+ */
+static void do_wm_prog( struct brw_context *brw,
+			struct brw_fragment_program *fp, 
+			struct brw_wm_prog_key *key)
+{
+   struct brw_wm_compile *c;
+   const GLuint *program;
+   GLuint program_size;
+
+   c = brw->wm.compile_data;
+   if (c == NULL) {
+      brw->wm.compile_data = calloc(1, sizeof(*brw->wm.compile_data));
+      c = brw->wm.compile_data;
+      if (c == NULL) {
+         /* Ouch - big out of memory problem.  Can't continue
+          * without triggering a segfault, no way to signal,
+          * so just return.
+          */
+         return;
+      }
+   } else {
+      memset(c, 0, sizeof(*brw->wm.compile_data));
+   }
+   memcpy(&c->key, key, sizeof(*key));
+
+   c->fp = fp;
+   c->env_param = brw->intel.ctx.FragmentProgram.Parameters;
+
+   brw_init_compile(brw, &c->func);
+
+   /* temporary sanity check assertion */
+   ASSERT(fp->isGLSL == brw_wm_is_glsl(&c->fp->program));
+
+   /*
+    * Shader which use GLSL features such as flow control are handled
+    * differently from "simple" shaders.
+    */
+   if (fp->isGLSL) {
+      c->dispatch_width = 8;
+      brw_wm_glsl_emit(brw, c);
+   }
+   else {
+      c->dispatch_width = 16;
+      brw_wm_non_glsl_emit(brw, c);
+   }
+
+   if (INTEL_DEBUG & DEBUG_WM)
+      fprintf(stderr, "\n");
+
+   /* get the program
+    */
+   program = brw_get_program(&c->func, &program_size);
+
+   dri_bo_unreference(brw->wm.prog_bo);
+   brw->wm.prog_bo = brw_upload_cache( &brw->cache, BRW_WM_PROG,
+				       &c->key, sizeof(c->key),
+				       NULL, 0,
+				       program, program_size,
+				       &c->prog_data,
+				       &brw->wm.prog_data );
+}
+
+
+
+static void brw_wm_populate_key( struct brw_context *brw,
+				 struct brw_wm_prog_key *key )
+{
+   GLcontext *ctx = &brw->intel.ctx;
+   /* BRW_NEW_FRAGMENT_PROGRAM */
+   const struct brw_fragment_program *fp = 
+      (struct brw_fragment_program *)brw->fragment_program;
+   GLboolean uses_depth = (fp->program.Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
+   GLuint lookup = 0;
+   GLuint line_aa;
+   GLuint i;
+
+   memset(key, 0, sizeof(*key));
+
+   /* Build the index for table lookup
+    */
+   /* _NEW_COLOR */
+   if (fp->program.UsesKill ||
+       ctx->Color.AlphaEnabled)
+      lookup |= IZ_PS_KILL_ALPHATEST_BIT;
+
+   if (fp->program.Base.OutputsWritten & (1<<FRAG_RESULT_DEPTH))
+      lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
+
+   /* _NEW_DEPTH */
+   if (ctx->Depth.Test)
+      lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
+
+   if (ctx->Depth.Test &&  
+       ctx->Depth.Mask) /* ?? */
+      lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
+
+   /* _NEW_STENCIL */
+   if (ctx->Stencil._Enabled) {
+      lookup |= IZ_STENCIL_TEST_ENABLE_BIT;
+
+      if (ctx->Stencil.WriteMask[0] ||
+	  ctx->Stencil.WriteMask[ctx->Stencil._BackFace])
+	 lookup |= IZ_STENCIL_WRITE_ENABLE_BIT;
+   }
+
+   line_aa = AA_NEVER;
+
+   /* _NEW_LINE, _NEW_POLYGON, BRW_NEW_REDUCED_PRIMITIVE */
+   if (ctx->Line.SmoothFlag) {
+      if (brw->intel.reduced_primitive == GL_LINES) {
+	 line_aa = AA_ALWAYS;
+      }
+      else if (brw->intel.reduced_primitive == GL_TRIANGLES) {
+	 if (ctx->Polygon.FrontMode == GL_LINE) {
+	    line_aa = AA_SOMETIMES;
+
+	    if (ctx->Polygon.BackMode == GL_LINE ||
+		(ctx->Polygon.CullFlag &&
+		 ctx->Polygon.CullFaceMode == GL_BACK))
+	       line_aa = AA_ALWAYS;
+	 }
+	 else if (ctx->Polygon.BackMode == GL_LINE) {
+	    line_aa = AA_SOMETIMES;
+
+	    if ((ctx->Polygon.CullFlag &&
+		 ctx->Polygon.CullFaceMode == GL_FRONT))
+	       line_aa = AA_ALWAYS;
+	 }
+      }
+   }
+	 
+   brw_wm_lookup_iz(line_aa,
+		    lookup,
+		    uses_depth,
+		    key);
+
+
+   /* BRW_NEW_WM_INPUT_DIMENSIONS */
+   key->proj_attrib_mask = brw->wm.input_size_masks[4-1];
+
+   /* _NEW_LIGHT */
+   key->flat_shade = (ctx->Light.ShadeModel == GL_FLAT);
+
+   /* _NEW_HINT */
+   key->linear_color = (ctx->Hint.PerspectiveCorrection == GL_FASTEST);
+
+   /* _NEW_TEXTURE */
+   for (i = 0; i < BRW_MAX_TEX_UNIT; i++) {
+      const struct gl_texture_unit *unit = &ctx->Texture.Unit[i];
+
+      if (unit->_ReallyEnabled) {
+         const struct gl_texture_object *t = unit->_Current;
+         const struct gl_texture_image *img = t->Image[0][t->BaseLevel];
+	 if (img->InternalFormat == GL_YCBCR_MESA) {
+	    key->yuvtex_mask |= 1 << i;
+	    if (img->TexFormat->MesaFormat == MESA_FORMAT_YCBCR)
+		key->yuvtex_swap_mask |= 1 << i;
+	 }
+
+         key->tex_swizzles[i] = t->_Swizzle;
+      }
+      else {
+         key->tex_swizzles[i] = SWIZZLE_NOOP;
+      }
+   }
+
+   /* Shadow */
+   key->shadowtex_mask = fp->program.Base.ShadowSamplers;
+
+   /* _NEW_BUFFERS */
+   /*
+    * Include the draw buffer origin and height so that we can calculate
+    * fragment position values relative to the bottom left of the drawable,
+    * from the incoming screen origin relative position we get as part of our
+    * payload.
+    *
+    * We could avoid recompiling by including this as a constant referenced by
+    * our program, but if we were to do that it would also be nice to handle
+    * getting that constant updated at batchbuffer submit time (when we
+    * hold the lock and know where the buffer really is) rather than at emit
+    * time when we don't hold the lock and are just guessing.  We could also
+    * just avoid using this as key data if the program doesn't use
+    * fragment.position.
+    *
+    * This pretty much becomes moot with DRI2 and redirected buffers anyway,
+    * as our origins will always be zero then.
+    */
+   if (brw->intel.driDrawable != NULL) {
+      key->origin_x = brw->intel.driDrawable->x;
+      key->origin_y = brw->intel.driDrawable->y;
+      key->drawable_height = brw->intel.driDrawable->h;
+   }
+
+   /* CACHE_NEW_VS_PROG */
+   key->vp_outputs_written = brw->vs.prog_data->outputs_written & DO_SETUP_BITS;
+
+   /* The unique fragment program ID */
+   key->program_string_id = fp->id;
+}
+
+
+static void brw_prepare_wm_prog(struct brw_context *brw)
+{
+   struct brw_wm_prog_key key;
+   struct brw_fragment_program *fp = (struct brw_fragment_program *)
+      brw->fragment_program;
+     
+   brw_wm_populate_key(brw, &key);
+
+   /* Make an early check for the key.
+    */
+   dri_bo_unreference(brw->wm.prog_bo);
+   brw->wm.prog_bo = brw_search_cache(&brw->cache, BRW_WM_PROG,
+				      &key, sizeof(key),
+				      NULL, 0,
+				      &brw->wm.prog_data);
+   if (brw->wm.prog_bo == NULL)
+      do_wm_prog(brw, fp, &key);
+}
+
+
+const struct brw_tracked_state brw_wm_prog = {
+   .dirty = {
+      .mesa  = (_NEW_COLOR |
+		_NEW_DEPTH |
+                _NEW_HINT |
+		_NEW_STENCIL |
+		_NEW_POLYGON |
+		_NEW_LINE |
+		_NEW_LIGHT |
+		_NEW_BUFFERS |
+		_NEW_TEXTURE),
+      .brw   = (BRW_NEW_FRAGMENT_PROGRAM |
+		BRW_NEW_WM_INPUT_DIMENSIONS |
+		BRW_NEW_REDUCED_PRIMITIVE),
+      .cache = CACHE_NEW_VS_PROG,
+   },
+   .prepare = brw_prepare_wm_prog
+};
+
--- a/src/gallium/drivers/i965/brw_wm.h
+++ b/src/gallium/drivers/i965/brw_wm.h
@ -0,0 +1,309 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+              
+
+#ifndef BRW_WM_H
+#define BRW_WM_H
+
+
+#include "shader/prog_instruction.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+
+#define SATURATE (1<<5)
+
+/* A big lookup table is used to figure out which and how many
+ * additional regs will inserted before the main payload in the WM
+ * program execution.  These mainly relate to depth and stencil
+ * processing and the early-depth-test optimization.
+ */
+#define IZ_PS_KILL_ALPHATEST_BIT    0x1
+#define IZ_PS_COMPUTES_DEPTH_BIT    0x2
+#define IZ_DEPTH_WRITE_ENABLE_BIT   0x4
+#define IZ_DEPTH_TEST_ENABLE_BIT    0x8
+#define IZ_STENCIL_WRITE_ENABLE_BIT 0x10
+#define IZ_STENCIL_TEST_ENABLE_BIT  0x20
+#define IZ_BIT_MAX                  0x40
+
+#define AA_NEVER     0
+#define AA_SOMETIMES 1
+#define AA_ALWAYS    2
+
+struct brw_wm_prog_key {
+   GLuint source_depth_reg:3;
+   GLuint aa_dest_stencil_reg:3;
+   GLuint dest_depth_reg:3;
+   GLuint nr_depth_regs:3;
+   GLuint computes_depth:1;	/* could be derived from program string */
+   GLuint source_depth_to_render_target:1;
+   GLuint flat_shade:1;
+   GLuint linear_color:1;  /**< linear interpolation vs perspective interp */
+   GLuint runtime_check_aads_emit:1;
+   
+   GLbitfield proj_attrib_mask; /**< one bit per fragment program attribute */
+   GLuint shadowtex_mask:16;
+   GLuint yuvtex_mask:16;
+   GLuint yuvtex_swap_mask:16;	/* UV swaped */
+
+   GLuint tex_swizzles[BRW_MAX_TEX_UNIT];
+
+   GLuint program_string_id:32;
+   GLuint origin_x, origin_y;
+   GLuint drawable_height;
+   GLuint vp_outputs_written;
+};
+
+
+/* A bit of a glossary:
+ *
+ * brw_wm_value: A computed value or program input.  Values are
+ * constant, they are created once and are never modified.  When a
+ * fragment program register is written or overwritten, new values are
+ * created fresh, preserving the rule that values are constant.
+ *
+ * brw_wm_ref: A reference to a value.  Wherever a value used is by an
+ * instruction or as a program output, that is tracked with an
+ * instance of this struct.  All references to a value occur after it
+ * is created.  After the last reference, a value is dead and can be
+ * discarded.
+ *
+ * brw_wm_grf: Represents a physical hardware register.  May be either
+ * empty or hold a value.  Register allocation is the process of
+ * assigning values to grf registers.  This occurs in pass2 and the
+ * brw_wm_grf struct is not used before that.
+ *
+ * Fragment program registers: These are time-varying constructs that
+ * are hard to reason about and which we translate away in pass0.  A
+ * single fragment program register element (eg. temp[0].x) will be
+ * translated to one or more brw_wm_value structs, one for each time
+ * that temp[0].x is written to during the program. 
+ */
+
+
+
+/* Used in pass2 to track register allocation.
+ */
+struct brw_wm_grf {
+   struct brw_wm_value *value;
+   GLuint nextuse;
+};
+
+struct brw_wm_value {
+   struct brw_reg hw_reg;	/* emitted to this reg, may not always be there */
+   struct brw_wm_ref *lastuse;
+   struct brw_wm_grf *resident; 
+   GLuint contributes_to_output:1;
+   GLuint spill_slot:16;	/* if non-zero, spill immediately after calculation */
+};
+
+struct brw_wm_ref {
+   struct brw_reg hw_reg;	/* nr filled in in pass2, everything else, pass0 */
+   struct brw_wm_value *value;
+   struct brw_wm_ref *prevuse;
+   GLuint unspill_reg:7;	/* unspill to reg */
+   GLuint emitted:1;
+   GLuint insn:24;
+};
+
+struct brw_wm_constref {
+   const struct brw_wm_ref *ref;
+   GLfloat constval;
+};
+
+
+struct brw_wm_instruction {
+   struct brw_wm_value *dst[4];
+   struct brw_wm_ref *src[3][4];
+   GLuint opcode:8;
+   GLuint saturate:1;
+   GLuint writemask:4;
+   GLuint tex_unit:4;   /* texture unit for TEX, TXD, TXP instructions */
+   GLuint tex_idx:3;    /* TEXTURE_1D,2D,3D,CUBE,RECT_INDEX source target */
+   GLuint tex_shadow:1; /* do shadow comparison? */
+   GLuint eot:1;    	/* End of thread indicator for FB_WRITE*/
+   GLuint target:10;    /* target binding table index for FB_WRITE*/
+};
+
+
+#define BRW_WM_MAX_INSN  (MAX_NV_FRAGMENT_PROGRAM_INSTRUCTIONS*3 + FRAG_ATTRIB_MAX + 3)
+#define BRW_WM_MAX_GRF   128		/* hardware limit */
+#define BRW_WM_MAX_VREG  (BRW_WM_MAX_INSN * 4)
+#define BRW_WM_MAX_REF   (BRW_WM_MAX_INSN * 12)
+#define BRW_WM_MAX_PARAM 256
+#define BRW_WM_MAX_CONST 256
+#define BRW_WM_MAX_KILLS MAX_NV_FRAGMENT_PROGRAM_INSTRUCTIONS
+#define BRW_WM_MAX_SUBROUTINE 16
+
+
+
+/* New opcodes to track internal operations required for WM unit.
+ * These are added early so that the registers used can be tracked,
+ * freed and reused like those of other instructions.
+ */
+#define WM_PIXELXY        (MAX_OPCODE)
+#define WM_DELTAXY        (MAX_OPCODE + 1)
+#define WM_PIXELW         (MAX_OPCODE + 2)
+#define WM_LINTERP        (MAX_OPCODE + 3)
+#define WM_PINTERP        (MAX_OPCODE + 4)
+#define WM_CINTERP        (MAX_OPCODE + 5)
+#define WM_WPOSXY         (MAX_OPCODE + 6)
+#define WM_FB_WRITE       (MAX_OPCODE + 7)
+#define WM_FRONTFACING    (MAX_OPCODE + 8)
+#define MAX_WM_OPCODE     (MAX_OPCODE + 9)
+
+#define PROGRAM_PAYLOAD   (PROGRAM_FILE_MAX)
+#define PAYLOAD_DEPTH     (FRAG_ATTRIB_MAX)
+
+struct brw_wm_compile {
+   struct brw_compile func;
+   struct brw_wm_prog_key key;
+   struct brw_wm_prog_data prog_data;
+
+   struct brw_fragment_program *fp;
+
+   GLfloat (*env_param)[4];
+
+   enum {
+      START,
+      PASS2_DONE
+   } state;
+
+   /* Initial pass - translate fp instructions to fp instructions,
+    * simplifying and adding instructions for interpolation and
+    * framebuffer writes.
+    */
+   struct prog_instruction prog_instructions[BRW_WM_MAX_INSN];
+   GLuint nr_fp_insns;
+   GLuint fp_temp;
+   GLuint fp_interp_emitted;
+   GLuint fp_fragcolor_emitted;
+
+   struct prog_src_register pixel_xy;
+   struct prog_src_register delta_xy;
+   struct prog_src_register pixel_w;
+
+
+   struct brw_wm_value vreg[BRW_WM_MAX_VREG];
+   GLuint nr_vreg;
+
+   struct brw_wm_value creg[BRW_WM_MAX_PARAM];
+   GLuint nr_creg;
+
+   struct {
+      struct brw_wm_value depth[4]; /* includes r0/r1 */
+      struct brw_wm_value input_interp[FRAG_ATTRIB_MAX];
+   } payload;
+
+
+   const struct brw_wm_ref *pass0_fp_reg[PROGRAM_PAYLOAD+1][256][4];
+
+   struct brw_wm_ref undef_ref;
+   struct brw_wm_value undef_value;
+
+   struct brw_wm_ref refs[BRW_WM_MAX_REF];
+   GLuint nr_refs;
+
+   struct brw_wm_instruction instruction[BRW_WM_MAX_INSN];
+   GLuint nr_insns;
+
+   struct brw_wm_constref constref[BRW_WM_MAX_CONST];
+   GLuint nr_constrefs;
+
+   struct brw_wm_grf pass2_grf[BRW_WM_MAX_GRF/2];
+
+   GLuint grf_limit;
+   GLuint max_wm_grf;
+   GLuint last_scratch;
+
+   GLuint cur_inst;  /**< index of current instruction */
+
+   GLboolean out_of_regs;  /**< ran out of GRF registers? */
+
+   /** Mapping from Mesa registers to hardware registers */
+   struct {
+      GLboolean inited;
+      struct brw_reg reg;
+   } wm_regs[PROGRAM_PAYLOAD+1][256][4];
+
+   GLboolean used_grf[BRW_WM_MAX_GRF];
+   GLuint first_free_grf;
+   struct brw_reg stack;
+   struct brw_reg emit_mask_reg;
+   GLuint tmp_regs[BRW_WM_MAX_GRF];
+   GLuint tmp_index;
+   GLuint tmp_max;
+   GLuint subroutines[BRW_WM_MAX_SUBROUTINE];
+   GLuint dispatch_width;
+
+   /** we may need up to 3 constants per instruction (if use_const_buffer) */
+   struct {
+      GLint index;
+      struct brw_reg reg;
+   } current_const[3];
+};
+
+
+GLuint brw_wm_nr_args( GLuint opcode );
+GLuint brw_wm_is_scalar_result( GLuint opcode );
+
+void brw_wm_pass_fp( struct brw_wm_compile *c );
+void brw_wm_pass0( struct brw_wm_compile *c );
+void brw_wm_pass1( struct brw_wm_compile *c );
+void brw_wm_pass2( struct brw_wm_compile *c );
+void brw_wm_emit( struct brw_wm_compile *c );
+
+void brw_wm_print_value( struct brw_wm_compile *c,
+			 struct brw_wm_value *value );
+
+void brw_wm_print_ref( struct brw_wm_compile *c,
+		       struct brw_wm_ref *ref );
+
+void brw_wm_print_insn( struct brw_wm_compile *c,
+			struct brw_wm_instruction *inst );
+
+void brw_wm_print_program( struct brw_wm_compile *c,
+			   const char *stage );
+
+void brw_wm_lookup_iz( GLuint line_aa,
+		       GLuint lookup,
+		       GLboolean ps_uses_depth,
+		       struct brw_wm_prog_key *key );
+
+GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp);
+void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c);
+
+void emit_ddxy(struct brw_compile *p,
+	       const struct brw_reg *dst,
+	       GLuint mask,
+	       GLboolean is_ddx,
+	       const struct brw_reg *arg0);
+
+#endif
--- a/src/gallium/drivers/i965/brw_wm_debug.c
+++ b/src/gallium/drivers/i965/brw_wm_debug.c
@ -0,0 +1,174 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+               
+
+#include "brw_context.h"
+#include "brw_wm.h"
+
+
+void brw_wm_print_value( struct brw_wm_compile *c,
+		       struct brw_wm_value *value )
+{
+   assert(value);
+   if (c->state >= PASS2_DONE) 
+      brw_print_reg(value->hw_reg);
+   else if( value == &c->undef_value )
+      _mesa_printf("undef");
+   else if( value - c->vreg >= 0 &&
+	    value - c->vreg < BRW_WM_MAX_VREG)
+      _mesa_printf("r%d", value - c->vreg);
+   else if (value - c->creg >= 0 &&
+	    value - c->creg < BRW_WM_MAX_PARAM)
+      _mesa_printf("c%d", value - c->creg);
+   else if (value - c->payload.input_interp >= 0 &&
+	    value - c->payload.input_interp < FRAG_ATTRIB_MAX)
+      _mesa_printf("i%d", value - c->payload.input_interp);
+   else if (value - c->payload.depth >= 0 &&
+	    value - c->payload.depth < FRAG_ATTRIB_MAX)
+      _mesa_printf("d%d", value - c->payload.depth);
+   else 
+      _mesa_printf("?");
+}
+
+void brw_wm_print_ref( struct brw_wm_compile *c,
+		       struct brw_wm_ref *ref )
+{
+   struct brw_reg hw_reg = ref->hw_reg;
+
+   if (ref->unspill_reg)
+      _mesa_printf("UNSPILL(%x)/", ref->value->spill_slot);
+
+   if (c->state >= PASS2_DONE)
+      brw_print_reg(ref->hw_reg);
+   else {
+      _mesa_printf("%s", hw_reg.negate ? "-" : "");
+      _mesa_printf("%s", hw_reg.abs ? "abs/" : "");
+      brw_wm_print_value(c, ref->value);
+      if ((hw_reg.nr&1) || hw_reg.subnr) {
+	 _mesa_printf("->%d.%d", (hw_reg.nr&1), hw_reg.subnr);
+      }
+   }
+}
+
+void brw_wm_print_insn( struct brw_wm_compile *c,
+			struct brw_wm_instruction *inst )
+{
+   GLuint i, arg;
+   GLuint nr_args = brw_wm_nr_args(inst->opcode);
+
+   _mesa_printf("[");
+   for (i = 0; i < 4; i++) {
+      if (inst->dst[i]) {
+	 brw_wm_print_value(c, inst->dst[i]);
+	 if (inst->dst[i]->spill_slot)
+	    _mesa_printf("/SPILL(%x)",inst->dst[i]->spill_slot);
+      }
+      else
+	 _mesa_printf("#");
+      if (i < 3)      
+	 _mesa_printf(",");
+   }
+   _mesa_printf("]");
+
+   if (inst->writemask != WRITEMASK_XYZW)
+      _mesa_printf(".%s%s%s%s", 
+		   GET_BIT(inst->writemask, 0) ? "x" : "",
+		   GET_BIT(inst->writemask, 1) ? "y" : "",
+		   GET_BIT(inst->writemask, 2) ? "z" : "",
+		   GET_BIT(inst->writemask, 3) ? "w" : "");
+
+   switch (inst->opcode) {
+   case WM_PIXELXY:
+      _mesa_printf(" = PIXELXY");
+      break;
+   case WM_DELTAXY:
+      _mesa_printf(" = DELTAXY");
+      break;
+   case WM_PIXELW:
+      _mesa_printf(" = PIXELW");
+      break;
+   case WM_WPOSXY:
+      _mesa_printf(" = WPOSXY");
+      break;
+   case WM_PINTERP:
+      _mesa_printf(" = PINTERP");
+      break;
+   case WM_LINTERP:
+      _mesa_printf(" = LINTERP");
+      break;
+   case WM_CINTERP:
+      _mesa_printf(" = CINTERP");
+      break;
+   case WM_FB_WRITE:
+      _mesa_printf(" = FB_WRITE");
+      break;
+   case WM_FRONTFACING:
+      _mesa_printf(" = FRONTFACING");
+      break;
+   default:
+      _mesa_printf(" = %s", _mesa_opcode_string(inst->opcode));
+      break;
+   }
+
+   if (inst->saturate)
+      _mesa_printf("_SAT");
+
+   for (arg = 0; arg < nr_args; arg++) {
+
+      _mesa_printf(" [");
+
+      for (i = 0; i < 4; i++) {
+	 if (inst->src[arg][i]) {
+	    brw_wm_print_ref(c, inst->src[arg][i]);
+	 }
+	 else
+	    _mesa_printf("%%");
+
+	 if (i < 3) 
+	    _mesa_printf(",");
+	 else
+	    _mesa_printf("]");
+      }
+   }
+   _mesa_printf("\n");
+}
+
+void brw_wm_print_program( struct brw_wm_compile *c,
+			   const char *stage )
+{
+   GLuint insn;
+
+   _mesa_printf("%s:\n", stage);
+   for (insn = 0; insn < c->nr_insns; insn++)
+      brw_wm_print_insn(c, &c->instruction[insn]);
+   _mesa_printf("\n");
+}
+
--- a/src/gallium/drivers/i965/brw_wm_emit.c
+++ b/src/gallium/drivers/i965/brw_wm_emit.c
--- a/src/gallium/drivers/i965/brw_wm_fp.c
+++ b/src/gallium/drivers/i965/brw_wm_fp.c
--- a/src/gallium/drivers/i965/brw_wm_glsl.c
+++ b/src/gallium/drivers/i965/brw_wm_glsl.c
--- a/src/gallium/drivers/i965/brw_wm_iz.c
+++ b/src/gallium/drivers/i965/brw_wm_iz.c
@ -0,0 +1,157 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+                
+
+#include "main/mtypes.h"
+#include "brw_wm.h"
+
+
+#undef P			/* prompted depth */
+#undef C			/* computed */
+#undef N			/* non-promoted? */
+
+#define P 0
+#define C 1
+#define N 2
+
+const struct {
+   GLuint mode:2;
+   GLuint sd_present:1;
+   GLuint sd_to_rt:1;
+   GLuint dd_present:1;
+   GLuint ds_present:1;
+} wm_iz_table[IZ_BIT_MAX] =
+{
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { N, 1, 1, 0, 0 }, 
+ { N, 0, 1, 0, 0 }, 
+ { N, 0, 1, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { N, 1, 1, 0, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { N, 1, 1, 0, 0 }, 
+ { N, 0, 1, 0, 0 }, 
+ { N, 0, 1, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { N, 1, 1, 0, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { N, 1, 1, 0, 1 }, 
+ { N, 0, 1, 0, 1 }, 
+ { N, 0, 1, 0, 1 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 0, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { P, 0, 0, 0, 0 }, 
+ { N, 1, 1, 0, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 0, 0, 0, 1 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 0, 1, 0, 1 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 1, 1, 0, 1 }, 
+ { C, 0, 1, 0, 1 }, 
+ { C, 0, 1, 0, 1 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 1, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 1, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 } 
+};
+
+/**
+ * \param line_aa  AA_NEVER, AA_ALWAYS or AA_SOMETIMES
+ * \param lookup  bitmask of IZ_* flags
+ */
+void brw_wm_lookup_iz( GLuint line_aa,
+		       GLuint lookup,
+		       GLboolean ps_uses_depth,
+		       struct brw_wm_prog_key *key )
+{
+   GLuint reg = 2;
+
+   assert (lookup < IZ_BIT_MAX);
+      
+   if (lookup & IZ_PS_COMPUTES_DEPTH_BIT)
+      key->computes_depth = 1;
+
+   if (wm_iz_table[lookup].sd_present || ps_uses_depth) {
+      key->source_depth_reg = reg;
+      reg += 2;
+   }
+
+   if (wm_iz_table[lookup].sd_to_rt)
+      key->source_depth_to_render_target = 1;
+
+   if (wm_iz_table[lookup].ds_present || line_aa != AA_NEVER) {
+      key->aa_dest_stencil_reg = reg;
+      key->runtime_check_aads_emit = (!wm_iz_table[lookup].ds_present &&
+				      line_aa == AA_SOMETIMES);
+      reg++;
+   }
+
+   if (wm_iz_table[lookup].dd_present) {
+      key->dest_depth_reg = reg;
+      reg+=2;
+   }
+
+   key->nr_depth_regs = (reg+1)/2;
+}
+
--- a/src/gallium/drivers/i965/brw_wm_pass0.c
+++ b/src/gallium/drivers/i965/brw_wm_pass0.c
@ -0,0 +1,442 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+                 
+
+#include "brw_context.h"
+#include "brw_wm.h"
+#include "shader/prog_parameter.h"
+
+
+
+/***********************************************************************
+ */
+
+static struct brw_wm_ref *get_ref( struct brw_wm_compile *c )
+{
+   assert(c->nr_refs < BRW_WM_MAX_REF);
+   return &c->refs[c->nr_refs++];
+}
+
+static struct brw_wm_value *get_value( struct brw_wm_compile *c)
+{
+   assert(c->nr_refs < BRW_WM_MAX_VREG);
+   return &c->vreg[c->nr_vreg++];
+}
+
+/** return pointer to a newly allocated instruction */
+static struct brw_wm_instruction *get_instruction( struct brw_wm_compile *c )
+{
+   assert(c->nr_insns < BRW_WM_MAX_INSN);
+   return &c->instruction[c->nr_insns++];
+}
+
+/***********************************************************************
+ */
+
+/** Init the "undef" register */
+static void pass0_init_undef( struct brw_wm_compile *c)
+{
+   struct brw_wm_ref *ref = &c->undef_ref;
+   ref->value = &c->undef_value;
+   ref->hw_reg = brw_vec8_grf(0, 0);
+   ref->insn = 0;
+   ref->prevuse = NULL;
+}
+
+/** Set a FP register to a value */
+static void pass0_set_fpreg_value( struct brw_wm_compile *c,
+				   GLuint file,
+				   GLuint idx,
+				   GLuint component,
+				   struct brw_wm_value *value )
+{
+   struct brw_wm_ref *ref = get_ref(c);
+   ref->value = value;
+   ref->hw_reg = brw_vec8_grf(0, 0);
+   ref->insn = 0;
+   ref->prevuse = NULL;
+   c->pass0_fp_reg[file][idx][component] = ref;
+}
+
+/** Set a FP register to a ref */
+static void pass0_set_fpreg_ref( struct brw_wm_compile *c,
+				 GLuint file,
+				 GLuint idx,
+				 GLuint component,
+				 const struct brw_wm_ref *src_ref )
+{
+   c->pass0_fp_reg[file][idx][component] = src_ref;
+}
+
+static const struct brw_wm_ref *get_param_ref( struct brw_wm_compile *c, 
+					       const GLfloat *param_ptr )
+{
+   GLuint i = c->prog_data.nr_params++;
+   
+   if (i >= BRW_WM_MAX_PARAM) {
+      _mesa_printf("%s: out of params\n", __FUNCTION__);
+      c->prog_data.error = 1;
+      return NULL;
+   }
+   else {
+      struct brw_wm_ref *ref = get_ref(c);
+
+      c->prog_data.param[i] = param_ptr;
+      c->nr_creg = (i+16)/16;
+
+      /* Push the offsets into hw_reg.  These will be added to the
+       * real register numbers once one is allocated in pass2.
+       */
+      ref->hw_reg = brw_vec1_grf((i&8)?1:0, i%8);
+      ref->value = &c->creg[i/16];
+      ref->insn = 0;
+      ref->prevuse = NULL;
+
+      return ref;
+   }
+}
+
+
+/** Return a ref to a constant/literal value */
+static const struct brw_wm_ref *get_const_ref( struct brw_wm_compile *c,
+					       const GLfloat *constval )
+{
+   GLuint i;
+
+   /* Search for an existing const value matching the request:
+    */
+   for (i = 0; i < c->nr_constrefs; i++) {
+      if (c->constref[i].constval == *constval) 
+	 return c->constref[i].ref;
+   }
+
+   /* Else try to add a new one:
+    */
+   if (c->nr_constrefs < BRW_WM_MAX_CONST) {
+      GLuint i = c->nr_constrefs++;
+
+      /* A constant is a special type of parameter:
+       */
+      c->constref[i].constval = *constval;
+      c->constref[i].ref = get_param_ref(c, constval);
+
+      return c->constref[i].ref;
+   }
+   else {
+      _mesa_printf("%s: out of constrefs\n", __FUNCTION__);
+      c->prog_data.error = 1;
+      return NULL;
+   }
+}
+
+
+/* Lookup our internal registers
+ */
+static const struct brw_wm_ref *pass0_get_reg( struct brw_wm_compile *c,
+					       GLuint file,
+					       GLuint idx,
+					       GLuint component )
+{
+   const struct brw_wm_ref *ref = c->pass0_fp_reg[file][idx][component];
+
+   if (!ref) {
+      switch (file) {
+      case PROGRAM_INPUT:
+      case PROGRAM_PAYLOAD:
+      case PROGRAM_TEMPORARY:
+      case PROGRAM_OUTPUT:
+      case PROGRAM_VARYING:
+	 break;
+
+      case PROGRAM_LOCAL_PARAM:
+	 ref = get_param_ref(c, &c->fp->program.Base.LocalParams[idx][component]);
+	 break;
+
+      case PROGRAM_ENV_PARAM:
+	 ref = get_param_ref(c, &c->env_param[idx][component]);
+	 break;
+
+      case PROGRAM_STATE_VAR:
+      case PROGRAM_UNIFORM:
+      case PROGRAM_CONSTANT:
+      case PROGRAM_NAMED_PARAM: {
+	 struct gl_program_parameter_list *plist = c->fp->program.Base.Parameters;
+	 
+	 /* There's something really hokey about parameters parsed in
+	  * arb programs - they all end up in here, whether they be
+	  * state values, parameters or constants.  This duplicates the
+	  * structure above & also seems to subvert the limits set for
+	  * each type of constant/param.
+	  */ 
+	 switch (plist->Parameters[idx].Type) {
+	 case PROGRAM_NAMED_PARAM:
+	 case PROGRAM_CONSTANT:
+	    /* These are invarient:
+	     */
+	    ref = get_const_ref(c, &plist->ParameterValues[idx][component]);
+	    break;
+
+	 case PROGRAM_STATE_VAR:
+	 case PROGRAM_UNIFORM:
+	    /* These may change from run to run:
+	     */
+	    ref = get_param_ref(c, &plist->ParameterValues[idx][component] );
+	    break;
+
+	 default:
+	    assert(0);
+	    break;
+	 }
+	 break;
+      }
+
+      default:
+	 assert(0);
+	 break;
+      }
+
+      c->pass0_fp_reg[file][idx][component] = ref;
+   }
+
+   if (!ref)
+      ref = &c->undef_ref;
+
+   return ref;
+}
+
+
+
+/***********************************************************************
+ * Straight translation to internal instruction format
+ */
+
+static void pass0_set_dst( struct brw_wm_compile *c,
+			   struct brw_wm_instruction *out,
+			   const struct prog_instruction *inst,
+			   GLuint writemask )
+{
+   const struct prog_dst_register *dst = &inst->DstReg;
+   GLuint i;
+
+   for (i = 0; i < 4; i++) {
+      if (writemask & (1<<i)) {
+	 out->dst[i] = get_value(c);
+	 pass0_set_fpreg_value(c, dst->File, dst->Index, i, out->dst[i]);
+      }
+   }
+
+   out->writemask = writemask;
+}
+
+
+static const struct brw_wm_ref *get_fp_src_reg_ref( struct brw_wm_compile *c,
+						    struct prog_src_register src,
+						    GLuint i )
+{
+   GLuint component = GET_SWZ(src.Swizzle,i);
+   const struct brw_wm_ref *src_ref;
+   static const GLfloat const_zero = 0.0;
+   static const GLfloat const_one = 1.0;
+
+   if (component == SWIZZLE_ZERO) 
+      src_ref = get_const_ref(c, &const_zero);
+   else if (component == SWIZZLE_ONE) 
+      src_ref = get_const_ref(c, &const_one);
+   else 
+      src_ref = pass0_get_reg(c, src.File, src.Index, component);
+
+   return src_ref;
+}
+
+
+static struct brw_wm_ref *get_new_ref( struct brw_wm_compile *c,
+				       struct prog_src_register src,
+				       GLuint i,
+				       struct brw_wm_instruction *insn)
+{
+   const struct brw_wm_ref *ref = get_fp_src_reg_ref(c, src, i);
+   struct brw_wm_ref *newref = get_ref(c);
+
+   newref->value = ref->value;
+   newref->hw_reg = ref->hw_reg;
+
+   if (insn) {
+      newref->insn = insn - c->instruction;
+      newref->prevuse = newref->value->lastuse;
+      newref->value->lastuse = newref;
+   }
+
+   if (src.Negate & (1 << i))
+      newref->hw_reg.negate ^= 1;
+
+   if (src.Abs) {
+      newref->hw_reg.negate = 0;
+      newref->hw_reg.abs = 1;
+   }
+
+   return newref;
+}
+
+
+static void
+translate_insn(struct brw_wm_compile *c,
+               const struct prog_instruction *inst)
+{
+   struct brw_wm_instruction *out = get_instruction(c);
+   GLuint writemask = inst->DstReg.WriteMask;
+   GLuint nr_args = brw_wm_nr_args(inst->Opcode);
+   GLuint i, j;
+
+   /* Copy some data out of the instruction
+    */
+   out->opcode = inst->Opcode;
+   out->saturate = (inst->SaturateMode != SATURATE_OFF);
+   out->tex_unit = inst->TexSrcUnit;
+   out->tex_idx = inst->TexSrcTarget;
+   out->tex_shadow = inst->TexShadow;
+   out->eot = inst->Aux & 1;
+   out->target = inst->Aux >> 1;
+
+   /* Args:
+    */
+   for (i = 0; i < nr_args; i++) {
+      for (j = 0; j < 4; j++) {
+	 out->src[i][j] = get_new_ref(c, inst->SrcReg[i], j, out);
+      }
+   }
+
+   /* Dst:
+    */
+   pass0_set_dst(c, out, inst, writemask);
+}
+
+
+
+/***********************************************************************
+ * Optimize moves and swizzles away:
+ */ 
+static void pass0_precalc_mov( struct brw_wm_compile *c,
+			       const struct prog_instruction *inst )
+{
+   const struct prog_dst_register *dst = &inst->DstReg;
+   GLuint writemask = inst->DstReg.WriteMask;
+   struct brw_wm_ref *refs[4];
+   GLuint i;
+
+   /* Get the effect of a MOV by manipulating our register table:
+    * First get all refs, then assign refs.  This ensures that "in-place"
+    * swizzles such as:
+    *   MOV t, t.xxyx
+    * are handled correctly.  Previously, these two steps were done in
+    * one loop and the above case was incorrectly handled.
+    */
+   for (i = 0; i < 4; i++) {
+      refs[i] = get_new_ref(c, inst->SrcReg[0], i, NULL);
+   }
+   for (i = 0; i < 4; i++) {
+      if (writemask & (1 << i)) {	    
+         pass0_set_fpreg_ref( c, dst->File, dst->Index, i, refs[i]);
+      }
+   }
+}
+
+
+/* Initialize payload "registers".
+ */
+static void pass0_init_payload( struct brw_wm_compile *c )
+{
+   GLuint i;
+
+   for (i = 0; i < 4; i++) {
+      GLuint j = i >= c->key.nr_depth_regs ? 0 : i;
+      pass0_set_fpreg_value( c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, 
+			     &c->payload.depth[j] );
+   }
+
+#if 0
+   /* This seems to be an alternative to the INTERP_WPOS stuff I do
+    * elsewhere:
+    */
+   if (c->key.source_depth_reg)
+      pass0_set_fpreg_value(c, PROGRAM_INPUT, FRAG_ATTRIB_WPOS, 2,
+			    &c->payload.depth[c->key.source_depth_reg/2]);
+#endif
+   
+   for (i = 0; i < FRAG_ATTRIB_MAX; i++)
+      pass0_set_fpreg_value( c, PROGRAM_PAYLOAD, i, 0, 
+			     &c->payload.input_interp[i] );      
+}
+
+
+/***********************************************************************
+ * PASS 0
+ *
+ * Work forwards to give each calculated value a unique number.  Where
+ * an instruction produces duplicate values (eg DP3), all are given
+ * the same number.
+ *
+ * Translate away swizzling and eliminate non-saturating moves.
+ */
+void brw_wm_pass0( struct brw_wm_compile *c )
+{
+   GLuint insn;
+
+   c->nr_vreg = 0;
+   c->nr_insns = 0;
+
+   pass0_init_undef(c);
+   pass0_init_payload(c);
+
+   for (insn = 0; insn < c->nr_fp_insns; insn++) {
+      const struct prog_instruction *inst = &c->prog_instructions[insn];
+
+      /* Optimize away moves, otherwise emit translated instruction:
+       */      
+      switch (inst->Opcode) {
+      case OPCODE_MOV: 
+      case OPCODE_SWZ: 
+	 if (!inst->SaturateMode) {
+	    pass0_precalc_mov(c, inst);
+	 }
+	 else {
+	    translate_insn(c, inst);
+	 }
+	 break;
+      default:
+	 translate_insn(c, inst);
+	 break;
+      }
+   }
+ 
+   if (INTEL_DEBUG & DEBUG_WM) {
+      brw_wm_print_program(c, "pass0");
+   }
+}
--- a/src/gallium/drivers/i965/brw_wm_pass1.c
+++ b/src/gallium/drivers/i965/brw_wm_pass1.c
@ -0,0 +1,291 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+                  
+
+#include "brw_context.h"
+#include "brw_wm.h"
+
+
+static GLuint get_tracked_mask(struct brw_wm_compile *c,
+			       struct brw_wm_instruction *inst)
+{
+   GLuint i;
+   for (i = 0; i < 4; i++) {
+      if (inst->writemask & (1<<i)) {
+	 if (!inst->dst[i]->contributes_to_output) {
+	    inst->writemask &= ~(1<<i);
+	    inst->dst[i] = 0;
+	 }
+      }
+   }
+
+   return inst->writemask;
+}
+
+/* Remove a reference from a value's usage chain.
+ */
+static void unlink_ref(struct brw_wm_ref *ref)
+{
+   struct brw_wm_value *value = ref->value;
+
+   if (ref == value->lastuse) {
+      value->lastuse = ref->prevuse;
+   }
+   else {
+      struct brw_wm_ref *i = value->lastuse;
+      while (i->prevuse != ref) i = i->prevuse;
+      i->prevuse = ref->prevuse;
+   }
+}
+
+static void track_arg(struct brw_wm_compile *c,
+		      struct brw_wm_instruction *inst,
+		      GLuint arg,
+		      GLuint readmask)
+{
+   GLuint i;
+
+   for (i = 0; i < 4; i++) {
+      struct brw_wm_ref *ref = inst->src[arg][i];
+      if (ref) {
+	 if (readmask & (1<<i)) {
+	    ref->value->contributes_to_output = 1;
+         }
+	 else {
+	    unlink_ref(ref);
+	    inst->src[arg][i] = NULL;
+	 }
+      }
+   }
+}
+
+static GLuint get_texcoord_mask( GLuint tex_idx )
+{
+   switch (tex_idx) {
+   case TEXTURE_1D_INDEX:
+      return WRITEMASK_X;
+   case TEXTURE_2D_INDEX:
+      return WRITEMASK_XY;
+   case TEXTURE_3D_INDEX:
+      return WRITEMASK_XYZ;
+   case TEXTURE_CUBE_INDEX:
+      return WRITEMASK_XYZ;
+   case TEXTURE_RECT_INDEX:
+      return WRITEMASK_XY;
+   default: return 0;
+   }
+}
+
+
+/* Step two: Basically this is dead code elimination.  
+ *
+ * Iterate backwards over instructions, noting which values
+ * contribute to the final result.  Adjust writemasks to only
+ * calculate these values.
+ */
+void brw_wm_pass1( struct brw_wm_compile *c )
+{
+   GLint insn;
+
+   for (insn = c->nr_insns-1; insn >= 0; insn--) {
+      struct brw_wm_instruction *inst = &c->instruction[insn];
+      GLuint writemask;
+      GLuint read0, read1, read2;
+
+      if (inst->opcode == OPCODE_KIL) {
+	 track_arg(c, inst, 0, WRITEMASK_XYZW); /* All args contribute to final */
+	 continue;
+      }
+
+      if (inst->opcode == WM_FB_WRITE) {
+	 track_arg(c, inst, 0, WRITEMASK_XYZW); 
+	 track_arg(c, inst, 1, WRITEMASK_XYZW); 
+	 if (c->key.source_depth_to_render_target &&
+	     c->key.computes_depth)
+	    track_arg(c, inst, 2, WRITEMASK_Z); 
+	 else
+	    track_arg(c, inst, 2, 0); 
+	 continue;
+      }
+
+      /* Lookup all the registers which were written by this
+       * instruction and get a mask of those that contribute to the output:
+       */
+      writemask = get_tracked_mask(c, inst);
+      if (!writemask) {
+	 GLuint arg;
+	 for (arg = 0; arg < 3; arg++)
+	    track_arg(c, inst, arg, 0);
+	 continue;
+      }
+
+      read0 = 0;
+      read1 = 0;
+      read2 = 0;
+
+      /* Mark all inputs which contribute to the marked outputs:
+       */
+      switch (inst->opcode) {
+      case OPCODE_ABS:
+      case OPCODE_FLR:
+      case OPCODE_FRC:
+      case OPCODE_MOV:
+      case OPCODE_SWZ:
+      case OPCODE_TRUNC:
+	 read0 = writemask;
+	 break;
+
+      case OPCODE_SUB:
+      case OPCODE_SLT:
+      case OPCODE_SLE:
+      case OPCODE_SGE:
+      case OPCODE_SGT:
+      case OPCODE_SEQ:
+      case OPCODE_SNE:
+      case OPCODE_ADD:
+      case OPCODE_MAX:
+      case OPCODE_MIN:
+      case OPCODE_MUL:
+	 read0 = writemask;
+	 read1 = writemask;
+	 break;
+
+      case OPCODE_DDX:
+      case OPCODE_DDY:
+	 read0 = writemask;
+	 break;
+
+      case OPCODE_MAD:	
+      case OPCODE_CMP:
+      case OPCODE_LRP:
+	 read0 = writemask;
+	 read1 = writemask;	
+	 read2 = writemask;	
+	 break;
+
+      case OPCODE_XPD: 
+	 if (writemask & WRITEMASK_X) read0 |= WRITEMASK_YZ;	 
+	 if (writemask & WRITEMASK_Y) read0 |= WRITEMASK_XZ;	 
+	 if (writemask & WRITEMASK_Z) read0 |= WRITEMASK_XY;
+	 read1 = read0;
+	 break;
+
+      case OPCODE_COS:
+      case OPCODE_EX2:
+      case OPCODE_LG2:
+      case OPCODE_RCP:
+      case OPCODE_RSQ:
+      case OPCODE_SIN:
+      case OPCODE_SCS:
+      case WM_CINTERP:
+      case WM_PIXELXY:
+	 read0 = WRITEMASK_X;
+	 break;
+
+      case OPCODE_POW:
+	 read0 = WRITEMASK_X;
+	 read1 = WRITEMASK_X;
+	 break;
+
+      case OPCODE_TEX:
+      case OPCODE_TXP:
+	 read0 = get_texcoord_mask(inst->tex_idx);
+
+         if (inst->tex_shadow)
+	    read0 |= WRITEMASK_Z;
+	 break;
+
+      case OPCODE_TXB:
+	 /* Shadow ignored for txb.
+	  */
+	 read0 = get_texcoord_mask(inst->tex_idx) | WRITEMASK_W;
+	 break;
+
+      case WM_WPOSXY:
+	 read0 = writemask & WRITEMASK_XY;
+	 break;
+
+      case WM_DELTAXY:
+	 read0 = writemask & WRITEMASK_XY;
+	 read1 = WRITEMASK_X;
+	 break;
+
+      case WM_PIXELW:
+	 read0 = WRITEMASK_X;
+	 read1 = WRITEMASK_XY;
+	 break;
+
+      case WM_LINTERP:
+	 read0 = WRITEMASK_X;
+	 read1 = WRITEMASK_XY;
+	 break;
+
+      case WM_PINTERP:
+	 read0 = WRITEMASK_X; /* interpolant */
+	 read1 = WRITEMASK_XY; /* deltas */
+	 read2 = WRITEMASK_W; /* pixel w */
+	 break;
+
+      case OPCODE_DP3:	
+	 read0 = WRITEMASK_XYZ;
+	 read1 = WRITEMASK_XYZ;
+	 break;
+
+      case OPCODE_DPH:
+	 read0 = WRITEMASK_XYZ;
+	 read1 = WRITEMASK_XYZW;
+	 break;
+
+      case OPCODE_DP4:
+	 read0 = WRITEMASK_XYZW;
+	 read1 = WRITEMASK_XYZW;
+	 break;
+
+      case OPCODE_LIT: 
+	 read0 = WRITEMASK_XYW;
+	 break;
+
+      case OPCODE_DST:
+      case WM_FRONTFACING:
+      case OPCODE_KIL_NV:
+      default:
+	 break;
+      }
+
+      track_arg(c, inst, 0, read0);
+      track_arg(c, inst, 1, read1);
+      track_arg(c, inst, 2, read2);
+   }
+
+   if (INTEL_DEBUG & DEBUG_WM) {
+      brw_wm_print_program(c, "pass1");
+   }
+}
--- a/src/gallium/drivers/i965/brw_wm_pass2.c
+++ b/src/gallium/drivers/i965/brw_wm_pass2.c
@ -0,0 +1,343 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+                   
+
+#include "brw_context.h"
+#include "brw_wm.h"
+
+
+/* Use these to force spilling so that that functionality can be
+ * tested with known-good examples rather than having to construct new
+ * tests.
+ */
+#define TEST_PAYLOAD_SPILLS 0
+#define TEST_DST_SPILLS 0
+
+static void spill_value(struct brw_wm_compile *c,
+			struct brw_wm_value *value);
+
+static void prealloc_reg(struct brw_wm_compile *c,
+			 struct brw_wm_value *value,
+			 GLuint reg)
+{
+   if (value->lastuse) {
+      /* Set nextuse to zero, it will be corrected by
+       * update_register_usage().
+       */
+      c->pass2_grf[reg].value = value;
+      c->pass2_grf[reg].nextuse = 0;
+
+      value->resident = &c->pass2_grf[reg];
+      value->hw_reg = brw_vec8_grf(reg*2, 0);
+
+      if (TEST_PAYLOAD_SPILLS)
+	 spill_value(c, value);
+   }
+}
+
+
+/* Initialize all the register values.  Do the initial setup
+ * calculations for interpolants.
+ */
+static void init_registers( struct brw_wm_compile *c )
+{
+   GLuint nr_interp_regs = 0;
+   GLuint i = 0;
+   GLuint j;
+
+   for (j = 0; j < c->grf_limit; j++) 
+      c->pass2_grf[j].nextuse = BRW_WM_MAX_INSN;
+
+   for (j = 0; j < c->key.nr_depth_regs; j++) 
+      prealloc_reg(c, &c->payload.depth[j], i++);
+
+   for (j = 0; j < c->nr_creg; j++) 
+      prealloc_reg(c, &c->creg[j], i++);
+
+   for (j = 0; j < FRAG_ATTRIB_MAX; j++) {
+      if (c->key.vp_outputs_written & (1<<j)) {
+	 int fp_index;
+
+	 if (j >= VERT_RESULT_VAR0)
+	    fp_index = j - (VERT_RESULT_VAR0 - FRAG_ATTRIB_VAR0);
+	 else if (j <= VERT_RESULT_TEX7)
+	    fp_index = j;
+	 else
+	    fp_index = -1;
+
+	 nr_interp_regs++;
+	 if (fp_index >= 0)
+	    prealloc_reg(c, &c->payload.input_interp[fp_index], i++);
+      }
+   }
+
+   assert(nr_interp_regs >= 1);
+
+   c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
+   c->prog_data.urb_read_length = nr_interp_regs * 2;
+   c->prog_data.curb_read_length = c->nr_creg * 2;
+
+   c->max_wm_grf = i * 2;
+}
+
+
+/* Update the nextuse value for each register in our file.
+ */
+static void update_register_usage(struct brw_wm_compile *c,
+				  GLuint thisinsn)
+{
+   GLuint i;
+
+   for (i = 1; i < c->grf_limit; i++) {
+      struct brw_wm_grf *grf = &c->pass2_grf[i];
+
+      /* Only search those which can change:
+       */
+      if (grf->nextuse < thisinsn) {
+	 const struct brw_wm_ref *ref = grf->value->lastuse;
+
+	 /* Has last use of value been passed?
+	  */
+	 if (ref->insn < thisinsn) {
+	    grf->value->resident = 0;
+	    grf->value = 0;
+	    grf->nextuse = BRW_WM_MAX_INSN;
+	 }
+	 else {
+	    /* Else loop through chain to update:
+	     */
+	    while (ref->prevuse && ref->prevuse->insn >= thisinsn)
+	       ref = ref->prevuse;
+
+	    grf->nextuse = ref->insn;
+	 }
+      }
+   }
+}
+
+
+static void spill_value(struct brw_wm_compile *c,
+			struct brw_wm_value *value)
+{	
+   /* Allocate a spill slot.  Note that allocations start from 0x40 -
+    * the first slot is reserved to mean "undef" in brw_wm_emit.c
+    */
+   if (!value->spill_slot) {
+      c->last_scratch += 0x40;	
+      value->spill_slot = c->last_scratch;
+   }
+
+   /* The spill will be done in brw_wm_emit.c immediately after the
+    * value is calculated, so we can just take this reg without any
+    * further work.
+    */
+   value->resident->value = NULL;
+   value->resident->nextuse = BRW_WM_MAX_INSN;
+   value->resident = NULL;
+}
+
+
+
+/* Search for contiguous region with the most distant nearest
+ * member.  Free regs count as very distant.
+ *
+ * TODO: implement spill-to-reg so that we can rearrange discontigous
+ * free regs and then spill the oldest non-free regs in sequence.
+ * This would mean inserting instructions in this pass.
+ */
+static GLuint search_contiguous_regs(struct brw_wm_compile *c,
+				     GLuint nr,
+				     GLuint thisinsn)
+{
+   struct brw_wm_grf *grf = c->pass2_grf;
+   GLuint furthest = 0;
+   GLuint reg = 0;
+   GLuint i, j;
+
+   /* Start search at 1: r0 is special and can't be used or spilled.
+    */
+   for (i = 1; i < c->grf_limit && furthest < BRW_WM_MAX_INSN; i++) {
+      GLuint group_nextuse = BRW_WM_MAX_INSN;
+
+      for (j = 0; j < nr; j++) {
+	 if (grf[i+j].nextuse < group_nextuse)
+	    group_nextuse = grf[i+j].nextuse;
+      }
+
+      if (group_nextuse > furthest) {
+	 furthest = group_nextuse;
+	 reg = i;
+      }
+   }
+
+   assert(furthest != thisinsn);
+
+   /* Any non-empty regs will need to be spilled:
+    */
+   for (j = 0; j < nr; j++) 
+      if (grf[reg+j].value)
+	 spill_value(c, grf[reg+j].value);
+
+   return reg;
+}
+
+
+static void alloc_contiguous_dest(struct brw_wm_compile *c, 
+				  struct brw_wm_value *dst[],
+				  GLuint nr,
+				  GLuint thisinsn)
+{
+   GLuint reg = search_contiguous_regs(c, nr, thisinsn);
+   GLuint i;
+
+   for (i = 0; i < nr; i++) {
+      if (!dst[i]) {
+	 /* Need to grab a dummy value in TEX case.  Don't introduce
+	  * it into the tracking scheme.
+	  */
+	 dst[i] = &c->vreg[c->nr_vreg++];
+      }
+      else {
+	 assert(!dst[i]->resident);
+	 assert(c->pass2_grf[reg+i].nextuse != thisinsn);
+
+	 c->pass2_grf[reg+i].value = dst[i];
+	 c->pass2_grf[reg+i].nextuse = thisinsn;
+
+	 dst[i]->resident = &c->pass2_grf[reg+i];
+      }
+
+      dst[i]->hw_reg = brw_vec8_grf((reg+i)*2, 0);
+   }
+
+   if ((reg+nr)*2 > c->max_wm_grf)
+      c->max_wm_grf = (reg+nr) * 2;
+}
+
+
+static void load_args(struct brw_wm_compile *c, 
+		      struct brw_wm_instruction *inst)
+{
+   GLuint thisinsn = inst - c->instruction;
+   GLuint i,j;
+
+   for (i = 0; i < 3; i++) {
+      for (j = 0; j < 4; j++) {
+	 struct brw_wm_ref *ref = inst->src[i][j];
+
+	 if (ref) {
+	    if (!ref->value->resident) {
+	       /* Need to bring the value in from scratch space.  The code for
+		* this will be done in brw_wm_emit.c, here we just do the
+		* register allocation and mark the ref as requiring a fill.
+		*/
+	       GLuint reg = search_contiguous_regs(c, 1, thisinsn);
+
+	       c->pass2_grf[reg].value = ref->value;
+	       c->pass2_grf[reg].nextuse = thisinsn;
+
+	       ref->value->resident = &c->pass2_grf[reg];
+
+	       /* Note that a fill is required:
+		*/
+	       ref->unspill_reg = reg*2;
+	    }
+
+	    /* Adjust the hw_reg to point at the value's current location:
+	     */
+	    assert(ref->value == ref->value->resident->value);
+	    ref->hw_reg.nr += (ref->value->resident - c->pass2_grf) * 2;
+	 }
+      }
+   }
+}
+
+
+
+/* Step 3: Work forwards once again.  Perform register allocations,
+ * taking into account instructions like TEX which require contiguous
+ * result registers.  Where necessary spill registers to scratch space
+ * and reload later.
+ */
+void brw_wm_pass2( struct brw_wm_compile *c )
+{
+   GLuint insn;
+   GLuint i;
+
+   init_registers(c);
+
+   for (insn = 0; insn < c->nr_insns; insn++) {
+      struct brw_wm_instruction *inst = &c->instruction[insn];
+
+      /* Update registers' nextuse values:
+       */
+      update_register_usage(c, insn);
+
+      /* May need to unspill some args.
+       */
+      load_args(c, inst);
+
+      /* Allocate registers to hold results:
+       */
+      switch (inst->opcode) {
+      case OPCODE_TEX:
+      case OPCODE_TXB:
+      case OPCODE_TXP:
+	 alloc_contiguous_dest(c, inst->dst, 4, insn);
+	 break;
+
+      default:
+	 for (i = 0; i < 4; i++) {
+	    if (inst->writemask & (1<<i)) {
+	       assert(inst->dst[i]);
+	       alloc_contiguous_dest(c, &inst->dst[i], 1, insn);
+	    }
+	 }
+	 break;
+      }
+
+      if (TEST_DST_SPILLS && inst->opcode != WM_PIXELXY) {
+	 for (i = 0; i < 4; i++)	
+	    if (inst->dst[i])
+	       spill_value(c, inst->dst[i]);
+      }
+   }
+
+   if (INTEL_DEBUG & DEBUG_WM) {
+      brw_wm_print_program(c, "pass2");
+   }
+
+   c->state = PASS2_DONE;
+
+   if (INTEL_DEBUG & DEBUG_WM) {
+       brw_wm_print_program(c, "pass2/done");
+   }
+}
--- a/src/gallium/drivers/i965/brw_wm_sampler_state.c
+++ b/src/gallium/drivers/i965/brw_wm_sampler_state.c
@ -0,0 +1,369 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+                   
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+
+#include "main/macros.h"
+
+
+
+/* Samplers aren't strictly wm state from the hardware's perspective,
+ * but that is the only situation in which we use them in this driver.
+ */
+
+
+
+/* The brw (and related graphics cores) do not support GL_CLAMP.  The
+ * Intel drivers for "other operating systems" implement GL_CLAMP as
+ * GL_CLAMP_TO_EDGE, so the same is done here.
+ */
+static GLuint translate_wrap_mode( GLenum wrap )
+{
+   switch( wrap ) {
+   case GL_REPEAT: 
+      return BRW_TEXCOORDMODE_WRAP;
+   case GL_CLAMP:  
+      return BRW_TEXCOORDMODE_CLAMP;
+   case GL_CLAMP_TO_EDGE: 
+      return BRW_TEXCOORDMODE_CLAMP; /* conform likes it this way */
+   case GL_CLAMP_TO_BORDER: 
+      return BRW_TEXCOORDMODE_CLAMP_BORDER;
+   case GL_MIRRORED_REPEAT: 
+      return BRW_TEXCOORDMODE_MIRROR;
+   default: 
+      return BRW_TEXCOORDMODE_WRAP;
+   }
+}
+
+
+static GLuint U_FIXED(GLfloat value, GLuint frac_bits)
+{
+   value *= (1<<frac_bits);
+   return value < 0 ? 0 : value;
+}
+
+static GLint S_FIXED(GLfloat value, GLuint frac_bits)
+{
+   return value * (1<<frac_bits);
+}
+
+
+static dri_bo *upload_default_color( struct brw_context *brw,
+				     const GLfloat *color )
+{
+   struct brw_sampler_default_color sdc;
+
+   COPY_4V(sdc.color, color); 
+   
+   return brw_cache_data( &brw->cache, BRW_SAMPLER_DEFAULT_COLOR, &sdc,
+			  NULL, 0 );
+}
+
+
+struct wm_sampler_key {
+   int sampler_count;
+
+   struct wm_sampler_entry {
+      GLenum tex_target;
+      GLenum wrap_r, wrap_s, wrap_t;
+      float maxlod, minlod;
+      float lod_bias;
+      float max_aniso;
+      GLenum minfilter, magfilter;
+      GLenum comparemode, comparefunc;
+      dri_bo *sdc_bo;
+
+      /** If target is cubemap, take context setting.
+       */
+      GLboolean seamless_cube_map;
+   } sampler[BRW_MAX_TEX_UNIT];
+};
+
+/**
+ * Sets the sampler state for a single unit based off of the sampler key
+ * entry.
+ */
+static void brw_update_sampler_state(struct wm_sampler_entry *key,
+				     dri_bo *sdc_bo,
+				     struct brw_sampler_state *sampler)
+{
+   _mesa_memset(sampler, 0, sizeof(*sampler));
+
+   switch (key->minfilter) {
+   case GL_NEAREST:
+      sampler->ss0.min_filter = BRW_MAPFILTER_NEAREST;
+      sampler->ss0.mip_filter = BRW_MIPFILTER_NONE;
+      break;
+   case GL_LINEAR:
+      sampler->ss0.min_filter = BRW_MAPFILTER_LINEAR;
+      sampler->ss0.mip_filter = BRW_MIPFILTER_NONE;
+      break;
+   case GL_NEAREST_MIPMAP_NEAREST:
+      sampler->ss0.min_filter = BRW_MAPFILTER_NEAREST;
+      sampler->ss0.mip_filter = BRW_MIPFILTER_NEAREST;
+      break;
+   case GL_LINEAR_MIPMAP_NEAREST:
+      sampler->ss0.min_filter = BRW_MAPFILTER_LINEAR;
+      sampler->ss0.mip_filter = BRW_MIPFILTER_NEAREST;
+      break;
+   case GL_NEAREST_MIPMAP_LINEAR:
+      sampler->ss0.min_filter = BRW_MAPFILTER_NEAREST;
+      sampler->ss0.mip_filter = BRW_MIPFILTER_LINEAR;
+      break;
+   case GL_LINEAR_MIPMAP_LINEAR:
+      sampler->ss0.min_filter = BRW_MAPFILTER_LINEAR;
+      sampler->ss0.mip_filter = BRW_MIPFILTER_LINEAR;
+      break;
+   default:
+      break;
+   }
+
+   /* Set Anisotropy: 
+    */
+   if (key->max_aniso > 1.0) {
+      sampler->ss0.min_filter = BRW_MAPFILTER_ANISOTROPIC; 
+      sampler->ss0.mag_filter = BRW_MAPFILTER_ANISOTROPIC;
+
+      if (key->max_aniso > 2.0) {
+	 sampler->ss3.max_aniso = MIN2((key->max_aniso - 2) / 2,
+				       BRW_ANISORATIO_16);
+      }
+   }
+   else {
+      switch (key->magfilter) {
+      case GL_NEAREST:
+	 sampler->ss0.mag_filter = BRW_MAPFILTER_NEAREST;
+	 break;
+      case GL_LINEAR:
+	 sampler->ss0.mag_filter = BRW_MAPFILTER_LINEAR;
+	 break;
+      default:
+	 break;
+      }  
+   }
+
+   sampler->ss1.r_wrap_mode = translate_wrap_mode(key->wrap_r);
+   sampler->ss1.s_wrap_mode = translate_wrap_mode(key->wrap_s);
+   sampler->ss1.t_wrap_mode = translate_wrap_mode(key->wrap_t);
+
+   /* Cube-maps on 965 and later must use the same wrap mode for all 3
+    * coordinate dimensions.  Futher, only CUBE and CLAMP are valid.
+    */
+   if (key->tex_target == GL_TEXTURE_CUBE_MAP) {
+      if (key->seamless_cube_map &&
+	  (key->minfilter != GL_NEAREST || key->magfilter != GL_NEAREST)) {
+	 sampler->ss1.r_wrap_mode = BRW_TEXCOORDMODE_CUBE;
+	 sampler->ss1.s_wrap_mode = BRW_TEXCOORDMODE_CUBE;
+	 sampler->ss1.t_wrap_mode = BRW_TEXCOORDMODE_CUBE;
+      } else {
+	 sampler->ss1.r_wrap_mode = BRW_TEXCOORDMODE_CLAMP;
+	 sampler->ss1.s_wrap_mode = BRW_TEXCOORDMODE_CLAMP;
+	 sampler->ss1.t_wrap_mode = BRW_TEXCOORDMODE_CLAMP;
+      }
+   } else if (key->tex_target == GL_TEXTURE_1D) {
+      /* There's a bug in 1D texture sampling - it actually pays
+       * attention to the wrap_t value, though it should not.
+       * Override the wrap_t value here to GL_REPEAT to keep
+       * any nonexistent border pixels from floating in.
+       */
+      sampler->ss1.t_wrap_mode = BRW_TEXCOORDMODE_WRAP;
+   }
+
+
+   /* Set shadow function: 
+    */
+   if (key->comparemode == GL_COMPARE_R_TO_TEXTURE_ARB) {
+      /* Shadowing is "enabled" by emitting a particular sampler
+       * message (sample_c).  So need to recompile WM program when
+       * shadow comparison is enabled on each/any texture unit.
+       */
+      sampler->ss0.shadow_function =
+	 intel_translate_shadow_compare_func(key->comparefunc);
+   }
+
+   /* Set LOD bias: 
+    */
+   sampler->ss0.lod_bias = S_FIXED(CLAMP(key->lod_bias, -16, 15), 6);
+
+   sampler->ss0.lod_preclamp = 1; /* OpenGL mode */
+   sampler->ss0.default_color_mode = 0; /* OpenGL/DX10 mode */
+
+   /* Set BaseMipLevel, MaxLOD, MinLOD: 
+    *
+    * XXX: I don't think that using firstLevel, lastLevel works,
+    * because we always setup the surface state as if firstLevel ==
+    * level zero.  Probably have to subtract firstLevel from each of
+    * these:
+    */
+   sampler->ss0.base_level = U_FIXED(0, 1);
+
+   sampler->ss1.max_lod = U_FIXED(MIN2(MAX2(key->maxlod, 0), 13), 6);
+   sampler->ss1.min_lod = U_FIXED(MIN2(MAX2(key->minlod, 0), 13), 6);
+   
+   sampler->ss2.default_color_pointer = sdc_bo->offset >> 5; /* reloc */
+}
+
+
+/** Sets up the cache key for sampler state for all texture units */
+static void
+brw_wm_sampler_populate_key(struct brw_context *brw,
+			    struct wm_sampler_key *key)
+{
+   GLcontext *ctx = &brw->intel.ctx;
+   int unit;
+
+   memset(key, 0, sizeof(*key));
+
+   for (unit = 0; unit < BRW_MAX_TEX_UNIT; unit++) {
+      if (ctx->Texture.Unit[unit]._ReallyEnabled) {
+	 struct wm_sampler_entry *entry = &key->sampler[unit];
+	 struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+	 struct gl_texture_object *texObj = texUnit->_Current;
+	 struct intel_texture_object *intelObj = intel_texture_object(texObj);
+	 struct gl_texture_image *firstImage =
+	    texObj->Image[0][intelObj->firstLevel];
+
+         entry->tex_target = texObj->Target;
+
+	 entry->seamless_cube_map = (texObj->Target == GL_TEXTURE_CUBE_MAP)
+	    ? ctx->Texture.CubeMapSeamless : GL_FALSE;
+
+	 entry->wrap_r = texObj->WrapR;
+	 entry->wrap_s = texObj->WrapS;
+	 entry->wrap_t = texObj->WrapT;
+
+	 entry->maxlod = texObj->MaxLod;
+	 entry->minlod = texObj->MinLod;
+	 entry->lod_bias = texUnit->LodBias + texObj->LodBias;
+	 entry->max_aniso = texObj->MaxAnisotropy;
+	 entry->minfilter = texObj->MinFilter;
+	 entry->magfilter = texObj->MagFilter;
+	 entry->comparemode = texObj->CompareMode;
+         entry->comparefunc = texObj->CompareFunc;
+
+	 dri_bo_unreference(brw->wm.sdc_bo[unit]);
+	 if (firstImage->_BaseFormat == GL_DEPTH_COMPONENT) {
+	    float bordercolor[4] = {
+	       texObj->BorderColor[0],
+	       texObj->BorderColor[0],
+	       texObj->BorderColor[0],
+	       texObj->BorderColor[0]
+	    };
+	    /* GL specs that border color for depth textures is taken from the
+	     * R channel, while the hardware uses A.  Spam R into all the
+	     * channels for safety.
+	     */
+	    brw->wm.sdc_bo[unit] = upload_default_color(brw, bordercolor);
+	 } else {
+	    brw->wm.sdc_bo[unit] = upload_default_color(brw,
+							texObj->BorderColor);
+	 }
+	 key->sampler_count = unit + 1;
+      }
+   }
+}
+
+/* All samplers must be uploaded in a single contiguous array, which
+ * complicates various things.  However, this is still too confusing -
+ * FIXME: simplify all the different new texture state flags.
+ */
+static void upload_wm_samplers( struct brw_context *brw )
+{
+   GLcontext *ctx = &brw->intel.ctx;
+   struct wm_sampler_key key;
+   int i;
+
+   brw_wm_sampler_populate_key(brw, &key);
+
+   if (brw->wm.sampler_count != key.sampler_count) {
+      brw->wm.sampler_count = key.sampler_count;
+      brw->state.dirty.cache |= CACHE_NEW_SAMPLER;
+   }
+
+   dri_bo_unreference(brw->wm.sampler_bo);
+   brw->wm.sampler_bo = NULL;
+   if (brw->wm.sampler_count == 0)
+      return;
+
+   brw->wm.sampler_bo = brw_search_cache(&brw->cache, BRW_SAMPLER,
+					 &key, sizeof(key),
+					 brw->wm.sdc_bo, key.sampler_count,
+					 NULL);
+
+   /* If we didnt find it in the cache, compute the state and put it in the
+    * cache.
+    */
+   if (brw->wm.sampler_bo == NULL) {
+      struct brw_sampler_state sampler[BRW_MAX_TEX_UNIT];
+
+      memset(sampler, 0, sizeof(sampler));
+      for (i = 0; i < key.sampler_count; i++) {
+	 if (brw->wm.sdc_bo[i] == NULL)
+	    continue;
+
+	 brw_update_sampler_state(&key.sampler[i], brw->wm.sdc_bo[i],
+				  &sampler[i]);
+      }
+
+      brw->wm.sampler_bo = brw_upload_cache(&brw->cache, BRW_SAMPLER,
+					    &key, sizeof(key),
+					    brw->wm.sdc_bo, key.sampler_count,
+					    &sampler, sizeof(sampler),
+					    NULL, NULL);
+
+      /* Emit SDC relocations */
+      for (i = 0; i < BRW_MAX_TEX_UNIT; i++) {
+	 if (!ctx->Texture.Unit[i]._ReallyEnabled)
+	    continue;
+
+	 dri_bo_emit_reloc(brw->wm.sampler_bo,
+			   I915_GEM_DOMAIN_SAMPLER, 0,
+			   0,
+			   i * sizeof(struct brw_sampler_state) +
+			   offsetof(struct brw_sampler_state, ss2),
+			   brw->wm.sdc_bo[i]);
+      }
+   }
+}
+
+const struct brw_tracked_state brw_wm_samplers = {
+   .dirty = {
+      .mesa = _NEW_TEXTURE,
+      .brw = 0,
+      .cache = 0
+   },
+   .prepare = upload_wm_samplers,
+};
+
+
--- a/src/gallium/drivers/i965/brw_wm_state.c
+++ b/src/gallium/drivers/i965/brw_wm_state.c
@ -0,0 +1,317 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+                   
+
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+#include "brw_wm.h"
+
+/***********************************************************************
+ * WM unit - fragment programs and rasterization
+ */
+
+struct brw_wm_unit_key {
+   unsigned int total_grf, total_scratch;
+   unsigned int urb_entry_read_length;
+   unsigned int curb_entry_read_length;
+   unsigned int dispatch_grf_start_reg;
+
+   unsigned int curbe_offset;
+   unsigned int urb_size;
+
+   unsigned int max_threads;
+
+   unsigned int nr_surfaces, sampler_count;
+   GLboolean uses_depth, computes_depth, uses_kill, is_glsl;
+   GLboolean polygon_stipple, stats_wm, line_stipple, offset_enable;
+   GLfloat offset_units, offset_factor;
+};
+
+static void
+wm_unit_populate_key(struct brw_context *brw, struct brw_wm_unit_key *key)
+{
+   GLcontext *ctx = &brw->intel.ctx;
+   const struct gl_fragment_program *fp = brw->fragment_program;
+   const struct brw_fragment_program *bfp = (struct brw_fragment_program *) fp;
+   struct intel_context *intel = &brw->intel;
+
+   memset(key, 0, sizeof(*key));
+
+   if (INTEL_DEBUG & DEBUG_SINGLE_THREAD)
+      key->max_threads = 1;
+   else {
+      /* WM maximum threads is number of EUs times number of threads per EU. */
+      if (BRW_IS_IGDNG(brw))
+         key->max_threads = 12 * 6;
+      else if (BRW_IS_G4X(brw))
+	 key->max_threads = 10 * 5;
+      else
+	 key->max_threads = 8 * 4;
+   }
+
+   /* CACHE_NEW_WM_PROG */
+   key->total_grf = brw->wm.prog_data->total_grf;
+   key->urb_entry_read_length = brw->wm.prog_data->urb_read_length;
+   key->curb_entry_read_length = brw->wm.prog_data->curb_read_length;
+   key->dispatch_grf_start_reg = brw->wm.prog_data->first_curbe_grf;
+   key->total_scratch = ALIGN(brw->wm.prog_data->total_scratch, 1024);
+
+   /* BRW_NEW_URB_FENCE */
+   key->urb_size = brw->urb.vsize;
+
+   /* BRW_NEW_CURBE_OFFSETS */
+   key->curbe_offset = brw->curbe.wm_start;
+
+   /* BRW_NEW_NR_SURFACEs */
+   key->nr_surfaces = brw->wm.nr_surfaces;
+
+   /* CACHE_NEW_SAMPLER */
+   key->sampler_count = brw->wm.sampler_count;
+
+   /* _NEW_POLYGONSTIPPLE */
+   key->polygon_stipple = ctx->Polygon.StippleFlag;
+
+   /* BRW_NEW_FRAGMENT_PROGRAM */
+   key->uses_depth = (fp->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
+
+   /* as far as we can tell */
+   key->computes_depth =
+      (fp->Base.OutputsWritten & (1 << FRAG_RESULT_DEPTH)) != 0;
+   /* BRW_NEW_DEPTH_BUFFER
+    * Override for NULL depthbuffer case, required by the Pixel Shader Computed
+    * Depth field.
+    */
+   if (brw->state.depth_region == NULL)
+      key->computes_depth = 0;
+
+   /* _NEW_COLOR */
+   key->uses_kill = fp->UsesKill || ctx->Color.AlphaEnabled;
+   key->is_glsl = bfp->isGLSL;
+
+   /* temporary sanity check assertion */
+   ASSERT(bfp->isGLSL == brw_wm_is_glsl(fp));
+
+   /* _NEW_DEPTH */
+   key->stats_wm = intel->stats_wm;
+
+   /* _NEW_LINE */
+   key->line_stipple = ctx->Line.StippleFlag;
+
+   /* _NEW_POLYGON */
+   key->offset_enable = ctx->Polygon.OffsetFill;
+   key->offset_units = ctx->Polygon.OffsetUnits;
+   key->offset_factor = ctx->Polygon.OffsetFactor;
+}
+
+/**
+ * Setup wm hardware state.  See page 225 of Volume 2
+ */
+static dri_bo *
+wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key,
+			dri_bo **reloc_bufs)
+{
+   struct brw_wm_unit_state wm;
+   dri_bo *bo;
+
+   memset(&wm, 0, sizeof(wm));
+
+   wm.thread0.grf_reg_count = ALIGN(key->total_grf, 16) / 16 - 1;
+   wm.thread0.kernel_start_pointer = brw->wm.prog_bo->offset >> 6; /* reloc */
+   wm.thread1.depth_coef_urb_read_offset = 1;
+   wm.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
+
+   if (BRW_IS_IGDNG(brw))
+      wm.thread1.binding_table_entry_count = 0; /* hardware requirement */
+   else
+      wm.thread1.binding_table_entry_count = key->nr_surfaces;
+
+   if (key->total_scratch != 0) {
+      wm.thread2.scratch_space_base_pointer =
+	 brw->wm.scratch_bo->offset >> 10; /* reloc */
+      wm.thread2.per_thread_scratch_space = key->total_scratch / 1024 - 1;
+   } else {
+      wm.thread2.scratch_space_base_pointer = 0;
+      wm.thread2.per_thread_scratch_space = 0;
+   }
+
+   wm.thread3.dispatch_grf_start_reg = key->dispatch_grf_start_reg;
+   wm.thread3.urb_entry_read_length = key->urb_entry_read_length;
+   wm.thread3.urb_entry_read_offset = 0;
+   wm.thread3.const_urb_entry_read_length = key->curb_entry_read_length;
+   wm.thread3.const_urb_entry_read_offset = key->curbe_offset * 2;
+
+   if (BRW_IS_IGDNG(brw)) 
+      wm.wm4.sampler_count = 0; /* hardware requirement */
+   else
+      wm.wm4.sampler_count = (key->sampler_count + 1) / 4;
+
+   if (brw->wm.sampler_bo != NULL) {
+      /* reloc */
+      wm.wm4.sampler_state_pointer = brw->wm.sampler_bo->offset >> 5;
+   } else {
+      wm.wm4.sampler_state_pointer = 0;
+   }
+
+   wm.wm5.program_uses_depth = key->uses_depth;
+   wm.wm5.program_computes_depth = key->computes_depth;
+   wm.wm5.program_uses_killpixel = key->uses_kill;
+
+   if (key->is_glsl)
+      wm.wm5.enable_8_pix = 1;
+   else
+      wm.wm5.enable_16_pix = 1;
+
+   wm.wm5.max_threads = key->max_threads - 1;
+   wm.wm5.thread_dispatch_enable = 1;	/* AKA: color_write */
+   wm.wm5.legacy_line_rast = 0;
+   wm.wm5.legacy_global_depth_bias = 0;
+   wm.wm5.early_depth_test = 1;	        /* never need to disable */
+   wm.wm5.line_aa_region_width = 0;
+   wm.wm5.line_endcap_aa_region_width = 1;
+
+   wm.wm5.polygon_stipple = key->polygon_stipple;
+
+   if (key->offset_enable) {
+      wm.wm5.depth_offset = 1;
+      /* Something wierd going on with legacy_global_depth_bias,
+       * offset_constant, scaling and MRD.  This value passes glean
+       * but gives some odd results elsewere (eg. the
+       * quad-offset-units test).
+       */
+      wm.global_depth_offset_constant = key->offset_units * 2;
+
+      /* This is the only value that passes glean:
+       */
+      wm.global_depth_offset_scale = key->offset_factor;
+   }
+
+   wm.wm5.line_stipple = key->line_stipple;
+
+   if (INTEL_DEBUG & DEBUG_STATS || key->stats_wm)
+      wm.wm4.stats_enable = 1;
+
+   bo = brw_upload_cache(&brw->cache, BRW_WM_UNIT,
+			 key, sizeof(*key),
+			 reloc_bufs, 3,
+			 &wm, sizeof(wm),
+			 NULL, NULL);
+
+   /* Emit WM program relocation */
+   dri_bo_emit_reloc(bo,
+		     I915_GEM_DOMAIN_INSTRUCTION, 0,
+		     wm.thread0.grf_reg_count << 1,
+		     offsetof(struct brw_wm_unit_state, thread0),
+		     brw->wm.prog_bo);
+
+   /* Emit scratch space relocation */
+   if (key->total_scratch != 0) {
+      dri_bo_emit_reloc(bo,
+			0, 0,
+			wm.thread2.per_thread_scratch_space,
+			offsetof(struct brw_wm_unit_state, thread2),
+			brw->wm.scratch_bo);
+   }
+
+   /* Emit sampler state relocation */
+   if (key->sampler_count != 0) {
+      dri_bo_emit_reloc(bo,
+			I915_GEM_DOMAIN_INSTRUCTION, 0,
+			wm.wm4.stats_enable | (wm.wm4.sampler_count << 2),
+			offsetof(struct brw_wm_unit_state, wm4),
+			brw->wm.sampler_bo);
+   }
+
+   return bo;
+}
+
+
+static void upload_wm_unit( struct brw_context *brw )
+{
+   struct intel_context *intel = &brw->intel;
+   struct brw_wm_unit_key key;
+   dri_bo *reloc_bufs[3];
+   wm_unit_populate_key(brw, &key);
+
+   /* Allocate the necessary scratch space if we haven't already.  Don't
+    * bother reducing the allocation later, since we use scratch so
+    * rarely.
+    */
+   assert(key.total_scratch <= 12 * 1024);
+   if (key.total_scratch) {
+      GLuint total = key.total_scratch * key.max_threads;
+
+      if (brw->wm.scratch_bo && total > brw->wm.scratch_bo->size) {
+	 dri_bo_unreference(brw->wm.scratch_bo);
+	 brw->wm.scratch_bo = NULL;
+      }
+      if (brw->wm.scratch_bo == NULL) {
+	 brw->wm.scratch_bo = dri_bo_alloc(intel->bufmgr,
+                                           "wm scratch",
+                                           total,
+                                           4096);
+      }
+   }
+
+   reloc_bufs[0] = brw->wm.prog_bo;
+   reloc_bufs[1] = brw->wm.scratch_bo;
+   reloc_bufs[2] = brw->wm.sampler_bo;
+
+   dri_bo_unreference(brw->wm.state_bo);
+   brw->wm.state_bo = brw_search_cache(&brw->cache, BRW_WM_UNIT,
+				       &key, sizeof(key),
+				       reloc_bufs, 3,
+				       NULL);
+   if (brw->wm.state_bo == NULL) {
+      brw->wm.state_bo = wm_unit_create_from_key(brw, &key, reloc_bufs);
+   }
+}
+
+const struct brw_tracked_state brw_wm_unit = {
+   .dirty = {
+      .mesa = (_NEW_POLYGON | 
+	       _NEW_POLYGONSTIPPLE | 
+	       _NEW_LINE | 
+	       _NEW_COLOR |
+	       _NEW_DEPTH),
+
+      .brw = (BRW_NEW_FRAGMENT_PROGRAM | 
+	      BRW_NEW_CURBE_OFFSETS |
+	      BRW_NEW_DEPTH_BUFFER |
+	      BRW_NEW_NR_WM_SURFACES),
+
+      .cache = (CACHE_NEW_WM_PROG |
+		CACHE_NEW_SAMPLER)
+   },
+   .prepare = upload_wm_unit,
+};
+
--- a/src/gallium/drivers/i965/brw_wm_surface_state.c
+++ b/src/gallium/drivers/i965/brw_wm_surface_state.c
@ -0,0 +1,752 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+                   
+
+#include "main/mtypes.h"
+#include "main/texformat.h"
+#include "main/texstore.h"
+#include "shader/prog_parameter.h"
+
+#include "intel_mipmap_tree.h"
+#include "intel_batchbuffer.h"
+#include "intel_tex.h"
+#include "intel_fbo.h"
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+
+
+static GLuint translate_tex_target( GLenum target )
+{
+   switch (target) {
+   case GL_TEXTURE_1D: 
+      return BRW_SURFACE_1D;
+
+   case GL_TEXTURE_RECTANGLE_NV: 
+      return BRW_SURFACE_2D;
+
+   case GL_TEXTURE_2D: 
+      return BRW_SURFACE_2D;
+
+   case GL_TEXTURE_3D: 
+      return BRW_SURFACE_3D;
+
+   case GL_TEXTURE_CUBE_MAP: 
+      return BRW_SURFACE_CUBE;
+
+   default: 
+      assert(0); 
+      return 0;
+   }
+}
+
+
+static GLuint translate_tex_format( GLuint mesa_format, GLenum internal_format,
+				    GLenum depth_mode )
+{
+   switch( mesa_format ) {
+   case MESA_FORMAT_L8:
+      return BRW_SURFACEFORMAT_L8_UNORM;
+
+   case MESA_FORMAT_I8:
+      return BRW_SURFACEFORMAT_I8_UNORM;
+
+   case MESA_FORMAT_A8:
+      return BRW_SURFACEFORMAT_A8_UNORM; 
+
+   case MESA_FORMAT_AL88:
+      return BRW_SURFACEFORMAT_L8A8_UNORM;
+
+   case MESA_FORMAT_RGB888:
+      assert(0);		/* not supported for sampling */
+      return BRW_SURFACEFORMAT_R8G8B8_UNORM;      
+
+   case MESA_FORMAT_ARGB8888:
+      if (internal_format == GL_RGB)
+	 return BRW_SURFACEFORMAT_B8G8R8X8_UNORM;
+      else
+	 return BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
+
+   case MESA_FORMAT_RGBA8888_REV:
+      if (internal_format == GL_RGB)
+	 return BRW_SURFACEFORMAT_R8G8B8X8_UNORM;
+      else
+	 return BRW_SURFACEFORMAT_R8G8B8A8_UNORM;
+
+   case MESA_FORMAT_RGB565:
+      return BRW_SURFACEFORMAT_B5G6R5_UNORM;
+
+   case MESA_FORMAT_ARGB1555:
+      return BRW_SURFACEFORMAT_B5G5R5A1_UNORM;
+
+   case MESA_FORMAT_ARGB4444:
+      return BRW_SURFACEFORMAT_B4G4R4A4_UNORM;
+
+   case MESA_FORMAT_YCBCR_REV:
+      return BRW_SURFACEFORMAT_YCRCB_NORMAL;
+
+   case MESA_FORMAT_YCBCR:
+      return BRW_SURFACEFORMAT_YCRCB_SWAPUVY;
+
+   case MESA_FORMAT_RGB_FXT1:
+   case MESA_FORMAT_RGBA_FXT1:
+      return BRW_SURFACEFORMAT_FXT1;
+
+   case MESA_FORMAT_Z16:
+      if (depth_mode == GL_INTENSITY) 
+	  return BRW_SURFACEFORMAT_I16_UNORM;
+      else if (depth_mode == GL_ALPHA)
+	  return BRW_SURFACEFORMAT_A16_UNORM;
+      else
+	  return BRW_SURFACEFORMAT_L16_UNORM;
+
+   case MESA_FORMAT_RGB_DXT1:
+       return BRW_SURFACEFORMAT_DXT1_RGB;
+
+   case MESA_FORMAT_RGBA_DXT1:
+       return BRW_SURFACEFORMAT_BC1_UNORM;
+       
+   case MESA_FORMAT_RGBA_DXT3:
+       return BRW_SURFACEFORMAT_BC2_UNORM;
+       
+   case MESA_FORMAT_RGBA_DXT5:
+       return BRW_SURFACEFORMAT_BC3_UNORM;
+
+   case MESA_FORMAT_SARGB8:
+      return BRW_SURFACEFORMAT_B8G8R8A8_UNORM_SRGB;
+
+   case MESA_FORMAT_SLA8:
+      return BRW_SURFACEFORMAT_L8A8_UNORM_SRGB;
+
+   case MESA_FORMAT_SL8:
+      return BRW_SURFACEFORMAT_L8_UNORM_SRGB;
+
+   case MESA_FORMAT_SRGB_DXT1:
+      return BRW_SURFACEFORMAT_BC1_UNORM_SRGB;
+
+   case MESA_FORMAT_S8_Z24:
+      /* XXX: these different surface formats don't seem to
+       * make any difference for shadow sampler/compares.
+       */
+      if (depth_mode == GL_INTENSITY) 
+         return BRW_SURFACEFORMAT_I24X8_UNORM;
+      else if (depth_mode == GL_ALPHA)
+         return BRW_SURFACEFORMAT_A24X8_UNORM;
+      else
+         return BRW_SURFACEFORMAT_L24X8_UNORM;
+
+   case MESA_FORMAT_DUDV8:
+      return BRW_SURFACEFORMAT_R8G8_SNORM;
+
+   case MESA_FORMAT_SIGNED_RGBA8888_REV:
+      return BRW_SURFACEFORMAT_R8G8B8A8_SNORM;
+
+   default:
+      assert(0);
+      return 0;
+   }
+}
+
+static void
+brw_set_surface_tiling(struct brw_surface_state *surf, uint32_t tiling)
+{
+   switch (tiling) {
+   case I915_TILING_NONE:
+      surf->ss3.tiled_surface = 0;
+      surf->ss3.tile_walk = 0;
+      break;
+   case I915_TILING_X:
+      surf->ss3.tiled_surface = 1;
+      surf->ss3.tile_walk = BRW_TILEWALK_XMAJOR;
+      break;
+   case I915_TILING_Y:
+      surf->ss3.tiled_surface = 1;
+      surf->ss3.tile_walk = BRW_TILEWALK_YMAJOR;
+      break;
+   }
+}
+
+static dri_bo *
+brw_create_texture_surface( struct brw_context *brw,
+			    struct brw_surface_key *key )
+{
+   struct brw_surface_state surf;
+   dri_bo *bo;
+
+   memset(&surf, 0, sizeof(surf));
+
+   surf.ss0.mipmap_layout_mode = BRW_SURFACE_MIPMAPLAYOUT_BELOW;
+   surf.ss0.surface_type = translate_tex_target(key->target);
+   if (key->bo) {
+      surf.ss0.surface_format = translate_tex_format(key->format,
+						     key->internal_format,
+						     key->depthmode);
+   }
+   else {
+      switch (key->depth) {
+      case 32:
+         surf.ss0.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
+         break;
+      default:
+      case 24:
+         surf.ss0.surface_format = BRW_SURFACEFORMAT_B8G8R8X8_UNORM;
+         break;
+      case 16:
+         surf.ss0.surface_format = BRW_SURFACEFORMAT_B5G6R5_UNORM;
+         break;
+      }
+   }
+
+   /* This is ok for all textures with channel width 8bit or less:
+    */
+/*    surf.ss0.data_return_format = BRW_SURFACERETURNFORMAT_S1; */
+   if (key->bo)
+      surf.ss1.base_addr = key->bo->offset; /* reloc */
+   else
+      surf.ss1.base_addr = key->offset;
+
+   surf.ss2.mip_count = key->last_level - key->first_level;
+   surf.ss2.width = key->width - 1;
+   surf.ss2.height = key->height - 1;
+   brw_set_surface_tiling(&surf, key->tiling);
+   surf.ss3.pitch = (key->pitch * key->cpp) - 1;
+   surf.ss3.depth = key->depth - 1;
+
+   surf.ss4.min_lod = 0;
+ 
+   if (key->target == GL_TEXTURE_CUBE_MAP) {
+      surf.ss0.cube_pos_x = 1;
+      surf.ss0.cube_pos_y = 1;
+      surf.ss0.cube_pos_z = 1;
+      surf.ss0.cube_neg_x = 1;
+      surf.ss0.cube_neg_y = 1;
+      surf.ss0.cube_neg_z = 1;
+   }
+
+   bo = brw_upload_cache(&brw->surface_cache, BRW_SS_SURFACE,
+			 key, sizeof(*key),
+			 &key->bo, key->bo ? 1 : 0,
+			 &surf, sizeof(surf),
+			 NULL, NULL);
+
+   if (key->bo) {
+      /* Emit relocation to surface contents */
+      dri_bo_emit_reloc(bo,
+			I915_GEM_DOMAIN_SAMPLER, 0,
+			0,
+			offsetof(struct brw_surface_state, ss1),
+			key->bo);
+   }
+   return bo;
+}
+
+static void
+brw_update_texture_surface( GLcontext *ctx, GLuint unit )
+{
+   struct brw_context *brw = brw_context(ctx);
+   struct gl_texture_object *tObj = ctx->Texture.Unit[unit]._Current;
+   struct intel_texture_object *intelObj = intel_texture_object(tObj);
+   struct gl_texture_image *firstImage = tObj->Image[0][intelObj->firstLevel];
+   struct brw_surface_key key;
+   const GLuint surf = SURF_INDEX_TEXTURE(unit);
+
+   memset(&key, 0, sizeof(key));
+
+   if (intelObj->imageOverride) {
+      key.pitch = intelObj->pitchOverride / intelObj->mt->cpp;
+      key.depth = intelObj->depthOverride;
+      key.bo = NULL;
+      key.offset = intelObj->textureOffset;
+   } else {
+      key.format = firstImage->TexFormat->MesaFormat;
+      key.internal_format = firstImage->InternalFormat;
+      key.pitch = intelObj->mt->pitch;
+      key.depth = firstImage->Depth;
+      key.bo = intelObj->mt->region->buffer;
+      key.offset = 0;
+   }
+
+   key.target = tObj->Target;
+   key.depthmode = tObj->DepthMode;
+   key.first_level = intelObj->firstLevel;
+   key.last_level = intelObj->lastLevel;
+   key.width = firstImage->Width;
+   key.height = firstImage->Height;
+   key.cpp = intelObj->mt->cpp;
+   key.tiling = intelObj->mt->region->tiling;
+
+   dri_bo_unreference(brw->wm.surf_bo[surf]);
+   brw->wm.surf_bo[surf] = brw_search_cache(&brw->surface_cache,
+                                            BRW_SS_SURFACE,
+                                            &key, sizeof(key),
+                                            &key.bo, key.bo ? 1 : 0,
+                                            NULL);
+   if (brw->wm.surf_bo[surf] == NULL) {
+      brw->wm.surf_bo[surf] = brw_create_texture_surface(brw, &key);
+   }
+}
+
+
+
+/**
+ * Create the constant buffer surface.  Vertex/fragment shader constants will be
+ * read from this buffer with Data Port Read instructions/messages.
+ */
+dri_bo *
+brw_create_constant_surface( struct brw_context *brw,
+                             struct brw_surface_key *key )
+{
+   const GLint w = key->width - 1;
+   struct brw_surface_state surf;
+   dri_bo *bo;
+
+   memset(&surf, 0, sizeof(surf));
+
+   surf.ss0.mipmap_layout_mode = BRW_SURFACE_MIPMAPLAYOUT_BELOW;
+   surf.ss0.surface_type = BRW_SURFACE_BUFFER;
+   surf.ss0.surface_format = BRW_SURFACEFORMAT_R32G32B32A32_FLOAT;
+
+   assert(key->bo);
+   if (key->bo)
+      surf.ss1.base_addr = key->bo->offset; /* reloc */
+   else
+      surf.ss1.base_addr = key->offset;
+
+   surf.ss2.width = w & 0x7f;            /* bits 6:0 of size or width */
+   surf.ss2.height = (w >> 7) & 0x1fff;  /* bits 19:7 of size or width */
+   surf.ss3.depth = (w >> 20) & 0x7f;    /* bits 26:20 of size or width */
+   surf.ss3.pitch = (key->pitch * key->cpp) - 1; /* ignored?? */
+   brw_set_surface_tiling(&surf, key->tiling); /* tiling now allowed */
+ 
+   bo = brw_upload_cache(&brw->surface_cache, BRW_SS_SURFACE,
+			 key, sizeof(*key),
+			 &key->bo, key->bo ? 1 : 0,
+			 &surf, sizeof(surf),
+			 NULL, NULL);
+
+   if (key->bo) {
+      /* Emit relocation to surface contents */
+      dri_bo_emit_reloc(bo,
+			I915_GEM_DOMAIN_SAMPLER, 0,
+			0,
+			offsetof(struct brw_surface_state, ss1),
+			key->bo);
+   }
+
+   return bo;
+}
+
+/* Creates a new WM constant buffer reflecting the current fragment program's
+ * constants, if needed by the fragment program.
+ *
+ * Otherwise, constants go through the CURBEs using the brw_constant_buffer
+ * state atom.
+ */
+static drm_intel_bo *
+brw_wm_update_constant_buffer(struct brw_context *brw)
+{
+   struct intel_context *intel = &brw->intel;
+   struct brw_fragment_program *fp =
+      (struct brw_fragment_program *) brw->fragment_program;
+   const struct gl_program_parameter_list *params = fp->program.Base.Parameters;
+   const int size = params->NumParameters * 4 * sizeof(GLfloat);
+   drm_intel_bo *const_buffer;
+
+   /* BRW_NEW_FRAGMENT_PROGRAM */
+   if (!fp->use_const_buffer)
+      return NULL;
+
+   const_buffer = drm_intel_bo_alloc(intel->bufmgr, "fp_const_buffer",
+				     size, 64);
+
+   /* _NEW_PROGRAM_CONSTANTS */
+   dri_bo_subdata(const_buffer, 0, size, params->ParameterValues);
+
+   return const_buffer;
+}
+
+/**
+ * Update the surface state for a WM constant buffer.
+ * The constant buffer will be (re)allocated here if needed.
+ */
+static void
+brw_update_wm_constant_surface( GLcontext *ctx,
+                                GLuint surf)
+{
+   struct brw_context *brw = brw_context(ctx);
+   struct brw_surface_key key;
+   struct brw_fragment_program *fp =
+      (struct brw_fragment_program *) brw->fragment_program;
+   const struct gl_program_parameter_list *params =
+      fp->program.Base.Parameters;
+
+   /* If we're in this state update atom, we need to update WM constants, so
+    * free the old buffer and create a new one for the new contents.
+    */
+   dri_bo_unreference(fp->const_buffer);
+   fp->const_buffer = brw_wm_update_constant_buffer(brw);
+
+   /* If there's no constant buffer, then no surface BO is needed to point at
+    * it.
+    */
+   if (fp->const_buffer == 0) {
+      drm_intel_bo_unreference(brw->wm.surf_bo[surf]);
+      brw->wm.surf_bo[surf] = NULL;
+      return;
+   }
+
+   memset(&key, 0, sizeof(key));
+
+   key.format = MESA_FORMAT_RGBA_FLOAT32;
+   key.internal_format = GL_RGBA;
+   key.bo = fp->const_buffer;
+   key.depthmode = GL_NONE;
+   key.pitch = params->NumParameters;
+   key.width = params->NumParameters;
+   key.height = 1;
+   key.depth = 1;
+   key.cpp = 16;
+
+   /*
+   printf("%s:\n", __FUNCTION__);
+   printf("  width %d  height %d  depth %d  cpp %d  pitch %d\n",
+          key.width, key.height, key.depth, key.cpp, key.pitch);
+   */
+
+   dri_bo_unreference(brw->wm.surf_bo[surf]);
+   brw->wm.surf_bo[surf] = brw_search_cache(&brw->surface_cache,
+                                            BRW_SS_SURFACE,
+                                            &key, sizeof(key),
+                                            &key.bo, key.bo ? 1 : 0,
+                                            NULL);
+   if (brw->wm.surf_bo[surf] == NULL) {
+      brw->wm.surf_bo[surf] = brw_create_constant_surface(brw, &key);
+   }
+   brw->state.dirty.brw |= BRW_NEW_WM_SURFACES;
+}
+
+/**
+ * Updates surface / buffer for fragment shader constant buffer, if
+ * one is required.
+ *
+ * This consumes the state updates for the constant buffer, and produces
+ * BRW_NEW_WM_SURFACES to get picked up by brw_prepare_wm_surfaces for
+ * inclusion in the binding table.
+ */
+static void prepare_wm_constant_surface(struct brw_context *brw )
+{
+   GLcontext *ctx = &brw->intel.ctx;
+   struct brw_fragment_program *fp =
+      (struct brw_fragment_program *) brw->fragment_program;
+   GLuint surf = SURF_INDEX_FRAG_CONST_BUFFER;
+
+   drm_intel_bo_unreference(fp->const_buffer);
+   fp->const_buffer = brw_wm_update_constant_buffer(brw);
+
+   /* If there's no constant buffer, then no surface BO is needed to point at
+    * it.
+    */
+   if (fp->const_buffer == 0) {
+      if (brw->wm.surf_bo[surf] != NULL) {
+	 drm_intel_bo_unreference(brw->wm.surf_bo[surf]);
+	 brw->wm.surf_bo[surf] = NULL;
+	 brw->state.dirty.brw |= BRW_NEW_WM_SURFACES;
+      }
+      return;
+   }
+
+   brw_update_wm_constant_surface(ctx, surf);
+}
+
+const struct brw_tracked_state brw_wm_constant_surface = {
+   .dirty = {
+      .mesa = (_NEW_PROGRAM_CONSTANTS),
+      .brw = (BRW_NEW_FRAGMENT_PROGRAM),
+      .cache = 0
+   },
+   .prepare = prepare_wm_constant_surface,
+};
+
+
+/**
+ * Sets up a surface state structure to point at the given region.
+ * While it is only used for the front/back buffer currently, it should be
+ * usable for further buffers when doing ARB_draw_buffer support.
+ */
+static void
+brw_update_renderbuffer_surface(struct brw_context *brw,
+				struct gl_renderbuffer *rb,
+				unsigned int unit)
+{
+   GLcontext *ctx = &brw->intel.ctx;
+   dri_bo *region_bo = NULL;
+   struct intel_renderbuffer *irb = intel_renderbuffer(rb);
+   struct intel_region *region = irb ? irb->region : NULL;
+   struct {
+      unsigned int surface_type;
+      unsigned int surface_format;
+      unsigned int width, height, pitch, cpp;
+      GLubyte color_mask[4];
+      GLboolean color_blend;
+      uint32_t tiling;
+      uint32_t draw_offset;
+   } key;
+
+   memset(&key, 0, sizeof(key));
+
+   if (region != NULL) {
+      region_bo = region->buffer;
+
+      key.surface_type = BRW_SURFACE_2D;
+      switch (irb->texformat->MesaFormat) {
+      case MESA_FORMAT_ARGB8888:
+	 key.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
+	 break;
+      case MESA_FORMAT_RGB565:
+	 key.surface_format = BRW_SURFACEFORMAT_B5G6R5_UNORM;
+	 break;
+      case MESA_FORMAT_ARGB1555:
+	 key.surface_format = BRW_SURFACEFORMAT_B5G5R5A1_UNORM;
+	 break;
+      case MESA_FORMAT_ARGB4444:
+	 key.surface_format = BRW_SURFACEFORMAT_B4G4R4A4_UNORM;
+	 break;
+      default:
+	 _mesa_problem(ctx, "Bad renderbuffer format: %d\n",
+		       irb->texformat->MesaFormat);
+      }
+      key.tiling = region->tiling;
+      if (brw->intel.intelScreen->driScrnPriv->dri2.enabled) {
+	 key.width = rb->Width;
+	 key.height = rb->Height;
+      } else {
+	 key.width = region->width;
+	 key.height = region->height;
+      }
+      key.pitch = region->pitch;
+      key.cpp = region->cpp;
+      key.draw_offset = region->draw_offset; /* cur 3d or cube face offset */
+   } else {
+      key.surface_type = BRW_SURFACE_NULL;
+      key.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
+      key.tiling = I915_TILING_X;
+      key.width = 1;
+      key.height = 1;
+      key.cpp = 4;
+      key.draw_offset = 0;
+   }
+   memcpy(key.color_mask, ctx->Color.ColorMask,
+	  sizeof(key.color_mask));
+   key.color_blend = (!ctx->Color._LogicOpEnabled &&
+		      ctx->Color.BlendEnabled);
+
+   dri_bo_unreference(brw->wm.surf_bo[unit]);
+   brw->wm.surf_bo[unit] = brw_search_cache(&brw->surface_cache,
+					    BRW_SS_SURFACE,
+					    &key, sizeof(key),
+					    &region_bo, 1,
+					    NULL);
+
+   if (brw->wm.surf_bo[unit] == NULL) {
+      struct brw_surface_state surf;
+
+      memset(&surf, 0, sizeof(surf));
+
+      surf.ss0.surface_format = key.surface_format;
+      surf.ss0.surface_type = key.surface_type;
+      if (key.tiling == I915_TILING_NONE) {
+	 surf.ss1.base_addr = key.draw_offset;
+      } else {
+	 uint32_t tile_offset = key.draw_offset % 4096;
+
+	 surf.ss1.base_addr = key.draw_offset - tile_offset;
+
+	 assert(BRW_IS_G4X(brw) || tile_offset == 0);
+	 if (BRW_IS_G4X(brw)) {
+	    if (key.tiling == I915_TILING_X) {
+	       /* Note that the low bits of these fields are missing, so
+		* there's the possibility of getting in trouble.
+		*/
+	       surf.ss5.x_offset = (tile_offset % 512) / key.cpp / 4;
+	       surf.ss5.y_offset = tile_offset / 512 / 2;
+	    } else {
+	       surf.ss5.x_offset = (tile_offset % 128) / key.cpp / 4;
+	       surf.ss5.y_offset = tile_offset / 128 / 2;
+	    }
+	 }
+      }
+      if (region_bo != NULL)
+	 surf.ss1.base_addr += region_bo->offset; /* reloc */
+
+      surf.ss2.width = key.width - 1;
+      surf.ss2.height = key.height - 1;
+      brw_set_surface_tiling(&surf, key.tiling);
+      surf.ss3.pitch = (key.pitch * key.cpp) - 1;
+
+      /* _NEW_COLOR */
+      surf.ss0.color_blend = key.color_blend;
+      surf.ss0.writedisable_red =   !key.color_mask[0];
+      surf.ss0.writedisable_green = !key.color_mask[1];
+      surf.ss0.writedisable_blue =  !key.color_mask[2];
+      surf.ss0.writedisable_alpha = !key.color_mask[3];
+
+      /* Key size will never match key size for textures, so we're safe. */
+      brw->wm.surf_bo[unit] = brw_upload_cache(&brw->surface_cache,
+                                               BRW_SS_SURFACE,
+                                               &key, sizeof(key),
+					       &region_bo, 1,
+					       &surf, sizeof(surf),
+					       NULL, NULL);
+      if (region_bo != NULL) {
+	 /* We might sample from it, and we might render to it, so flag
+	  * them both.  We might be able to figure out from other state
+	  * a more restrictive relocation to emit.
+	  */
+	 drm_intel_bo_emit_reloc(brw->wm.surf_bo[unit],
+				 offsetof(struct brw_surface_state, ss1),
+				 region_bo,
+				 surf.ss1.base_addr - region_bo->offset,
+				 I915_GEM_DOMAIN_RENDER,
+				 I915_GEM_DOMAIN_RENDER);
+      }
+   }
+}
+
+
+/**
+ * Constructs the binding table for the WM surface state, which maps unit
+ * numbers to surface state objects.
+ */
+static dri_bo *
+brw_wm_get_binding_table(struct brw_context *brw)
+{
+   dri_bo *bind_bo;
+
+   assert(brw->wm.nr_surfaces <= BRW_WM_MAX_SURF);
+
+   bind_bo = brw_search_cache(&brw->surface_cache, BRW_SS_SURF_BIND,
+			      NULL, 0,
+			      brw->wm.surf_bo, brw->wm.nr_surfaces,
+			      NULL);
+
+   if (bind_bo == NULL) {
+      GLuint data_size = brw->wm.nr_surfaces * sizeof(GLuint);
+      uint32_t data[BRW_WM_MAX_SURF];
+      int i;
+
+      for (i = 0; i < brw->wm.nr_surfaces; i++)
+         if (brw->wm.surf_bo[i])
+            data[i] = brw->wm.surf_bo[i]->offset;
+         else
+            data[i] = 0;
+
+      bind_bo = brw_upload_cache( &brw->surface_cache, BRW_SS_SURF_BIND,
+				  NULL, 0,
+				  brw->wm.surf_bo, brw->wm.nr_surfaces,
+				  data, data_size,
+				  NULL, NULL);
+
+      /* Emit binding table relocations to surface state */
+      for (i = 0; i < BRW_WM_MAX_SURF; i++) {
+	 if (brw->wm.surf_bo[i] != NULL) {
+	    dri_bo_emit_reloc(bind_bo,
+			      I915_GEM_DOMAIN_INSTRUCTION, 0,
+			      0,
+			      i * sizeof(GLuint),
+			      brw->wm.surf_bo[i]);
+	 }
+      }
+   }
+
+   return bind_bo;
+}
+
+static void prepare_wm_surfaces(struct brw_context *brw )
+{
+   GLcontext *ctx = &brw->intel.ctx;
+   GLuint i;
+   int old_nr_surfaces;
+
+   /* _NEW_BUFFERS */
+   /* Update surfaces for drawing buffers */
+   if (ctx->DrawBuffer->_NumColorDrawBuffers >= 1) {
+      for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++) {
+         brw_update_renderbuffer_surface(brw,
+					 ctx->DrawBuffer->_ColorDrawBuffers[i],
+					 i);
+      }
+   } else {
+      brw_update_renderbuffer_surface(brw, NULL, 0);
+   }
+
+   old_nr_surfaces = brw->wm.nr_surfaces;
+   brw->wm.nr_surfaces = MAX_DRAW_BUFFERS;
+
+   if (brw->wm.surf_bo[SURF_INDEX_FRAG_CONST_BUFFER] != NULL)
+       brw->wm.nr_surfaces = SURF_INDEX_FRAG_CONST_BUFFER + 1;
+
+   /* Update surfaces for textures */
+   for (i = 0; i < BRW_MAX_TEX_UNIT; i++) {
+      const struct gl_texture_unit *texUnit = &ctx->Texture.Unit[i];
+      const GLuint surf = SURF_INDEX_TEXTURE(i);
+
+      /* _NEW_TEXTURE, BRW_NEW_TEXDATA */
+      if (texUnit->_ReallyEnabled) {
+	 brw_update_texture_surface(ctx, i);
+	 brw->wm.nr_surfaces = surf + 1;
+      } else {
+         dri_bo_unreference(brw->wm.surf_bo[surf]);
+         brw->wm.surf_bo[surf] = NULL;
+      }
+   }
+
+   dri_bo_unreference(brw->wm.bind_bo);
+   brw->wm.bind_bo = brw_wm_get_binding_table(brw);
+
+   if (brw->wm.nr_surfaces != old_nr_surfaces)
+      brw->state.dirty.brw |= BRW_NEW_NR_WM_SURFACES;
+}
+
+const struct brw_tracked_state brw_wm_surfaces = {
+   .dirty = {
+      .mesa = (_NEW_COLOR |
+               _NEW_TEXTURE |
+               _NEW_BUFFERS),
+      .brw = (BRW_NEW_CONTEXT |
+	      BRW_NEW_WM_SURFACES),
+      .cache = 0
+   },
+   .prepare = prepare_wm_surfaces,
+};
+
+
+
--- a/src/gallium/drivers/i965/intel_batchbuffer.h
+++ b/src/gallium/drivers/i965/intel_batchbuffer.h
@ -0,0 +1,184 @@
+#ifndef INTEL_BATCHBUFFER_H
+#define INTEL_BATCHBUFFER_H
+
+#include "main/mtypes.h"
+
+#include "intel_context.h"
+#include "intel_bufmgr.h"
+#include "intel_reg.h"
+
+#define BATCH_SZ 16384
+#define BATCH_RESERVED 16
+
+enum cliprect_mode {
+   /**
+    * Batchbuffer contents may be looped over per cliprect, but do not
+    * require it.
+    */
+   IGNORE_CLIPRECTS,
+   /**
+    * Batchbuffer contents require looping over per cliprect at batch submit
+    * time.
+    *
+    * This will be upgraded to NO_LOOP_CLIPRECTS when there's a single
+    * constant cliprect, as in DRI2 or FBO rendering.
+    */
+   LOOP_CLIPRECTS,
+   /**
+    * Batchbuffer contents contain drawing that should not be executed multiple
+    * times.
+    */
+   NO_LOOP_CLIPRECTS,
+   /**
+    * Batchbuffer contents contain drawing that already handles cliprects, such
+    * as 2D drawing to front/back/depth that doesn't respect DRAWING_RECTANGLE.
+    *
+    * Equivalent behavior to NO_LOOP_CLIPRECTS, but may not persist in batch
+    * outside of LOCK/UNLOCK.  This is upgraded to just NO_LOOP_CLIPRECTS when
+    * there's a constant cliprect, as in DRI2 or FBO rendering.
+    */
+   REFERENCES_CLIPRECTS
+};
+
+struct intel_batchbuffer
+{
+   struct intel_context *intel;
+
+   dri_bo *buf;
+
+   GLubyte *buffer;
+
+   GLubyte *map;
+   GLubyte *ptr;
+
+   enum cliprect_mode cliprect_mode;
+
+   GLuint size;
+
+   /** Tracking of BEGIN_BATCH()/OUT_BATCH()/ADVANCE_BATCH() debugging */
+   struct {
+      GLuint total;
+      GLubyte *start_ptr;
+   } emit;
+
+   GLuint dirty_state;
+};
+
+struct intel_batchbuffer *intel_batchbuffer_alloc(struct intel_context
+                                                  *intel);
+
+void intel_batchbuffer_free(struct intel_batchbuffer *batch);
+
+
+void _intel_batchbuffer_flush(struct intel_batchbuffer *batch,
+			      const char *file, int line);
+
+#define intel_batchbuffer_flush(batch) \
+	_intel_batchbuffer_flush(batch, __FILE__, __LINE__)
+
+void intel_batchbuffer_reset(struct intel_batchbuffer *batch);
+
+
+/* Unlike bmBufferData, this currently requires the buffer be mapped.
+ * Consider it a convenience function wrapping multple
+ * intel_buffer_dword() calls.
+ */
+void intel_batchbuffer_data(struct intel_batchbuffer *batch,
+                            const void *data, GLuint bytes,
+			    enum cliprect_mode cliprect_mode);
+
+void intel_batchbuffer_release_space(struct intel_batchbuffer *batch,
+                                     GLuint bytes);
+
+GLboolean intel_batchbuffer_emit_reloc(struct intel_batchbuffer *batch,
+                                       dri_bo *buffer,
+				       uint32_t read_domains,
+				       uint32_t write_domain,
+				       uint32_t offset);
+
+/* Inline functions - might actually be better off with these
+ * non-inlined.  Certainly better off switching all command packets to
+ * be passed as structs rather than dwords, but that's a little bit of
+ * work...
+ */
+static INLINE GLint
+intel_batchbuffer_space(struct intel_batchbuffer *batch)
+{
+   return (batch->size - BATCH_RESERVED) - (batch->ptr - batch->map);
+}
+
+
+static INLINE void
+intel_batchbuffer_emit_dword(struct intel_batchbuffer *batch, GLuint dword)
+{
+   assert(batch->map);
+   assert(intel_batchbuffer_space(batch) >= 4);
+   *(GLuint *) (batch->ptr) = dword;
+   batch->ptr += 4;
+}
+
+static INLINE void
+intel_batchbuffer_require_space(struct intel_batchbuffer *batch,
+                                GLuint sz,
+				enum cliprect_mode cliprect_mode)
+{
+   assert(sz < batch->size - 8);
+   if (intel_batchbuffer_space(batch) < sz)
+      intel_batchbuffer_flush(batch);
+
+   if ((cliprect_mode == LOOP_CLIPRECTS ||
+	cliprect_mode == REFERENCES_CLIPRECTS) &&
+       batch->intel->constant_cliprect)
+      cliprect_mode = NO_LOOP_CLIPRECTS;
+
+   if (cliprect_mode != IGNORE_CLIPRECTS) {
+      if (batch->cliprect_mode == IGNORE_CLIPRECTS) {
+	 batch->cliprect_mode = cliprect_mode;
+      } else {
+	 if (batch->cliprect_mode != cliprect_mode) {
+	    intel_batchbuffer_flush(batch);
+	    batch->cliprect_mode = cliprect_mode;
+	 }
+      }
+   }
+}
+
+/* Here are the crusty old macros, to be removed:
+ */
+#define BATCH_LOCALS
+
+#define BEGIN_BATCH(n, cliprect_mode) do {				\
+   intel_batchbuffer_require_space(intel->batch, (n)*4, cliprect_mode); \
+   assert(intel->batch->emit.start_ptr == NULL);			\
+   intel->batch->emit.total = (n) * 4;					\
+   intel->batch->emit.start_ptr = intel->batch->ptr;			\
+} while (0)
+
+#define OUT_BATCH(d) intel_batchbuffer_emit_dword(intel->batch, d)
+
+#define OUT_RELOC(buf, read_domains, write_domain, delta) do {		\
+   assert((unsigned) (delta) < buf->size);				\
+   intel_batchbuffer_emit_reloc(intel->batch, buf,			\
+				read_domains, write_domain, delta);	\
+} while (0)
+
+#define ADVANCE_BATCH() do {						\
+   unsigned int _n = intel->batch->ptr - intel->batch->emit.start_ptr;	\
+   assert(intel->batch->emit.start_ptr != NULL);			\
+   if (_n != intel->batch->emit.total) {				\
+      fprintf(stderr, "ADVANCE_BATCH: %d of %d dwords emitted\n",	\
+	      _n, intel->batch->emit.total);				\
+      abort();								\
+   }									\
+   intel->batch->emit.start_ptr = NULL;					\
+} while(0)
+
+
+static INLINE void
+intel_batchbuffer_emit_mi_flush(struct intel_batchbuffer *batch)
+{
+   intel_batchbuffer_require_space(batch, 4, IGNORE_CLIPRECTS);
+   intel_batchbuffer_emit_dword(batch, MI_FLUSH);
+}
+
+#endif
--- a/src/gallium/drivers/i965/intel_chipset.h
+++ b/src/gallium/drivers/i965/intel_chipset.h
@ -0,0 +1,118 @@
+/*
+ * Copyright © 2007 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+#define PCI_CHIP_I810			0x7121
+#define PCI_CHIP_I810_DC100		0x7123
+#define PCI_CHIP_I810_E			0x7125
+#define PCI_CHIP_I815			0x1132
+
+#define PCI_CHIP_I830_M			0x3577
+#define PCI_CHIP_845_G			0x2562
+#define PCI_CHIP_I855_GM		0x3582
+#define PCI_CHIP_I865_G			0x2572
+
+#define PCI_CHIP_I915_G			0x2582
+#define PCI_CHIP_E7221_G		0x258A
+#define PCI_CHIP_I915_GM		0x2592
+#define PCI_CHIP_I945_G			0x2772
+#define PCI_CHIP_I945_GM		0x27A2
+#define PCI_CHIP_I945_GME		0x27AE
+
+#define PCI_CHIP_Q35_G			0x29B2
+#define PCI_CHIP_G33_G			0x29C2
+#define PCI_CHIP_Q33_G			0x29D2
+
+#define PCI_CHIP_IGD_GM			0xA011
+#define PCI_CHIP_IGD_G			0xA001
+
+#define IS_IGDGM(devid)	(devid == PCI_CHIP_IGD_GM)
+#define IS_IGDG(devid)	(devid == PCI_CHIP_IGD_G)
+#define IS_IGD(devid) (IS_IGDG(devid) || IS_IGDGM(devid))
+
+#define PCI_CHIP_I965_G			0x29A2
+#define PCI_CHIP_I965_Q			0x2992
+#define PCI_CHIP_I965_G_1		0x2982
+#define PCI_CHIP_I946_GZ		0x2972
+#define PCI_CHIP_I965_GM                0x2A02
+#define PCI_CHIP_I965_GME               0x2A12
+
+#define PCI_CHIP_GM45_GM                0x2A42
+
+#define PCI_CHIP_IGD_E_G                0x2E02
+#define PCI_CHIP_Q45_G                  0x2E12
+#define PCI_CHIP_G45_G                  0x2E22
+#define PCI_CHIP_G41_G                  0x2E32
+#define PCI_CHIP_B43_G                  0x2E42
+
+#define PCI_CHIP_ILD_G                  0x0042
+#define PCI_CHIP_ILM_G                  0x0046
+
+#define IS_MOBILE(devid)	(devid == PCI_CHIP_I855_GM || \
+				 devid == PCI_CHIP_I915_GM || \
+				 devid == PCI_CHIP_I945_GM || \
+				 devid == PCI_CHIP_I945_GME || \
+				 devid == PCI_CHIP_I965_GM || \
+				 devid == PCI_CHIP_I965_GME || \
+				 devid == PCI_CHIP_GM45_GM || \
+				 IS_IGD(devid) || \
+				 devid == PCI_CHIP_ILM_G)
+
+#define IS_G45(devid)           (devid == PCI_CHIP_IGD_E_G || \
+                                 devid == PCI_CHIP_Q45_G || \
+                                 devid == PCI_CHIP_G45_G || \
+                                 devid == PCI_CHIP_G41_G || \
+                                 devid == PCI_CHIP_B43_G)
+#define IS_GM45(devid)          (devid == PCI_CHIP_GM45_GM)
+#define IS_G4X(devid)		(IS_G45(devid) || IS_GM45(devid))
+
+#define IS_ILD(devid)           (devid == PCI_CHIP_ILD_G)
+#define IS_ILM(devid)           (devid == PCI_CHIP_ILM_G)
+#define IS_IGDNG(devid)           (IS_ILD(devid) || IS_ILM(devid))
+
+#define IS_915(devid)		(devid == PCI_CHIP_I915_G || \
+				 devid == PCI_CHIP_E7221_G || \
+				 devid == PCI_CHIP_I915_GM)
+
+#define IS_945(devid)		(devid == PCI_CHIP_I945_G || \
+				 devid == PCI_CHIP_I945_GM || \
+				 devid == PCI_CHIP_I945_GME || \
+				 devid == PCI_CHIP_G33_G || \
+				 devid == PCI_CHIP_Q33_G || \
+				 devid == PCI_CHIP_Q35_G || IS_IGD(devid))
+
+#define IS_965(devid)		(devid == PCI_CHIP_I965_G || \
+				 devid == PCI_CHIP_I965_Q || \
+				 devid == PCI_CHIP_I965_G_1 || \
+				 devid == PCI_CHIP_I965_GM || \
+				 devid == PCI_CHIP_I965_GME || \
+				 devid == PCI_CHIP_I946_GZ || \
+				 IS_G4X(devid) || \
+				 IS_IGDNG(devid))
+
+#define IS_9XX(devid)		(IS_915(devid) || \
+				 IS_945(devid) || \
+				 IS_965(devid))
--- a/src/gallium/drivers/i965/intel_structs.h
+++ b/src/gallium/drivers/i965/intel_structs.h
@ -0,0 +1,132 @@
+#ifndef INTEL_STRUCTS_H
+#define INTEL_STRUCTS_H
+
+struct br0 {
+   GLuint length:8;
+   GLuint pad0:3;
+   GLuint dst_tiled:1;
+   GLuint pad1:8;
+   GLuint write_rgb:1;
+   GLuint write_alpha:1;
+   GLuint opcode:7;
+   GLuint client:3;
+};
+
+   
+struct br13 {
+   GLint dest_pitch:16;
+   GLuint rop:8;
+   GLuint color_depth:2;
+   GLuint pad1:3;
+   GLuint mono_source_transparency:1;
+   GLuint clipping_enable:1;
+   GLuint pad0:1;
+};
+
+
+
+/* This is an attempt to move some of the 2D interaction in this
+ * driver to using structs for packets rather than a bunch of #defines
+ * and dwords.
+ */
+struct xy_color_blit {
+   struct br0 br0;
+   struct br13 br13;
+
+   struct {
+      GLuint dest_x1:16;
+      GLuint dest_y1:16;
+   } dw2;
+
+   struct {
+      GLuint dest_x2:16;
+      GLuint dest_y2:16;
+   } dw3;
+   
+   GLuint dest_base_addr;
+   GLuint color;
+};
+
+struct xy_src_copy_blit {
+   struct br0 br0;
+   struct br13 br13;
+
+   struct {
+      GLuint dest_x1:16;
+      GLuint dest_y1:16;
+   } dw2;
+
+   struct {
+      GLuint dest_x2:16;
+      GLuint dest_y2:16;
+   } dw3;
+   
+   GLuint dest_base_addr;
+
+   struct {
+      GLuint src_x1:16;
+      GLuint src_y1:16;
+   } dw5;
+
+   struct {
+      GLint src_pitch:16;
+      GLuint pad:16;
+   } dw6;
+   
+   GLuint src_base_addr;
+};
+
+struct xy_setup_blit {
+   struct br0 br0;
+   struct br13 br13;
+
+   struct {
+      GLuint clip_x1:16;
+      GLuint clip_y1:16;
+   } dw2;
+
+   struct {
+      GLuint clip_x2:16;
+      GLuint clip_y2:16;
+   } dw3;
+      
+   GLuint dest_base_addr;
+   GLuint background_color;
+   GLuint foreground_color;
+   GLuint pattern_base_addr;
+};
+
+
+struct xy_text_immediate_blit {
+   struct {
+      GLuint length:8;
+      GLuint pad2:3;
+      GLuint dst_tiled:1;
+      GLuint pad1:4;
+      GLuint byte_packed:1;
+      GLuint pad0:5;
+      GLuint opcode:7;
+      GLuint client:3;
+   } dw0;
+
+   struct {
+      GLuint dest_x1:16;
+      GLuint dest_y1:16;
+   } dw1;
+
+   struct {
+      GLuint dest_x2:16;
+      GLuint dest_y2:16;
+   } dw2;   
+
+   /* Src bitmap data follows as inline dwords.
+    */
+};
+
+
+#define CLIENT_2D 0x2
+#define OPCODE_XY_SETUP_BLT 0x1
+#define OPCODE_XY_COLOR_BLT 0x50
+#define OPCODE_XY_TEXT_IMMEDIATE_BLT 0x31
+
+#endif
--- a/src/gallium/drivers/i965/intel_tex_format.c
+++ b/src/gallium/drivers/i965/intel_tex_format.c
@ -0,0 +1,225 @@
+#include "intel_context.h"
+#include "intel_tex.h"
+#include "intel_chipset.h"
+#include "main/texformat.h"
+#include "main/enums.h"
+
+
+/**
+ * Choose hardware texture format given the user's glTexImage parameters.
+ *
+ * It works out that this function is fine for all the supported
+ * hardware.  However, there is still a need to map the formats onto
+ * hardware descriptors.
+ *
+ * Note that the i915 can actually support many more formats than
+ * these if we take the step of simply swizzling the colors
+ * immediately after sampling...
+ */
+const struct gl_texture_format *
+intelChooseTextureFormat(GLcontext * ctx, GLint internalFormat,
+                         GLenum format, GLenum type)
+{
+   struct intel_context *intel = intel_context(ctx);
+   const GLboolean do32bpt = (intel->ctx.Visual.rgbBits >= 24);
+
+#if 0
+   printf("%s intFmt=0x%x format=0x%x type=0x%x\n",
+          __FUNCTION__, internalFormat, format, type);
+#endif
+
+   switch (internalFormat) {
+   case 4:
+   case GL_RGBA:
+   case GL_COMPRESSED_RGBA:
+      if (format == GL_BGRA) {
+         if (type == GL_UNSIGNED_BYTE || type == GL_UNSIGNED_INT_8_8_8_8_REV) {
+            return &_mesa_texformat_argb8888;
+         }
+         else if (type == GL_UNSIGNED_SHORT_4_4_4_4_REV) {
+            return &_mesa_texformat_argb4444;
+         }
+         else if (type == GL_UNSIGNED_SHORT_1_5_5_5_REV) {
+            return &_mesa_texformat_argb1555;
+         }
+      }
+      return do32bpt ? &_mesa_texformat_argb8888 : &_mesa_texformat_argb4444;
+
+   case 3:
+   case GL_RGB:
+   case GL_COMPRESSED_RGB:
+      if (format == GL_RGB && type == GL_UNSIGNED_SHORT_5_6_5) {
+         return &_mesa_texformat_rgb565;
+      }
+      return do32bpt ? &_mesa_texformat_argb8888 : &_mesa_texformat_rgb565;
+
+   case GL_RGBA8:
+   case GL_RGB10_A2:
+   case GL_RGBA12:
+   case GL_RGBA16:
+      return do32bpt ? &_mesa_texformat_argb8888 : &_mesa_texformat_argb4444;
+
+   case GL_RGBA4:
+   case GL_RGBA2:
+      return &_mesa_texformat_argb4444;
+
+   case GL_RGB5_A1:
+      return &_mesa_texformat_argb1555;
+
+   case GL_RGB8:
+   case GL_RGB10:
+   case GL_RGB12:
+   case GL_RGB16:
+      return &_mesa_texformat_argb8888;
+
+   case GL_RGB5:
+   case GL_RGB4:
+   case GL_R3_G3_B2:
+      return &_mesa_texformat_rgb565;
+
+   case GL_ALPHA:
+   case GL_ALPHA4:
+   case GL_ALPHA8:
+   case GL_ALPHA12:
+   case GL_ALPHA16:
+   case GL_COMPRESSED_ALPHA:
+      return &_mesa_texformat_a8;
+
+   case 1:
+   case GL_LUMINANCE:
+   case GL_LUMINANCE4:
+   case GL_LUMINANCE8:
+   case GL_LUMINANCE12:
+   case GL_LUMINANCE16:
+   case GL_COMPRESSED_LUMINANCE:
+      return &_mesa_texformat_l8;
+
+   case 2:
+   case GL_LUMINANCE_ALPHA:
+   case GL_LUMINANCE4_ALPHA4:
+   case GL_LUMINANCE6_ALPHA2:
+   case GL_LUMINANCE8_ALPHA8:
+   case GL_LUMINANCE12_ALPHA4:
+   case GL_LUMINANCE12_ALPHA12:
+   case GL_LUMINANCE16_ALPHA16:
+   case GL_COMPRESSED_LUMINANCE_ALPHA:
+      return &_mesa_texformat_al88;
+
+   case GL_INTENSITY:
+   case GL_INTENSITY4:
+   case GL_INTENSITY8:
+   case GL_INTENSITY12:
+   case GL_INTENSITY16:
+   case GL_COMPRESSED_INTENSITY:
+      return &_mesa_texformat_i8;
+
+   case GL_YCBCR_MESA:
+      if (type == GL_UNSIGNED_SHORT_8_8_MESA || type == GL_UNSIGNED_BYTE)
+         return &_mesa_texformat_ycbcr;
+      else
+         return &_mesa_texformat_ycbcr_rev;
+
+   case GL_COMPRESSED_RGB_FXT1_3DFX:
+      return &_mesa_texformat_rgb_fxt1;
+   case GL_COMPRESSED_RGBA_FXT1_3DFX:
+      return &_mesa_texformat_rgba_fxt1;
+
+   case GL_RGB_S3TC:
+   case GL_RGB4_S3TC:
+   case GL_COMPRESSED_RGB_S3TC_DXT1_EXT:
+      return &_mesa_texformat_rgb_dxt1;
+
+   case GL_COMPRESSED_RGBA_S3TC_DXT1_EXT:
+      return &_mesa_texformat_rgba_dxt1;
+
+   case GL_RGBA_S3TC:
+   case GL_RGBA4_S3TC:
+   case GL_COMPRESSED_RGBA_S3TC_DXT3_EXT:
+      return &_mesa_texformat_rgba_dxt3;
+
+   case GL_COMPRESSED_RGBA_S3TC_DXT5_EXT:
+      return &_mesa_texformat_rgba_dxt5;
+
+   case GL_DEPTH_COMPONENT:
+   case GL_DEPTH_COMPONENT16:
+   case GL_DEPTH_COMPONENT24:
+   case GL_DEPTH_COMPONENT32:
+#if 0
+      return &_mesa_texformat_z16;
+#else
+      /* fall-through.
+       * 16bpp depth texture can't be paired with a stencil buffer so
+       * always used combined depth/stencil format.
+       */
+#endif
+   case GL_DEPTH_STENCIL_EXT:
+   case GL_DEPTH24_STENCIL8_EXT:
+      return &_mesa_texformat_s8_z24;
+
+#ifndef I915
+   case GL_SRGB_EXT:
+   case GL_SRGB8_EXT:
+   case GL_SRGB_ALPHA_EXT:
+   case GL_SRGB8_ALPHA8_EXT:
+   case GL_COMPRESSED_SRGB_EXT:
+   case GL_COMPRESSED_SRGB_ALPHA_EXT:
+   case GL_COMPRESSED_SLUMINANCE_EXT:
+   case GL_COMPRESSED_SLUMINANCE_ALPHA_EXT:
+      return &_mesa_texformat_sargb8;
+   case GL_SLUMINANCE_EXT:
+   case GL_SLUMINANCE8_EXT:
+      if (IS_G4X(intel->intelScreen->deviceID))
+         return &_mesa_texformat_sl8;
+      else
+         return &_mesa_texformat_sargb8;
+   case GL_SLUMINANCE_ALPHA_EXT:
+   case GL_SLUMINANCE8_ALPHA8_EXT:
+      if (IS_G4X(intel->intelScreen->deviceID))
+         return &_mesa_texformat_sla8;
+      else
+         return &_mesa_texformat_sargb8;
+   case GL_COMPRESSED_SRGB_S3TC_DXT1_EXT:
+   case GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT:
+   case GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT:
+   case GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT:
+      return &_mesa_texformat_srgb_dxt1;
+
+   /* i915 could also do this */
+   case GL_DUDV_ATI:
+   case GL_DU8DV8_ATI:
+      return &_mesa_texformat_dudv8;
+   case GL_RGBA_SNORM:
+   case GL_RGBA8_SNORM:
+      return &_mesa_texformat_signed_rgba8888_rev;
+#endif
+
+   default:
+      fprintf(stderr, "unexpected texture format %s in %s\n",
+              _mesa_lookup_enum_by_nr(internalFormat), __FUNCTION__);
+      return NULL;
+   }
+
+   return NULL;                 /* never get here */
+}
+
+int intel_compressed_num_bytes(GLuint mesaFormat)
+{
+   int bytes = 0;
+   switch(mesaFormat) {
+     
+   case MESA_FORMAT_RGB_FXT1:
+   case MESA_FORMAT_RGBA_FXT1:
+   case MESA_FORMAT_RGB_DXT1:
+   case MESA_FORMAT_RGBA_DXT1:
+     bytes = 2;
+     break;
+     
+   case MESA_FORMAT_RGBA_DXT3:
+   case MESA_FORMAT_RGBA_DXT5:
+     bytes = 4;
+   default:
+     break;
+   }
+   
+   return bytes;
+}
--- a/src/gallium/drivers/i965/intel_tex_layout.c
+++ b/src/gallium/drivers/i965/intel_tex_layout.c
@ -0,0 +1,140 @@
+/**************************************************************************
+ * 
+ * Copyright 2006 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  *   Michel Dänzer <michel@tungstengraphics.com>
+  */
+
+#include "intel_mipmap_tree.h"
+#include "intel_tex_layout.h"
+#include "intel_context.h"
+#include "main/macros.h"
+
+void intel_get_texture_alignment_unit(GLenum internalFormat, GLuint *w, GLuint *h)
+{
+    switch (internalFormat) {
+    case GL_COMPRESSED_RGB_FXT1_3DFX:
+    case GL_COMPRESSED_RGBA_FXT1_3DFX:
+        *w = 8;
+        *h = 4;
+        break;
+
+    case GL_RGB_S3TC:
+    case GL_RGB4_S3TC:
+    case GL_COMPRESSED_RGB_S3TC_DXT1_EXT:
+    case GL_COMPRESSED_RGBA_S3TC_DXT1_EXT:
+    case GL_RGBA_S3TC:
+    case GL_RGBA4_S3TC:
+    case GL_COMPRESSED_RGBA_S3TC_DXT3_EXT:
+    case GL_COMPRESSED_RGBA_S3TC_DXT5_EXT:
+        *w = 4;
+        *h = 4;
+        break;
+
+    default:
+        *w = 4;
+        *h = 2;
+        break;
+    }
+}
+
+void i945_miptree_layout_2d( struct intel_context *intel,
+			     struct intel_mipmap_tree *mt,
+			     uint32_t tiling )
+{
+   GLuint align_h = 2, align_w = 4;
+   GLuint level;
+   GLuint x = 0;
+   GLuint y = 0;
+   GLuint width = mt->width0;
+   GLuint height = mt->height0;
+
+   mt->pitch = mt->width0;
+   intel_get_texture_alignment_unit(mt->internal_format, &align_w, &align_h);
+
+   if (mt->compressed) {
+       mt->pitch = ALIGN(mt->width0, align_w);
+   }
+
+   /* May need to adjust pitch to accomodate the placement of
+    * the 2nd mipmap.  This occurs when the alignment
+    * constraints of mipmap placement push the right edge of the
+    * 2nd mipmap out past the width of its parent.
+    */
+   if (mt->first_level != mt->last_level) {
+       GLuint mip1_width;
+
+       if (mt->compressed) {
+           mip1_width = ALIGN(minify(mt->width0), align_w)
+               + ALIGN(minify(minify(mt->width0)), align_w);
+       } else {
+           mip1_width = ALIGN(minify(mt->width0), align_w)
+               + minify(minify(mt->width0));
+       }
+
+       if (mip1_width > mt->pitch) {
+           mt->pitch = mip1_width;
+       }
+   }
+
+   /* Pitch must be a whole number of dwords, even though we
+    * express it in texels.
+    */
+   mt->pitch = intel_miptree_pitch_align (intel, mt, tiling, mt->pitch);
+   mt->total_height = 0;
+
+   for ( level = mt->first_level ; level <= mt->last_level ; level++ ) {
+      GLuint img_height;
+
+      intel_miptree_set_level_info(mt, level, 1, x, y, width, 
+				   height, 1);
+
+      if (mt->compressed)
+	 img_height = MAX2(1, height/4);
+      else
+	 img_height = ALIGN(height, align_h);
+
+
+      /* Because the images are packed better, the final offset
+       * might not be the maximal one:
+       */
+      mt->total_height = MAX2(mt->total_height, y + img_height);
+
+      /* Layout_below: step right after second mipmap.
+       */
+      if (level == mt->first_level + 1) {
+	 x += ALIGN(width, align_w);
+      }
+      else {
+	 y += img_height;
+      }
+
+      width  = minify(width);
+      height = minify(height);
+   }
+}