updated Glide driver documentation

fixed SAL/SAR in assyntax.h (NASM) fixed a bug wrt NULL pointer assignment in t_vtx_api.c cosmetics to t_vtx_x86.c & t_vtx_x86_gcc.S enabled STDCALL with codegen (MinGW)
2004-04-13 07:08:34 +00:00 · 2004-04-13 07:08:34 +00:00 · fb7766853d
parent 4ba589bbf0
commit fb7766853d
7 changed files with 471 additions and 287 deletions
--- a/docs/README.3DFX
+++ b/docs/README.3DFX
@ -3,7 +3,7 @@
-Mesa-6.0 release notes:
+Mesa-6.1 release notes:
 -----------------------
 1) Glide2 support has been ceased; in order to keep Voodoo Rush
@ -26,7 +26,7 @@ DOS (DJGPP), Windows9x/2k (MinGW/MSVC), Linux
 How to compile:
 ---------------
-DJGPP/MinGW/MSVC:
+DJGPP/MinGW:
   Place the Glide3 SDK in the top Mesa directory:
 	$(MESA)/glide3/include/*.h
 	$(MESA)/glide3/lib/
@ -35,11 +35,9 @@ DJGPP/MinGW/MSVC:
   Required libraries:
 	OS specific
   Type:
-	make -f Makefile.DJ HAVE_MMX=1 HAVE_3DNOW=1 FX=1
+	make -f Makefile.DJ X86=1 FX=1
 	or
-	make -f Makefile.mgw HAVE_MMX=1 HAVE_3DNOW=1 FX=1
+	make -f Makefile.mgw X86=1 FX=1
 	or
 	nmake -f Makefile.wfx
   Look into the corresponding makefiles for further information.
 Linux:
--- a/src/mesa/Makefile.mgw
+++ b/src/mesa/Makefile.mgw
@ -134,7 +134,7 @@ x86/matypes.h: x86/gen_matypes.exe
 x86/gen_matypes.exe: x86/gen_matypes.c
 	$(CC) -o $@ $(CFLAGS) -s $<
-# [dBorca] Hack alert:
+# [dBorca]
 # glapi_x86.S needs some adjustments
 # in order to generate correct entrypoints
 x86/glapi_x86.o: x86/glapi_x86.S
@ -142,6 +142,11 @@ x86/glapi_x86.o: x86/glapi_x86.S
 #main/dispatch.o: main/dispatch.c
 #	$(CC) -o $@ $(CFLAGS) -UUSE_X86_ASM -c $<
 # [dBorca]
 # if we want codegen, we have to stdcall
 tnl/t_vtx_x86_gcc.o: tnl/t_vtx_x86_gcc.S
 	$(CC) -o $@ $(CFLAGS) -DSTDCALL_API -c $<
 clean:
 	-$(call UNLINK,array_cache/*.o)
 	-$(call UNLINK,glapi/*.o)
--- a/src/mesa/tnl/t_vtx_api.c
+++ b/src/mesa/tnl/t_vtx_api.c
@ -102,8 +102,10 @@ static void _tnl_wrap_buffers( GLcontext *ctx )
 /* Deal with buffer wrapping where provoked by the vertex buffer
 * filling up, as opposed to upgrade_vertex().
 *
 * Make it GLAPIENTRY, so we can tail from the codegen'ed Vertex*fv
 */
-void _tnl_wrap_filled_vertex( GLcontext *ctx )
+void GLAPIENTRY _tnl_wrap_filled_vertex( GLcontext *ctx )
 {
   TNLcontext *tnl = TNL_CONTEXT(ctx);
   GLfloat *data = tnl->vtx.copied.buffer;
@ -403,7 +405,7 @@ static attrfv_func do_choose( GLuint attr, GLuint sz )
   /* Try to use codegen:
-    */   
+    */
 #ifdef USE_X86_ASM
   if (tnl->AllowCodegen)
      tnl->vtx.tabfv[attr][sz-1] = do_codegen( ctx, attr, sz );
@ -473,11 +475,15 @@ static void reset_attrfv( TNLcontext *tnl )
   for (i = 0 ; i < _TNL_ATTRIB_MAX ; i++) 
      if (tnl->vtx.attrsz[i]) {
-	 GLuint j = tnl->vtx.attrsz[i] - 1;
+	 GLint j = tnl->vtx.attrsz[i] - 1;
 	 tnl->vtx.attrsz[i] = 0;
-	 if (i < _TNL_MAX_ATTR_CODEGEN)
+	 if (i < _TNL_MAX_ATTR_CODEGEN) {
-	    tnl->vtx.tabfv[i][j] = choose[i][j];
+            while (j >= 0) {
 	       tnl->vtx.tabfv[i][j] = choose[i][j];
               j--;
            }
         }
      }
   tnl->vtx.vertex_size = 0;
--- a/src/mesa/tnl/t_vtx_api.h
+++ b/src/mesa/tnl/t_vtx_api.h
@ -49,7 +49,7 @@ extern void _tnl_vtx_destroy( GLcontext *ctx );
 extern void _tnl_FlushVertices( GLcontext *ctx, GLuint flags );
 extern void _tnl_flush_vtx( GLcontext *ctx );
-extern void _tnl_wrap_filled_vertex( GLcontext *ctx );
+extern void GLAPIENTRY _tnl_wrap_filled_vertex( GLcontext *ctx );
 /* t_vtx_exec.c:
 */
--- a/src/mesa/tnl/t_vtx_x86.c
+++ b/src/mesa/tnl/t_vtx_x86.c
@ -60,22 +60,25 @@ EXTERN( _tnl_x86_Vertex2fv );
 EXTERN( _tnl_x86_Vertex3fv );
 EXTERN( _tnl_x86_Vertex4fv );
-EXTERN( _tnl_x86_dispatch_attrf );
+EXTERN( _tnl_x86_dispatch_attrf1 );
 EXTERN( _tnl_x86_dispatch_attrf2 );
 EXTERN( _tnl_x86_dispatch_attrf3 );
 EXTERN( _tnl_x86_dispatch_attrf4 );
 EXTERN( _tnl_x86_dispatch_attrfv );
-EXTERN( _tnl_x86_dispatch_multitexcoordf );
+EXTERN( _tnl_x86_dispatch_multitexcoordf1 );
 EXTERN( _tnl_x86_dispatch_multitexcoordf2 );
 EXTERN( _tnl_x86_dispatch_multitexcoordf3 );
 EXTERN( _tnl_x86_dispatch_multitexcoordf4 );
 EXTERN( _tnl_x86_dispatch_multitexcoordfv );
-EXTERN( _tnl_x86_dispatch_vertexattribf );
+EXTERN( _tnl_x86_dispatch_vertexattribf1 );
 EXTERN( _tnl_x86_dispatch_vertexattribf2 );
 EXTERN( _tnl_x86_dispatch_vertexattribf3 );
 EXTERN( _tnl_x86_dispatch_vertexattribf4 );
 EXTERN( _tnl_x86_dispatch_vertexattribfv );
 EXTERN( _tnl_x86_choose_fv );
 static void notify( void )
 {
   GET_CURRENT_CONTEXT( ctx );
   _tnl_wrap_filled_vertex( ctx );
 }
 #define DONT_KNOW_OFFSETS 1
@ -93,7 +96,7 @@ static void notify( void )
 #define FIXUP( CODE, KNOWN_OFFSET, CHECKVAL, NEWVAL )	\
 do {							\
-   GLuint subst = 0x10101010 + CHECKVAL;		\
+   GLint subst = 0x10101010 + CHECKVAL;			\
 							\
   if (DONT_KNOW_OFFSETS) {				\
      while (*(int *)(CODE+offset) != subst) offset++;	\
@ -112,7 +115,7 @@ do {							\
 #define FIXUPREL( CODE, KNOWN_OFFSET, CHECKVAL, NEWVAL )\
 do {							\
-   GLuint subst = 0x10101010 + CHECKVAL;		\
+   GLint subst = 0x10101010 + CHECKVAL;			\
 							\
   if (DONT_KNOW_OFFSETS) {				\
      while (*(int *)(CODE+offset) != subst) offset++;	\
@ -262,53 +265,16 @@ void _tnl_InitX86Codegen( struct _tnl_dynfn_generators *gen )
 }
-static attrfv_func
+#define MKDISP(FUNC, SIZE, ATTR, WARP)					\
 _do_choose( GLuint attr, GLuint sz )
 {
   return NULL;
 }
 /* I purposely avoided one single macro, since they might need to be
 * handled in different ways.  Ohwell, once things get much clearer,
 * they could collapse...
 */
 #define MAKE_DISPATCH_ATTR(FUNC, SIZE, TYPE, ATTR)			\
 do {									\
   char *code;								\
-   char *start = (char *)&_tnl_x86_dispatch_attr##TYPE;			\
+   char *start = (char *)&WARP;						\
-   char *end = (char *)&_tnl_x86_dispatch_attr##TYPE##_end;		\
+   char *end = (char *)&WARP##_end;					\
   int offset = 0;							\
   code = ALIGN_MALLOC( end - start, 16 );				\
   memcpy (code, start, end - start);					\
   FIXUP(code, 0, 0, (int)&(TNL_CONTEXT(ctx)->vtx.tabfv[ATTR][SIZE-1]));\
-   vfmt->FUNC##SIZE##TYPE = code;					\
+   *(void **)&vfmt->FUNC = code;					\
 } while (0)
 #define MAKE_DISPATCH_MULTITEXCOORD(FUNC, SIZE, TYPE, ATTR)		\
 do {									\
   char *code;								\
   char *start = (char *)&_tnl_x86_dispatch_multitexcoord##TYPE;	\
   char *end = (char *)&_tnl_x86_dispatch_multitexcoord##TYPE##_end;	\
   int offset = 0;							\
   code = ALIGN_MALLOC( end - start, 16 );				\
   memcpy (code, start, end - start);					\
   FIXUP(code, 0, 0, (int)&(TNL_CONTEXT(ctx)->vtx.tabfv[_TNL_ATTRIB_TEX0][SIZE-1]));\
   vfmt->FUNC##SIZE##TYPE##ARB = code;					\
 } while (0)
 #define MAKE_DISPATCH_VERTEXATTRIB(FUNC, SIZE, TYPE, ATTR)		\
 do {									\
   char *code;								\
   char *start = (char *)&_tnl_x86_dispatch_vertexattrib##TYPE;		\
   char *end = (char *)&_tnl_x86_dispatch_vertexattrib##TYPE##_end;	\
   int offset = 0;							\
   code = ALIGN_MALLOC( end - start, 16 );				\
   memcpy (code, start, end - start);					\
   FIXUP(code, 0, 0, (int)&(TNL_CONTEXT(ctx)->vtx.tabfv[0][SIZE-1]));	\
   vfmt->FUNC##SIZE##TYPE##NV = code;					\
 } while (0)
@ -319,48 +285,48 @@ void _tnl_x86_exec_vtxfmt_init( GLcontext *ctx )
 {
   GLvertexformat *vfmt = &(TNL_CONTEXT(ctx)->exec_vtxfmt);
-   MAKE_DISPATCH_ATTR(Color,3,f,     _TNL_ATTRIB_COLOR0);
+   MKDISP(Color3f,             3, _TNL_ATTRIB_COLOR0, _tnl_x86_dispatch_attrf3);
-   MAKE_DISPATCH_ATTR(Color,3,fv,    _TNL_ATTRIB_COLOR0);
+   MKDISP(Color3fv,            3, _TNL_ATTRIB_COLOR0, _tnl_x86_dispatch_attrfv);
-   MAKE_DISPATCH_ATTR(Color,4,f,     _TNL_ATTRIB_COLOR0);
+   MKDISP(Color4f,             4, _TNL_ATTRIB_COLOR0, _tnl_x86_dispatch_attrf4);
-   MAKE_DISPATCH_ATTR(Color,4,fv,    _TNL_ATTRIB_COLOR0);
+   MKDISP(Color4fv,            4, _TNL_ATTRIB_COLOR0, _tnl_x86_dispatch_attrfv);
-/* vfmt->FogCoordfEXT = _tnl_FogCoordfEXT;
+   MKDISP(FogCoordfEXT,        1, _TNL_ATTRIB_FOG,    _tnl_x86_dispatch_attrf1);
-   vfmt->FogCoordfvEXT = _tnl_FogCoordfvEXT;*/
+   MKDISP(FogCoordfvEXT,       1, _TNL_ATTRIB_FOG,    _tnl_x86_dispatch_attrfv);
-   MAKE_DISPATCH_ATTR(Normal,3,f,    _TNL_ATTRIB_NORMAL);
+   MKDISP(Normal3f,            3, _TNL_ATTRIB_NORMAL, _tnl_x86_dispatch_attrf3);
-   MAKE_DISPATCH_ATTR(Normal,3,fv,   _TNL_ATTRIB_NORMAL);
+   MKDISP(Normal3fv,           3, _TNL_ATTRIB_NORMAL, _tnl_x86_dispatch_attrfv);
-/* vfmt->SecondaryColor3fEXT = _tnl_SecondaryColor3fEXT;
+   MKDISP(SecondaryColor3fEXT, 3, _TNL_ATTRIB_COLOR1, _tnl_x86_dispatch_attrf3);
-   vfmt->SecondaryColor3fvEXT = _tnl_SecondaryColor3fvEXT; */
+   MKDISP(SecondaryColor3fvEXT,3, _TNL_ATTRIB_COLOR1, _tnl_x86_dispatch_attrfv);
-   MAKE_DISPATCH_ATTR(TexCoord,1,f,  _TNL_ATTRIB_TEX0);
+   MKDISP(TexCoord1f,          1, _TNL_ATTRIB_TEX0,   _tnl_x86_dispatch_attrf1);
-   MAKE_DISPATCH_ATTR(TexCoord,1,fv, _TNL_ATTRIB_TEX0);
+   MKDISP(TexCoord1fv,         1, _TNL_ATTRIB_TEX0,   _tnl_x86_dispatch_attrfv);
-   MAKE_DISPATCH_ATTR(TexCoord,2,f,  _TNL_ATTRIB_TEX0);
+   MKDISP(TexCoord2f,          2, _TNL_ATTRIB_TEX0,   _tnl_x86_dispatch_attrf2);
-   MAKE_DISPATCH_ATTR(TexCoord,2,fv, _TNL_ATTRIB_TEX0);
+   MKDISP(TexCoord2fv,         2, _TNL_ATTRIB_TEX0,   _tnl_x86_dispatch_attrfv);
-   MAKE_DISPATCH_ATTR(TexCoord,3,f,  _TNL_ATTRIB_TEX0);
+   MKDISP(TexCoord3f,          3, _TNL_ATTRIB_TEX0,   _tnl_x86_dispatch_attrf3);
-   MAKE_DISPATCH_ATTR(TexCoord,3,fv, _TNL_ATTRIB_TEX0);
+   MKDISP(TexCoord3fv,         3, _TNL_ATTRIB_TEX0,   _tnl_x86_dispatch_attrfv);
-   MAKE_DISPATCH_ATTR(TexCoord,4,f,  _TNL_ATTRIB_TEX0);
+   MKDISP(TexCoord4f,          4, _TNL_ATTRIB_TEX0,   _tnl_x86_dispatch_attrf4);
-   MAKE_DISPATCH_ATTR(TexCoord,4,fv, _TNL_ATTRIB_TEX0);
+   MKDISP(TexCoord4fv,         4, _TNL_ATTRIB_TEX0,   _tnl_x86_dispatch_attrfv);
-   MAKE_DISPATCH_ATTR(Vertex,2,f,    _TNL_ATTRIB_POS);
+   MKDISP(Vertex2f,            2, _TNL_ATTRIB_POS,    _tnl_x86_dispatch_attrf2);
-   MAKE_DISPATCH_ATTR(Vertex,2,fv,   _TNL_ATTRIB_POS);
+   MKDISP(Vertex2fv,           2, _TNL_ATTRIB_POS,    _tnl_x86_dispatch_attrfv);
-   MAKE_DISPATCH_ATTR(Vertex,3,f,    _TNL_ATTRIB_POS);
+   MKDISP(Vertex3f,            3, _TNL_ATTRIB_POS,    _tnl_x86_dispatch_attrf3);
-   MAKE_DISPATCH_ATTR(Vertex,3,fv,   _TNL_ATTRIB_POS);
+   MKDISP(Vertex3fv,           3, _TNL_ATTRIB_POS,    _tnl_x86_dispatch_attrfv);
-   MAKE_DISPATCH_ATTR(Vertex,4,f,    _TNL_ATTRIB_POS);
+   MKDISP(Vertex4f,            4, _TNL_ATTRIB_POS,    _tnl_x86_dispatch_attrf4);
-   MAKE_DISPATCH_ATTR(Vertex,4,fv,   _TNL_ATTRIB_POS);
+   MKDISP(Vertex4fv,           4, _TNL_ATTRIB_POS,    _tnl_x86_dispatch_attrfv);
-   MAKE_DISPATCH_MULTITEXCOORD(MultiTexCoord,1,f,  0);
+   MKDISP(MultiTexCoord1fARB,  1, _TNL_ATTRIB_TEX0,   _tnl_x86_dispatch_multitexcoordf1);
-   MAKE_DISPATCH_MULTITEXCOORD(MultiTexCoord,1,fv, 0);
+   MKDISP(MultiTexCoord1fvARB, 1, _TNL_ATTRIB_TEX0,   _tnl_x86_dispatch_multitexcoordfv);
-   MAKE_DISPATCH_MULTITEXCOORD(MultiTexCoord,2,f,  0);
+   MKDISP(MultiTexCoord2fARB,  2, _TNL_ATTRIB_TEX0,   _tnl_x86_dispatch_multitexcoordf2);
-   MAKE_DISPATCH_MULTITEXCOORD(MultiTexCoord,2,fv, 0);
+   MKDISP(MultiTexCoord2fvARB, 2, _TNL_ATTRIB_TEX0,   _tnl_x86_dispatch_multitexcoordfv);
-   MAKE_DISPATCH_MULTITEXCOORD(MultiTexCoord,3,f,  0);
+   MKDISP(MultiTexCoord3fARB,  3, _TNL_ATTRIB_TEX0,   _tnl_x86_dispatch_multitexcoordf3);
-   MAKE_DISPATCH_MULTITEXCOORD(MultiTexCoord,3,fv, 0);
+   MKDISP(MultiTexCoord3fvARB, 3, _TNL_ATTRIB_TEX0,   _tnl_x86_dispatch_multitexcoordfv);
-   MAKE_DISPATCH_MULTITEXCOORD(MultiTexCoord,4,f,  0);
+   MKDISP(MultiTexCoord4fARB,  4, _TNL_ATTRIB_TEX0,   _tnl_x86_dispatch_multitexcoordf4);
-   MAKE_DISPATCH_MULTITEXCOORD(MultiTexCoord,4,fv, 0);
+   MKDISP(MultiTexCoord4fvARB, 4, _TNL_ATTRIB_TEX0,   _tnl_x86_dispatch_multitexcoordfv);
-   MAKE_DISPATCH_VERTEXATTRIB(VertexAttrib,1,f,  0);
+   MKDISP(VertexAttrib1fNV,    1, 0,                  _tnl_x86_dispatch_vertexattribf1);
-   MAKE_DISPATCH_VERTEXATTRIB(VertexAttrib,1,fv, 0);
+   MKDISP(VertexAttrib1fvNV,   1, 0,                  _tnl_x86_dispatch_vertexattribfv);
-   MAKE_DISPATCH_VERTEXATTRIB(VertexAttrib,2,f,  0);
+   MKDISP(VertexAttrib2fNV,    2, 0,                  _tnl_x86_dispatch_vertexattribf2);
-   MAKE_DISPATCH_VERTEXATTRIB(VertexAttrib,2,fv, 0);
+   MKDISP(VertexAttrib2fvNV,   2, 0,                  _tnl_x86_dispatch_vertexattribfv);
-   MAKE_DISPATCH_VERTEXATTRIB(VertexAttrib,3,f,  0);
+   MKDISP(VertexAttrib3fNV,    3, 0,                  _tnl_x86_dispatch_vertexattribf3);
-   MAKE_DISPATCH_VERTEXATTRIB(VertexAttrib,3,fv, 0);
+   MKDISP(VertexAttrib3fvNV,   3, 0,                  _tnl_x86_dispatch_vertexattribfv);
-   MAKE_DISPATCH_VERTEXATTRIB(VertexAttrib,4,f,  0);
+   MKDISP(VertexAttrib4fNV,    4, 0,                  _tnl_x86_dispatch_vertexattribf4);
-   MAKE_DISPATCH_VERTEXATTRIB(VertexAttrib,4,fv, 0);
+   MKDISP(VertexAttrib4fvNV,   4, 0,                  _tnl_x86_dispatch_vertexattribfv);
 }
@ -384,7 +350,7 @@ void _tnl_x86choosers( attrfv_func (*choose)[4],
         FIXUP(code, 0, 0, attr);
         FIXUP(code, 0, 1, size + 1);
         FIXUPREL(code, 0, 2, do_choose);
-         choose[attr][size] = code;
+         choose[attr][size] = (attrfv_func)code;
      }
   }
 }
--- a/src/mesa/tnl/t_vtx_x86_gcc.S
+++ b/src/mesa/tnl/t_vtx_x86_gcc.S
@ -28,97 +28,114 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 /*
 * Authors:
 *   Keith Whitwell <keith@tungstengraphics.com>
 *   Daniel Borca <dborca@yahoo.com>
 */
-#if !defined (__DJGPP__) && !defined (__MINGW32__)
+#if defined (__DJGPP__) || defined (__MINGW32__)
 #define GLOBL( x )	\
 .globl x;		\
 x:
 #else  /* defined(__DJGPP__) || defined (__MINGW32__) */
 #define GLOBL( x )	\
 .globl _##x;		\
 _##x:
 #else  /* !defined (__DJGPP__) && !defined (__MINGW32__) */
 #define GLOBL( x )	\
 .globl x;		\
 x:
 #endif /* !defined (__DJGPP__) && !defined (__MINGW32__) */
 #endif /* defined(__DJGPP__) || defined (__MINGW32__) */
-.data
+#if !defined (STDCALL_API)
-.align 4
+#define RETCLEAN( x )	ret
 #else
 #define RETCLEAN( x )	ret $x
 #endif
-// Someone who knew a lot about this sort of thing would use this
+
-// macro to note current offsets, etc in a special region of the
+#define _JMP(x)		\
-// object file & just make everything work out neat.  I do not know
+.byte 0xe9;		\
-// enough to do that...
+.long x
 #define _CALL(x)	\
 .byte 0xe8;		\
 .long x
 /* Someone who knew a lot about this sort of thing would use this
 * macro to note current offsets, etc in a special region of the
 * object file & just make everything work out neat.  I don't know
 * enough to do that...
 */
 #define SUBST( x ) (0x10101010 + x)
 .data
 // [dBorca] TODO
 // Unfold functions for each vertex size?
 // Build super-specialized SSE versions?
 // STDCALL woes (HAVE_NONSTANDARD_GLAPIENTRY):
 //   need separate routine for the non "fv" case,
 //   to clean up the stack!
 /* [dBorca] TODO
 * Unfold functions for each vertex size?
 * Build super-specialized SSE versions?
 *
 * There is a trick in Vertex*fv: under certain conditions,
 * we tail to _tnl_wrap_filled_vertex(ctx). This means that
 * if Vertex*fv is STDCALL, then _tnl_wrap_filled_vertex must
 * be STDCALL as well, because (GLcontext *) and (GLfloat *)
 * have the same size.
 */
 .align 4
 GLOBL ( _tnl_x86_Vertex1fv )
 	movl	4(%esp), %ecx
 	push	%edi
 	push	%esi
-	movl	SUBST(0), %edi	# 0x0 --> tnl->vtx.vbptr
+	movl	SUBST(0), %edi	/* 0x0 --> tnl->vtx.vbptr */
-	movl	(%ecx), %edx	# load v[0]
+	movl	(%ecx), %edx	/* load v[0] */
-	movl	%edx, (%edi)	# tnl->vtx.vbptr[0] = v[0]
+	movl	%edx, (%edi)	/* tnl->vtx.vbptr[0] = v[0] */
-	addl	$4, %edi	# tnl->vtx.vbptr += 1
+	addl	$4, %edi	/* tnl->vtx.vbptr += 1 */
-	movl	$SUBST(1), %ecx	# 0x1 --> (tnl->vtx.vertex_size - 1)
+	movl	$SUBST(1), %ecx	/* 0x1 --> (tnl->vtx.vertex_size - 1) */
-	movl	$SUBST(2), %esi	# 0x2 --> (tnl->vtx.vertex + 1)
+	movl	$SUBST(2), %esi	/* 0x2 --> (tnl->vtx.vertex + 1) */
 	repz
 	movsl	%ds:(%esi), %es:(%edi)
-	movl	%edi, SUBST(0)	# 0x0 --> tnl->vtx.vbptr
+	movl	%edi, SUBST(0)	/* 0x0 --> tnl->vtx.vbptr */
-	movl	SUBST(3), %edx	# 0x3 --> counter
+	movl	SUBST(3), %edx	/* 0x3 --> counter */
 	pop	%esi
 	pop	%edi
-	dec	%edx		# counter--
+	dec	%edx		/* counter-- */
-	movl	%edx, SUBST(3)	# 0x3 --> counter
+	movl	%edx, SUBST(3)	/* 0x3 --> counter */
-	jne	.0		# if (counter != 0) return
+	je	.0		/* if (counter == 0) goto .0 */
-	pushl	$SUBST(4)	# 0x4 --> ctx
+	RETCLEAN(4)		/* return */
-	.byte	0xe8		# call ...
+	.balign	16
 	.long	SUBST(5)	# ... _tnl_wrap_filled_vertex(ctx)
 	pop	%eax
 .0:
-	ret			# return
+	movl	$SUBST(4), %eax	/* load ctx */
 	movl	%eax, 4(%esp)	/* push ctx */
 	_JMP	(SUBST(5))	/* jmp _tnl_wrap_filled_vertex */
 GLOBL ( _tnl_x86_Vertex1fv_end )
 .align 4
 GLOBL ( _tnl_x86_Vertex2fv )
 	movl	4(%esp), %ecx
 	push	%edi
 	push	%esi
-	movl	SUBST(0), %edi	# load tnl->vtx.vbptr
+	movl	SUBST(0), %edi	/* load tnl->vtx.vbptr */
-	movl	(%ecx), %edx	# load v[0]
+	movl	(%ecx), %edx	/* load v[0] */
-	movl	4(%ecx), %eax	# load v[1]
+	movl	4(%ecx), %eax	/* load v[1] */
-	movl	%edx, (%edi)	# tnl->vtx.vbptr[0] = v[0]
+	movl	%edx, (%edi)	/* tnl->vtx.vbptr[0] = v[0] */
-	movl	%eax, 4(%edi)	# tnl->vtx.vbptr[1] = v[1]
+	movl	%eax, 4(%edi)	/* tnl->vtx.vbptr[1] = v[1] */
-	addl	$8, %edi	# tnl->vtx.vbptr += 2
+	addl	$8, %edi	/* tnl->vtx.vbptr += 2 */
-	movl	$SUBST(1), %ecx	# vertex_size - 2
+	movl	$SUBST(1), %ecx	/* vertex_size - 2 */
-	movl	$SUBST(2), %esi	# tnl->vtx.vertex + 2
+	movl	$SUBST(2), %esi	/* tnl->vtx.vertex + 2 */
 	repz
 	movsl	%ds:(%esi), %es:(%edi)
-	movl	%edi, SUBST(0)	# save tnl->vtx.vbptr
+	movl	%edi, SUBST(0)	/* save tnl->vtx.vbptr */
-	movl	SUBST(3), %edx	# load counter
+	movl	SUBST(3), %edx	/* load counter */
 	pop	%esi
 	pop	%edi
-	dec	%edx		# counter--
+	dec	%edx		/* counter-- */
-	movl	%edx, SUBST(3)	# save counter
+	movl	%edx, SUBST(3)	/* save counter */
-	jne	.1		# if (counter != 0) return
+	je	.1		/* if (counter == 0) goto .1 */
-	pushl	$SUBST(4)	# load ctx
+	RETCLEAN(4)		/* return */
-	.byte	0xe8		# call ...
+	.balign	16
 	.long	SUBST(5)	# ... _tnl_wrap_filled_vertex(ctx)
 	pop	%eax
 .1:
-	ret			# return
+	movl	$SUBST(4), %eax	/* load ctx */
 	movl	%eax, 4(%esp)	/* push ctx */
 	_JMP	(SUBST(5))	/* jmp _tnl_wrap_filled_vertex */
 GLOBL ( _tnl_x86_Vertex2fv_end )
 .align 4
@ -126,92 +143,88 @@ GLOBL ( _tnl_x86_Vertex3fv )
 	movl	4(%esp), %ecx
 	push	%edi
 	push	%esi
-	movl	SUBST(0), %edi	# load tnl->vtx.vbptr
+	movl	SUBST(0), %edi	/* load tnl->vtx.vbptr */
-	movl	(%ecx), %edx	# load v[0]
+	movl	(%ecx), %edx	/* load v[0] */
-	movl	4(%ecx), %eax	# load v[1]
+	movl	4(%ecx), %eax	/* load v[1] */
-	movl	8(%ecx), %esi	# load v[2]
+	movl	8(%ecx), %esi	/* load v[2] */
-	movl	%edx, (%edi)	# tnl->vtx.vbptr[0] = v[0]
+	movl	%edx, (%edi)	/* tnl->vtx.vbptr[0] = v[0] */
-	movl	%eax, 4(%edi)	# tnl->vtx.vbptr[1] = v[1]
+	movl	%eax, 4(%edi)	/* tnl->vtx.vbptr[1] = v[1] */
-	movl	%esi, 8(%edi)	# tnl->vtx.vbptr[2] = v[2]
+	movl	%esi, 8(%edi)	/* tnl->vtx.vbptr[2] = v[2] */
-	addl	$12, %edi	# tnl->vtx.vbptr += 3
+	addl	$12, %edi	/* tnl->vtx.vbptr += 3 */
-	movl	$SUBST(1), %ecx	# vertex_size - 3
+	movl	$SUBST(1), %ecx	/* vertex_size - 3 */
-	movl	$SUBST(2), %esi	# tnl->vtx.vertex + 3
+	movl	$SUBST(2), %esi	/* tnl->vtx.vertex + 3 */
 	repz
 	movsl	%ds:(%esi), %es:(%edi)
-	movl	%edi, SUBST(0)	# save tnl->vtx.vbptr
+	movl	%edi, SUBST(0)	/* save tnl->vtx.vbptr */
-	movl	SUBST(3), %edx	# load counter
+	movl	SUBST(3), %edx	/* load counter */
 	pop	%esi
 	pop	%edi
-	dec	%edx		# counter--
+	dec	%edx		/* counter-- */
-	movl	%edx, SUBST(3)	# save counter
+	movl	%edx, SUBST(3)	/* save counter */
-	jne	.2		# if (counter != 0) return
+	je	.2		/* if (counter == 0) goto .2 */
-	pushl	$SUBST(4)	# load ctx
+	RETCLEAN(4)		/* return */
-	.byte	0xe8		# call ...
+	.balign	16
 	.long	SUBST(5)	# ... _tnl_wrap_filled_vertex(ctx)
 	pop	%eax
 .2:
-	ret			# return
+	movl	$SUBST(4), %eax	/* load ctx */
 	movl	%eax, 4(%esp)	/* push ctx */
 	_JMP	(SUBST(5))	/* jmp _tnl_wrap_filled_vertex */
 GLOBL ( _tnl_x86_Vertex3fv_end )
 .align 4
 GLOBL ( _tnl_x86_Vertex4fv )
 	movl	4(%esp), %ecx
 	push	%edi
 	push	%esi
-	movl	SUBST(0), %edi	# load tnl->vtx.vbptr
+	movl	SUBST(0), %edi	/* load tnl->vtx.vbptr */
-	movl	(%ecx), %edx	# load v[0]
+	movl	(%ecx), %edx	/* load v[0] */
-	movl	4(%ecx), %eax	# load v[1]
+	movl	4(%ecx), %eax	/* load v[1] */
-	movl	8(%ecx), %esi	# load v[2]
+	movl	8(%ecx), %esi	/* load v[2] */
-	movl	12(%ecx), %ecx	# load v[3]
+	movl	12(%ecx), %ecx	/* load v[3] */
-	movl	%edx, (%edi)	# tnl->vtx.vbptr[0] = v[0]
+	movl	%edx, (%edi)	/* tnl->vtx.vbptr[0] = v[0] */
-	movl	%eax, 4(%edi)	# tnl->vtx.vbptr[1] = v[1]
+	movl	%eax, 4(%edi)	/* tnl->vtx.vbptr[1] = v[1] */
-	movl	%esi, 8(%edi)	# tnl->vtx.vbptr[2] = v[2]
+	movl	%esi, 8(%edi)	/* tnl->vtx.vbptr[2] = v[2] */
-	movl	%ecx, 12(%edi)	# tnl->vtx.vbptr[3] = v[3]
+	movl	%ecx, 12(%edi)	/* tnl->vtx.vbptr[3] = v[3] */
-	addl	$16, %edi	# tnl->vtx.vbptr += 4
+	addl	$16, %edi	/* tnl->vtx.vbptr += 4 */
-	movl	$SUBST(1), %ecx	# vertex_size - 4
+	movl	$SUBST(1), %ecx	/* vertex_size - 4 */
-	movl	$SUBST(2), %esi	# tnl->vtx.vertex + 3
+	movl	$SUBST(2), %esi	/* tnl->vtx.vertex + 4 */
 	repz
 	movsl	%ds:(%esi), %es:(%edi)
-	movl	%edi, SUBST(0)	# save tnl->vtx.vbptr
+	movl	%edi, SUBST(0)	/* save tnl->vtx.vbptr */
-	movl	SUBST(3), %edx	# load counter
+	movl	SUBST(3), %edx	/* load counter */
 	pop	%esi
 	pop	%edi
-	dec	%edx		# counter--
+	dec	%edx		/* counter-- */
-	movl	%edx, SUBST(3)	# save counter
+	movl	%edx, SUBST(3)	/* save counter */
-	jne	.3		# if (counter != 0) return
+	je	.3		/* if (counter == 0) goto .3 */
-	pushl	$SUBST(4)	# load ctx
+	RETCLEAN(4)		/* return */
-	.byte	0xe8		# call ...
+	.balign	16
 	.long	SUBST(5)	# ... _tnl_wrap_filled_vertex(ctx)
 	pop	%eax
 .3:
-	ret			# return
+	movl	$SUBST(4), %eax	/* load ctx */
 	movl	%eax, 4(%esp)	/* push ctx */
 	_JMP	(SUBST(5))	/* jmp _tnl_wrap_filled_vertex */
 GLOBL ( _tnl_x86_Vertex4fv_end )
 /**
 * Generic handlers for vector format data. 
 */
-
+GLOBL( _tnl_x86_Attribute1fv )
 GLOBL( _tnl_x86_Attribute1fv)
 	movl	4(%esp), %ecx
 	movl	(%ecx), %eax	/* load v[0] */
 	movl	%eax, SUBST(0)	/* store v[0] to current vertex */
-	ret
+	RETCLEAN(4)
 GLOBL ( _tnl_x86_Attribute1fv_end )
-GLOBL( _tnl_x86_Attribute2fv)
+GLOBL( _tnl_x86_Attribute2fv )
 	movl	4(%esp), %ecx
 	movl	(%ecx), %eax	/* load v[0] */
 	movl	4(%ecx), %edx	/* load v[1] */
 	movl	%eax, SUBST(0)	/* store v[0] to current vertex */
 	movl	%edx, SUBST(1)	/* store v[1] to current vertex */
-	ret
+	RETCLEAN(4)
 GLOBL ( _tnl_x86_Attribute2fv_end )
-
+GLOBL( _tnl_x86_Attribute3fv )
 GLOBL( _tnl_x86_Attribute3fv)
 	movl	4(%esp), %ecx
 	movl	(%ecx), %eax	/* load v[0] */
 	movl	4(%ecx), %edx	/* load v[1] */
@ -219,10 +232,10 @@ GLOBL( _tnl_x86_Attribute3fv)
 	movl	%eax, SUBST(0)	/* store v[0] to current vertex */
 	movl	%edx, SUBST(1)	/* store v[1] to current vertex */
 	movl	%ecx, SUBST(2)	/* store v[2] to current vertex */
-	ret
+	RETCLEAN(4)
 GLOBL ( _tnl_x86_Attribute3fv_end )
-GLOBL( _tnl_x86_Attribute4fv)
+GLOBL( _tnl_x86_Attribute4fv )
 	movl	4(%esp), %ecx
 	movl	(%ecx), %eax	/* load v[0] */
 	movl	4(%ecx), %edx	/* load v[1] */
@ -232,84 +245,131 @@ GLOBL( _tnl_x86_Attribute4fv)
 	movl	12(%ecx), %edx	/* load v[3] */
 	movl	%eax, SUBST(2)	/* store v[2] to current vertex */
 	movl	%edx, SUBST(3)	/* store v[3] to current vertex */
-	ret
+	RETCLEAN(4)
 GLOBL ( _tnl_x86_Attribute4fv_end )
-// Choosers:
+/* Choosers:
-
+ *
-// Must generate all of these ahead of first usage.  Generate at
+ * Must generate all of these ahead of first usage.  Generate at
-// compile-time?  
+ * compile-time?
-
+ */
-
+GLOBL( _tnl_x86_choose_fv )
-GLOBL( _tnl_x86_choose_fv)
+	subl	$12, %esp	/* gcc does 16 byte alignment of stack frames? */
-	subl	$12, %esp	# gcc does 16 byte alignment of stack frames?
+	movl	$SUBST(0), (%esp)	/* arg 0 - attrib */
-	movl	$SUBST(0), (%esp)	# arg 0 - attrib
+	movl	$SUBST(1), 4(%esp)	/* arg 1 - N */
-	movl	$SUBST(1), 4(%esp)	# arg 1 - N
+	_CALL	(SUBST(2))		/* call do_choose */
-	.byte	0xe8			# call ...
+	add	$12, %esp		/* tear down stack frame */
-	.long	SUBST(2)		# ... do_choose
+	jmp	*%eax			/* jump to new func */
 	add	$12, %esp		# tear down stack frame
 	jmp	*%eax			# jump to new func
 GLOBL ( _tnl_x86_choose_fv_end )
 /* FIRST LEVEL FUNCTIONS -- these are plugged directly into GL dispatch.
 *
 * In the 1st level dispatch functions, switch to a different
 * calling convention -- (const GLfloat *v) in %ecx.
 *
 * As with regular (x86) dispatch, don't create a new stack frame -
 * just let the 'ret' in the dispatched function return straight
 * back to the original caller.
 *
 * Vertex/Normal/Color, etc: the address of the function pointer
 * is known at codegen time.
 */
-// FIRST LEVEL FUNCTIONS -- these are plugged directly into GL dispatch.
+/* Unfortunately, have to play with the stack in the non-fv case:
 */
 #if !defined (STDCALL_API)
 GLOBL( _tnl_x86_dispatch_attrf1 )
 GLOBL( _tnl_x86_dispatch_attrf2 )
 GLOBL( _tnl_x86_dispatch_attrf3 )
 GLOBL( _tnl_x86_dispatch_attrf4 )
 	subl	$12, %esp	/* gcc does 16 byte alignment of stack frames? */
 	leal	16(%esp), %edx	/* address of first float on stack */
 	movl	%edx, (%esp)	/* save as 'v' */
 	call	*SUBST(0)	/* 0x0 --> tabfv[attr][n] */
 	addl	$12, %esp	/* tear down frame */
 	ret			/* return */
 GLOBL( _tnl_x86_dispatch_attrf4_end )
 GLOBL( _tnl_x86_dispatch_attrf3_end )
 GLOBL( _tnl_x86_dispatch_attrf2_end )
 GLOBL( _tnl_x86_dispatch_attrf1_end )
 #else  /* defined(STDCALL_API) */
 GLOBL( _tnl_x86_dispatch_attrf1 )
 	subl	$12, %esp	/* gcc does 16 byte alignment of stack frames? */
 	leal	16(%esp), %edx	/* address of first float on stack */
 	movl	%edx, (%esp)	/* save as 'v' */
 	call	*SUBST(0)	/* 0x0 --> tabfv[attr][n] */
 	addl	$8, %esp	/* tear down frame (4 shaved off by the callee) */
 	ret	$4		/* return */
 GLOBL( _tnl_x86_dispatch_attrf1_end )
-// In the 1st level dispatch functions, switch to a different
+GLOBL( _tnl_x86_dispatch_attrf2 )
-// calling convention -- (const GLfloat *v) in %ecx.
+	subl	$12, %esp	/* gcc does 16 byte alignment of stack frames? */
-// 
+	leal	16(%esp), %edx	/* address of first float on stack */
-// As with regular (x86) dispatch, do not create a new stack frame -
+	movl	%edx, (%esp)	/* save as 'v' */
-// just let the 'ret' in the dispatched function return straight
+	call	*SUBST(0)	/* 0x0 --> tabfv[attr][n] */
-// back to the original caller.
+	addl	$8, %esp	/* tear down frame (4 shaved off by the callee) */
 	ret	$8		/* return */
 GLOBL( _tnl_x86_dispatch_attrf2_end )
 GLOBL( _tnl_x86_dispatch_attrf3 )
 	subl	$12, %esp	/* gcc does 16 byte alignment of stack frames? */
 	leal	16(%esp), %edx	/* address of first float on stack */
 	movl	%edx, (%esp)	/* save as 'v' */
 	call	*SUBST(0)	/* 0x0 --> tabfv[attr][n] */
 	addl	$8, %esp	/* tear down frame (4 shaved off by the callee) */
 	ret	$12		/* return */
 GLOBL( _tnl_x86_dispatch_attrf3_end )
 GLOBL( _tnl_x86_dispatch_attrf4 )
 	subl	$12, %esp	/* gcc does 16 byte alignment of stack frames? */
 	leal	16(%esp), %edx	/* address of first float on stack */
 	movl	%edx, (%esp)	/* save as 'v' */
 	call	*SUBST(0)	/* 0x0 --> tabfv[attr][n] */
 	addl	$8, %esp	/* tear down frame (4 shaved off by the callee) */
 	ret	$16		/* return */
 GLOBL( _tnl_x86_dispatch_attrf4_end )
 #endif /* defined(STDCALL_API) */
-// Vertex/Normal/Color, etc: the address of the function pointer
+/* The fv case is simpler:
-// is known at codegen time.
+ */
 // Unfortunately, have to play with the stack in the non-fv case:
 // 
 GLOBL( _tnl_x86_dispatch_attrf )
 	subl	$12, %esp	# gcc does 16 byte alignment of stack frames?
 	leal	16(%esp), %edx	# address of first float on stack
 	movl	%edx, (%esp)	# save as 'v'
 	call	*SUBST(0)	# 0x0 --> tabfv[attr][n]
 	addl	$12, %esp	# tear down frame
 	ret			# return
 GLOBL( _tnl_x86_dispatch_attrf_end )
 // The fv case is simpler:
 // 
 GLOBL( _tnl_x86_dispatch_attrfv )
-	jmp	*SUBST(0)	# 0x0 --> tabfv[attr][n]
+	jmp	*SUBST(0)	/* 0x0 --> tabfv[attr][n] */
 GLOBL( _tnl_x86_dispatch_attrfv_end )
-// MultiTexcoord: the address of the function pointer must be
+/* MultiTexcoord: the address of the function pointer must be
-// calculated, but can use the index argument slot to hold 'v', and
+ * calculated, but can use the index argument slot to hold 'v', and
-// avoid setting up a new stack frame.
+ * avoid setting up a new stack frame.
-//
+ *
-// [dBorca]
+ * [dBorca]
-// right, this would be the preferred approach, but gcc does not
+ * right, this would be the preferred approach, but gcc does not
-// clean up the stack after each function call when optimizing (-fdefer-pop);
+ * clean up the stack after each function call when optimizing (-fdefer-pop);
-// can it make assumptions about what is already on the stack?  I dunno,
+ * can it make assumptions about what's already on the stack?  I dunno,
-// but in this case, we can't mess with the caller's stack frame, and
+ * but in this case, we can't mess with the caller's stack frame, and
-// we must use a model like '_x86_dispatch_attrfv' above.  Caveat emptor!
+ * we must use a model like `_x86_dispatch_attrfv' above.  Caveat emptor!
 */
-// Also, will only need a maximum of four of each of these per context:
+/* Also, will only need a maximum of four of each of these per context:
-// 
+ */
-GLOBL( _tnl_x86_dispatch_multitexcoordf )
+#if !defined (STDCALL_API)
 GLOBL( _tnl_x86_dispatch_multitexcoordf1 )
 GLOBL( _tnl_x86_dispatch_multitexcoordf2 )
 GLOBL( _tnl_x86_dispatch_multitexcoordf3 )
 GLOBL( _tnl_x86_dispatch_multitexcoordf4 )
 	movl	4(%esp), %ecx
 	leal	8(%esp), %edx
 	andl	$7, %ecx
 	movl	%edx, 4(%esp)
 	sall	$4, %ecx
-	jmp	*SUBST(0)(%ecx)	# 0x0 - tabfv[tex0][n]
+	jmp	*SUBST(0)(%ecx)	/* 0x0 - tabfv[tex0][n] */
-GLOBL( _tnl_x86_dispatch_multitexcoordf_end )
+GLOBL( _tnl_x86_dispatch_multitexcoordf4_end )
 GLOBL( _tnl_x86_dispatch_multitexcoordf3_end )
 GLOBL( _tnl_x86_dispatch_multitexcoordf2_end )
 GLOBL( _tnl_x86_dispatch_multitexcoordf1_end )
 GLOBL( _tnl_x86_dispatch_multitexcoordfv )
 	movl	4(%esp), %ecx
@ -317,32 +377,181 @@ GLOBL( _tnl_x86_dispatch_multitexcoordfv )
 	andl	$7, %ecx
 	movl	%edx, 4(%esp)
 	sall	$4, %ecx
-	jmp	*SUBST(0)(%ecx)	# 0x0 - tabfv[tex0][n]
+	jmp	*SUBST(0)(%ecx)	/* 0x0 - tabfv[tex0][n] */
 GLOBL( _tnl_x86_dispatch_multitexcoordfv_end )
-// VertexAttrib: the address of the function pointer must be
+#else  /* defined (STDCALL_API) */
 // calculated.
-GLOBL( _tnl_x86_dispatch_vertexattribf )
+GLOBL( _tnl_x86_dispatch_multitexcoordf1 )
 	subl	$12, %esp	/* gcc does 16 byte alignment of stack frames? */
 	movl	16(%esp), %ecx
 	leal	20(%esp), %edx
 	andl	$7, %ecx
 	movl	%edx, (%esp)
 	sall	$4, %ecx
 	call	*SUBST(0)(%ecx)	/* 0x0 - tabfv[tex0][n] */
 	addl	$8, %esp	/* tear down frame (4 shaved off by the callee) */
 	ret	$8		/* return */
 GLOBL( _tnl_x86_dispatch_multitexcoordf1_end )
 GLOBL( _tnl_x86_dispatch_multitexcoordf2 )
 	subl	$12, %esp	/* gcc does 16 byte alignment of stack frames? */
 	movl	16(%esp), %ecx
 	leal	20(%esp), %edx
 	andl	$7, %ecx
 	movl	%edx, (%esp)
 	sall	$4, %ecx
 	call	*SUBST(0)(%ecx)	/* 0x0 - tabfv[tex0][n] */
 	addl	$8, %esp	/* tear down frame (4 shaved off by the callee) */
 	ret	$12		/* return */
 GLOBL( _tnl_x86_dispatch_multitexcoordf2_end )
 GLOBL( _tnl_x86_dispatch_multitexcoordf3 )
 	subl	$12, %esp	/* gcc does 16 byte alignment of stack frames? */
 	movl	16(%esp), %ecx
 	leal	20(%esp), %edx
 	andl	$7, %ecx
 	movl	%edx, (%esp)
 	sall	$4, %ecx
 	call	*SUBST(0)(%ecx)	/* 0x0 - tabfv[tex0][n] */
 	addl	$8, %esp	/* tear down frame (4 shaved off by the callee) */
 	ret	$16		/* return */
 GLOBL( _tnl_x86_dispatch_multitexcoordf3_end )
 GLOBL( _tnl_x86_dispatch_multitexcoordf4 )
 	subl	$12, %esp	/* gcc does 16 byte alignment of stack frames? */
 	movl	16(%esp), %ecx
 	leal	20(%esp), %edx
 	andl	$7, %ecx
 	movl	%edx, (%esp)
 	sall	$4, %ecx
 	call	*SUBST(0)(%ecx)	/* 0x0 - tabfv[tex0][n] */
 	addl	$8, %esp	/* tear down frame (4 shaved off by the callee) */
 	ret	$20		/* return */
 GLOBL( _tnl_x86_dispatch_multitexcoordf4_end )
 GLOBL( _tnl_x86_dispatch_multitexcoordfv )
 	subl	$12, %esp	/* gcc does 16 byte alignment of stack frames? */
 	movl	16(%esp), %ecx
 	movl	20(%esp), %edx
 	andl	$7, %ecx
 	movl	%edx, (%esp)
 	sall	$4, %ecx
 	call	*SUBST(0)(%ecx)	/* 0x0 - tabfv[tex0][n] */
 	addl	$8, %esp	/* tear down frame (4 shaved off by the callee) */
 	ret	$8		/* return */
 GLOBL( _tnl_x86_dispatch_multitexcoordfv_end )
 #endif /* defined (STDCALL_API) */
 /* VertexAttrib: the address of the function pointer must be
 * calculated.
 */
 #if !defined (STDCALL_API)
 GLOBL( _tnl_x86_dispatch_vertexattribf1 )
 GLOBL( _tnl_x86_dispatch_vertexattribf2 )
 GLOBL( _tnl_x86_dispatch_vertexattribf3 )
 GLOBL( _tnl_x86_dispatch_vertexattribf4 )
 	movl	4(%esp), %eax
 	cmpl	$16, %eax
-	jb	.8		# "cmovge" is not supported on all CPUs
+	jb	.8		/* "cmovge" is not supported on all CPUs */
 	movl	$16, %eax
 .8:
-	leal	8(%esp), %ecx	# calculate 'v'
+	leal	8(%esp), %ecx	/* calculate 'v' */
-	movl	%ecx, 4(%esp)	# save in 1st arg slot
+	movl	%ecx, 4(%esp)	/* save in 1st arg slot */
 	sall	$4, %eax
-	jmp	*SUBST(0)(%eax)	# 0x0 - tabfv[0][n]
+	jmp	*SUBST(0)(%eax)	/* 0x0 - tabfv[0][n] */
-GLOBL( _tnl_x86_dispatch_vertexattribf_end )
+GLOBL( _tnl_x86_dispatch_vertexattribf4_end )
 GLOBL( _tnl_x86_dispatch_vertexattribf3_end )
 GLOBL( _tnl_x86_dispatch_vertexattribf2_end )
 GLOBL( _tnl_x86_dispatch_vertexattribf1_end )
 GLOBL( _tnl_x86_dispatch_vertexattribfv )
 	movl	4(%esp), %eax
 	cmpl	$16, %eax
-	jb	.9		# "cmovge" is not supported on all CPUs
+	jb	.9		/* "cmovge" is not supported on all CPUs */
 	movl	$16, %eax
 .9:
-	movl	8(%esp), %ecx	# load 'v'
+	movl	8(%esp), %ecx	/* load 'v' */
-	movl	%ecx, 4(%esp)	# save in 1st arg slot
+	movl	%ecx, 4(%esp)	/* save in 1st arg slot */
 	sall	$4, %eax
-	jmp	*SUBST(0)(%eax)	# 0x0 - tabfv[0][n]
+	jmp	*SUBST(0)(%eax)	/* 0x0 - tabfv[0][n] */
 GLOBL( _tnl_x86_dispatch_vertexattribfv_end )
 #else  /* defined (STDCALL_API) */
 GLOBL( _tnl_x86_dispatch_vertexattribf1 )
 	subl	$12, %esp	/* gcc does 16 byte alignment of stack frames? */
 	movl	16(%esp), %eax
 	cmpl	$16, %eax
 	jb	.81		/* "cmovge" is not supported on all CPUs */
 	movl	$16, %eax
 .81:
 	leal	20(%esp), %ecx	/* load 'v' */
 	movl	%ecx, (%esp)	/* save in 1st arg slot */
 	sall	$4, %eax
 	call	*SUBST(0)(%eax)	/* 0x0 - tabfv[0][n] */
 	addl	$8, %esp	/* tear down frame (4 shaved off by the callee) */
 	ret	$8		/* return */
 GLOBL( _tnl_x86_dispatch_vertexattribf1_end )
 GLOBL( _tnl_x86_dispatch_vertexattribf2 )
 	subl	$12, %esp	/* gcc does 16 byte alignment of stack frames? */
 	movl	16(%esp), %eax
 	cmpl	$16, %eax
 	jb	.82		/* "cmovge" is not supported on all CPUs */
 	movl	$16, %eax
 .82:
 	leal	20(%esp), %ecx	/* load 'v' */
 	movl	%ecx, (%esp)	/* save in 1st arg slot */
 	sall	$4, %eax
 	call	*SUBST(0)(%eax)	/* 0x0 - tabfv[0][n] */
 	addl	$8, %esp	/* tear down frame (4 shaved off by the callee) */
 	ret	$12		/* return */
 GLOBL( _tnl_x86_dispatch_vertexattribf2_end )
 GLOBL( _tnl_x86_dispatch_vertexattribf3 )
 	subl	$12, %esp	/* gcc does 16 byte alignment of stack frames? */
 	movl	16(%esp), %eax
 	cmpl	$16, %eax
 	jb	.83		/* "cmovge" is not supported on all CPUs */
 	movl	$16, %eax
 .83:
 	leal	20(%esp), %ecx	/* load 'v' */
 	movl	%ecx, (%esp)	/* save in 1st arg slot */
 	sall	$4, %eax
 	call	*SUBST(0)(%eax)	/* 0x0 - tabfv[0][n] */
 	addl	$8, %esp	/* tear down frame (4 shaved off by the callee) */
 	ret	$16		/* return */
 GLOBL( _tnl_x86_dispatch_vertexattribf3_end )
 GLOBL( _tnl_x86_dispatch_vertexattribf4 )
 	subl	$12, %esp	/* gcc does 16 byte alignment of stack frames? */
 	movl	16(%esp), %eax
 	cmpl	$16, %eax
 	jb	.84		/* "cmovge" is not supported on all CPUs */
 	movl	$16, %eax
 .84:
 	leal	20(%esp), %ecx	/* load 'v' */
 	movl	%ecx, (%esp)	/* save in 1st arg slot */
 	sall	$4, %eax
 	call	*SUBST(0)(%eax)	/* 0x0 - tabfv[0][n] */
 	addl	$8, %esp	/* tear down frame (4 shaved off by the callee) */
 	ret	$20		/* return */
 GLOBL( _tnl_x86_dispatch_vertexattribf4_end )
 GLOBL( _tnl_x86_dispatch_vertexattribfv )
 	subl	$12, %esp	/* gcc does 16 byte alignment of stack frames? */
 	movl	16(%esp), %eax
 	cmpl	$16, %eax
 	jb	.9		/* "cmovge" is not supported on all CPUs */
 	movl	$16, %eax
 .9:
 	movl	20(%esp), %ecx	/* load 'v' */
 	movl	%ecx, (%esp)	/* save in 1st arg slot */
 	sall	$4, %eax
 	call	*SUBST(0)(%eax)	/* 0x0 - tabfv[0][n] */
 	addl	$8, %esp	/* tear down frame (4 shaved off by the callee) */
 	ret	$8		/* return */
 GLOBL( _tnl_x86_dispatch_vertexattribfv_end )
 #endif /* defined (STDCALL_API) */
--- a/src/mesa/x86/assyntax.h
+++ b/src/mesa/x86/assyntax.h
@ -1300,11 +1300,11 @@ SECTION _DATA public align=16 class=DATA use32 flat
 #define REPZ			REPE
 #define RET			ret
 #define SAHF			sahf
-#define SAL_L(a, b)		sal L_(b), L_(a)
+#define SAL_L(a, b)		sal L_(b), B_(a)
-#define SAL_W(a, b)		sal W_(b), W_(a)
+#define SAL_W(a, b)		sal W_(b), B_(a)
 #define SAL_B(a, b)		sal B_(b), B_(a)
-#define SAR_L(a, b)		sar L_(b), L_(a)
+#define SAR_L(a, b)		sar L_(b), B_(a)
-#define SAR_W(a, b)		sar W_(b), W_(a)
+#define SAR_W(a, b)		sar W_(b), B_(a)
 #define SAR_B(a, b)		sar B_(b), B_(a)
 #define SBB_L(a, b)		sbb L_(b), L_(a)
 #define SBB_W(a, b)		sbb W_(b), W_(a)