radeonsi: overhaul the vertex fetch fixup mechanism

The overall goal is to support unaligned loads from vertex buffers natively on SI. In the unaligned case, we fall back to the general case implementation in ac_build_opencoded_load_format. Since this function is fully general, we will also use it going forward for cases requiring fully manual format conversions of dwords anyway. This requires a different encoding of the fix_fetch array, which will now contain the entire format information if a fixup is required. Having to check the alignment of vertex buffers is awkward. To keep the impact on the fast path minimal, the si_context will keep track of which vertex buffers are (not) at least dword-aligned, while the si_vertex_elements will note which vertex buffers have some (at most dword) alignment requirement. Vertex buffers should be dword-aligned most of the time, which allows a fast early-out in almost all cases. Add the radeonsi_vs_fetch_always_opencode configuration variable for testing purposes. Note that it can only be used reliably on LLVM >= 9, because support for byte and short load is required. v2: - add a missing check to si_bind_vertex_elements Reviewed-by: Marek Olšák <marek.olsak@amd.com>
2019-04-01 15:44:39 +02:00 · 2019-04-01 15:44:39 +02:00 · d814c21b1b
parent 8a951c3d2f
commit d814c21b1b
8 changed files with 301 additions and 280 deletions
--- a/src/gallium/drivers/radeonsi/si_debug_options.h
+++ b/src/gallium/drivers/radeonsi/si_debug_options.h
@ -2,5 +2,6 @@ OPT_BOOL(clear_db_cache_before_clear, false, "Clear DB cache before fast depth c
 OPT_BOOL(enable_nir, false, "Enable NIR")
 OPT_BOOL(aux_debug, false, "Generate ddebug_dumps for the auxiliary context")
 OPT_BOOL(sync_compile, false, "Always compile synchronously (will cause stalls)")
+OPT_BOOL(vs_fetch_always_opencode, false, "Always open code vertex fetches (less efficient, purely for testing)")

 #undef OPT_BOOL
--- a/src/gallium/drivers/radeonsi/si_get.c
+++ b/src/gallium/drivers/radeonsi/si_get.c
@ -197,7 +197,7 @@ static int si_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY:
 	case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY:
 	case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY:
-		return !sscreen->info.has_unaligned_shader_loads;
+		return HAVE_LLVM < 0x0900 && !sscreen->info.has_unaligned_shader_loads;

 	case PIPE_CAP_SPARSE_BUFFER_PAGE_SIZE:
 		return sscreen->info.has_sparse_vm_mappings ?
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@ -939,6 +939,7 @@ struct si_context {
 	bool				vertex_buffers_dirty;
 	bool				vertex_buffer_pointer_dirty;
 	struct pipe_vertex_buffer	vertex_buffer[SI_NUM_VERTEX_BUFFERS];
+	uint16_t			vertex_buffer_unaligned; /* bitmask of not dword-aligned buffers */

 	/* MSAA config state. */
 	int				ps_iter_samples;
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@ -430,21 +430,6 @@ static LLVMValueRef get_tcs_in_vertex_dw_stride(struct si_shader_context *ctx)
 	}
 }

-/* Bitcast <4 x float> to <2 x double>, extract the component, and convert
- * to float. */
-static LLVMValueRef extract_double_to_float(struct si_shader_context *ctx,
-					    LLVMValueRef vec4,
-					    unsigned double_index)
-{
-	LLVMBuilderRef builder = ctx->ac.builder;
-	LLVMTypeRef f64 = LLVMDoubleTypeInContext(ctx->ac.context);
-	LLVMValueRef dvec2 = LLVMBuildBitCast(builder, vec4,
-					      LLVMVectorType(f64, 2), "");
-	LLVMValueRef index = LLVMConstInt(ctx->i32, double_index, 0);
-	LLVMValueRef value = LLVMBuildExtractElement(builder, dvec2, index, "");
-	return LLVMBuildFPTrunc(builder, value, ctx->f32, "");
-}
-
 static LLVMValueRef unpack_sint16(struct si_shader_context *ctx,
 				 LLVMValueRef i32, unsigned index)
 {
@ -536,17 +521,12 @@ void si_llvm_load_input_vs(
 		return;
 	}

-	unsigned chan;
-	unsigned fix_fetch;
-	unsigned num_fetches;
-	unsigned fetch_stride;
-	unsigned num_channels;
-
+	union si_vs_fix_fetch fix_fetch;
 	LLVMValueRef t_list_ptr;
 	LLVMValueRef t_offset;
 	LLVMValueRef t_list;
 	LLVMValueRef vertex_index;
-	LLVMValueRef input[3];
+	LLVMValueRef tmp;

 	/* Load the T list */
 	t_list_ptr = LLVMGetParam(ctx->main_fn, ctx->param_vertex_buffers);
@ -559,74 +539,84 @@ void si_llvm_load_input_vs(
 				    ctx->param_vertex_index0 +
 				    input_index);

-	fix_fetch = ctx->shader->key.mono.vs_fix_fetch[input_index];
+	/* Use the open-coded implementation for all loads of doubles and
+	 * of dword-sized data that needs fixups. We need to insert conversion
+	 * code anyway, and the amd/common code does it for us.
+	 *
+	 * Note: On LLVM <= 8, we can only open-code formats with
+	 * channel size >= 4 bytes.
+	 */
+	bool opencode = ctx->shader->key.mono.vs_fetch_opencode & (1 << input_index);
+	fix_fetch.bits = ctx->shader->key.mono.vs_fix_fetch[input_index].bits;
+	if (opencode ||
+	    (fix_fetch.u.log_size == 3 && fix_fetch.u.format == AC_FETCH_FORMAT_FLOAT) ||
+	    (fix_fetch.u.log_size == 2)) {
+		tmp = ac_build_opencoded_load_format(
+				&ctx->ac, fix_fetch.u.log_size, fix_fetch.u.num_channels_m1 + 1,
+				fix_fetch.u.format, fix_fetch.u.reverse, !opencode,
+				t_list, vertex_index, ctx->ac.i32_0, ctx->ac.i32_0,
+				false, false, true);
+		for (unsigned i = 0; i < 4; ++i)
+			out[i] = LLVMBuildExtractElement(ctx->ac.builder, tmp, LLVMConstInt(ctx->i32, i, false), "");
+		return;
+	}

 	/* Do multiple loads for special formats. */
-	switch (fix_fetch) {
-	case SI_FIX_FETCH_RG_64_FLOAT:
-		num_fetches = 1; /* 1 2-dword or 4-dword load */
-		fetch_stride = 0;
-		if (util_last_bit(info->input_usage_mask[input_index]) >= 2)
-			num_channels = 4; /* 2 doubles in 4 dwords */
-		else
-			num_channels = 2; /* 1 double in 2 dwords */
-		break;
-	case SI_FIX_FETCH_RGB_64_FLOAT:
-		num_fetches = 3; /* 3 2-dword loads */
-		fetch_stride = 8;
-		num_channels = 2;
-		break;
-	case SI_FIX_FETCH_RGBA_64_FLOAT:
-		num_fetches = 2; /* 2 4-dword loads */
-		fetch_stride = 16;
-		num_channels = 4;
-		break;
-	case SI_FIX_FETCH_RGB_8:
-	case SI_FIX_FETCH_RGB_8_INT:
-		num_fetches = 3;
-		fetch_stride = 1;
-		num_channels = 1;
-		break;
-	case SI_FIX_FETCH_RGB_16:
-	case SI_FIX_FETCH_RGB_16_INT:
-		num_fetches = 3;
-		fetch_stride = 2;
-		num_channels = 1;
-		break;
-	default:
+	unsigned required_channels = util_last_bit(info->input_usage_mask[input_index]);
+	LLVMValueRef fetches[4];
+	unsigned num_fetches;
+	unsigned fetch_stride;
+	unsigned channels_per_fetch;
+
+	if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2) {
+		num_fetches = MIN2(required_channels, 3);
+		fetch_stride = 1 << fix_fetch.u.log_size;
+		channels_per_fetch = 1;
+	} else {
 		num_fetches = 1;
 		fetch_stride = 0;
-		num_channels = util_last_bit(info->input_usage_mask[input_index]);
+		channels_per_fetch = required_channels;
 	}

-	for (unsigned i = 0; i < num_fetches; i++) {
+	for (unsigned i = 0; i < num_fetches; ++i) {
 		LLVMValueRef voffset = LLVMConstInt(ctx->i32, fetch_stride * i, 0);
-
-		input[i] = ac_build_buffer_load_format(&ctx->ac, t_list,
-						       vertex_index, voffset,
-						       num_channels, false, true);
-		input[i] = ac_build_expand_to_vec4(&ctx->ac, input[i], num_channels);
+		fetches[i] = ac_build_buffer_load_format(&ctx->ac, t_list, vertex_index, voffset,
+							 channels_per_fetch, false, true);
 	}

-	/* Break up the vec4 into individual components */
-	for (chan = 0; chan < 4; chan++) {
-		LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
-		out[chan] = LLVMBuildExtractElement(ctx->ac.builder,
-						    input[0], llvm_chan, "");
+	if (num_fetches == 1 && channels_per_fetch > 1) {
+		LLVMValueRef fetch = fetches[0];
+		for (unsigned i = 0; i < channels_per_fetch; ++i) {
+			tmp = LLVMConstInt(ctx->i32, i, false);
+			fetches[i] = LLVMBuildExtractElement(
+				ctx->ac.builder, fetch, tmp, "");
+		}
+		num_fetches = channels_per_fetch;
+		channels_per_fetch = 1;
 	}

-	switch (fix_fetch) {
-	case SI_FIX_FETCH_A2_SNORM:
-	case SI_FIX_FETCH_A2_SSCALED:
-	case SI_FIX_FETCH_A2_SINT: {
-		/* The hardware returns an unsigned value; convert it to a
-		 * signed one.
+	for (unsigned i = num_fetches; i < 4; ++i)
+		fetches[i] = LLVMGetUndef(ctx->f32);
+
+	if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2 &&
+	    required_channels == 4) {
+		if (fix_fetch.u.format == AC_FETCH_FORMAT_UINT || fix_fetch.u.format == AC_FETCH_FORMAT_SINT)
+			fetches[3] = ctx->ac.i32_1;
+		else
+			fetches[3] = ctx->ac.f32_1;
+	} else if (fix_fetch.u.log_size == 3 &&
+		   (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ||
+		    fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED ||
+		    fix_fetch.u.format == AC_FETCH_FORMAT_SINT) &&
+		   required_channels == 4) {
+		/* For 2_10_10_10, the hardware returns an unsigned value;
+		 * convert it to a signed one.
 		 */
-		LLVMValueRef tmp = out[3];
+		LLVMValueRef tmp = fetches[3];
 		LLVMValueRef c30 = LLVMConstInt(ctx->i32, 30, 0);

 		/* First, recover the sign-extended signed integer value. */
-		if (fix_fetch == SI_FIX_FETCH_A2_SSCALED)
+		if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED)
 			tmp = LLVMBuildFPToUI(ctx->ac.builder, tmp, ctx->i32, "");
 		else
 			tmp = ac_to_integer(&ctx->ac, tmp);
@ -638,110 +628,26 @@ void si_llvm_load_input_vs(
 		 * exponent.
 		 */
 		tmp = LLVMBuildShl(ctx->ac.builder, tmp,
-				   fix_fetch == SI_FIX_FETCH_A2_SNORM ?
+				   fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ?
 				   LLVMConstInt(ctx->i32, 7, 0) : c30, "");
 		tmp = LLVMBuildAShr(ctx->ac.builder, tmp, c30, "");

 		/* Convert back to the right type. */
-		if (fix_fetch == SI_FIX_FETCH_A2_SNORM) {
+		if (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM) {
 			LLVMValueRef clamp;
 			LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
 			tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->f32, "");
 			clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, tmp, neg_one, "");
 			tmp = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, tmp, "");
-		} else if (fix_fetch == SI_FIX_FETCH_A2_SSCALED) {
+		} else if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED) {
 			tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->f32, "");
 		}

-		out[3] = tmp;
-		break;
+		fetches[3] = tmp;
 	}
-	case SI_FIX_FETCH_RGBA_32_UNORM:
-	case SI_FIX_FETCH_RGBX_32_UNORM:
-		for (chan = 0; chan < 4; chan++) {
-			out[chan] = ac_to_integer(&ctx->ac, out[chan]);
-			out[chan] = LLVMBuildUIToFP(ctx->ac.builder,
-						    out[chan], ctx->f32, "");
-			out[chan] = LLVMBuildFMul(ctx->ac.builder, out[chan],
-						  LLVMConstReal(ctx->f32, 1.0 / UINT_MAX), "");
-		}
-		/* RGBX UINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
-		if (fix_fetch == SI_FIX_FETCH_RGBX_32_UNORM)
-			out[3] = LLVMConstReal(ctx->f32, 1);
-		break;
-	case SI_FIX_FETCH_RGBA_32_SNORM:
-	case SI_FIX_FETCH_RGBX_32_SNORM:
-	case SI_FIX_FETCH_RGBA_32_FIXED:
-	case SI_FIX_FETCH_RGBX_32_FIXED: {
-		double scale;
-		if (fix_fetch >= SI_FIX_FETCH_RGBA_32_FIXED)
-			scale = 1.0 / 0x10000;
-		else
-			scale = 1.0 / INT_MAX;

-		for (chan = 0; chan < 4; chan++) {
-			out[chan] = ac_to_integer(&ctx->ac, out[chan]);
-			out[chan] = LLVMBuildSIToFP(ctx->ac.builder,
-						    out[chan], ctx->f32, "");
-			out[chan] = LLVMBuildFMul(ctx->ac.builder, out[chan],
-						  LLVMConstReal(ctx->f32, scale), "");
-		}
-		/* RGBX SINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
-		if (fix_fetch == SI_FIX_FETCH_RGBX_32_SNORM ||
-		    fix_fetch == SI_FIX_FETCH_RGBX_32_FIXED)
-			out[3] = LLVMConstReal(ctx->f32, 1);
-		break;
-	}
-	case SI_FIX_FETCH_RGBA_32_USCALED:
-		for (chan = 0; chan < 4; chan++) {
-			out[chan] = ac_to_integer(&ctx->ac, out[chan]);
-			out[chan] = LLVMBuildUIToFP(ctx->ac.builder,
-						    out[chan], ctx->f32, "");
-		}
-		break;
-	case SI_FIX_FETCH_RGBA_32_SSCALED:
-		for (chan = 0; chan < 4; chan++) {
-			out[chan] = ac_to_integer(&ctx->ac, out[chan]);
-			out[chan] = LLVMBuildSIToFP(ctx->ac.builder,
-						    out[chan], ctx->f32, "");
-		}
-		break;
-	case SI_FIX_FETCH_RG_64_FLOAT:
-		for (chan = 0; chan < 2; chan++)
-			out[chan] = extract_double_to_float(ctx, input[0], chan);
-
-		out[2] = LLVMConstReal(ctx->f32, 0);
-		out[3] = LLVMConstReal(ctx->f32, 1);
-		break;
-	case SI_FIX_FETCH_RGB_64_FLOAT:
-		for (chan = 0; chan < 3; chan++)
-			out[chan] = extract_double_to_float(ctx, input[chan], 0);
-
-		out[3] = LLVMConstReal(ctx->f32, 1);
-		break;
-	case SI_FIX_FETCH_RGBA_64_FLOAT:
-		for (chan = 0; chan < 4; chan++) {
-			out[chan] = extract_double_to_float(ctx, input[chan / 2],
-							    chan % 2);
-		}
-		break;
-	case SI_FIX_FETCH_RGB_8:
-	case SI_FIX_FETCH_RGB_8_INT:
-	case SI_FIX_FETCH_RGB_16:
-	case SI_FIX_FETCH_RGB_16_INT:
-		for (chan = 0; chan < 3; chan++) {
-			out[chan] = LLVMBuildExtractElement(ctx->ac.builder,
-							    input[chan],
-							    ctx->i32_0, "");
-		}
-		if (fix_fetch == SI_FIX_FETCH_RGB_8 ||
-		    fix_fetch == SI_FIX_FETCH_RGB_16) {
-			out[3] = LLVMConstReal(ctx->f32, 1);
-		} else {
-			out[3] = ac_to_float(&ctx->ac, ctx->i32_1);
-		}
-		break;
-	}
+	for (unsigned i = 0; i < 4; ++i)
+		out[i] = ac_to_float(&ctx->ac, fetches[i]);
 }

 static void declare_input_vs(
@ -5777,9 +5683,18 @@ static void si_dump_shader_key_vs(const struct si_shader_key *key,
 	fprintf(f, "  %s.ls_vgpr_fix = %u\n",
 		prefix, prolog->ls_vgpr_fix);

+	fprintf(f, "  mono.vs.fetch_opencode = %x\n", key->mono.vs_fetch_opencode);
 	fprintf(f, "  mono.vs.fix_fetch = {");
-	for (int i = 0; i < SI_MAX_ATTRIBS; i++)
-		fprintf(f, !i ? "%u" : ", %u", key->mono.vs_fix_fetch[i]);
+	for (int i = 0; i < SI_MAX_ATTRIBS; i++) {
+		union si_vs_fix_fetch fix = key->mono.vs_fix_fetch[i];
+		if (i)
+			fprintf(f, ", ");
+		if (!fix.bits)
+			fprintf(f, "0");
+		else
+			fprintf(f, "%u.%u.%u.%u", fix.u.reverse, fix.u.log_size,
+				fix.u.num_channels_m1, fix.u.format);
+	}
 	fprintf(f, "}\n");
 }

--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@ -273,27 +273,24 @@ enum {
 	SI_VS_BLIT_SGPRS_POS_TEXCOORD = 9,
 };

-/* For VS shader key fix_fetch. */
-enum {
-	SI_FIX_FETCH_NONE = 0,
-	SI_FIX_FETCH_A2_SNORM,
-	SI_FIX_FETCH_A2_SSCALED,
-	SI_FIX_FETCH_A2_SINT,
-	SI_FIX_FETCH_RGBA_32_UNORM,
-	SI_FIX_FETCH_RGBX_32_UNORM,
-	SI_FIX_FETCH_RGBA_32_SNORM,
-	SI_FIX_FETCH_RGBX_32_SNORM,
-	SI_FIX_FETCH_RGBA_32_USCALED,
-	SI_FIX_FETCH_RGBA_32_SSCALED,
-	SI_FIX_FETCH_RGBA_32_FIXED,
-	SI_FIX_FETCH_RGBX_32_FIXED,
-	SI_FIX_FETCH_RG_64_FLOAT,
-	SI_FIX_FETCH_RGB_64_FLOAT,
-	SI_FIX_FETCH_RGBA_64_FLOAT,
-	SI_FIX_FETCH_RGB_8,	/* A = 1.0 */
-	SI_FIX_FETCH_RGB_8_INT,	/* A = 1 */
-	SI_FIX_FETCH_RGB_16,
-	SI_FIX_FETCH_RGB_16_INT,
+/**
+ * For VS shader keys, describe any fixups required for vertex fetch.
+ *
+ * \ref log_size, \ref format, and the number of channels are interpreted as
+ * by \ref ac_build_opencoded_load_format.
+ *
+ * Note: all bits 0 (size = 1 byte, num channels = 1, format = float) is an
+ * impossible format and indicates that no fixup is needed (just use
+ * buffer_load_format_xyzw).
+ */
+union si_vs_fix_fetch {
+	struct {
+		uint8_t log_size : 2; /* 1, 2, 4, 8 or bytes per channel */
+		uint8_t num_channels_m1 : 2; /* number of channels minus 1 */
+		uint8_t format : 3; /* AC_FETCH_FORMAT_xxx */
+		uint8_t reverse : 1; /* reverse XYZ channels */
+	} u;
+	uint8_t bits;
 };

 struct si_shader;
@ -524,8 +521,11 @@ struct si_shader_key {

 	/* Flags for monolithic compilation only. */
 	struct {
-		/* One byte for every input: SI_FIX_FETCH_* enums. */
-		uint8_t		vs_fix_fetch[SI_MAX_ATTRIBS];
+		/* Whether fetch should be opencoded according to vs_fix_fetch.
+		 * Otherwise, if vs_fix_fetch is non-zero, buffer_load_format_xyzw
+		 * with minimal fixups is used. */
+		uint16_t vs_fetch_opencode;
+		union si_vs_fix_fetch vs_fix_fetch[SI_MAX_ATTRIBS];

 		union {
 			uint64_t	ff_tcs_inputs_to_copy; /* for fixed-func TCS */
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@ -4459,10 +4459,8 @@ static void *si_create_vertex_elements(struct pipe_context *ctx,
 	for (i = 0; i < count; ++i) {
 		const struct util_format_description *desc;
 		const struct util_format_channel_description *channel;
-		unsigned data_format, num_format;
 		int first_non_void;
 		unsigned vbo_index = elements[i].vertex_buffer_index;
-		unsigned char swizzle[4];

 		if (vbo_index >= SI_NUM_VERTEX_BUFFERS) {
 			FREE(v);
@ -4489,105 +4487,137 @@ static void *si_create_vertex_elements(struct pipe_context *ctx,

 		desc = util_format_description(elements[i].src_format);
 		first_non_void = util_format_get_first_non_void_channel(elements[i].src_format);
-		data_format = si_translate_buffer_dataformat(ctx->screen, desc, first_non_void);
-		num_format = si_translate_buffer_numformat(ctx->screen, desc, first_non_void);
 		channel = first_non_void >= 0 ? &desc->channel[first_non_void] : NULL;
-		memcpy(swizzle, desc->swizzle, sizeof(swizzle));

 		v->format_size[i] = desc->block.bits / 8;
 		v->src_offset[i] = elements[i].src_offset;
 		v->vertex_buffer_index[i] = vbo_index;

-		/* The hardware always treats the 2-bit alpha channel as
-		 * unsigned, so a shader workaround is needed. The affected
-		 * chips are VI and older except Stoney (GFX8.1).
-		 */
-		if (data_format == V_008F0C_BUF_DATA_FORMAT_2_10_10_10 &&
-		    sscreen->info.chip_class <= VI &&
-		    sscreen->info.family != CHIP_STONEY) {
-			if (num_format == V_008F0C_BUF_NUM_FORMAT_SNORM) {
-				v->fix_fetch[i] = SI_FIX_FETCH_A2_SNORM;
-			} else if (num_format == V_008F0C_BUF_NUM_FORMAT_SSCALED) {
-				v->fix_fetch[i] = SI_FIX_FETCH_A2_SSCALED;
-			} else if (num_format == V_008F0C_BUF_NUM_FORMAT_SINT) {
-				/* This isn't actually used in OpenGL. */
-				v->fix_fetch[i] = SI_FIX_FETCH_A2_SINT;
-			}
-		} else if (channel && channel->type == UTIL_FORMAT_TYPE_FIXED) {
-			if (desc->swizzle[3] == PIPE_SWIZZLE_1)
-				v->fix_fetch[i] = SI_FIX_FETCH_RGBX_32_FIXED;
-			else
-				v->fix_fetch[i] = SI_FIX_FETCH_RGBA_32_FIXED;
-		} else if (channel && channel->size == 32 && !channel->pure_integer) {
-			if (channel->type == UTIL_FORMAT_TYPE_SIGNED) {
-				if (channel->normalized) {
-					if (desc->swizzle[3] == PIPE_SWIZZLE_1)
-						v->fix_fetch[i] = SI_FIX_FETCH_RGBX_32_SNORM;
-					else
-						v->fix_fetch[i] = SI_FIX_FETCH_RGBA_32_SNORM;
-				} else {
-					v->fix_fetch[i] = SI_FIX_FETCH_RGBA_32_SSCALED;
-				}
-			} else if (channel->type == UTIL_FORMAT_TYPE_UNSIGNED) {
-				if (channel->normalized) {
-					if (desc->swizzle[3] == PIPE_SWIZZLE_1)
-						v->fix_fetch[i] = SI_FIX_FETCH_RGBX_32_UNORM;
-					else
-						v->fix_fetch[i] = SI_FIX_FETCH_RGBA_32_UNORM;
-				} else {
-					v->fix_fetch[i] = SI_FIX_FETCH_RGBA_32_USCALED;
-				}
-			}
-		} else if (channel && channel->size == 64 &&
-			   channel->type == UTIL_FORMAT_TYPE_FLOAT) {
-			switch (desc->nr_channels) {
-			case 1:
-			case 2:
-				v->fix_fetch[i] = SI_FIX_FETCH_RG_64_FLOAT;
-				swizzle[0] = PIPE_SWIZZLE_X;
-				swizzle[1] = PIPE_SWIZZLE_Y;
-				swizzle[2] = desc->nr_channels == 2 ? PIPE_SWIZZLE_Z : PIPE_SWIZZLE_0;
-				swizzle[3] = desc->nr_channels == 2 ? PIPE_SWIZZLE_W : PIPE_SWIZZLE_0;
-				break;
-			case 3:
-				v->fix_fetch[i] = SI_FIX_FETCH_RGB_64_FLOAT;
-				swizzle[0] = PIPE_SWIZZLE_X; /* 3 loads */
-				swizzle[1] = PIPE_SWIZZLE_Y;
-				swizzle[2] = PIPE_SWIZZLE_0;
-				swizzle[3] = PIPE_SWIZZLE_0;
-				break;
-			case 4:
-				v->fix_fetch[i] = SI_FIX_FETCH_RGBA_64_FLOAT;
-				swizzle[0] = PIPE_SWIZZLE_X; /* 2 loads */
-				swizzle[1] = PIPE_SWIZZLE_Y;
-				swizzle[2] = PIPE_SWIZZLE_Z;
-				swizzle[3] = PIPE_SWIZZLE_W;
-				break;
-			default:
-				assert(0);
-			}
-		} else if (channel && desc->nr_channels == 3) {
-			assert(desc->swizzle[0] == PIPE_SWIZZLE_X);
+		bool always_fix = false;
+		union si_vs_fix_fetch fix_fetch;
+		unsigned log_hw_load_size; /* the load element size as seen by the hardware */

-			if (channel->size == 8) {
+		fix_fetch.bits = 0;
+		log_hw_load_size = MIN2(2, util_logbase2(desc->block.bits) - 3);
+
+		if (channel) {
+			switch (channel->type) {
+			case UTIL_FORMAT_TYPE_FLOAT: fix_fetch.u.format = AC_FETCH_FORMAT_FLOAT; break;
+			case UTIL_FORMAT_TYPE_FIXED: fix_fetch.u.format = AC_FETCH_FORMAT_FIXED; break;
+			case UTIL_FORMAT_TYPE_SIGNED: {
 				if (channel->pure_integer)
-					v->fix_fetch[i] = SI_FIX_FETCH_RGB_8_INT;
+					fix_fetch.u.format = AC_FETCH_FORMAT_SINT;
+				else if (channel->normalized)
+					fix_fetch.u.format = AC_FETCH_FORMAT_SNORM;
 				else
-					v->fix_fetch[i] = SI_FIX_FETCH_RGB_8;
-			} else if (channel->size == 16) {
+					fix_fetch.u.format = AC_FETCH_FORMAT_SSCALED;
+				break;
+			}
+			case UTIL_FORMAT_TYPE_UNSIGNED: {
 				if (channel->pure_integer)
-					v->fix_fetch[i] = SI_FIX_FETCH_RGB_16_INT;
+					fix_fetch.u.format = AC_FETCH_FORMAT_UINT;
+				else if (channel->normalized)
+					fix_fetch.u.format = AC_FETCH_FORMAT_UNORM;
 				else
-					v->fix_fetch[i] = SI_FIX_FETCH_RGB_16;
+					fix_fetch.u.format = AC_FETCH_FORMAT_USCALED;
+				break;
+			}
+			default: unreachable("bad format type");
+			}
+		} else {
+			switch (elements[i].src_format) {
+			case PIPE_FORMAT_R11G11B10_FLOAT: fix_fetch.u.format = AC_FETCH_FORMAT_FLOAT; break;
+			default: unreachable("bad other format");
 			}
 		}

-		v->rsrc_word3[i] = S_008F0C_DST_SEL_X(si_map_swizzle(swizzle[0])) |
-				   S_008F0C_DST_SEL_Y(si_map_swizzle(swizzle[1])) |
-				   S_008F0C_DST_SEL_Z(si_map_swizzle(swizzle[2])) |
-				   S_008F0C_DST_SEL_W(si_map_swizzle(swizzle[3])) |
-				   S_008F0C_NUM_FORMAT(num_format) |
-				   S_008F0C_DATA_FORMAT(data_format);
+		if (desc->channel[0].size == 10) {
+			fix_fetch.u.log_size = 3; /* special encoding for 2_10_10_10 */
+			log_hw_load_size = 2;
+
+			/* The hardware always treats the 2-bit alpha channel as
+			 * unsigned, so a shader workaround is needed. The affected
+			 * chips are VI and older except Stoney (GFX8.1).
+			 */
+			always_fix = sscreen->info.chip_class <= VI &&
+				     sscreen->info.family != CHIP_STONEY &&
+				     channel->type == UTIL_FORMAT_TYPE_SIGNED;
+		} else if (elements[i].src_format == PIPE_FORMAT_R11G11B10_FLOAT) {
+			fix_fetch.u.log_size = 3; /* special encoding */
+			fix_fetch.u.format = AC_FETCH_FORMAT_FIXED;
+			log_hw_load_size = 2;
+		} else {
+			fix_fetch.u.log_size = util_logbase2(channel->size) - 3;
+			fix_fetch.u.num_channels_m1 = desc->nr_channels - 1;
+
+			/* Always fix up:
+			 * - doubles (multiple loads + truncate to float)
+			 * - 32-bit requiring a conversion
+			 */
+			always_fix =
+				(fix_fetch.u.log_size == 3) ||
+				(fix_fetch.u.log_size == 2 &&
+				 fix_fetch.u.format != AC_FETCH_FORMAT_FLOAT &&
+				 fix_fetch.u.format != AC_FETCH_FORMAT_UINT &&
+				 fix_fetch.u.format != AC_FETCH_FORMAT_SINT);
+
+			/* Also fixup 8_8_8 and 16_16_16. */
+			if (desc->nr_channels == 3 && fix_fetch.u.log_size <= 1) {
+				always_fix = true;
+				log_hw_load_size = fix_fetch.u.log_size;
+			}
+		}
+
+		if (desc->swizzle[0] != PIPE_SWIZZLE_X) {
+			assert(desc->swizzle[0] == PIPE_SWIZZLE_Z &&
+			       (desc->swizzle[2] == PIPE_SWIZZLE_X || desc->swizzle[2] == PIPE_SWIZZLE_0));
+			fix_fetch.u.reverse = 1;
+		}
+
+		/* Force the workaround for unaligned access here already if the
+		 * offset relative to the vertex buffer base is unaligned.
+		 *
+		 * There is a theoretical case in which this is too conservative:
+		 * if the vertex buffer's offset is also unaligned in just the
+		 * right way, we end up with an aligned address after all.
+		 * However, this case should be extremely rare in practice (it
+		 * won't happen in well-behaved applications), and taking it
+		 * into account would complicate the fast path (where everything
+		 * is nicely aligned).
+		 */
+		bool check_alignment = log_hw_load_size >= 1 && sscreen->info.chip_class == SI;
+		bool opencode = sscreen->options.vs_fetch_always_opencode;
+
+		if (check_alignment &&
+		    (elements[i].src_offset & ((1 << log_hw_load_size) - 1)) != 0)
+			opencode = true;
+
+		if (always_fix || check_alignment || opencode)
+			v->fix_fetch[i] = fix_fetch.bits;
+
+		if (opencode)
+			v->fix_fetch_opencode |= 1 << i;
+		if (opencode || always_fix)
+			v->fix_fetch_always |= 1 << i;
+
+		if (check_alignment && !opencode) {
+			assert(log_hw_load_size == 1 || log_hw_load_size == 2);
+
+			v->fix_fetch_unaligned |= 1 << i;
+			v->hw_load_is_dword |= (log_hw_load_size - 1) << i;
+			v->vb_alignment_check_mask |= 1 << vbo_index;
+		}
+
+		v->rsrc_word3[i] = S_008F0C_DST_SEL_X(si_map_swizzle(desc->swizzle[0])) |
+				   S_008F0C_DST_SEL_Y(si_map_swizzle(desc->swizzle[1])) |
+				   S_008F0C_DST_SEL_Z(si_map_swizzle(desc->swizzle[2])) |
+				   S_008F0C_DST_SEL_W(si_map_swizzle(desc->swizzle[3]));
+
+		unsigned data_format, num_format;
+		data_format = si_translate_buffer_dataformat(ctx->screen, desc, first_non_void);
+		num_format = si_translate_buffer_numformat(ctx->screen, desc, first_non_void);
+		v->rsrc_word3[i] |= S_008F0C_NUM_FORMAT(num_format) |
+				    S_008F0C_DATA_FORMAT(data_format);
 	}

 	if (v->instance_divisor_is_fetched) {
@ -4621,7 +4651,17 @@ static void si_bind_vertex_elements(struct pipe_context *ctx, void *state)
 	    (!old ||
 	     old->count != v->count ||
 	     old->uses_instance_divisors != v->uses_instance_divisors ||
-	     v->uses_instance_divisors || /* we don't check which divisors changed */
+	     /* we don't check which divisors changed */
+	     v->uses_instance_divisors ||
+	     (old->vb_alignment_check_mask ^ v->vb_alignment_check_mask) & sctx->vertex_buffer_unaligned ||
+	     ((v->vb_alignment_check_mask & sctx->vertex_buffer_unaligned) &&
+	      memcmp(old->vertex_buffer_index, v->vertex_buffer_index,
+		     sizeof(v->vertex_buffer_index[0]) * v->count)) ||
+	     /* fix_fetch_{always,opencode,unaligned} and hw_load_is_dword are
+	      * functions of fix_fetch and the src_offset alignment.
+	      * If they change and fix_fetch doesn't, it must be due to different
+	      * src_offset alignment, which is reflected in fix_fetch_opencode. */
+	     old->fix_fetch_opencode != v->fix_fetch_opencode ||
 	     memcmp(old->fix_fetch, v->fix_fetch, sizeof(v->fix_fetch[0]) * v->count)))
 		sctx->do_update_shaders = true;

@ -4653,6 +4693,8 @@ static void si_set_vertex_buffers(struct pipe_context *ctx,
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct pipe_vertex_buffer *dst = sctx->vertex_buffer + start_slot;
+	uint32_t orig_unaligned = sctx->vertex_buffer_unaligned;
+	uint32_t unaligned = orig_unaligned;
 	int i;

 	assert(start_slot + count <= ARRAY_SIZE(sctx->vertex_buffer));
@ -4666,6 +4708,11 @@ static void si_set_vertex_buffers(struct pipe_context *ctx,
 			pipe_resource_reference(&dsti->buffer.resource, buf);
 			dsti->buffer_offset = src->buffer_offset;
 			dsti->stride = src->stride;
+			if (dsti->buffer_offset & 3 || dsti->stride & 3)
+				unaligned |= 1 << (start_slot + i);
+			else
+				unaligned &= ~(1 << (start_slot + i));
+
 			si_context_add_resource_size(sctx, buf);
 			if (buf)
 				si_resource(buf)->bind_history |= PIPE_BIND_VERTEX_BUFFER;
@ -4674,8 +4721,22 @@ static void si_set_vertex_buffers(struct pipe_context *ctx,
 		for (i = 0; i < count; i++) {
 			pipe_resource_reference(&dst[i].buffer.resource, NULL);
 		}
+		unaligned &= ~u_bit_consecutive(start_slot, count);
 	}
 	sctx->vertex_buffers_dirty = true;
+	sctx->vertex_buffer_unaligned = unaligned;
+
+	/* Check whether alignment may have changed in a way that requires
+	 * shader changes. This check is conservative: a vertex buffer can only
+	 * trigger a shader change if the misalignment amount changes (e.g.
+	 * from byte-aligned to short-aligned), but we only keep track of
+	 * whether buffers are at least dword-aligned, since that should always
+	 * be the case in well-behaved applications anyway.
+	 */
+	if (sctx->vertex_elements &&
+	    (sctx->vertex_elements->vb_alignment_check_mask &
+	     (unaligned | orig_unaligned) & u_bit_consecutive(start_slot, count)))
+		sctx->do_update_shaders = true;
 }

 /*
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@ -139,6 +139,25 @@ struct si_vertex_elements
 	uint8_t				format_size[SI_MAX_ATTRIBS];
 	uint8_t				vertex_buffer_index[SI_MAX_ATTRIBS];

+	/* Bitmask of elements that always need a fixup to be applied. */
+	uint16_t			fix_fetch_always;
+
+	/* Bitmask of elements whose fetch should always be opencoded. */
+	uint16_t			fix_fetch_opencode;
+
+	/* Bitmask of elements which need to be opencoded if the vertex buffer
+	 * is unaligned. */
+	uint16_t			fix_fetch_unaligned;
+
+	/* For elements in fix_fetch_unaligned: whether the effective
+	 * element load size as seen by the hardware is a dword (as opposed
+	 * to a short).
+	 */
+	uint16_t			hw_load_is_dword;
+
+	/* Bitmask of vertex buffers requiring alignment check */
+	uint16_t			vb_alignment_check_mask;
+
 	uint8_t				count;
 	bool				uses_instance_divisors;

--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@ -1390,7 +1390,31 @@ static void si_shader_selector_key_vs(struct si_context *sctx,
 		key->opt.prefer_mono = 1;

 	unsigned count = MIN2(vs->info.num_inputs, elts->count);
-	memcpy(key->mono.vs_fix_fetch, elts->fix_fetch, count);
+	unsigned count_mask = (1 << count) - 1;
+	unsigned fix = elts->fix_fetch_always & count_mask;
+	unsigned opencode = elts->fix_fetch_opencode & count_mask;
+
+	if (sctx->vertex_buffer_unaligned & elts->vb_alignment_check_mask) {
+		uint32_t mask = elts->fix_fetch_unaligned & count_mask;
+		while (mask) {
+			unsigned i = u_bit_scan(&mask);
+			unsigned log_hw_load_size = 1 + ((elts->hw_load_is_dword >> i) & 1);
+			unsigned vbidx = elts->vertex_buffer_index[i];
+			struct pipe_vertex_buffer *vb = &sctx->vertex_buffer[vbidx];
+			unsigned align_mask = (1 << log_hw_load_size) - 1;
+			if (vb->buffer_offset & align_mask ||
+			    vb->stride & align_mask) {
+				fix |= 1 << i;
+				opencode |= 1 << i;
+			}
+		}
+	}
+
+	while (fix) {
+		unsigned i = u_bit_scan(&fix);
+		key->mono.vs_fix_fetch[i].bits = elts->fix_fetch[i];
+	}
+	key->mono.vs_fetch_opencode = opencode;
 }

 static void si_shader_selector_key_hw_vs(struct si_context *sctx,