diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c
index 3a1b0579648..1c265603663 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -742,6 +742,7 @@ generate_fetch(struct gallivm_state *gallivm,
       val = lp_build_fetch_rgba_aos(gallivm,
                                     format_desc,
                                     lp_float32_vec4_type(),
+                                    FALSE,
                                     map_ptr,
                                     zero, zero, zero);
       LLVMBuildStore(builder, val, temp_ptr);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format.h b/src/gallium/auxiliary/gallivm/lp_bld_format.h
index 1177fb224dd..969f1f6cc94 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format.h
@@ -62,6 +62,7 @@ LLVMValueRef
 lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
                         const struct util_format_description *format_desc,
                         struct lp_type type,
+                        boolean aligned,
                         LLVMValueRef base_ptr,
                         LLVMValueRef offset,
                         LLVMValueRef i,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
index af755d460c1..3c25c329edd 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
@@ -356,6 +356,7 @@ lp_build_pack_rgba_aos(struct gallivm_state *gallivm,
  * Fetch a pixel into a 4 float AoS.
  *
  * \param format_desc  describes format of the image we're fetching from
+ * \param aligned  whether the data is guaranteed to be aligned
  * \param ptr  address of the pixel block (or the texel if uncompressed)
  * \param i, j  the sub-block pixel coordinates.  For non-compressed formats
  *              these will always be (0, 0).
@@ -365,6 +366,7 @@ LLVMValueRef
 lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
                         const struct util_format_description *format_desc,
                         struct lp_type type,
+                        boolean aligned,
                         LLVMValueRef base_ptr,
                         LLVMValueRef offset,
                         LLVMValueRef i,
@@ -400,7 +402,7 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
 
       packed = lp_build_gather(gallivm, type.length/4,
                                format_desc->block.bits, type.width*4,
-                               base_ptr, offset, TRUE);
+                               aligned, base_ptr, offset, TRUE);
 
       assert(format_desc->block.bits <= vec_len);
 
@@ -437,7 +439,7 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
          LLVMValueRef packed;
 
          packed = lp_build_gather_elem(gallivm, num_pixels,
-                                       format_desc->block.bits, 32,
+                                       format_desc->block.bits, 32, aligned,
                                        base_ptr, offset, k, FALSE);
 
          tmps[k] = lp_build_unpack_arith_rgba_aos(gallivm,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
index ff2887ee8fa..afaabc08790 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
@@ -386,6 +386,7 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
                                type.length,
                                format_desc->block.bits,
                                type.width,
+                               TRUE,
                                base_ptr, offset, FALSE);
 
       /*
@@ -411,8 +412,8 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
 
       packed = lp_build_gather(gallivm, type.length,
                                format_desc->block.bits,
-                               type.width, base_ptr, offset,
-                               FALSE);
+                               type.width, TRUE,
+                               base_ptr, offset, FALSE);
       if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT) {
          lp_build_r11g11b10_to_float(gallivm, packed, rgba_out);
       }
@@ -438,15 +439,15 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
          unsigned mask = (1 << 8) - 1;
          LLVMValueRef s_offset = lp_build_const_int_vec(gallivm, type, 4);
          offset = LLVMBuildAdd(builder, offset, s_offset, "");
-         packed = lp_build_gather(gallivm, type.length,
-                                  32, type.width, base_ptr, offset, FALSE);
+         packed = lp_build_gather(gallivm, type.length, 32, type.width,
+                                  TRUE, base_ptr, offset, FALSE);
          packed = LLVMBuildAnd(builder, packed,
                                lp_build_const_int_vec(gallivm, type, mask), "");
       }
       else {
          assert (format_desc->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
-         packed = lp_build_gather(gallivm, type.length,
-                                  32, type.width, base_ptr, offset, TRUE);
+         packed = lp_build_gather(gallivm, type.length, 32, type.width,
+                                  TRUE, base_ptr, offset, TRUE);
          packed = LLVMBuildBitCast(builder, packed,
                                    lp_build_vec_type(gallivm, type), "");
       }
@@ -472,7 +473,7 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
       tmp_type.norm = TRUE;
 
       tmp = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
-                                    base_ptr, offset, i, j);
+                                    TRUE, base_ptr, offset, i, j);
 
       lp_build_rgba8_to_fi32_soa(gallivm,
                                 type,
@@ -522,7 +523,7 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
 
          /* Get a single float[4]={R,G,B,A} pixel */
          tmp = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
-                                       base_ptr, offset_elem,
+                                       TRUE, base_ptr, offset_elem,
                                        i_elem, j_elem);
 
          /*
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
index 873f354c040..4f5a45c6a3d 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
@@ -497,7 +497,7 @@ lp_build_fetch_subsampled_rgba_aos(struct gallivm_state *gallivm,
    assert(format_desc->block.width == 2);
    assert(format_desc->block.height == 1);
 
-   packed = lp_build_gather(gallivm, n, 32, 32, base_ptr, offset, FALSE);
+   packed = lp_build_gather(gallivm, n, 32, 32, TRUE, base_ptr, offset, FALSE);
 
    (void)j;
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_gather.c b/src/gallium/auxiliary/gallivm/lp_bld_gather.c
index 9155d811c06..d02602041ce 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_gather.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_gather.c
@@ -76,6 +76,7 @@ lp_build_gather_elem(struct gallivm_state *gallivm,
                      unsigned length,
                      unsigned src_width,
                      unsigned dst_width,
+                     boolean aligned,
                      LLVMValueRef base_ptr,
                      LLVMValueRef offsets,
                      unsigned i,
@@ -93,6 +94,27 @@ lp_build_gather_elem(struct gallivm_state *gallivm,
    ptr = LLVMBuildBitCast(gallivm->builder, ptr, src_ptr_type, "");
    res = LLVMBuildLoad(gallivm->builder, ptr, "");
 
+   /* XXX
+    * On some archs we probably really want to avoid having to deal
+    * with alignments lower than 4 bytes (if fetch size is a power of
+    * two >= 32). On x86 it doesn't matter, however.
+    * We should be able to guarantee full alignment for any kind of texture
+    * fetch (except ARB_texture_buffer_range, oops), but not vertex fetch
+    * (there's PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY and friends
+    * but I don't think that's quite what we wanted).
+    * For ARB_texture_buffer_range, PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT
+    * looks like a good fit, but it seems this cap bit (and OpenGL) aren't
+    * enforcing what we want (which is what d3d10 does, the offset needs to
+    * be aligned to element size, but GL has bytes regardless of element
+    * size which would only leave us with minimum alignment restriction of 16
+    * which doesn't make much sense if the type isn't 4x32bit). Due to
+    * translation of offsets to first_elem in sampler_views it actually seems
+    * gallium could not do anything else except 16 no matter what...
+    */
+  if (!aligned) {
+      lp_set_load_alignment(res, 1);
+   }
+
    assert(src_width <= dst_width);
    if (src_width > dst_width) {
       res = LLVMBuildTrunc(gallivm->builder, res, dst_elem_type, "");
@@ -126,6 +148,7 @@ lp_build_gather_elem(struct gallivm_state *gallivm,
  * @param length length of the offsets
  * @param src_width src element width in bits
  * @param dst_width result element width in bits (src will be expanded to fit)
+ * @param aligned whether the data is guaranteed to be aligned (to src_width)
  * @param base_ptr base pointer, should be a i8 pointer type.
  * @param offsets vector with offsets
  * @param vector_justify select vector rather than integer justification
@@ -135,6 +158,7 @@ lp_build_gather(struct gallivm_state *gallivm,
                 unsigned length,
                 unsigned src_width,
                 unsigned dst_width,
+                boolean aligned,
                 LLVMValueRef base_ptr,
                 LLVMValueRef offsets,
                 boolean vector_justify)
@@ -144,7 +168,7 @@ lp_build_gather(struct gallivm_state *gallivm,
    if (length == 1) {
       /* Scalar */
       return lp_build_gather_elem(gallivm, length,
-                                  src_width, dst_width,
+                                  src_width, dst_width, aligned,
                                   base_ptr, offsets, 0, vector_justify);
    } else {
       /* Vector */
@@ -158,7 +182,7 @@ lp_build_gather(struct gallivm_state *gallivm,
          LLVMValueRef index = lp_build_const_int32(gallivm, i);
          LLVMValueRef elem;
          elem = lp_build_gather_elem(gallivm, length,
-                                     src_width, dst_width,
+                                     src_width, dst_width, aligned,
                                      base_ptr, offsets, i, vector_justify);
          res = LLVMBuildInsertElement(gallivm->builder, res, elem, index, "");
       }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_gather.h b/src/gallium/auxiliary/gallivm/lp_bld_gather.h
index ee694732d36..3ede4763a70 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_gather.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_gather.h
@@ -45,6 +45,7 @@ lp_build_gather_elem(struct gallivm_state *gallivm,
                      unsigned length,
                      unsigned src_width,
                      unsigned dst_width,
+                     boolean aligned,
                      LLVMValueRef base_ptr,
                      LLVMValueRef offsets,
                      unsigned i,
@@ -55,6 +56,7 @@ lp_build_gather(struct gallivm_state *gallivm,
                 unsigned length,
                 unsigned src_width,
                 unsigned dst_width,
+                boolean aligned,
                 LLVMValueRef base_ptr,
                 LLVMValueRef offsets,
                 boolean vector_justify);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
index 394521d382a..d7fde810a76 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
@@ -581,6 +581,7 @@ lp_build_sample_fetch_image_nearest(struct lp_build_sample_context *bld,
                               bld->texel_type.length,
                               bld->format_desc->block.bits,
                               bld->texel_type.width,
+                              TRUE,
                               data_ptr, offset, TRUE);
 
       rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, "");
@@ -589,6 +590,7 @@ lp_build_sample_fetch_image_nearest(struct lp_build_sample_context *bld,
       rgba8 = lp_build_fetch_rgba_aos(bld->gallivm,
                                       bld->format_desc,
                                       u8n.type,
+                                      TRUE,
                                       data_ptr, offset,
                                       x_subcoord,
                                       y_subcoord);
@@ -919,6 +921,7 @@ lp_build_sample_fetch_image_linear(struct lp_build_sample_context *bld,
                                        bld->texel_type.length,
                                        bld->format_desc->block.bits,
                                        bld->texel_type.width,
+                                       TRUE,
                                        data_ptr, offset[k][j][i], TRUE);
 
                rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, "");
@@ -927,6 +930,7 @@ lp_build_sample_fetch_image_linear(struct lp_build_sample_context *bld,
                rgba8 = lp_build_fetch_rgba_aos(bld->gallivm,
                                                bld->format_desc,
                                                u8n.type,
+                                               TRUE,
                                                data_ptr, offset[k][j][i],
                                                x_subcoord[i],
                                                y_subcoord[j]);
diff --git a/src/gallium/drivers/llvmpipe/lp_test_format.c b/src/gallium/drivers/llvmpipe/lp_test_format.c
index 48bf06e3c45..d9abd1ae37c 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_format.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_format.c
@@ -112,7 +112,7 @@ add_fetch_rgba_test(struct gallivm_state *gallivm, unsigned verbose,
    block = LLVMAppendBasicBlockInContext(context, func, "entry");
    LLVMPositionBuilderAtEnd(builder, block);
 
-   rgba = lp_build_fetch_rgba_aos(gallivm, desc, type,
+   rgba = lp_build_fetch_rgba_aos(gallivm, desc, type, TRUE,
                                   packed_ptr, offset, i, j);
 
    LLVMBuildStore(builder, rgba, rgba_ptr);
@@ -252,6 +252,7 @@ test_format_unorm8(unsigned verbose, FILE *fp,
          }
 
          /* To ensure it's 16-byte aligned */
+         /* Could skip this and use unaligned lp_build_fetch_rgba_aos */
          memcpy(packed, test->packed, sizeof packed);
 
          for (i = 0; i < desc->block.height; ++i) {