diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index 977fd8c35f5..671b44bd50d 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -2111,25 +2111,22 @@ fs_visitor::lower_constant_loads() if (pull_index == -1) continue; - const unsigned index = stage_prog_data->binding_table.pull_constants_start; - fs_reg dst; - - if (type_sz(inst->src[i].type) <= 4) - dst = vgrf(glsl_type::float_type); - else - dst = vgrf(glsl_type::double_type); - assert(inst->src[i].stride == 0); - const fs_builder ubld = ibld.exec_all().group(4, 0); - struct brw_reg offset = brw_imm_ud((unsigned)(pull_index * 4) & ~15); + const unsigned index = stage_prog_data->binding_table.pull_constants_start; + const unsigned block_sz = 64; /* Fetch one cacheline at a time. */ + const fs_builder ubld = ibld.exec_all().group(block_sz / 4, 0); + const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD); + const unsigned base = pull_index * 4; + ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, - dst, brw_imm_ud(index), offset); + dst, brw_imm_ud(index), brw_imm_ud(base & ~(block_sz - 1))); /* Rewrite the instruction to use the temporary VGRF. */ inst->src[i].file = VGRF; inst->src[i].nr = dst.nr; - inst->src[i].offset = (pull_index & 3) * 4 + inst->src[i].offset % 4; + inst->src[i].offset = (base & (block_sz - 1)) + + inst->src[i].offset % 4; brw_mark_surface_used(prog_data, index); } diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp index 7df74232457..9f2729a9b6e 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp @@ -4059,21 +4059,23 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr * and we have to split it if necessary. */ const unsigned type_size = type_sz(dest.type); - const fs_builder ubld = bld.exec_all().group(4, 0); - const fs_reg packed_consts = ubld.vgrf(BRW_REGISTER_TYPE_F); + const unsigned block_sz = 64; /* Fetch one cacheline at a time. */ + const fs_builder ubld = bld.exec_all().group(block_sz / 4, 0); + const fs_reg packed_consts = ubld.vgrf(BRW_REGISTER_TYPE_UD); for (unsigned c = 0; c < instr->num_components;) { const unsigned base = const_offset->u32[0] + c * type_size; - - /* Number of usable components in the next 16B-aligned load */ + /* Number of usable components in the next block-aligned load. */ const unsigned count = MIN2(instr->num_components - c, - (16 - base % 16) / type_size); + (block_sz - base % block_sz) / type_size); ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, - packed_consts, surf_index, brw_imm_ud(base & ~15)); + packed_consts, surf_index, + brw_imm_ud(base & ~(block_sz - 1))); const fs_reg consts = - retype(byte_offset(packed_consts, base & 15), dest.type); + retype(byte_offset(packed_consts, base & (block_sz - 1)), + dest.type); for (unsigned d = 0; d < count; d++) bld.MOV(offset(dest, bld, c + d), component(consts, d));