intel/eu: Use non-coherent mode (BTI=253) for stateless A64 messages
We don't care about full IA coherency since we always have the opportunity in GL or Vulkan to flush the data cache. Using IA-coherent mode is likely just making A64 access slower than it needs to be. Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4819>
This commit is contained in:
parent
0edc29020b
commit
4985e380dd
|
@ -744,7 +744,8 @@ brw_dp_a64_untyped_surface_rw_desc(const struct gen_device_info *devinfo,
|
|||
SET_BITS(brw_mdc_cmask(num_channels), 3, 0) |
|
||||
SET_BITS(simd_mode, 5, 4);
|
||||
|
||||
return brw_dp_desc(devinfo, BRW_BTI_STATELESS, msg_type, msg_control);
|
||||
return brw_dp_desc(devinfo, GEN8_BTI_STATELESS_NON_COHERENT,
|
||||
msg_type, msg_control);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -782,7 +783,8 @@ brw_dp_a64_byte_scattered_rw_desc(const struct gen_device_info *devinfo,
|
|||
SET_BITS(brw_mdc_a64_ds(bit_size / 8), 3, 2) |
|
||||
SET_BITS(exec_size == 16, 4, 4);
|
||||
|
||||
return brw_dp_desc(devinfo, BRW_BTI_STATELESS, msg_type, msg_control);
|
||||
return brw_dp_desc(devinfo, GEN8_BTI_STATELESS_NON_COHERENT,
|
||||
msg_type, msg_control);
|
||||
}
|
||||
|
||||
static inline uint32_t
|
||||
|
@ -803,7 +805,8 @@ brw_dp_a64_untyped_atomic_desc(const struct gen_device_info *devinfo,
|
|||
SET_BITS(bit_size == 64, 4, 4) |
|
||||
SET_BITS(response_expected, 5, 5);
|
||||
|
||||
return brw_dp_desc(devinfo, BRW_BTI_STATELESS, msg_type, msg_control);
|
||||
return brw_dp_desc(devinfo, GEN8_BTI_STATELESS_NON_COHERENT,
|
||||
msg_type, msg_control);
|
||||
}
|
||||
|
||||
static inline uint32_t
|
||||
|
@ -822,7 +825,8 @@ brw_dp_a64_untyped_atomic_float_desc(const struct gen_device_info *devinfo,
|
|||
SET_BITS(atomic_op, 1, 0) |
|
||||
SET_BITS(response_expected, 5, 5);
|
||||
|
||||
return brw_dp_desc(devinfo, BRW_BTI_STATELESS, msg_type, msg_control);
|
||||
return brw_dp_desc(devinfo, GEN8_BTI_STATELESS_NON_COHERENT,
|
||||
msg_type, msg_control);
|
||||
}
|
||||
|
||||
static inline uint32_t
|
||||
|
|
|
@ -1419,12 +1419,37 @@ enum brw_message_target {
|
|||
/* Dataport special binding table indices: */
|
||||
#define BRW_BTI_STATELESS 255
|
||||
#define GEN7_BTI_SLM 254
|
||||
/* Note that on Gen8+ BTI 255 was redefined to be IA-coherent according to the
|
||||
* hardware spec, however because the DRM sets bit 4 of HDC_CHICKEN0 on BDW,
|
||||
* CHV and at least some pre-production steppings of SKL due to
|
||||
* WaForceEnableNonCoherent, HDC memory access may have been overridden by the
|
||||
* kernel to be non-coherent (matching the behavior of the same BTI on
|
||||
* pre-Gen8 hardware) and BTI 255 may actually be an alias for BTI 253.
|
||||
|
||||
#define HSW_BTI_STATELESS_LOCALLY_COHERENT 255
|
||||
#define HSW_BTI_STATELESS_NON_COHERENT 253
|
||||
#define HSW_BTI_STATELESS_GLOBALLY_COHERENT 252
|
||||
#define HSW_BTI_STATELESS_LLC_COHERENT 251
|
||||
#define HSW_BTI_STATELESS_L3_UNCACHED 250
|
||||
|
||||
/* The hardware docs are a bit contradictory here. On Haswell, where they
|
||||
* first added cache ability control, there were 5 different cache modes (see
|
||||
* HSW_BTI_STATELESS_* above). On Broadwell, they reduced to two:
|
||||
*
|
||||
* - IA-Coherent (BTI=255): Coherent within Gen and coherent within the
|
||||
* entire IA cache memory hierarchy.
|
||||
*
|
||||
* - Non-Coherent (BTI=253): Coherent within Gen, same cache type.
|
||||
*
|
||||
* Information about stateless cache coherency can be found in the "A32
|
||||
* Stateless" section of the "3D Media GPGPU" volume of the PRM for each
|
||||
* hardware generation.
|
||||
*
|
||||
* Unfortunately, the docs for MDC_STATELESS appear to have been copied and
|
||||
* pasted from Haswell and give the Haswell definitions for the BTI values of
|
||||
* 255 and 253 including a warning about accessing 253 surfaces from multiple
|
||||
* threads. This seems to be a copy+paste error and the definitions from the
|
||||
* "A32 Stateless" section should be trusted instead.
|
||||
*
|
||||
* Note that because the DRM sets bit 4 of HDC_CHICKEN0 on BDW, CHV and at
|
||||
* least some pre-production steppings of SKL due to WaForceEnableNonCoherent,
|
||||
* HDC memory access may have been overridden by the kernel to be non-coherent
|
||||
* (matching the behavior of the same BTI on pre-Gen8 hardware) and BTI 255
|
||||
* may actually be an alias for BTI 253.
|
||||
*/
|
||||
#define GEN8_BTI_STATELESS_IA_COHERENT 255
|
||||
#define GEN8_BTI_STATELESS_NON_COHERENT 253
|
||||
|
|
Loading…
Reference in New Issue