intel/eu: Use non-coherent mode (BTI=253) for stateless A64 messages

We don't care about full IA coherency since we always have the
opportunity in GL or Vulkan to flush the data cache.  Using IA-coherent
mode is likely just making A64 access slower than it needs to be.

Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4819>
This commit is contained in:
Jason Ekstrand 2020-04-29 17:14:58 -05:00 committed by Marge Bot
parent 0edc29020b
commit 4985e380dd
2 changed files with 39 additions and 10 deletions

View File

@ -744,7 +744,8 @@ brw_dp_a64_untyped_surface_rw_desc(const struct gen_device_info *devinfo,
SET_BITS(brw_mdc_cmask(num_channels), 3, 0) |
SET_BITS(simd_mode, 5, 4);
return brw_dp_desc(devinfo, BRW_BTI_STATELESS, msg_type, msg_control);
return brw_dp_desc(devinfo, GEN8_BTI_STATELESS_NON_COHERENT,
msg_type, msg_control);
}
/**
@ -782,7 +783,8 @@ brw_dp_a64_byte_scattered_rw_desc(const struct gen_device_info *devinfo,
SET_BITS(brw_mdc_a64_ds(bit_size / 8), 3, 2) |
SET_BITS(exec_size == 16, 4, 4);
return brw_dp_desc(devinfo, BRW_BTI_STATELESS, msg_type, msg_control);
return brw_dp_desc(devinfo, GEN8_BTI_STATELESS_NON_COHERENT,
msg_type, msg_control);
}
static inline uint32_t
@ -803,7 +805,8 @@ brw_dp_a64_untyped_atomic_desc(const struct gen_device_info *devinfo,
SET_BITS(bit_size == 64, 4, 4) |
SET_BITS(response_expected, 5, 5);
return brw_dp_desc(devinfo, BRW_BTI_STATELESS, msg_type, msg_control);
return brw_dp_desc(devinfo, GEN8_BTI_STATELESS_NON_COHERENT,
msg_type, msg_control);
}
static inline uint32_t
@ -822,7 +825,8 @@ brw_dp_a64_untyped_atomic_float_desc(const struct gen_device_info *devinfo,
SET_BITS(atomic_op, 1, 0) |
SET_BITS(response_expected, 5, 5);
return brw_dp_desc(devinfo, BRW_BTI_STATELESS, msg_type, msg_control);
return brw_dp_desc(devinfo, GEN8_BTI_STATELESS_NON_COHERENT,
msg_type, msg_control);
}
static inline uint32_t

View File

@ -1419,12 +1419,37 @@ enum brw_message_target {
/* Dataport special binding table indices: */
#define BRW_BTI_STATELESS 255
#define GEN7_BTI_SLM 254
/* Note that on Gen8+ BTI 255 was redefined to be IA-coherent according to the
* hardware spec, however because the DRM sets bit 4 of HDC_CHICKEN0 on BDW,
* CHV and at least some pre-production steppings of SKL due to
* WaForceEnableNonCoherent, HDC memory access may have been overridden by the
* kernel to be non-coherent (matching the behavior of the same BTI on
* pre-Gen8 hardware) and BTI 255 may actually be an alias for BTI 253.
#define HSW_BTI_STATELESS_LOCALLY_COHERENT 255
#define HSW_BTI_STATELESS_NON_COHERENT 253
#define HSW_BTI_STATELESS_GLOBALLY_COHERENT 252
#define HSW_BTI_STATELESS_LLC_COHERENT 251
#define HSW_BTI_STATELESS_L3_UNCACHED 250
/* The hardware docs are a bit contradictory here. On Haswell, where they
* first added cache ability control, there were 5 different cache modes (see
* HSW_BTI_STATELESS_* above). On Broadwell, they reduced to two:
*
* - IA-Coherent (BTI=255): Coherent within Gen and coherent within the
* entire IA cache memory hierarchy.
*
* - Non-Coherent (BTI=253): Coherent within Gen, same cache type.
*
* Information about stateless cache coherency can be found in the "A32
* Stateless" section of the "3D Media GPGPU" volume of the PRM for each
* hardware generation.
*
* Unfortunately, the docs for MDC_STATELESS appear to have been copied and
* pasted from Haswell and give the Haswell definitions for the BTI values of
* 255 and 253 including a warning about accessing 253 surfaces from multiple
* threads. This seems to be a copy+paste error and the definitions from the
* "A32 Stateless" section should be trusted instead.
*
* Note that because the DRM sets bit 4 of HDC_CHICKEN0 on BDW, CHV and at
* least some pre-production steppings of SKL due to WaForceEnableNonCoherent,
* HDC memory access may have been overridden by the kernel to be non-coherent
* (matching the behavior of the same BTI on pre-Gen8 hardware) and BTI 255
* may actually be an alias for BTI 253.
*/
#define GEN8_BTI_STATELESS_IA_COHERENT 255
#define GEN8_BTI_STATELESS_NON_COHERENT 253