radeonsi: implement TC-compatible HTILE

so that decompress blits aren't needed and depth texturing needs less
memory bandwidth.

Z16 and Z24 are promoted to Z32_FLOAT by the driver, because TC-compatible
HTILE only supports Z32_FLOAT. This doubles memory footprint for Z16.
The format promotion is not visible to state trackers.

This is part of TC-compatible renderbuffer compression, which has 3 parts:
DCC, HTILE, FMASK. Only TC-compatible FMASK compression is missing now.

I don't see a measurable increase in performance though.

(I tested Talos Principle and DiRT: Showdown, the latter is improved by
 0.5%, which is almost noise, and it originally used layered Z16,
 so at least we know that Z16 promoted to Z32F isn't slower now)

Tested-by: Edmondo Tommasina <edmondo.tommasina@gmail.com>
Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
This commit is contained in:
Marek Olšák 2016-10-11 23:19:46 +02:00
parent a077185ea9
commit d4d9ec55c5
9 changed files with 185 additions and 24 deletions

View File

@ -245,6 +245,7 @@ struct r600_htile_info {
unsigned height;
unsigned xalign;
unsigned yalign;
unsigned alignment;
};
struct r600_texture {
@ -252,6 +253,7 @@ struct r600_texture {
uint64_t size;
unsigned num_level0_transfers;
enum pipe_format db_render_format;
bool is_depth;
bool db_compatible;
bool can_sample_z;
@ -273,6 +275,7 @@ struct r600_texture {
/* Depth buffer compression and fast clear. */
struct r600_htile_info htile;
struct r600_resource *htile_buffer;
bool tc_compatible_htile;
bool depth_cleared; /* if it was cleared at least once */
float depth_clear_value;
bool stencil_cleared; /* if it was cleared at least once */

View File

@ -192,7 +192,8 @@ static int r600_init_surface(struct r600_common_screen *rscreen,
struct radeon_surf *surface,
const struct pipe_resource *ptex,
unsigned array_mode,
bool is_flushed_depth)
bool is_flushed_depth,
bool tc_compatible_htile)
{
const struct util_format_description *desc =
util_format_description(ptex->format);
@ -256,11 +257,22 @@ static int r600_init_surface(struct r600_common_screen *rscreen,
if (!is_flushed_depth && is_depth) {
surface->flags |= RADEON_SURF_ZBUFFER;
if (tc_compatible_htile &&
array_mode == RADEON_SURF_MODE_2D) {
/* TC-compatible HTILE only supports Z32_FLOAT.
* Promote Z16 to Z32. DB->CB copies will convert
* the format for transfers.
*/
surface->bpe = 4;
surface->flags |= RADEON_SURF_TC_COMPATIBLE_HTILE;
}
if (is_stencil) {
surface->flags |= RADEON_SURF_SBUFFER |
RADEON_SURF_HAS_SBUFFER_MIPTREE;
}
}
if (rscreen->chip_class >= SI) {
surface->flags |= RADEON_SURF_HAS_TILE_MODE_INDEX;
}
@ -904,6 +916,7 @@ static unsigned r600_texture_get_htile_size(struct r600_common_screen *rscreen,
rtex->htile.height = height;
rtex->htile.xalign = cl_width * 8;
rtex->htile.yalign = cl_height * 8;
rtex->htile.alignment = base_align;
return (util_max_layer(&rtex->resource.b.b, 0) + 1) *
align(slice_bytes, base_align);
@ -912,21 +925,34 @@ static unsigned r600_texture_get_htile_size(struct r600_common_screen *rscreen,
static void r600_texture_allocate_htile(struct r600_common_screen *rscreen,
struct r600_texture *rtex)
{
unsigned htile_size = r600_texture_get_htile_size(rscreen, rtex);
uint64_t htile_size, alignment;
uint32_t clear_value;
if (rtex->tc_compatible_htile) {
htile_size = rtex->surface.htile_size;
alignment = rtex->surface.htile_alignment;
clear_value = 0x0000030F;
} else {
htile_size = r600_texture_get_htile_size(rscreen, rtex);
alignment = rtex->htile.alignment;
clear_value = 0;
}
if (!htile_size)
return;
rtex->htile_buffer = (struct r600_resource*)
pipe_buffer_create(&rscreen->b, PIPE_BIND_CUSTOM,
PIPE_USAGE_DEFAULT, htile_size);
r600_aligned_buffer_create(&rscreen->b, PIPE_BIND_CUSTOM,
PIPE_USAGE_DEFAULT,
htile_size, alignment);
if (rtex->htile_buffer == NULL) {
/* this is not a fatal error as we can still keep rendering
* without htile buffer */
R600_ERR("Failed to create buffer object for htile buffer.\n");
} else {
r600_screen_clear_buffer(rscreen, &rtex->htile_buffer->b.b, 0,
htile_size, 0, R600_COHERENCY_NONE);
r600_screen_clear_buffer(rscreen, &rtex->htile_buffer->b.b,
0, htile_size, clear_value,
R600_COHERENCY_NONE);
}
}
@ -967,10 +993,11 @@ void r600_print_texture_info(struct r600_texture *rtex, FILE *f)
if (rtex->htile_buffer)
fprintf(f, " HTile: size=%u, alignment=%u, pitch=%u, height=%u, "
"xalign=%u, yalign=%u\n",
"xalign=%u, yalign=%u, TC_compatible = %u\n",
rtex->htile_buffer->b.b.width0,
rtex->htile_buffer->buf->alignment, rtex->htile.pitch,
rtex->htile.height, rtex->htile.xalign, rtex->htile.yalign);
rtex->htile.height, rtex->htile.xalign, rtex->htile.yalign,
rtex->tc_compatible_htile);
if (rtex->dcc_offset) {
fprintf(f, " DCC: offset=%"PRIu64", size=%"PRIu64", alignment=%"PRIu64"\n",
@ -1054,6 +1081,16 @@ r600_texture_create_object(struct pipe_screen *screen,
return NULL;
}
rtex->tc_compatible_htile = rtex->surface.htile_size != 0;
assert(!!(rtex->surface.flags & RADEON_SURF_TC_COMPATIBLE_HTILE) ==
rtex->tc_compatible_htile);
/* TC-compatible HTILE only supports Z32_FLOAT. */
if (rtex->tc_compatible_htile)
rtex->db_render_format = PIPE_FORMAT_Z32_FLOAT;
else
rtex->db_render_format = base->format;
/* Tiled depth textures utilize the non-displayable tile order.
* This must be done after r600_setup_surface.
* Applies to R600-Cayman. */
@ -1241,11 +1278,20 @@ struct pipe_resource *r600_texture_create(struct pipe_screen *screen,
{
struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
struct radeon_surf surface = {0};
bool is_flushed_depth = templ->flags & R600_RESOURCE_FLAG_FLUSHED_DEPTH;
bool tc_compatible_htile =
rscreen->chip_class >= VI &&
(templ->flags & PIPE_RESOURCE_FLAG_TEXTURING_MORE_LIKELY) &&
!(rscreen->debug_flags & DBG_NO_HYPERZ) &&
!is_flushed_depth &&
templ->nr_samples <= 1 && /* TC-compat HTILE is less efficient with MSAA */
util_format_is_depth_or_stencil(templ->format);
int r;
r = r600_init_surface(rscreen, &surface, templ,
r600_choose_tiling(rscreen, templ),
templ->flags & R600_RESOURCE_FLAG_FLUSHED_DEPTH);
is_flushed_depth, tc_compatible_htile);
if (r) {
return NULL;
}
@ -1296,7 +1342,8 @@ static struct pipe_resource *r600_texture_from_handle(struct pipe_screen *screen
else
array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED;
r = r600_init_surface(rscreen, &surface, templ, array_mode, false);
r = r600_init_surface(rscreen, &surface, templ, array_mode,
false, false);
if (r) {
return NULL;
}

View File

@ -278,6 +278,7 @@ enum radeon_feature_id {
#define RADEON_SURF_HAS_TILE_MODE_INDEX (1 << 20)
#define RADEON_SURF_FMASK (1 << 21)
#define RADEON_SURF_DISABLE_DCC (1 << 22)
#define RADEON_SURF_TC_COMPATIBLE_HTILE (1 << 23)
#define RADEON_SURF_GET(v, field) (((v) >> RADEON_SURF_ ## field ## _SHIFT) & RADEON_SURF_ ## field ## _MASK)
#define RADEON_SURF_SET(v, field) (((v) & RADEON_SURF_ ## field ## _MASK) << RADEON_SURF_ ## field ## _SHIFT)
@ -344,6 +345,9 @@ struct radeon_surf {
uint64_t dcc_size;
uint64_t dcc_alignment;
/* TC-compatible HTILE only. */
uint64_t htile_size;
uint64_t htile_alignment;
};
struct radeon_bo_list_item {

View File

@ -332,6 +332,8 @@ si_flush_depth_texture(struct si_context *sctx,
}
}
assert(!tex->tc_compatible_htile || levels_z == 0);
/* We may have to allocate the flushed texture here when called from
* si_decompress_subresource.
*/
@ -699,7 +701,10 @@ static void si_clear(struct pipe_context *ctx, unsigned buffers,
zsbuf->u.tex.level == 0 &&
zsbuf->u.tex.first_layer == 0 &&
zsbuf->u.tex.last_layer == util_max_layer(&zstex->resource.b.b, 0)) {
if (buffers & PIPE_CLEAR_DEPTH) {
/* TC-compatible HTILE only supports depth clears to 0 or 1. */
if (buffers & PIPE_CLEAR_DEPTH &&
(!zstex->tc_compatible_htile ||
depth == 0 || depth == 1)) {
/* Need to disable EXPCLEAR temporarily if clearing
* to a new value. */
if (!zstex->depth_cleared || zstex->depth_clear_value != depth) {
@ -713,7 +718,9 @@ static void si_clear(struct pipe_context *ctx, unsigned buffers,
si_mark_atom_dirty(sctx, &sctx->db_render_state);
}
if (buffers & PIPE_CLEAR_STENCIL) {
/* TC-compatible HTILE only supports stencil clears to 0. */
if (buffers & PIPE_CLEAR_STENCIL &&
(!zstex->tc_compatible_htile || stencil == 0)) {
stencil &= 0xff;
/* Need to disable EXPCLEAR temporarily if clearing

View File

@ -399,6 +399,9 @@ void si_set_mutable_tex_desc_fields(struct r600_texture *tex,
state[7] = ((!tex->dcc_separate_buffer ? tex->resource.gpu_address : 0) +
tex->dcc_offset +
base_level_info->dcc_offset) >> 8;
} else if (tex->tc_compatible_htile) {
state[6] |= S_008F28_COMPRESSION_EN(1);
state[7] = tex->htile_buffer->gpu_address >> 8;
}
}
@ -508,8 +511,10 @@ static void si_set_sampler_views(struct pipe_context *ctx,
if (views[i]->texture && views[i]->texture->target != PIPE_BUFFER) {
struct r600_texture *rtex =
(struct r600_texture*)views[i]->texture;
struct si_sampler_view *rview = (struct si_sampler_view *)views[i];
if (rtex->db_compatible) {
if (rtex->db_compatible &&
(!rtex->tc_compatible_htile || rview->is_stencil_sampler)) {
samplers->depth_texture_mask |= 1u << slot;
} else {
samplers->depth_texture_mask &= ~(1u << slot);

View File

@ -4607,12 +4607,26 @@ static void tex_fetch_args(
/* Pack depth comparison value */
if (tgsi_is_shadow_target(target) && opcode != TGSI_OPCODE_LODQ) {
LLVMValueRef z;
if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
z = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
} else {
assert(ref_pos >= 0);
address[count++] = coords[ref_pos];
z = coords[ref_pos];
}
/* TC-compatible HTILE promotes Z16 and Z24 to Z32_FLOAT,
* so the depth comparison value isn't clamped for Z16 and
* Z24 anymore. Do it manually here.
*
* It's unnecessary if the original texture format was
* Z32_FLOAT, but we don't know that here.
*/
if (ctx->screen->b.chip_class == VI)
z = radeon_llvm_saturate(bld_base, z);
address[count++] = z;
}
/* Pack user derivatives */

View File

@ -686,6 +686,9 @@ static void si_update_poly_offset_state(struct si_context *sctx)
if (!rs || !rs->uses_poly_offset || !sctx->framebuffer.state.zsbuf)
return;
/* Use the user format, not db_render_format, so that the polygon
* offset behaves as expected by applications.
*/
switch (sctx->framebuffer.state.zsbuf->texture->format) {
case PIPE_FORMAT_Z16_UNORM:
si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[0]);
@ -2140,7 +2143,7 @@ static void si_init_depth_surface(struct si_context *sctx,
uint64_t z_offs, s_offs;
uint32_t db_htile_data_base, db_htile_surface;
format = si_translate_dbformat(rtex->resource.b.b.format);
format = si_translate_dbformat(rtex->db_render_format);
if (format == V_028040_Z_INVALID) {
R600_ERR("Invalid DB format: %d, disabling DB.\n", rtex->resource.b.b.format);
@ -2151,7 +2154,7 @@ static void si_init_depth_surface(struct si_context *sctx,
z_offs += rtex->surface.level[level].offset;
s_offs += rtex->surface.stencil_level[level].offset;
db_depth_info = S_02803C_ADDR5_SWIZZLE_MASK(1);
db_depth_info = S_02803C_ADDR5_SWIZZLE_MASK(!rtex->tc_compatible_htile);
z_info = S_028040_FORMAT(format);
if (rtex->resource.b.b.nr_samples > 1) {
@ -2208,13 +2211,37 @@ static void si_init_depth_surface(struct si_context *sctx,
*/
if (rtex->resource.b.b.nr_samples <= 1)
s_info |= S_028044_ALLOW_EXPCLEAR(1);
} else
/* Use all of the htile_buffer for depth if there's no stencil. */
} else if (!rtex->tc_compatible_htile) {
/* Use all of the htile_buffer for depth if there's no stencil.
* This must not be set when TC-compatible HTILE is enabled
* due to a hw bug.
*/
s_info |= S_028044_TILE_STENCIL_DISABLE(1);
}
uint64_t va = rtex->htile_buffer->gpu_address;
db_htile_data_base = va >> 8;
db_htile_surface = S_028ABC_FULL_CACHE(1);
if (rtex->tc_compatible_htile) {
db_htile_surface |= S_028ABC_TC_COMPATIBLE(1);
switch (rtex->resource.b.b.nr_samples) {
case 0:
case 1:
z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(5);
break;
case 2:
case 4:
z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(3);
break;
case 8:
z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(2);
break;
default:
assert(0);
}
}
} else {
db_htile_data_base = 0;
db_htile_surface = 0;
@ -2356,6 +2383,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
if (state->zsbuf) {
surf = (struct r600_surface*)state->zsbuf;
rtex = (struct r600_texture*)surf->base.texture;
if (!surf->depth_initialized) {
si_init_depth_surface(sctx, surf);
@ -3021,6 +3049,9 @@ si_create_sampler_view_custom(struct pipe_context *ctx,
surflevel = tmp->surface.level;
if (tmp->db_compatible) {
if (!view->is_stencil_sampler)
pipe_format = tmp->db_render_format;
switch (pipe_format) {
case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
pipe_format = PIPE_FORMAT_Z32_FLOAT;

View File

@ -1118,7 +1118,8 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
struct pipe_surface *surf = sctx->framebuffer.state.zsbuf;
struct r600_texture *rtex = (struct r600_texture *)surf->texture;
rtex->dirty_level_mask |= 1 << surf->u.tex.level;
if (!rtex->tc_compatible_htile)
rtex->dirty_level_mask |= 1 << surf->u.tex.level;
if (rtex->surface.flags & RADEON_SURF_SBUFFER)
rtex->stencil_dirty_level_mask |= 1 << surf->u.tex.level;

View File

@ -137,6 +137,7 @@ ADDR_HANDLE amdgpu_addr_create(struct amdgpu_winsys *ws)
createFlags.value = 0;
createFlags.useTileIndex = 1;
createFlags.degradeBaseLevel = 1;
createFlags.useHtileSliceAlign = 1;
addrCreateInput.chipEngine = CIASICIDGFXENGINE_SOUTHERNISLAND;
addrCreateInput.chipFamily = ws->family;
@ -160,7 +161,9 @@ static int compute_level(struct amdgpu_winsys *ws,
ADDR_COMPUTE_SURFACE_INFO_INPUT *AddrSurfInfoIn,
ADDR_COMPUTE_SURFACE_INFO_OUTPUT *AddrSurfInfoOut,
ADDR_COMPUTE_DCCINFO_INPUT *AddrDccIn,
ADDR_COMPUTE_DCCINFO_OUTPUT *AddrDccOut)
ADDR_COMPUTE_DCCINFO_OUTPUT *AddrDccOut,
ADDR_COMPUTE_HTILE_INFO_INPUT *AddrHtileIn,
ADDR_COMPUTE_HTILE_INFO_OUTPUT *AddrHtileOut)
{
struct radeon_surf_level *surf_level;
ADDR_E_RETURNCODE ret;
@ -257,6 +260,32 @@ static int compute_level(struct amdgpu_winsys *ws,
}
}
/* TC-compatible HTILE. */
if (!is_stencil &&
AddrSurfInfoIn->flags.depth &&
AddrSurfInfoIn->flags.tcCompatible &&
surf_level->mode == RADEON_SURF_MODE_2D &&
level == 0) {
AddrHtileIn->flags.tcCompatible = 1;
AddrHtileIn->pitch = AddrSurfInfoOut->pitch;
AddrHtileIn->height = AddrSurfInfoOut->height;
AddrHtileIn->numSlices = AddrSurfInfoOut->depth;
AddrHtileIn->blockWidth = ADDR_HTILE_BLOCKSIZE_8;
AddrHtileIn->blockHeight = ADDR_HTILE_BLOCKSIZE_8;
AddrHtileIn->pTileInfo = AddrSurfInfoOut->pTileInfo;
AddrHtileIn->tileIndex = AddrSurfInfoOut->tileIndex;
AddrHtileIn->macroModeIndex = AddrSurfInfoOut->macroModeIndex;
ret = AddrComputeHtileInfo(ws->addrlib,
AddrHtileIn,
AddrHtileOut);
if (ret == ADDR_OK) {
surf->htile_size = AddrHtileOut->htileBytes;
surf->htile_alignment = AddrHtileOut->baseAlign;
}
}
return 0;
}
@ -284,6 +313,8 @@ static int amdgpu_surface_init(struct radeon_winsys *rws,
ADDR_COMPUTE_SURFACE_INFO_OUTPUT AddrSurfInfoOut = {0};
ADDR_COMPUTE_DCCINFO_INPUT AddrDccIn = {0};
ADDR_COMPUTE_DCCINFO_OUTPUT AddrDccOut = {0};
ADDR_COMPUTE_HTILE_INFO_INPUT AddrHtileIn = {0};
ADDR_COMPUTE_HTILE_INFO_OUTPUT AddrHtileOut = {0};
ADDR_TILEINFO AddrTileInfoIn = {0};
ADDR_TILEINFO AddrTileInfoOut = {0};
int r;
@ -296,6 +327,8 @@ static int amdgpu_surface_init(struct radeon_winsys *rws,
AddrSurfInfoOut.size = sizeof(ADDR_COMPUTE_SURFACE_INFO_OUTPUT);
AddrDccIn.size = sizeof(ADDR_COMPUTE_DCCINFO_INPUT);
AddrDccOut.size = sizeof(ADDR_COMPUTE_DCCINFO_OUTPUT);
AddrHtileIn.size = sizeof(ADDR_COMPUTE_HTILE_INFO_INPUT);
AddrHtileOut.size = sizeof(ADDR_COMPUTE_HTILE_INFO_OUTPUT);
AddrSurfInfoOut.pTileInfo = &AddrTileInfoOut;
type = RADEON_SURF_GET(surf->flags, TYPE);
@ -361,7 +394,12 @@ static int amdgpu_surface_init(struct radeon_winsys *rws,
AddrSurfInfoIn.flags.cube = type == RADEON_SURF_TYPE_CUBEMAP;
AddrSurfInfoIn.flags.display = (surf->flags & RADEON_SURF_SCANOUT) != 0;
AddrSurfInfoIn.flags.pow2Pad = surf->last_level > 0;
AddrSurfInfoIn.flags.degrade4Space = 1;
AddrSurfInfoIn.flags.tcCompatible = (surf->flags & RADEON_SURF_TC_COMPATIBLE_HTILE) != 0;
/* Only degrade the tile mode for space if TC-compatible HTILE hasn't been
* requested, because TC-compatible HTILE requires 2D tiling.
*/
AddrSurfInfoIn.flags.degrade4Space = !AddrSurfInfoIn.flags.tcCompatible;
/* DCC notes:
* - If we add MSAA support, keep in mind that CB can't decompress 8bpp
@ -443,11 +481,14 @@ static int amdgpu_surface_init(struct radeon_winsys *rws,
surf->bo_size = 0;
surf->dcc_size = 0;
surf->dcc_alignment = 1;
surf->htile_size = 0;
surf->htile_alignment = 1;
/* Calculate texture layout information. */
for (level = 0; level <= surf->last_level; level++) {
r = compute_level(ws, surf, false, level, type, compressed,
&AddrSurfInfoIn, &AddrSurfInfoOut, &AddrDccIn, &AddrDccOut);
&AddrSurfInfoIn, &AddrSurfInfoOut,
&AddrDccIn, &AddrDccOut, &AddrHtileIn, &AddrHtileOut);
if (r)
return r;
@ -475,12 +516,14 @@ static int amdgpu_surface_init(struct radeon_winsys *rws,
AddrSurfInfoIn.bpp = 8;
AddrSurfInfoIn.flags.depth = 0;
AddrSurfInfoIn.flags.stencil = 1;
AddrSurfInfoIn.flags.tcCompatible = 0;
/* This will be ignored if AddrSurfInfoIn.pTileInfo is NULL. */
AddrTileInfoIn.tileSplitBytes = surf->stencil_tile_split;
for (level = 0; level <= surf->last_level; level++) {
r = compute_level(ws, surf, true, level, type, compressed,
&AddrSurfInfoIn, &AddrSurfInfoOut, &AddrDccIn, &AddrDccOut);
&AddrSurfInfoIn, &AddrSurfInfoOut, &AddrDccIn, &AddrDccOut,
NULL, NULL);
if (r)
return r;
@ -508,6 +551,12 @@ static int amdgpu_surface_init(struct radeon_winsys *rws,
ws->info.num_tile_pipes);
}
/* Make sure HTILE covers the whole miptree, because the shader reads
* TC-compatible HTILE even for levels where it's disabled by DB.
*/
if (surf->htile_size && surf->last_level)
surf->htile_size *= 2;
return 0;
}