mesa/src/gallium/drivers/radeonsi/gfx10_query.c

464 lines
16 KiB
C

/*
* Copyright 2018 Advanced Micro Devices, Inc.
* All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* on the rights to use, copy, modify, merge, publish, distribute, sub
* license, and/or sell copies of the Software, and to permit persons to whom
* the Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
* THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
* USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "si_pipe.h"
#include "si_query.h"
#include "sid.h"
#include "util/u_memory.h"
#include "util/u_suballoc.h"
#include <stddef.h>
static void emit_shader_query(struct si_context *sctx)
{
assert(!list_is_empty(&sctx->shader_query_buffers));
struct gfx10_sh_query_buffer *qbuf =
list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
qbuf->head += sizeof(struct gfx10_sh_query_buffer_mem);
}
static void gfx10_release_query_buffers(struct si_context *sctx,
struct gfx10_sh_query_buffer *first,
struct gfx10_sh_query_buffer *last)
{
while (first) {
struct gfx10_sh_query_buffer *qbuf = first;
if (first != last)
first = list_entry(qbuf->list.next, struct gfx10_sh_query_buffer, list);
else
first = NULL;
qbuf->refcount--;
if (qbuf->refcount)
continue;
if (qbuf->list.next == &sctx->shader_query_buffers)
continue; /* keep the most recent buffer; it may not be full yet */
if (qbuf->list.prev == &sctx->shader_query_buffers)
continue; /* keep the oldest buffer for recycling */
list_del(&qbuf->list);
si_resource_reference(&qbuf->buf, NULL);
FREE(qbuf);
}
}
static bool gfx10_alloc_query_buffer(struct si_context *sctx)
{
if (si_is_atom_dirty(sctx, &sctx->atoms.s.shader_query))
return true;
struct gfx10_sh_query_buffer *qbuf = NULL;
if (!list_is_empty(&sctx->shader_query_buffers)) {
qbuf = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
if (qbuf->head + sizeof(struct gfx10_sh_query_buffer_mem) <= qbuf->buf->b.b.width0)
goto success;
qbuf = list_first_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
if (!qbuf->refcount &&
!si_cs_is_buffer_referenced(sctx, qbuf->buf->buf, RADEON_USAGE_READWRITE) &&
sctx->ws->buffer_wait(sctx->ws, qbuf->buf->buf, 0, RADEON_USAGE_READWRITE)) {
/* Can immediately re-use the oldest buffer */
list_del(&qbuf->list);
} else {
qbuf = NULL;
}
}
if (!qbuf) {
qbuf = CALLOC_STRUCT(gfx10_sh_query_buffer);
if (unlikely(!qbuf))
return false;
struct si_screen *screen = sctx->screen;
unsigned buf_size =
MAX2(sizeof(struct gfx10_sh_query_buffer_mem), screen->info.min_alloc_size);
qbuf->buf = si_resource(pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size));
if (unlikely(!qbuf->buf)) {
FREE(qbuf);
return false;
}
}
/* The buffer is currently unused by the GPU. Initialize it.
*
* We need to set the high bit of all the primitive counters for
* compatibility with the SET_PREDICATION packet.
*/
uint64_t *results = sctx->ws->buffer_map(sctx->ws, qbuf->buf->buf, NULL,
PIPE_MAP_WRITE | PIPE_MAP_UNSYNCHRONIZED);
assert(results);
for (unsigned i = 0, e = qbuf->buf->b.b.width0 / sizeof(struct gfx10_sh_query_buffer_mem); i < e;
++i) {
for (unsigned j = 0; j < 16; ++j)
results[32 * i + j] = (uint64_t)1 << 63;
results[32 * i + 16] = 0;
}
list_addtail(&qbuf->list, &sctx->shader_query_buffers);
qbuf->head = 0;
qbuf->refcount = sctx->num_active_shader_queries;
success:;
struct pipe_shader_buffer sbuf;
sbuf.buffer = &qbuf->buf->b.b;
sbuf.buffer_offset = qbuf->head;
sbuf.buffer_size = sizeof(struct gfx10_sh_query_buffer_mem);
si_set_internal_shader_buffer(sctx, SI_GS_QUERY_BUF, &sbuf);
SET_FIELD(sctx->current_gs_state, GS_STATE_STREAMOUT_QUERY_ENABLED, 1);
si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_query);
return true;
}
static void gfx10_sh_query_destroy(struct si_context *sctx, struct si_query *rquery)
{
struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
gfx10_release_query_buffers(sctx, query->first, query->last);
FREE(query);
}
static bool gfx10_sh_query_begin(struct si_context *sctx, struct si_query *rquery)
{
struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
gfx10_release_query_buffers(sctx, query->first, query->last);
query->first = query->last = NULL;
if (unlikely(!gfx10_alloc_query_buffer(sctx)))
return false;
query->first = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
query->first_begin = query->first->head;
sctx->num_active_shader_queries++;
query->first->refcount++;
return true;
}
static bool gfx10_sh_query_end(struct si_context *sctx, struct si_query *rquery)
{
struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
if (unlikely(!query->first))
return false; /* earlier out of memory error */
query->last = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
query->last_end = query->last->head;
/* Signal the fence of the previous chunk */
if (query->last_end != 0) {
uint64_t fence_va = query->last->buf->gpu_address;
fence_va += query->last_end - sizeof(struct gfx10_sh_query_buffer_mem);
fence_va += offsetof(struct gfx10_sh_query_buffer_mem, fence);
si_cp_release_mem(sctx, &sctx->gfx_cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM,
EOP_INT_SEL_NONE, EOP_DATA_SEL_VALUE_32BIT, query->last->buf, fence_va,
0xffffffff, PIPE_QUERY_GPU_FINISHED);
}
sctx->num_active_shader_queries--;
if (sctx->num_active_shader_queries <= 0 || !si_is_atom_dirty(sctx, &sctx->atoms.s.shader_query)) {
si_set_internal_shader_buffer(sctx, SI_GS_QUERY_BUF, NULL);
SET_FIELD(sctx->current_gs_state, GS_STATE_STREAMOUT_QUERY_ENABLED, 0);
/* If a query_begin is followed by a query_end without a draw
* in-between, we need to clear the atom to ensure that the
* next query_begin will re-initialize the shader buffer. */
si_set_atom_dirty(sctx, &sctx->atoms.s.shader_query, false);
}
return true;
}
static void gfx10_sh_query_add_result(struct gfx10_sh_query *query,
struct gfx10_sh_query_buffer_mem *qmem,
union pipe_query_result *result)
{
static const uint64_t mask = ((uint64_t)1 << 63) - 1;
switch (query->b.type) {
case PIPE_QUERY_PRIMITIVES_EMITTED:
result->u64 += qmem->stream[query->stream].emitted_primitives & mask;
break;
case PIPE_QUERY_PRIMITIVES_GENERATED:
result->u64 += qmem->stream[query->stream].generated_primitives & mask;
break;
case PIPE_QUERY_SO_STATISTICS:
result->so_statistics.num_primitives_written +=
qmem->stream[query->stream].emitted_primitives & mask;
result->so_statistics.primitives_storage_needed +=
qmem->stream[query->stream].generated_primitives & mask;
break;
case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
result->b |= qmem->stream[query->stream].emitted_primitives !=
qmem->stream[query->stream].generated_primitives;
break;
case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
result->b |= qmem->stream[stream].emitted_primitives !=
qmem->stream[stream].generated_primitives;
}
break;
default:
assert(0);
}
}
static bool gfx10_sh_query_get_result(struct si_context *sctx, struct si_query *rquery, bool wait,
union pipe_query_result *result)
{
struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
util_query_clear_result(result, query->b.type);
if (unlikely(!query->first))
return false; /* earlier out of memory error */
assert(query->last);
for (struct gfx10_sh_query_buffer *qbuf = query->last;;
qbuf = list_entry(qbuf->list.prev, struct gfx10_sh_query_buffer, list)) {
unsigned usage = PIPE_MAP_READ | (wait ? 0 : PIPE_MAP_DONTBLOCK);
void *map;
if (rquery->b.flushed)
map = sctx->ws->buffer_map(sctx->ws, qbuf->buf->buf, NULL, usage);
else
map = si_buffer_map(sctx, qbuf->buf, usage);
if (!map)
return false;
unsigned results_begin = 0;
unsigned results_end = qbuf->head;
if (qbuf == query->first)
results_begin = query->first_begin;
if (qbuf == query->last)
results_end = query->last_end;
while (results_begin != results_end) {
struct gfx10_sh_query_buffer_mem *qmem = map + results_begin;
results_begin += sizeof(*qmem);
gfx10_sh_query_add_result(query, qmem, result);
}
if (qbuf == query->first)
break;
}
return true;
}
static void gfx10_sh_query_get_result_resource(struct si_context *sctx, struct si_query *rquery,
enum pipe_query_flags flags,
enum pipe_query_value_type result_type,
int index, struct pipe_resource *resource,
unsigned offset)
{
struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
struct si_qbo_state saved_state = {};
struct pipe_resource *tmp_buffer = NULL;
unsigned tmp_buffer_offset = 0;
if (!sctx->sh_query_result_shader) {
sctx->sh_query_result_shader = gfx10_create_sh_query_result_cs(sctx);
if (!sctx->sh_query_result_shader)
return;
}
if (query->first != query->last) {
u_suballocator_alloc(&sctx->allocator_zeroed_memory, 16, 16, &tmp_buffer_offset, &tmp_buffer);
if (!tmp_buffer)
return;
}
si_save_qbo_state(sctx, &saved_state);
/* Pre-fill the constants configuring the shader behavior. */
struct {
uint32_t config;
uint32_t offset;
uint32_t chain;
uint32_t result_count;
} consts;
struct pipe_constant_buffer constant_buffer = {};
if (index >= 0) {
switch (query->b.type) {
case PIPE_QUERY_PRIMITIVES_GENERATED:
consts.offset = 4 * sizeof(uint64_t) * query->stream + 2 * sizeof(uint64_t);
consts.config = 0;
break;
case PIPE_QUERY_PRIMITIVES_EMITTED:
consts.offset = 4 * sizeof(uint64_t) * query->stream + 3 * sizeof(uint64_t);
consts.config = 0;
break;
case PIPE_QUERY_SO_STATISTICS:
consts.offset = sizeof(uint32_t) * (4 * index + query->stream);
consts.config = 0;
break;
case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
consts.offset = 4 * sizeof(uint64_t) * query->stream;
consts.config = 2;
break;
case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
consts.offset = 0;
consts.config = 3;
break;
default:
unreachable("bad query type");
}
} else {
/* Check result availability. */
consts.offset = 0;
consts.config = 1;
}
if (result_type == PIPE_QUERY_TYPE_I64 || result_type == PIPE_QUERY_TYPE_U64)
consts.config |= 8;
constant_buffer.buffer_size = sizeof(consts);
constant_buffer.user_buffer = &consts;
/* Pre-fill the SSBOs and grid. */
struct pipe_shader_buffer ssbo[3];
struct pipe_grid_info grid = {};
ssbo[1].buffer = tmp_buffer;
ssbo[1].buffer_offset = tmp_buffer_offset;
ssbo[1].buffer_size = 16;
ssbo[2] = ssbo[1];
grid.block[0] = 1;
grid.block[1] = 1;
grid.block[2] = 1;
grid.grid[0] = 1;
grid.grid[1] = 1;
grid.grid[2] = 1;
struct gfx10_sh_query_buffer *qbuf = query->first;
for (;;) {
unsigned begin = qbuf == query->first ? query->first_begin : 0;
unsigned end = qbuf == query->last ? query->last_end : qbuf->buf->b.b.width0;
if (!end)
continue;
ssbo[0].buffer = &qbuf->buf->b.b;
ssbo[0].buffer_offset = begin;
ssbo[0].buffer_size = end - begin;
consts.result_count = (end - begin) / sizeof(struct gfx10_sh_query_buffer_mem);
consts.chain = 0;
if (qbuf != query->first)
consts.chain |= 1;
if (qbuf != query->last)
consts.chain |= 2;
if (qbuf == query->last) {
ssbo[2].buffer = resource;
ssbo[2].buffer_offset = offset;
ssbo[2].buffer_size = 8;
}
sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, false, &constant_buffer);
if (flags & PIPE_QUERY_WAIT) {
uint64_t va;
/* Wait for result availability. Wait only for readiness
* of the last entry, since the fence writes should be
* serialized in the CP.
*/
va = qbuf->buf->gpu_address;
va += end - sizeof(struct gfx10_sh_query_buffer_mem);
va += offsetof(struct gfx10_sh_query_buffer_mem, fence);
si_cp_wait_mem(sctx, &sctx->gfx_cs, va, 0x00000001, 0x00000001, 0);
}
/* ssbo[2] is either tmp_buffer or resource */
assert(ssbo[2].buffer);
si_launch_grid_internal_ssbos(sctx, &grid, sctx->sh_query_result_shader,
SI_OP_SYNC_PS_BEFORE | SI_OP_SYNC_AFTER, SI_COHERENCY_SHADER,
3, ssbo, (1 << 2) | (ssbo[1].buffer ? 1 << 1 : 0));
if (qbuf == query->last)
break;
qbuf = list_entry(qbuf->list.next, struct gfx10_sh_query_buffer, list);
}
si_restore_qbo_state(sctx, &saved_state);
pipe_resource_reference(&tmp_buffer, NULL);
}
static const struct si_query_ops gfx10_sh_query_ops = {
.destroy = gfx10_sh_query_destroy,
.begin = gfx10_sh_query_begin,
.end = gfx10_sh_query_end,
.get_result = gfx10_sh_query_get_result,
.get_result_resource = gfx10_sh_query_get_result_resource,
};
struct pipe_query *gfx10_sh_query_create(struct si_screen *screen, enum pipe_query_type query_type,
unsigned index)
{
struct gfx10_sh_query *query = CALLOC_STRUCT(gfx10_sh_query);
if (unlikely(!query))
return NULL;
query->b.ops = &gfx10_sh_query_ops;
query->b.type = query_type;
query->stream = index;
return (struct pipe_query *)query;
}
void gfx10_init_query(struct si_context *sctx)
{
list_inithead(&sctx->shader_query_buffers);
sctx->atoms.s.shader_query.emit = emit_shader_query;
}
void gfx10_destroy_query(struct si_context *sctx)
{
if (!sctx->shader_query_buffers.next)
return;
while (!list_is_empty(&sctx->shader_query_buffers)) {
struct gfx10_sh_query_buffer *qbuf =
list_first_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
list_del(&qbuf->list);
assert(!qbuf->refcount);
si_resource_reference(&qbuf->buf, NULL);
FREE(qbuf);
}
}