nv50: dynamically allocate space for shader local storage
Fixes 21 piglit tests: spec/glsl-1.10/execution/variable-indexing/ fs-temp-array-mat4-index-col-row-wr vs-temp-array-mat4-index-col-row-wr vs-temp-array-mat4-index-row-wr spec/glsl-1.20/execution/variable-indexing/ fs-temp-array-mat3-index-col-row-rd fs-temp-array-mat3-index-row-rd fs-temp-array-mat4-col-row-wr fs-temp-array-mat4-index-col-row-rd fs-temp-array-mat4-index-col-row-wr fs-temp-array-mat4-index-row-rd fs-temp-array-mat4-index-row-wr vs-temp-array-mat3-index-col-row-rd vs-temp-array-mat3-index-col-row-wr vs-temp-array-mat3-index-row-rd vs-temp-array-mat3-index-row-wr vs-temp-array-mat4-col-row-wr vs-temp-array-mat4-index-col-row-rd vs-temp-array-mat4-index-col-row-wr vs-temp-array-mat4-index-col-wr vs-temp-array-mat4-index-row-rd vs-temp-array-mat4-index-row-wr vs-temp-array-mat4-index-wr ... and prevents a lot of GPU lockups
This commit is contained in:
parent
0fceaee4fd
commit
1906d2b46b
|
@ -97,6 +97,7 @@ struct nv50_context {
|
|||
boolean flushed;
|
||||
boolean rasterizer_discard;
|
||||
uint8_t tls_required;
|
||||
boolean new_tls_space;
|
||||
uint8_t num_vtxbufs;
|
||||
uint8_t num_vtxelts;
|
||||
uint8_t num_textures[3];
|
||||
|
|
|
@ -350,6 +350,7 @@ nv50_program_translate(struct nv50_program *prog, uint16_t chipset)
|
|||
prog->code_size = info->bin.codeSize;
|
||||
prog->fixups = info->bin.relocData;
|
||||
prog->max_gpr = MAX2(4, (info->bin.maxGPR >> 1) + 1);
|
||||
prog->tls_space = info->bin.tlsSpace;
|
||||
|
||||
if (prog->type == PIPE_SHADER_FRAGMENT) {
|
||||
if (info->prop.fp.writesDepth) {
|
||||
|
@ -399,6 +400,12 @@ nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog)
|
|||
}
|
||||
prog->code_base = prog->mem->start;
|
||||
|
||||
ret = nv50_tls_realloc(nv50->screen, prog->tls_space);
|
||||
if (ret < 0)
|
||||
return FALSE;
|
||||
if (ret > 0)
|
||||
nv50->state.new_tls_space = TRUE;
|
||||
|
||||
if (prog->fixups)
|
||||
nv50_ir_relocate_code(prog->fixups, prog->code, prog->code_base, 0, 0);
|
||||
|
||||
|
|
|
@ -28,8 +28,6 @@ struct nv50_context;
|
|||
#include "pipe/p_state.h"
|
||||
#include "pipe/p_shader_tokens.h"
|
||||
|
||||
#define NV50_CAP_MAX_PROGRAM_TEMPS 64
|
||||
|
||||
struct nv50_varying {
|
||||
uint8_t id; /* tgsi index */
|
||||
uint8_t hw; /* hw index, nv50 wants flat FP inputs last */
|
||||
|
@ -56,7 +54,6 @@ struct nv50_program {
|
|||
|
||||
ubyte type;
|
||||
boolean translated;
|
||||
boolean uses_lmem;
|
||||
|
||||
uint32_t *code;
|
||||
unsigned code_size;
|
||||
|
@ -64,6 +61,7 @@ struct nv50_program {
|
|||
uint32_t *immd;
|
||||
unsigned immd_size;
|
||||
unsigned parm_size; /* size limit of uniform buffer */
|
||||
uint32_t tls_space; /* required local memory per thread */
|
||||
|
||||
ubyte max_gpr; /* REG_ALLOC_TEMP */
|
||||
ubyte max_out; /* REG_ALLOC_RESULT or FP_RESULT_COUNT */
|
||||
|
|
|
@ -28,11 +28,21 @@
|
|||
#include "nv50_screen.h"
|
||||
|
||||
#include "nouveau/nv_object.xml.h"
|
||||
#include <errno.h>
|
||||
|
||||
#ifndef NOUVEAU_GETPARAM_GRAPH_UNITS
|
||||
# define NOUVEAU_GETPARAM_GRAPH_UNITS 13
|
||||
#endif
|
||||
|
||||
/* affected by LOCAL_WARPS_LOG_ALLOC / LOCAL_WARPS_NO_CLAMP */
|
||||
#define LOCAL_WARPS_ALLOC 32
|
||||
/* affected by STACK_WARPS_LOG_ALLOC / STACK_WARPS_NO_CLAMP */
|
||||
#define STACK_WARPS_ALLOC 32
|
||||
|
||||
#define THREADS_IN_WARP 32
|
||||
|
||||
#define ONE_TEMP_SIZE (4/*vector*/ * sizeof(float))
|
||||
|
||||
static boolean
|
||||
nv50_screen_is_format_supported(struct pipe_screen *pscreen,
|
||||
enum pipe_format format,
|
||||
|
@ -209,7 +219,7 @@ nv50_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
|
|||
case PIPE_SHADER_CAP_MAX_PREDS:
|
||||
return 0;
|
||||
case PIPE_SHADER_CAP_MAX_TEMPS:
|
||||
return NV50_CAP_MAX_PROGRAM_TEMPS;
|
||||
return nv50_screen(pscreen)->max_tls_space / ONE_TEMP_SIZE;
|
||||
case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED:
|
||||
return 1;
|
||||
case PIPE_SHADER_CAP_SUBROUTINES:
|
||||
|
@ -311,7 +321,7 @@ nv50_screen_fence_update(struct pipe_screen *pscreen)
|
|||
}
|
||||
|
||||
static void
|
||||
nv50_screen_init_hwctx(struct nv50_screen *screen, unsigned tls_space)
|
||||
nv50_screen_init_hwctx(struct nv50_screen *screen)
|
||||
{
|
||||
struct nouveau_pushbuf *push = screen->base.pushbuf;
|
||||
struct nv04_fifo *fifo;
|
||||
|
@ -411,7 +421,7 @@ nv50_screen_init_hwctx(struct nv50_screen *screen, unsigned tls_space)
|
|||
BEGIN_NV04(push, NV50_3D(LOCAL_ADDRESS_HIGH), 3);
|
||||
PUSH_DATAh(push, screen->tls_bo->offset);
|
||||
PUSH_DATA (push, screen->tls_bo->offset);
|
||||
PUSH_DATA (push, util_logbase2(tls_space / 8));
|
||||
PUSH_DATA (push, util_logbase2(screen->cur_tls_space / 8));
|
||||
|
||||
BEGIN_NV04(push, NV50_3D(STACK_ADDRESS_HIGH), 3);
|
||||
PUSH_DATAh(push, screen->stack_bo->offset);
|
||||
|
@ -508,6 +518,60 @@ nv50_screen_init_hwctx(struct nv50_screen *screen, unsigned tls_space)
|
|||
PUSH_KICK (push);
|
||||
}
|
||||
|
||||
static int nv50_tls_alloc(struct nv50_screen *screen, unsigned tls_space,
|
||||
uint64_t *tls_size)
|
||||
{
|
||||
struct nouveau_device *dev = screen->base.device;
|
||||
int ret;
|
||||
|
||||
screen->cur_tls_space = util_next_power_of_two(tls_space / ONE_TEMP_SIZE) *
|
||||
ONE_TEMP_SIZE;
|
||||
if (nouveau_mesa_debug)
|
||||
debug_printf("allocating space for %u temps\n",
|
||||
util_next_power_of_two(tls_space / ONE_TEMP_SIZE));
|
||||
*tls_size = screen->cur_tls_space * util_next_power_of_two(screen->TPs) *
|
||||
screen->MPsInTP * LOCAL_WARPS_ALLOC * THREADS_IN_WARP;
|
||||
|
||||
ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 16,
|
||||
*tls_size, NULL, &screen->tls_bo);
|
||||
if (ret) {
|
||||
NOUVEAU_ERR("Failed to allocate local bo: %d\n", ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int nv50_tls_realloc(struct nv50_screen *screen, unsigned tls_space)
|
||||
{
|
||||
struct nouveau_pushbuf *push = screen->base.pushbuf;
|
||||
int ret;
|
||||
uint64_t tls_size;
|
||||
|
||||
if (tls_space < screen->cur_tls_space)
|
||||
return 0;
|
||||
if (tls_space > screen->max_tls_space) {
|
||||
/* fixable by limiting number of warps (LOCAL_WARPS_LOG_ALLOC /
|
||||
* LOCAL_WARPS_NO_CLAMP) */
|
||||
NOUVEAU_ERR("Unsupported number of temporaries (%u > %u). Fixable if someone cares.\n",
|
||||
(unsigned)(tls_space / ONE_TEMP_SIZE),
|
||||
(unsigned)(screen->max_tls_space / ONE_TEMP_SIZE));
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
nouveau_bo_ref(NULL, &screen->tls_bo);
|
||||
ret = nv50_tls_alloc(screen, tls_space, &tls_size);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
BEGIN_NV04(push, NV50_3D(LOCAL_ADDRESS_HIGH), 3);
|
||||
PUSH_DATAh(push, screen->tls_bo->offset);
|
||||
PUSH_DATA (push, screen->tls_bo->offset);
|
||||
PUSH_DATA (push, util_logbase2(screen->cur_tls_space / 8));
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
struct pipe_screen *
|
||||
nv50_screen_create(struct nouveau_device *dev)
|
||||
{
|
||||
|
@ -516,7 +580,7 @@ nv50_screen_create(struct nouveau_device *dev)
|
|||
struct nouveau_object *chan;
|
||||
uint64_t value;
|
||||
uint32_t tesla_class;
|
||||
unsigned stack_size, max_warps, tls_space;
|
||||
unsigned stack_size;
|
||||
int ret;
|
||||
|
||||
screen = CALLOC_STRUCT(nv50_screen);
|
||||
|
@ -637,10 +701,11 @@ nv50_screen_create(struct nouveau_device *dev)
|
|||
|
||||
nouveau_getparam(dev, NOUVEAU_GETPARAM_GRAPH_UNITS, &value);
|
||||
|
||||
max_warps = util_bitcount(value & 0xffff);
|
||||
max_warps *= util_bitcount((value >> 24) & 0xf) * 32;
|
||||
screen->TPs = util_bitcount(value & 0xffff);
|
||||
screen->MPsInTP = util_bitcount((value >> 24) & 0xf);
|
||||
|
||||
stack_size = max_warps * 64 * 8;
|
||||
stack_size = util_next_power_of_two(screen->TPs) * screen->MPsInTP *
|
||||
STACK_WARPS_ALLOC * 64 * 8;
|
||||
|
||||
ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 16, stack_size, NULL,
|
||||
&screen->stack_bo);
|
||||
|
@ -649,20 +714,24 @@ nv50_screen_create(struct nouveau_device *dev)
|
|||
goto fail;
|
||||
}
|
||||
|
||||
tls_space = NV50_CAP_MAX_PROGRAM_TEMPS * 16;
|
||||
uint64_t size_of_one_temp = util_next_power_of_two(screen->TPs) *
|
||||
screen->MPsInTP * LOCAL_WARPS_ALLOC * THREADS_IN_WARP *
|
||||
ONE_TEMP_SIZE;
|
||||
screen->max_tls_space = dev->vram_size / size_of_one_temp * ONE_TEMP_SIZE;
|
||||
screen->max_tls_space /= 2; /* half of vram */
|
||||
|
||||
screen->tls_size = tls_space * max_warps * 32;
|
||||
/* hw can address max 64 KiB */
|
||||
screen->max_tls_space = MIN2(screen->max_tls_space, 64 << 10);
|
||||
|
||||
uint64_t tls_size;
|
||||
unsigned tls_space = 4/*temps*/ * ONE_TEMP_SIZE;
|
||||
ret = nv50_tls_alloc(screen, tls_space, &tls_size);
|
||||
if (ret)
|
||||
goto fail;
|
||||
|
||||
if (nouveau_mesa_debug)
|
||||
debug_printf("max_warps = %i, tls_size = %"PRIu64" KiB\n",
|
||||
max_warps, screen->tls_size >> 10);
|
||||
|
||||
ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 16, screen->tls_size, NULL,
|
||||
&screen->tls_bo);
|
||||
if (ret) {
|
||||
NOUVEAU_ERR("Failed to allocate local bo: %d\n", ret);
|
||||
goto fail;
|
||||
}
|
||||
debug_printf("TPs = %u, MPsInTP = %u, VRAM = %"PRIu64" MiB, tls_size = %"PRIu64" KiB\n",
|
||||
screen->TPs, screen->MPsInTP, dev->vram_size >> 20, tls_size >> 10);
|
||||
|
||||
ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 16, 4 << 16, NULL,
|
||||
&screen->uniforms);
|
||||
|
@ -684,7 +753,7 @@ nv50_screen_create(struct nouveau_device *dev)
|
|||
if (!nv50_blitctx_create(screen))
|
||||
goto fail;
|
||||
|
||||
nv50_screen_init_hwctx(screen, tls_space);
|
||||
nv50_screen_init_hwctx(screen);
|
||||
|
||||
nouveau_fence_new(&screen->base, &screen->base.fence.current, FALSE);
|
||||
|
||||
|
|
|
@ -34,7 +34,10 @@ struct nv50_screen {
|
|||
struct nouveau_bo *stack_bo;
|
||||
struct nouveau_bo *tls_bo;
|
||||
|
||||
uint64_t tls_size;
|
||||
unsigned TPs;
|
||||
unsigned MPsInTP;
|
||||
unsigned max_tls_space;
|
||||
unsigned cur_tls_space;
|
||||
|
||||
struct nouveau_heap *vp_code_heap;
|
||||
struct nouveau_heap *gp_code_heap;
|
||||
|
@ -143,4 +146,6 @@ nv50_screen_tsc_free(struct nv50_screen *screen, struct nv50_tsc_entry *tsc)
|
|||
}
|
||||
}
|
||||
|
||||
extern int nv50_tls_realloc(struct nv50_screen *screen, unsigned tls_space);
|
||||
|
||||
#endif
|
||||
|
|
|
@ -129,9 +129,12 @@ nv50_program_update_context_state(struct nv50_context *nv50,
|
|||
{
|
||||
const unsigned flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_RDWR;
|
||||
|
||||
if (prog && prog->uses_lmem) {
|
||||
if (!nv50->state.tls_required)
|
||||
if (prog && prog->tls_space) {
|
||||
if (nv50->state.new_tls_space)
|
||||
nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_TLS);
|
||||
if (!nv50->state.tls_required || nv50->state.new_tls_space)
|
||||
BCTX_REFN_bo(nv50->bufctx_3d, TLS, flags, nv50->screen->tls_bo);
|
||||
nv50->state.new_tls_space = FALSE;
|
||||
nv50->state.tls_required |= 1 << stage;
|
||||
} else {
|
||||
if (nv50->state.tls_required == (1 << stage))
|
||||
|
|
Loading…
Reference in New Issue