nv50,nvc0: fix buffer clearing to respect engine alignment requirements

It appears that the nvidia render engine is quite picky when it comes to
linear surfaces. It doesn't like non-256-byte aligned offsets, and
apparently doesn't even do non-256-byte strides.

This makes arb_clear_buffer_object-unaligned pass on both nv50 and nvc0.

As a side-effect this also allows RGB32 clears to work via GPU data
upload instead of synchronizing the buffer to the CPU (nvc0 only).

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu> # tested on GF108, GT215
Tested-by: Nick Sarnie <commendsarnex@gmail.com> # GK208
Cc: mesa-stable@lists.freedesktop.org
This commit is contained in:
Ilia Mirkin 2016-01-30 10:02:43 -05:00
parent f15447e7c9
commit 3ca2001b53
2 changed files with 248 additions and 53 deletions

View File

@ -594,6 +594,82 @@ nv50_clear(struct pipe_context *pipe, unsigned buffers,
PUSH_DATA (push, nv50->rt_array_mode);
}
static void
nv50_clear_buffer_push(struct pipe_context *pipe,
struct pipe_resource *res,
unsigned offset, unsigned size,
const void *data, int data_size)
{
struct nv50_context *nv50 = nv50_context(pipe);
struct nouveau_pushbuf *push = nv50->base.pushbuf;
struct nv04_resource *buf = nv04_resource(res);
unsigned count = (size + 3) / 4;
unsigned xcoord = offset & 0xff;
unsigned tmp, i;
if (data_size == 1) {
tmp = *(unsigned char *)data;
tmp = (tmp << 24) | (tmp << 16) | (tmp << 8) | tmp;
data = &tmp;
data_size = 4;
} else if (data_size == 2) {
tmp = *(unsigned short *)data;
tmp = (tmp << 16) | tmp;
data = &tmp;
data_size = 4;
}
unsigned data_words = data_size / 4;
nouveau_bufctx_refn(nv50->bufctx, 0, buf->bo, buf->domain | NOUVEAU_BO_WR);
nouveau_pushbuf_bufctx(push, nv50->bufctx);
nouveau_pushbuf_validate(push);
offset &= ~0xff;
BEGIN_NV04(push, NV50_2D(DST_FORMAT), 2);
PUSH_DATA (push, NV50_SURFACE_FORMAT_R8_UNORM);
PUSH_DATA (push, 1);
BEGIN_NV04(push, NV50_2D(DST_PITCH), 5);
PUSH_DATA (push, 262144);
PUSH_DATA (push, 65536);
PUSH_DATA (push, 1);
PUSH_DATAh(push, buf->address + offset);
PUSH_DATA (push, buf->address + offset);
BEGIN_NV04(push, NV50_2D(SIFC_BITMAP_ENABLE), 2);
PUSH_DATA (push, 0);
PUSH_DATA (push, NV50_SURFACE_FORMAT_R8_UNORM);
BEGIN_NV04(push, NV50_2D(SIFC_WIDTH), 10);
PUSH_DATA (push, size);
PUSH_DATA (push, 1);
PUSH_DATA (push, 0);
PUSH_DATA (push, 1);
PUSH_DATA (push, 0);
PUSH_DATA (push, 1);
PUSH_DATA (push, 0);
PUSH_DATA (push, xcoord);
PUSH_DATA (push, 0);
PUSH_DATA (push, 0);
while (count) {
unsigned nr_data = MIN2(count, NV04_PFIFO_MAX_PACKET_LEN) / data_words;
unsigned nr = nr_data * data_words;
BEGIN_NI04(push, NV50_2D(SIFC_DATA), nr);
for (i = 0; i < nr_data; i++)
PUSH_DATAp(push, data, data_words);
count -= nr;
}
if (buf->mm) {
nouveau_fence_ref(nv50->screen->base.fence.current, &buf->fence);
nouveau_fence_ref(nv50->screen->base.fence.current, &buf->fence_wr);
}
nouveau_bufctx_reset(nv50->bufctx, 0);
}
static void
nv50_clear_buffer(struct pipe_context *pipe,
struct pipe_resource *res,
@ -643,9 +719,22 @@ nv50_clear_buffer(struct pipe_context *pipe,
assert(size % data_size == 0);
if (offset & 0xff) {
unsigned fixup_size = MIN2(size, align(offset, 0x100) - offset);
assert(fixup_size % data_size == 0);
nv50_clear_buffer_push(pipe, res, offset, fixup_size, data, data_size);
offset += fixup_size;
size -= fixup_size;
if (!size)
return;
}
elements = size / data_size;
height = (elements + 8191) / 8192;
width = elements / height;
if (height > 1)
width &= ~0xff;
assert(width > 0);
BEGIN_NV04(push, NV50_3D(CLEAR_COLOR(0)), 4);
PUSH_DATAf(push, color.f[0]);
@ -669,13 +758,13 @@ nv50_clear_buffer(struct pipe_context *pipe,
BEGIN_NV04(push, NV50_3D(RT_CONTROL), 1);
PUSH_DATA (push, 1);
BEGIN_NV04(push, NV50_3D(RT_ADDRESS_HIGH(0)), 5);
PUSH_DATAh(push, buf->bo->offset + buf->offset + offset);
PUSH_DATA (push, buf->bo->offset + buf->offset + offset);
PUSH_DATAh(push, buf->address + offset);
PUSH_DATA (push, buf->address + offset);
PUSH_DATA (push, nv50_format_table[dst_fmt].rt);
PUSH_DATA (push, 0);
PUSH_DATA (push, 0);
BEGIN_NV04(push, NV50_3D(RT_HORIZ(0)), 2);
PUSH_DATA (push, NV50_3D_RT_HORIZ_LINEAR | (width * data_size));
PUSH_DATA (push, NV50_3D_RT_HORIZ_LINEAR | align(width * data_size, 0x100));
PUSH_DATA (push, height);
BEGIN_NV04(push, NV50_3D(ZETA_ENABLE), 1);
PUSH_DATA (push, 0);
@ -694,25 +783,20 @@ nv50_clear_buffer(struct pipe_context *pipe,
BEGIN_NI04(push, NV50_3D(CLEAR_BUFFERS), 1);
PUSH_DATA (push, 0x3c);
if (width * height != elements) {
offset += width * height * data_size;
width = elements - width * height;
height = 1;
BEGIN_NV04(push, NV50_3D(RT_ADDRESS_HIGH(0)), 2);
PUSH_DATAh(push, buf->bo->offset + buf->offset + offset);
PUSH_DATA (push, buf->bo->offset + buf->offset + offset);
BEGIN_NV04(push, NV50_3D(RT_HORIZ(0)), 2);
PUSH_DATA (push, NV50_3D_RT_HORIZ_LINEAR | (width * data_size));
PUSH_DATA (push, height);
BEGIN_NI04(push, NV50_3D(CLEAR_BUFFERS), 1);
PUSH_DATA (push, 0x3c);
}
BEGIN_NV04(push, NV50_3D(COND_MODE), 1);
PUSH_DATA (push, nv50->cond_condmode);
nouveau_fence_ref(nv50->screen->base.fence.current, &buf->fence);
nouveau_fence_ref(nv50->screen->base.fence.current, &buf->fence_wr);
if (buf->mm) {
nouveau_fence_ref(nv50->screen->base.fence.current, &buf->fence);
nouveau_fence_ref(nv50->screen->base.fence.current, &buf->fence_wr);
}
if (width * height != elements) {
offset += width * height * data_size;
width = elements - width * height;
nv50_clear_buffer_push(pipe, res, offset, width * data_size,
data, data_size);
}
nv50->dirty |= NV50_NEW_FRAMEBUFFER | NV50_NEW_SCISSOR;
}

View File

@ -357,27 +357,132 @@ nvc0_clear_render_target(struct pipe_context *pipe,
}
static void
nvc0_clear_buffer_cpu(struct pipe_context *pipe,
struct pipe_resource *res,
unsigned offset, unsigned size,
const void *data, int data_size)
nvc0_clear_buffer_push_nvc0(struct pipe_context *pipe,
struct pipe_resource *res,
unsigned offset, unsigned size,
const void *data, int data_size)
{
struct nvc0_context *nvc0 = nvc0_context(pipe);
struct nouveau_pushbuf *push = nvc0->base.pushbuf;
struct nv04_resource *buf = nv04_resource(res);
struct pipe_transfer *pt;
struct pipe_box box;
unsigned elements, i;
unsigned i;
elements = size / data_size;
nouveau_bufctx_refn(nvc0->bufctx, 0, buf->bo, buf->domain | NOUVEAU_BO_WR);
nouveau_pushbuf_bufctx(push, nvc0->bufctx);
nouveau_pushbuf_validate(push);
u_box_1d(offset, size, &box);
unsigned count = (size + 3) / 4;
unsigned data_words = data_size / 4;
uint8_t *map = buf->vtbl->transfer_map(pipe, res, 0, PIPE_TRANSFER_WRITE,
&box, &pt);
while (count) {
unsigned nr_data = MIN2(count, NV04_PFIFO_MAX_PACKET_LEN) / data_words;
unsigned nr = nr_data * data_words;
for (i = 0; i < elements; ++i)
memcpy(&map[i*data_size], data, data_size);
if (!PUSH_SPACE(push, nr + 9))
break;
buf->vtbl->transfer_unmap(pipe, pt);
BEGIN_NVC0(push, NVC0_M2MF(OFFSET_OUT_HIGH), 2);
PUSH_DATAh(push, buf->address + offset);
PUSH_DATA (push, buf->address + offset);
BEGIN_NVC0(push, NVC0_M2MF(LINE_LENGTH_IN), 2);
PUSH_DATA (push, MIN2(size, nr * 4));
PUSH_DATA (push, 1);
BEGIN_NVC0(push, NVC0_M2MF(EXEC), 1);
PUSH_DATA (push, 0x100111);
/* must not be interrupted (trap on QUERY fence, 0x50 works however) */
BEGIN_NIC0(push, NVC0_M2MF(DATA), nr);
for (i = 0; i < nr_data; i++)
PUSH_DATAp(push, data, data_words);
count -= nr;
offset += nr * 4;
size -= nr * 4;
}
if (buf->mm) {
nouveau_fence_ref(nvc0->screen->base.fence.current, &buf->fence);
nouveau_fence_ref(nvc0->screen->base.fence.current, &buf->fence_wr);
}
nouveau_bufctx_reset(nvc0->bufctx, 0);
}
static void
nvc0_clear_buffer_push_nve4(struct pipe_context *pipe,
struct pipe_resource *res,
unsigned offset, unsigned size,
const void *data, int data_size)
{
struct nvc0_context *nvc0 = nvc0_context(pipe);
struct nouveau_pushbuf *push = nvc0->base.pushbuf;
struct nv04_resource *buf = nv04_resource(res);
unsigned i;
nouveau_bufctx_refn(nvc0->bufctx, 0, buf->bo, buf->domain | NOUVEAU_BO_WR);
nouveau_pushbuf_bufctx(push, nvc0->bufctx);
nouveau_pushbuf_validate(push);
unsigned count = (size + 3) / 4;
unsigned data_words = data_size / 4;
while (count) {
unsigned nr_data = MIN2(count, NV04_PFIFO_MAX_PACKET_LEN) / data_words;
unsigned nr = nr_data * data_words;
if (!PUSH_SPACE(push, nr + 10))
break;
BEGIN_NVC0(push, NVE4_P2MF(UPLOAD_DST_ADDRESS_HIGH), 2);
PUSH_DATAh(push, buf->address + offset);
PUSH_DATA (push, buf->address + offset);
BEGIN_NVC0(push, NVE4_P2MF(UPLOAD_LINE_LENGTH_IN), 2);
PUSH_DATA (push, MIN2(size, nr * 4));
PUSH_DATA (push, 1);
/* must not be interrupted (trap on QUERY fence, 0x50 works however) */
BEGIN_1IC0(push, NVE4_P2MF(UPLOAD_EXEC), nr + 1);
PUSH_DATA (push, 0x1001);
for (i = 0; i < nr_data; i++)
PUSH_DATAp(push, data, data_words);
count -= nr;
offset += nr * 4;
size -= nr * 4;
}
if (buf->mm) {
nouveau_fence_ref(nvc0->screen->base.fence.current, &buf->fence);
nouveau_fence_ref(nvc0->screen->base.fence.current, &buf->fence_wr);
}
nouveau_bufctx_reset(nvc0->bufctx, 0);
}
static void
nvc0_clear_buffer_push(struct pipe_context *pipe,
struct pipe_resource *res,
unsigned offset, unsigned size,
const void *data, int data_size)
{
struct nvc0_context *nvc0 = nvc0_context(pipe);
unsigned tmp;
if (data_size == 1) {
tmp = *(unsigned char *)data;
tmp = (tmp << 24) | (tmp << 16) | (tmp << 8) | tmp;
data = &tmp;
data_size = 4;
} else if (data_size == 2) {
tmp = *(unsigned short *)data;
tmp = (tmp << 16) | tmp;
data = &tmp;
data_size = 4;
}
if (nvc0->screen->base.class_3d < NVE4_3D_CLASS)
nvc0_clear_buffer_push_nvc0(pipe, res, offset, size, data, data_size);
else
nvc0_clear_buffer_push_nve4(pipe, res, offset, size, data, data_size);
}
static void
@ -402,10 +507,8 @@ nvc0_clear_buffer(struct pipe_context *pipe,
memcpy(&color.ui, data, 16);
break;
case 12:
/* This doesn't work, RGB32 is not a valid RT format.
* dst_fmt = PIPE_FORMAT_R32G32B32_UINT;
* memcpy(&color.ui, data, 12);
* memset(&color.ui[3], 0, 4);
/* RGB32 is not a valid RT format. This will be handled by the pushbuf
* uploader.
*/
break;
case 8:
@ -437,14 +540,26 @@ nvc0_clear_buffer(struct pipe_context *pipe,
assert(size % data_size == 0);
if (data_size == 12) {
/* TODO: Find a way to do this with the GPU! */
nvc0_clear_buffer_cpu(pipe, res, offset, size, data, data_size);
nvc0_clear_buffer_push(pipe, res, offset, size, data, data_size);
return;
}
if (offset & 0xff) {
unsigned fixup_size = MIN2(size, align(offset, 0x100) - offset);
assert(fixup_size % data_size == 0);
nvc0_clear_buffer_push(pipe, res, offset, fixup_size, data, data_size);
offset += fixup_size;
size -= fixup_size;
if (!size)
return;
}
elements = size / data_size;
height = (elements + 16383) / 16384;
width = elements / height;
if (height > 1)
width &= ~0xff;
assert(width > 0);
if (!PUSH_SPACE(push, 40))
return;
@ -465,7 +580,7 @@ nvc0_clear_buffer(struct pipe_context *pipe,
BEGIN_NVC0(push, NVC0_3D(RT_ADDRESS_HIGH(0)), 9);
PUSH_DATAh(push, buf->address + offset);
PUSH_DATA (push, buf->address + offset);
PUSH_DATA (push, width * data_size);
PUSH_DATA (push, align(width * data_size, 0x100));
PUSH_DATA (push, height);
PUSH_DATA (push, nvc0_format_table[dst_fmt].rt);
PUSH_DATA (push, NVC0_3D_RT_TILE_MODE_LINEAR);
@ -480,24 +595,20 @@ nvc0_clear_buffer(struct pipe_context *pipe,
IMMED_NVC0(push, NVC0_3D(CLEAR_BUFFERS), 0x3c);
IMMED_NVC0(push, NVC0_3D(COND_MODE), nvc0->cond_condmode);
if (buf->mm) {
nouveau_fence_ref(nvc0->screen->base.fence.current, &buf->fence);
nouveau_fence_ref(nvc0->screen->base.fence.current, &buf->fence_wr);
}
if (width * height != elements) {
offset += width * height * data_size;
width = elements - width * height;
height = 1;
BEGIN_NVC0(push, NVC0_3D(RT_ADDRESS_HIGH(0)), 4);
PUSH_DATAh(push, buf->address + offset);
PUSH_DATA (push, buf->address + offset);
PUSH_DATA (push, width * data_size);
PUSH_DATA (push, height);
IMMED_NVC0(push, NVC0_3D(CLEAR_BUFFERS), 0x3c);
nvc0_clear_buffer_push(pipe, res, offset, width * data_size,
data, data_size);
}
IMMED_NVC0(push, NVC0_3D(COND_MODE), nvc0->cond_condmode);
nouveau_fence_ref(nvc0->screen->base.fence.current, &buf->fence);
nouveau_fence_ref(nvc0->screen->base.fence.current, &buf->fence_wr);
nvc0->dirty |= NVC0_NEW_FRAMEBUFFER;
}