panfrost: Move instancing routines to encoder/
Nothing Gallium specific or stateful about them. Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
This commit is contained in:
parent
8a57672673
commit
31305e1b28
|
@ -26,221 +26,6 @@
|
|||
#include "pan_bo.h"
|
||||
#include "pan_context.h"
|
||||
|
||||
/* See mali_job for notes on how this works. But basically, for small vertex
|
||||
* counts, we have a lookup table, and for large vertex counts, we look at the
|
||||
* high bits as a heuristic. This has to match exactly how the hardware
|
||||
* calculates this (which is why the algorithm is so weird) or else instancing
|
||||
* will break. */
|
||||
|
||||
/* Given an odd number (of the form 2k + 1), compute k */
|
||||
#define ODD(odd) ((odd - 1) >> 1)
|
||||
|
||||
/* Given the shift/odd pair, recover the original padded integer */
|
||||
|
||||
unsigned
|
||||
pan_expand_shift_odd(struct pan_shift_odd o)
|
||||
{
|
||||
unsigned odd = 2*o.odd + 1;
|
||||
unsigned shift = 1 << o.shift;
|
||||
return odd * shift;
|
||||
}
|
||||
|
||||
static inline struct pan_shift_odd
|
||||
pan_factored(unsigned pot, unsigned odd)
|
||||
{
|
||||
struct pan_shift_odd out;
|
||||
|
||||
assert(util_is_power_of_two_or_zero(pot));
|
||||
assert(odd & 1);
|
||||
|
||||
/* Odd is of the form (2k + 1) = (k << 1) + 1 = (k << 1) | 1.
|
||||
*
|
||||
* So (odd >> 1) = ((k << 1) | 1) >> 1 = ((k << 1) >> 1) | (1 >> 1)
|
||||
* = k | 0 = k */
|
||||
|
||||
out.odd = (odd >> 1);
|
||||
|
||||
/* POT is the form (1 << shift) */
|
||||
out.shift = __builtin_ctz(pot);
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
|
||||
/* For small vertices. Second argument is whether the primitive takes a
|
||||
* power-of-two argument, which determines how rounding works. True for POINTS
|
||||
* and LINES, false for TRIANGLES. Presumably true for QUADS but you'd be crazy
|
||||
* to try instanced quads on ES class hardware <3 */
|
||||
|
||||
static struct {
|
||||
unsigned pot;
|
||||
unsigned odd;
|
||||
} small_lut[] = {
|
||||
{ 0, 1 },
|
||||
{ 1, 1 },
|
||||
{ 2, 1 },
|
||||
{ 1, 3 },
|
||||
{ 4, 1 },
|
||||
{ 1, 5 },
|
||||
{ 2, 3 },
|
||||
{ 1, 7 },
|
||||
{ 8, 1 },
|
||||
{ 1, 9 },
|
||||
{ 2, 5 },
|
||||
{ 4, 3 }, /* 11 */
|
||||
{ 4, 3 },
|
||||
{ 2, 7 }, /* 13 */
|
||||
{ 2, 7 },
|
||||
{ 16, 1 }, /* 15 */
|
||||
{ 16, 1 },
|
||||
{ 2, 9 },
|
||||
{ 4, 5 }, /* 20 */
|
||||
{ 4, 5 }
|
||||
};
|
||||
|
||||
static struct pan_shift_odd
|
||||
panfrost_small_padded_vertex_count(unsigned idx)
|
||||
{
|
||||
return pan_factored(
|
||||
small_lut[idx].pot,
|
||||
small_lut[idx].odd);
|
||||
}
|
||||
|
||||
static struct pan_shift_odd
|
||||
panfrost_large_padded_vertex_count(uint32_t vertex_count)
|
||||
{
|
||||
struct pan_shift_odd out = { 0 };
|
||||
|
||||
/* First, we have to find the highest set one */
|
||||
unsigned highest = 32 - __builtin_clz(vertex_count);
|
||||
|
||||
/* Using that, we mask out the highest 4-bits */
|
||||
unsigned n = highest - 4;
|
||||
unsigned nibble = (vertex_count >> n) & 0xF;
|
||||
|
||||
/* Great, we have the nibble. Now we can just try possibilities. Note
|
||||
* that we don't care about the bottom most bit in most cases, and we
|
||||
* know the top bit must be 1 */
|
||||
|
||||
unsigned middle_two = (nibble >> 1) & 0x3;
|
||||
|
||||
switch (middle_two) {
|
||||
case 0b00:
|
||||
if (nibble & 1)
|
||||
return pan_factored(1 << n, 9);
|
||||
else
|
||||
return pan_factored(1 << (n + 1), 5);
|
||||
case 0b01:
|
||||
return pan_factored(1 << (n + 2), 3);
|
||||
case 0b10:
|
||||
return pan_factored(1 << (n + 1), 7);
|
||||
case 0b11:
|
||||
return pan_factored(1 << (n + 4), 1);
|
||||
default:
|
||||
unreachable("Invalid two bits");
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
struct pan_shift_odd
|
||||
panfrost_padded_vertex_count(
|
||||
unsigned vertex_count,
|
||||
bool pot)
|
||||
{
|
||||
assert(vertex_count > 0);
|
||||
|
||||
if (vertex_count < 20) {
|
||||
/* Add an off-by-one if it won't align naturally (quirk of the hardware) */
|
||||
//if (!pot)
|
||||
// vertex_count++;
|
||||
|
||||
return panfrost_small_padded_vertex_count(vertex_count);
|
||||
} else
|
||||
return panfrost_large_padded_vertex_count(vertex_count);
|
||||
}
|
||||
|
||||
/* The much, much more irritating case -- instancing is enabled. See
|
||||
* panfrost_job.h for notes on how this works */
|
||||
|
||||
static unsigned
|
||||
panfrost_vertex_instanced(
|
||||
unsigned padded_count,
|
||||
unsigned instance_shift, unsigned instance_odd,
|
||||
unsigned divisor,
|
||||
union mali_attr *attrs)
|
||||
{
|
||||
/* Depending if there is an instance divisor or not, packing varies.
|
||||
* When there is a divisor, the hardware-level divisor is actually the
|
||||
* product of the instance divisor and the padded count */
|
||||
|
||||
unsigned hw_divisor = padded_count * divisor;
|
||||
|
||||
if (divisor == 0) {
|
||||
/* Per-vertex attributes use the MODULO mode. First, compute
|
||||
* the modulus */
|
||||
|
||||
attrs->elements |= MALI_ATTR_MODULO;
|
||||
attrs->shift = instance_shift;
|
||||
attrs->extra_flags = instance_odd;
|
||||
|
||||
return 1;
|
||||
} else if (util_is_power_of_two_or_zero(hw_divisor)) {
|
||||
/* If there is a divisor but the hardware divisor works out to
|
||||
* a power of two (not terribly exceptional), we can use an
|
||||
* easy path (just shifting) */
|
||||
|
||||
attrs->elements |= MALI_ATTR_POT_DIVIDE;
|
||||
attrs->shift = __builtin_ctz(hw_divisor);
|
||||
|
||||
return 1;
|
||||
} else {
|
||||
/* We have a NPOT divisor. Here's the fun one (multipling by
|
||||
* the inverse and shifting) */
|
||||
|
||||
/* floor(log2(d)) */
|
||||
unsigned shift = util_logbase2(hw_divisor);
|
||||
|
||||
/* m = ceil(2^(32 + shift) / d) */
|
||||
uint64_t shift_hi = 32 + shift;
|
||||
uint64_t t = 1ll << shift_hi;
|
||||
double t_f = t;
|
||||
double hw_divisor_d = hw_divisor;
|
||||
double m_f = ceil(t_f / hw_divisor_d);
|
||||
unsigned m = m_f;
|
||||
|
||||
/* Default case */
|
||||
uint32_t magic_divisor = m, extra_flags = 0;
|
||||
|
||||
/* e = 2^(shift + 32) % d */
|
||||
uint64_t e = t % hw_divisor;
|
||||
|
||||
/* Apply round-down algorithm? e <= 2^shift?. XXX: The blob
|
||||
* seems to use a different condition */
|
||||
if (e <= (1ll << shift)) {
|
||||
magic_divisor = m - 1;
|
||||
extra_flags = 1;
|
||||
}
|
||||
|
||||
/* Top flag implicitly set */
|
||||
assert(magic_divisor & (1u << 31));
|
||||
magic_divisor &= ~(1u << 31);
|
||||
|
||||
/* Upload to two different slots */
|
||||
|
||||
attrs[0].elements |= MALI_ATTR_NPOT_DIVIDE;
|
||||
attrs[0].shift = shift;
|
||||
attrs[0].extra_flags = extra_flags;
|
||||
|
||||
attrs[1].unk = 0x20;
|
||||
attrs[1].magic_divisor = magic_divisor;
|
||||
attrs[1].zero = 0;
|
||||
attrs[1].divisor = divisor;
|
||||
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
panfrost_emit_vertex_data(struct panfrost_batch *batch)
|
||||
{
|
||||
|
|
|
@ -1551,16 +1551,8 @@ panfrost_draw_vbo(
|
|||
/* Encode the padded vertex count */
|
||||
|
||||
if (info->instance_count > 1) {
|
||||
/* Triangles have non-even vertex counts so they change how
|
||||
* padding works internally */
|
||||
|
||||
bool is_triangle =
|
||||
mode == PIPE_PRIM_TRIANGLES ||
|
||||
mode == PIPE_PRIM_TRIANGLE_STRIP ||
|
||||
mode == PIPE_PRIM_TRIANGLE_FAN;
|
||||
|
||||
struct pan_shift_odd so =
|
||||
panfrost_padded_vertex_count(vertex_count, !is_triangle);
|
||||
panfrost_padded_vertex_count(vertex_count);
|
||||
|
||||
ctx->payloads[PIPE_SHADER_VERTEX].instance_shift = so.shift;
|
||||
ctx->payloads[PIPE_SHADER_FRAGMENT].instance_shift = so.shift;
|
||||
|
|
|
@ -336,20 +336,6 @@ panfrost_vertex_buffer_address(struct panfrost_context *ctx, unsigned i);
|
|||
void
|
||||
panfrost_emit_vertex_data(struct panfrost_batch *batch);
|
||||
|
||||
struct pan_shift_odd {
|
||||
unsigned shift;
|
||||
unsigned odd;
|
||||
};
|
||||
|
||||
struct pan_shift_odd
|
||||
panfrost_padded_vertex_count(
|
||||
unsigned vertex_count,
|
||||
bool primitive_pot);
|
||||
|
||||
|
||||
unsigned
|
||||
pan_expand_shift_odd(struct pan_shift_odd o);
|
||||
|
||||
/* Compute */
|
||||
|
||||
void
|
||||
|
|
|
@ -22,6 +22,7 @@
|
|||
libpanfrost_encoder_files = files(
|
||||
'pan_encoder.h',
|
||||
|
||||
'pan_attributes.c',
|
||||
'pan_invocation.c',
|
||||
'pan_tiler.c',
|
||||
'pan_scratch.c',
|
||||
|
|
|
@ -0,0 +1,235 @@
|
|||
/*
|
||||
* Copyright (C) 2019 Collabora, Ltd.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#include "util/u_math.h"
|
||||
#include "panfrost-job.h"
|
||||
#include "pan_encoder.h"
|
||||
|
||||
/* This file handles attribute descriptors (mali_attr_meta). The
|
||||
* bulk of the complexity is from instancing. See mali_job for
|
||||
* notes on how this works. But basically, for small vertex
|
||||
* counts, we have a lookup table, and for large vertex counts,
|
||||
* we look at the high bits as a heuristic. This has to match
|
||||
* exactly how the hardware calculates this (which is why the
|
||||
* algorithm is so weird) or else instancing will break. */
|
||||
|
||||
/* Given an odd number (of the form 2k + 1), compute k */
|
||||
#define ODD(odd) ((odd - 1) >> 1)
|
||||
|
||||
/* Given the shift/odd pair, recover the original padded integer */
|
||||
|
||||
unsigned
|
||||
pan_expand_shift_odd(struct pan_shift_odd o)
|
||||
{
|
||||
unsigned odd = 2*o.odd + 1;
|
||||
unsigned shift = 1 << o.shift;
|
||||
return odd * shift;
|
||||
}
|
||||
|
||||
static inline struct pan_shift_odd
|
||||
pan_factored(unsigned pot, unsigned odd)
|
||||
{
|
||||
struct pan_shift_odd out;
|
||||
|
||||
assert(util_is_power_of_two_or_zero(pot));
|
||||
assert(odd & 1);
|
||||
|
||||
/* Odd is of the form (2k + 1) = (k << 1) + 1 = (k << 1) | 1.
|
||||
*
|
||||
* So (odd >> 1) = ((k << 1) | 1) >> 1 = ((k << 1) >> 1) | (1 >> 1)
|
||||
* = k | 0 = k */
|
||||
|
||||
out.odd = (odd >> 1);
|
||||
|
||||
/* POT is the form (1 << shift) */
|
||||
out.shift = __builtin_ctz(pot);
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
|
||||
/* For small vertices. Second argument is whether the primitive takes a
|
||||
* power-of-two argument, which determines how rounding works. True for POINTS
|
||||
* and LINES, false for TRIANGLES. Presumably true for QUADS but you'd be crazy
|
||||
* to try instanced quads on ES class hardware <3 */
|
||||
|
||||
static struct {
|
||||
unsigned pot;
|
||||
unsigned odd;
|
||||
} small_lut[] = {
|
||||
{ 0, 1 },
|
||||
{ 1, 1 },
|
||||
{ 2, 1 },
|
||||
{ 1, 3 },
|
||||
{ 4, 1 },
|
||||
{ 1, 5 },
|
||||
{ 2, 3 },
|
||||
{ 1, 7 },
|
||||
{ 8, 1 },
|
||||
{ 1, 9 },
|
||||
{ 2, 5 },
|
||||
{ 4, 3 }, /* 11 */
|
||||
{ 4, 3 },
|
||||
{ 2, 7 }, /* 13 */
|
||||
{ 2, 7 },
|
||||
{ 16, 1 }, /* 15 */
|
||||
{ 16, 1 },
|
||||
{ 2, 9 },
|
||||
{ 4, 5 }, /* 20 */
|
||||
{ 4, 5 }
|
||||
};
|
||||
|
||||
static struct pan_shift_odd
|
||||
panfrost_small_padded_vertex_count(unsigned idx)
|
||||
{
|
||||
return pan_factored(
|
||||
small_lut[idx].pot,
|
||||
small_lut[idx].odd);
|
||||
}
|
||||
|
||||
static struct pan_shift_odd
|
||||
panfrost_large_padded_vertex_count(uint32_t vertex_count)
|
||||
{
|
||||
struct pan_shift_odd out = { 0 };
|
||||
|
||||
/* First, we have to find the highest set one */
|
||||
unsigned highest = 32 - __builtin_clz(vertex_count);
|
||||
|
||||
/* Using that, we mask out the highest 4-bits */
|
||||
unsigned n = highest - 4;
|
||||
unsigned nibble = (vertex_count >> n) & 0xF;
|
||||
|
||||
/* Great, we have the nibble. Now we can just try possibilities. Note
|
||||
* that we don't care about the bottom most bit in most cases, and we
|
||||
* know the top bit must be 1 */
|
||||
|
||||
unsigned middle_two = (nibble >> 1) & 0x3;
|
||||
|
||||
switch (middle_two) {
|
||||
case 0b00:
|
||||
if (nibble & 1)
|
||||
return pan_factored(1 << n, 9);
|
||||
else
|
||||
return pan_factored(1 << (n + 1), 5);
|
||||
case 0b01:
|
||||
return pan_factored(1 << (n + 2), 3);
|
||||
case 0b10:
|
||||
return pan_factored(1 << (n + 1), 7);
|
||||
case 0b11:
|
||||
default: /* unreachable */
|
||||
return pan_factored(1 << (n + 4), 1);
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
struct pan_shift_odd
|
||||
panfrost_padded_vertex_count(unsigned vertex_count)
|
||||
{
|
||||
if (vertex_count < 20)
|
||||
return panfrost_small_padded_vertex_count(vertex_count);
|
||||
else
|
||||
return panfrost_large_padded_vertex_count(vertex_count);
|
||||
}
|
||||
|
||||
/* The much, much more irritating case -- instancing is enabled. See
|
||||
* panfrost_job.h for notes on how this works */
|
||||
|
||||
unsigned
|
||||
panfrost_vertex_instanced(
|
||||
unsigned padded_count,
|
||||
unsigned instance_shift, unsigned instance_odd,
|
||||
unsigned divisor,
|
||||
union mali_attr *attrs)
|
||||
{
|
||||
/* Depending if there is an instance divisor or not, packing varies.
|
||||
* When there is a divisor, the hardware-level divisor is actually the
|
||||
* product of the instance divisor and the padded count */
|
||||
|
||||
unsigned hw_divisor = padded_count * divisor;
|
||||
|
||||
if (divisor == 0) {
|
||||
/* Per-vertex attributes use the MODULO mode. First, compute
|
||||
* the modulus */
|
||||
|
||||
attrs->elements |= MALI_ATTR_MODULO;
|
||||
attrs->shift = instance_shift;
|
||||
attrs->extra_flags = instance_odd;
|
||||
|
||||
return 1;
|
||||
} else if (util_is_power_of_two_or_zero(hw_divisor)) {
|
||||
/* If there is a divisor but the hardware divisor works out to
|
||||
* a power of two (not terribly exceptional), we can use an
|
||||
* easy path (just shifting) */
|
||||
|
||||
attrs->elements |= MALI_ATTR_POT_DIVIDE;
|
||||
attrs->shift = __builtin_ctz(hw_divisor);
|
||||
|
||||
return 1;
|
||||
} else {
|
||||
/* We have a NPOT divisor. Here's the fun one (multipling by
|
||||
* the inverse and shifting) */
|
||||
|
||||
/* floor(log2(d)) */
|
||||
unsigned shift = util_logbase2(hw_divisor);
|
||||
|
||||
/* m = ceil(2^(32 + shift) / d) */
|
||||
uint64_t shift_hi = 32 + shift;
|
||||
uint64_t t = 1ll << shift_hi;
|
||||
double t_f = t;
|
||||
double hw_divisor_d = hw_divisor;
|
||||
double m_f = ceil(t_f / hw_divisor_d);
|
||||
unsigned m = m_f;
|
||||
|
||||
/* Default case */
|
||||
uint32_t magic_divisor = m, extra_flags = 0;
|
||||
|
||||
/* e = 2^(shift + 32) % d */
|
||||
uint64_t e = t % hw_divisor;
|
||||
|
||||
/* Apply round-down algorithm? e <= 2^shift?. XXX: The blob
|
||||
* seems to use a different condition */
|
||||
if (e <= (1ll << shift)) {
|
||||
magic_divisor = m - 1;
|
||||
extra_flags = 1;
|
||||
}
|
||||
|
||||
/* Top flag implicitly set */
|
||||
assert(magic_divisor & (1u << 31));
|
||||
magic_divisor &= ~(1u << 31);
|
||||
|
||||
/* Upload to two different slots */
|
||||
|
||||
attrs[0].elements |= MALI_ATTR_NPOT_DIVIDE;
|
||||
attrs[0].shift = shift;
|
||||
attrs[0].extra_flags = extra_flags;
|
||||
|
||||
attrs[1].unk = 0x20;
|
||||
attrs[1].magic_divisor = magic_divisor;
|
||||
attrs[1].zero = 0;
|
||||
attrs[1].divisor = divisor;
|
||||
|
||||
return 2;
|
||||
}
|
||||
}
|
|
@ -86,4 +86,24 @@ unsigned panfrost_query_thread_tls_alloc(int fd);
|
|||
|
||||
const char * panfrost_model_name(unsigned gpu_id);
|
||||
|
||||
/* Attributes / instancing */
|
||||
|
||||
struct pan_shift_odd {
|
||||
unsigned shift;
|
||||
unsigned odd;
|
||||
};
|
||||
|
||||
struct pan_shift_odd
|
||||
panfrost_padded_vertex_count(unsigned vertex_count);
|
||||
|
||||
unsigned
|
||||
panfrost_vertex_instanced(
|
||||
unsigned padded_count,
|
||||
unsigned instance_shift, unsigned instance_odd,
|
||||
unsigned divisor,
|
||||
union mali_attr *attrs);
|
||||
|
||||
unsigned
|
||||
pan_expand_shift_odd(struct pan_shift_odd o);
|
||||
|
||||
#endif
|
||||
|
|
Loading…
Reference in New Issue