mesa/src/amd/llvm/ac_llvm_cull.c

360 lines
16 KiB
C

/*
* Copyright 2019 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sub license, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
* USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* The above copyright notice and this permission notice (including the
* next paragraph) shall be included in all copies or substantial portions
* of the Software.
*
*/
#include "ac_llvm_cull.h"
#include <llvm-c/Core.h>
struct ac_position_w_info {
/* If a primitive intersects the W=0 plane, it causes a reflection
* of the determinant used for face culling. Every vertex behind
* the W=0 plane negates the determinant, so having 2 vertices behind
* the plane has no effect. This is i1 true if the determinant should be
* negated.
*/
LLVMValueRef w_reflection;
/* If we simplify the "-w <= p <= w" view culling equation, we get
* "-w <= w", which can't be satisfied when w is negative.
* In perspective projection, a negative W means that the primitive
* is behind the viewer, but the equation is independent of the type
* of projection.
*
* w_accepted is false when all W are negative and therefore
* the primitive is invisible.
*/
LLVMValueRef w_accepted;
/* The bounding box culling doesn't work and should be skipped when this is true. */
LLVMValueRef any_w_negative;
};
static void ac_analyze_position_w(struct ac_llvm_context *ctx, LLVMValueRef pos[3][4],
struct ac_position_w_info *w, unsigned num_vertices)
{
LLVMBuilderRef builder = ctx->builder;
LLVMValueRef all_w_negative = ctx->i1true;
w->w_reflection = ctx->i1false;
w->any_w_negative = ctx->i1false;
for (unsigned i = 0; i < num_vertices; i++) {
LLVMValueRef neg_w;
neg_w = LLVMBuildFCmp(builder, LLVMRealOLT, pos[i][3], ctx->f32_0, "");
/* If neg_w is true, negate w_reflection. */
w->w_reflection = LLVMBuildXor(builder, w->w_reflection, neg_w, "");
w->any_w_negative = LLVMBuildOr(builder, w->any_w_negative, neg_w, "");
all_w_negative = LLVMBuildAnd(builder, all_w_negative, neg_w, "");
}
w->w_accepted = LLVMBuildNot(builder, all_w_negative, "");
}
/* Perform front/back face culling and return true if the primitive is accepted. */
static LLVMValueRef ac_cull_face(struct ac_llvm_context *ctx, LLVMValueRef pos[3][4],
struct ac_position_w_info *w, bool cull_front, bool cull_back,
bool cull_zero_area)
{
LLVMBuilderRef builder = ctx->builder;
if (cull_front && cull_back)
return ctx->i1false;
if (!cull_front && !cull_back && !cull_zero_area)
return ctx->i1true;
/* Front/back face culling. Also if the determinant == 0, the triangle
* area is 0.
*/
LLVMValueRef det_t0 = LLVMBuildFSub(builder, pos[2][0], pos[0][0], "");
LLVMValueRef det_t1 = LLVMBuildFSub(builder, pos[1][1], pos[0][1], "");
LLVMValueRef det_t2 = LLVMBuildFSub(builder, pos[0][0], pos[1][0], "");
LLVMValueRef det_t3 = LLVMBuildFSub(builder, pos[0][1], pos[2][1], "");
/* t0 * t1 - t2 * t3 = t2 * -t3 + t0 * t1 = fma(t2, -t3, t0 * t1) */
LLVMValueRef det = ac_build_fmad(ctx, det_t2, LLVMBuildFNeg(builder, det_t3, ""),
LLVMBuildFMul(builder, det_t0, det_t1, ""));
/* Negative W negates the determinant. */
det = LLVMBuildSelect(builder, w->w_reflection, LLVMBuildFNeg(builder, det, ""), det, "");
LLVMValueRef accepted = NULL;
if (cull_front) {
LLVMRealPredicate cond = cull_zero_area ? LLVMRealOGT : LLVMRealOGE;
accepted = LLVMBuildFCmp(builder, cond, det, ctx->f32_0, "");
} else if (cull_back) {
LLVMRealPredicate cond = cull_zero_area ? LLVMRealOLT : LLVMRealOLE;
accepted = LLVMBuildFCmp(builder, cond, det, ctx->f32_0, "");
} else if (cull_zero_area) {
accepted = LLVMBuildFCmp(builder, LLVMRealONE, det, ctx->f32_0, "");
}
if (accepted) {
/* Don't reject NaN and +/-infinity, these are tricky.
* Just trust fixed-function HW to handle these cases correctly.
*/
accepted = LLVMBuildOr(builder, accepted, ac_build_is_inf_or_nan(ctx, det), "");
}
return accepted;
}
static void rotate_45degrees(struct ac_llvm_context *ctx, LLVMValueRef v[2])
{
/* sin(45) == cos(45) */
LLVMValueRef sincos45 = LLVMConstReal(ctx->f32, 0.707106781);
/* x2 = x*cos45 - y*sin45 = x*sincos45 - y*sincos45
* y2 = x*sin45 + y*cos45 = x*sincos45 + y*sincos45
*/
LLVMValueRef first = LLVMBuildFMul(ctx->builder, v[0], sincos45, "");
/* Doing 2x ffma while duplicating the multiplication is 33% faster than fmul+fadd+fadd. */
LLVMValueRef result[2] = {
ac_build_fmad(ctx, LLVMBuildFNeg(ctx->builder, v[1], ""), sincos45, first),
ac_build_fmad(ctx, v[1], sincos45, first),
};
memcpy(v, result, sizeof(result));
}
/* Perform view culling and small primitive elimination and return true
* if the primitive is accepted and initially_accepted == true. */
static void cull_bbox(struct ac_llvm_context *ctx, LLVMValueRef pos[3][4],
LLVMValueRef initially_accepted, struct ac_position_w_info *w,
LLVMValueRef vp_scale[2], LLVMValueRef vp_translate[2],
LLVMValueRef small_prim_precision,
LLVMValueRef clip_half_line_width[2],
struct ac_cull_options *options,
ac_cull_accept_func accept_func, void *userdata)
{
LLVMBuilderRef builder = ctx->builder;
if (!options->cull_view_xy && !options->cull_view_near_z && !options->cull_view_far_z &&
!options->cull_small_prims) {
if (accept_func)
accept_func(ctx, initially_accepted, userdata);
return;
}
ac_build_ifcc(ctx, initially_accepted, 10000000);
{
LLVMValueRef bbox_min[3], bbox_max[3];
LLVMValueRef accepted = ctx->i1true;
/* Compute the primitive bounding box for easy culling. */
for (unsigned chan = 0; chan < (options->cull_view_near_z ||
options->cull_view_far_z ? 3 : 2); chan++) {
assert(options->num_vertices >= 2);
bbox_min[chan] = ac_build_fmin(ctx, pos[0][chan], pos[1][chan]);
bbox_max[chan] = ac_build_fmax(ctx, pos[0][chan], pos[1][chan]);
if (options->num_vertices == 3) {
bbox_min[chan] = ac_build_fmin(ctx, bbox_min[chan], pos[2][chan]);
bbox_max[chan] = ac_build_fmax(ctx, bbox_max[chan], pos[2][chan]);
}
if (clip_half_line_width[chan]) {
bbox_min[chan] = LLVMBuildFSub(builder, bbox_min[chan], clip_half_line_width[chan], "");
bbox_max[chan] = LLVMBuildFAdd(builder, bbox_max[chan], clip_half_line_width[chan], "");
}
}
/* View culling. */
if (options->cull_view_xy || options->cull_view_near_z || options->cull_view_far_z) {
for (unsigned chan = 0; chan < 3; chan++) {
LLVMValueRef visible;
if ((options->cull_view_xy && chan <= 1) || (options->cull_view_near_z && chan == 2)) {
float t = chan == 2 && options->use_halfz_clip_space ? 0 : -1;
visible = LLVMBuildFCmp(builder, LLVMRealOGE, bbox_max[chan],
LLVMConstReal(ctx->f32, t), "");
accepted = LLVMBuildAnd(builder, accepted, visible, "");
}
if ((options->cull_view_xy && chan <= 1) || (options->cull_view_far_z && chan == 2)) {
visible = LLVMBuildFCmp(builder, LLVMRealOLE, bbox_min[chan], ctx->f32_1, "");
accepted = LLVMBuildAnd(builder, accepted, visible, "");
}
}
}
/* Small primitive culling - triangles. */
if (options->cull_small_prims && options->num_vertices == 3) {
/* Assuming a sample position at (0.5, 0.5), if we round
* the bounding box min/max extents and the results of
* the rounding are equal in either the X or Y direction,
* the bounding box does not intersect the sample.
*
* See these GDC slides for pictures:
* https://frostbite-wp-prd.s3.amazonaws.com/wp-content/uploads/2016/03/29204330/GDC_2016_Compute.pdf
*/
LLVMValueRef min, max, not_equal[2], visible;
for (unsigned chan = 0; chan < 2; chan++) {
/* Convert the position to screen-space coordinates. */
min = ac_build_fmad(ctx, bbox_min[chan], vp_scale[chan], vp_translate[chan]);
max = ac_build_fmad(ctx, bbox_max[chan], vp_scale[chan], vp_translate[chan]);
/* Scale the bounding box according to the precision of
* the rasterizer and the number of MSAA samples. */
min = LLVMBuildFSub(builder, min, small_prim_precision, "");
max = LLVMBuildFAdd(builder, max, small_prim_precision, "");
/* Determine if the bbox intersects the sample point.
* It also works for MSAA, but vp_scale, vp_translate,
* and small_prim_precision are computed differently.
*/
min = ac_build_round(ctx, min);
max = ac_build_round(ctx, max);
not_equal[chan] = LLVMBuildFCmp(builder, LLVMRealONE, min, max, "");
}
visible = LLVMBuildAnd(builder, not_equal[0], not_equal[1], "");
accepted = LLVMBuildAnd(builder, accepted, visible, "");
}
/* Small primitive culling - lines. */
if (options->cull_small_prims && options->num_vertices == 2) {
/* This only works with lines without perpendicular end caps (lines with perpendicular
* end caps are rasterized as quads and thus can't be culled as small prims in 99% of
* cases because line_width >= 1).
*
* This takes advantage of the diamont exit rule, which says that every pixel
* has a diamond inside it touching the pixel boundary and only if a line exits
* the diamond, that pixel is filled. If a line enters the diamond or stays
* outside the diamond, the pixel isn't filled.
*
* This algorithm is a little simpler than that. The space outside all diamonds also
* has the same diamond shape, which we'll call corner diamonds.
*
* The idea is to cull all lines that are entirely inside a diamond, including
* corner diamonds. If a line is entirely inside a diamond, it can be culled because
* it doesn't exit it. If a line is entirely inside a corner diamond, it can be culled
* because it doesn't enter any diamond and thus can't exit any diamond.
*
* The viewport is rotated by 45 degress to turn diamonds into squares, and a bounding
* box test is used to determine whether a line is entirely inside any square (diamond).
*
* The line width doesn't matter. Wide lines only duplicate filled pixels in either X or
* Y direction from the filled pixels. MSAA also doesn't matter. MSAA should ideally use
* perpendicular end caps that enable quad rasterization for lines. Thus, this should
* always use non-MSAA viewport transformation and non-MSAA small prim precision.
*
* A good test is piglit/lineloop because it draws 10k subpixel lines in a circle.
* It should contain no holes if this matches hw behavior.
*/
LLVMValueRef v0[2], v1[2];
/* Get vertex positions in pixels. */
for (unsigned chan = 0; chan < 2; chan++) {
v0[chan] = ac_build_fmad(ctx, pos[0][chan], vp_scale[chan], vp_translate[chan]);
v1[chan] = ac_build_fmad(ctx, pos[1][chan], vp_scale[chan], vp_translate[chan]);
}
/* Rotate the viewport by 45 degress, so that diamonds become squares. */
rotate_45degrees(ctx, v0);
rotate_45degrees(ctx, v1);
LLVMValueRef not_equal[2];
for (unsigned chan = 0; chan < 2; chan++) {
/* The width of each square is sqrt(0.5), so scale it to 1 because we want
* round() to give us the position of the closest center of a square (diamond).
*/
v0[chan] = LLVMBuildFMul(builder, v0[chan], LLVMConstReal(ctx->f32, 1.414213562), "");
v1[chan] = LLVMBuildFMul(builder, v1[chan], LLVMConstReal(ctx->f32, 1.414213562), "");
/* Compute the bounding box around both vertices. We do this because we must
* enlarge the line area by the precision of the rasterizer.
*/
LLVMValueRef min = ac_build_fmin(ctx, v0[chan], v1[chan]);
LLVMValueRef max = ac_build_fmax(ctx, v0[chan], v1[chan]);
/* Enlarge the bounding box by the precision of the rasterizer. */
min = LLVMBuildFSub(builder, min, small_prim_precision, "");
max = LLVMBuildFAdd(builder, max, small_prim_precision, "");
/* Round the bounding box corners. If both rounded corners are equal,
* the bounding box is entirely inside a square (diamond).
*/
min = ac_build_round(ctx, min);
max = ac_build_round(ctx, max);
not_equal[chan] = LLVMBuildFCmp(builder, LLVMRealONE, min, max, "");
}
accepted = LLVMBuildAnd(builder, accepted,
LLVMBuildOr(builder, not_equal[0], not_equal[1], ""), "");
}
/* Disregard the bounding box culling if any W is negative because the code
* doesn't work with that.
*/
accepted = LLVMBuildOr(builder, accepted, w->any_w_negative, "");
if (accept_func)
accept_func(ctx, accepted, userdata);
}
ac_build_endif(ctx, 10000000);
}
/**
* Return i1 true if the primitive is accepted (not culled).
*
* \param pos Vertex positions 3x vec4
* \param initially_accepted AND'ed with the result. Some computations can be
* skipped if this is false.
* \param vp_scale Viewport scale XY.
* For MSAA, multiply them by the number of samples.
* \param vp_translate Viewport translation XY.
* For MSAA, multiply them by the number of samples.
* \param small_prim_precision Precision of small primitive culling. This should
* be the same as or greater than the precision of
* the rasterizer. Set to num_samples / 2^subpixel_bits.
* subpixel_bits are defined by the quantization mode.
* \param options See ac_cull_options.
* \param accept_func Callback invoked in the inner-most branch where the primitive is accepted.
*/
void ac_cull_primitive(struct ac_llvm_context *ctx, LLVMValueRef pos[3][4],
LLVMValueRef initially_accepted, LLVMValueRef vp_scale[2],
LLVMValueRef vp_translate[2], LLVMValueRef small_prim_precision,
LLVMValueRef clip_half_line_width[2], struct ac_cull_options *options,
ac_cull_accept_func accept_func, void *userdata)
{
struct ac_position_w_info w;
ac_analyze_position_w(ctx, pos, &w, options->num_vertices);
/* W culling. */
LLVMValueRef accepted = options->cull_w ? w.w_accepted : ctx->i1true;
accepted = LLVMBuildAnd(ctx->builder, accepted, initially_accepted, "");
/* Face culling. */
accepted = LLVMBuildAnd(
ctx->builder, accepted,
ac_cull_face(ctx, pos, &w, options->cull_front, options->cull_back, options->cull_zero_area),
"");
/* View culling and small primitive elimination. */
cull_bbox(ctx, pos, accepted, &w, vp_scale, vp_translate, small_prim_precision,
clip_half_line_width, options, accept_func, userdata);
}