mesa/src/gallium/drivers/crocus/crocus_state.c

9264 lines
343 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* Copyright © 2017 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
/**
* @file crocus_state.c
*
* ============================= GENXML CODE =============================
* [This file is compiled once per generation.]
* =======================================================================
*
* This is the main state upload code.
*
* Gallium uses Constant State Objects, or CSOs, for most state. Large,
* complex, or highly reusable state can be created once, and bound and
* rebound multiple times. This is modeled with the pipe->create_*_state()
* and pipe->bind_*_state() hooks. Highly dynamic or inexpensive state is
* streamed out on the fly, via pipe->set_*_state() hooks.
*
* OpenGL involves frequently mutating context state, which is mirrored in
* core Mesa by highly mutable data structures. However, most applications
* typically draw the same things over and over - from frame to frame, most
* of the same objects are still visible and need to be redrawn. So, rather
* than inventing new state all the time, applications usually mutate to swap
* between known states that we've seen before.
*
* Gallium isolates us from this mutation by tracking API state, and
* distilling it into a set of Constant State Objects, or CSOs. Large,
* complex, or typically reusable state can be created once, then reused
* multiple times. Drivers can create and store their own associated data.
* This create/bind model corresponds to the pipe->create_*_state() and
* pipe->bind_*_state() driver hooks.
*
* Some state is cheap to create, or expected to be highly dynamic. Rather
* than creating and caching piles of CSOs for these, Gallium simply streams
* them out, via the pipe->set_*_state() driver hooks.
*
* To reduce draw time overhead, we try to compute as much state at create
* time as possible. Wherever possible, we translate the Gallium pipe state
* to 3DSTATE commands, and store those commands in the CSO. At draw time,
* we can simply memcpy them into a batch buffer.
*
* No hardware matches the abstraction perfectly, so some commands require
* information from multiple CSOs. In this case, we can store two copies
* of the packet (one in each CSO), and simply | together their DWords at
* draw time. Sometimes the second set is trivial (one or two fields), so
* we simply pack it at draw time.
*
* There are two main components in the file below. First, the CSO hooks
* create/bind/track state. The second are the draw-time upload functions,
* crocus_upload_render_state() and crocus_upload_compute_state(), which read
* the context state and emit the commands into the actual batch.
*/
#include <errno.h>
#include <stdio.h>
#if HAVE_VALGRIND
#include <memcheck.h>
#include <valgrind.h>
#define VG(x) x
#ifdef DEBUG
#define __gen_validate_value(x) VALGRIND_CHECK_MEM_IS_DEFINED(&(x), sizeof(x))
#endif
#else
#define VG(x)
#endif
#include "drm-uapi/i915_drm.h"
#include "intel/common/intel_l3_config.h"
#include "intel/common/intel_sample_positions.h"
#include "intel/compiler/brw_compiler.h"
#include "pipe/p_context.h"
#include "pipe/p_defines.h"
#include "pipe/p_screen.h"
#include "pipe/p_state.h"
#include "util/format/u_format.h"
#include "util/half_float.h"
#include "util/u_dual_blend.h"
#include "util/u_framebuffer.h"
#include "util/u_helpers.h"
#include "util/u_inlines.h"
#include "util/u_memory.h"
#include "util/u_prim.h"
#include "util/u_transfer.h"
#include "util/u_upload_mgr.h"
#include "util/u_viewport.h"
#include "crocus_batch.h"
#include "crocus_context.h"
#include "crocus_defines.h"
#include "crocus_pipe.h"
#include "crocus_resource.h"
#include "crocus_genx_macros.h"
#include "intel/common/intel_guardband.h"
/**
* Statically assert that PIPE_* enums match the hardware packets.
* (As long as they match, we don't need to translate them.)
*/
UNUSED static void pipe_asserts()
{
#define PIPE_ASSERT(x) STATIC_ASSERT((int)x)
/* pipe_logicop happens to match the hardware. */
PIPE_ASSERT(PIPE_LOGICOP_CLEAR == LOGICOP_CLEAR);
PIPE_ASSERT(PIPE_LOGICOP_NOR == LOGICOP_NOR);
PIPE_ASSERT(PIPE_LOGICOP_AND_INVERTED == LOGICOP_AND_INVERTED);
PIPE_ASSERT(PIPE_LOGICOP_COPY_INVERTED == LOGICOP_COPY_INVERTED);
PIPE_ASSERT(PIPE_LOGICOP_AND_REVERSE == LOGICOP_AND_REVERSE);
PIPE_ASSERT(PIPE_LOGICOP_INVERT == LOGICOP_INVERT);
PIPE_ASSERT(PIPE_LOGICOP_XOR == LOGICOP_XOR);
PIPE_ASSERT(PIPE_LOGICOP_NAND == LOGICOP_NAND);
PIPE_ASSERT(PIPE_LOGICOP_AND == LOGICOP_AND);
PIPE_ASSERT(PIPE_LOGICOP_EQUIV == LOGICOP_EQUIV);
PIPE_ASSERT(PIPE_LOGICOP_NOOP == LOGICOP_NOOP);
PIPE_ASSERT(PIPE_LOGICOP_OR_INVERTED == LOGICOP_OR_INVERTED);
PIPE_ASSERT(PIPE_LOGICOP_COPY == LOGICOP_COPY);
PIPE_ASSERT(PIPE_LOGICOP_OR_REVERSE == LOGICOP_OR_REVERSE);
PIPE_ASSERT(PIPE_LOGICOP_OR == LOGICOP_OR);
PIPE_ASSERT(PIPE_LOGICOP_SET == LOGICOP_SET);
/* pipe_blend_func happens to match the hardware. */
PIPE_ASSERT(PIPE_BLENDFACTOR_ONE == BLENDFACTOR_ONE);
PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_COLOR == BLENDFACTOR_SRC_COLOR);
PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_ALPHA == BLENDFACTOR_SRC_ALPHA);
PIPE_ASSERT(PIPE_BLENDFACTOR_DST_ALPHA == BLENDFACTOR_DST_ALPHA);
PIPE_ASSERT(PIPE_BLENDFACTOR_DST_COLOR == BLENDFACTOR_DST_COLOR);
PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE == BLENDFACTOR_SRC_ALPHA_SATURATE);
PIPE_ASSERT(PIPE_BLENDFACTOR_CONST_COLOR == BLENDFACTOR_CONST_COLOR);
PIPE_ASSERT(PIPE_BLENDFACTOR_CONST_ALPHA == BLENDFACTOR_CONST_ALPHA);
PIPE_ASSERT(PIPE_BLENDFACTOR_SRC1_COLOR == BLENDFACTOR_SRC1_COLOR);
PIPE_ASSERT(PIPE_BLENDFACTOR_SRC1_ALPHA == BLENDFACTOR_SRC1_ALPHA);
PIPE_ASSERT(PIPE_BLENDFACTOR_ZERO == BLENDFACTOR_ZERO);
PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC_COLOR == BLENDFACTOR_INV_SRC_COLOR);
PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC_ALPHA == BLENDFACTOR_INV_SRC_ALPHA);
PIPE_ASSERT(PIPE_BLENDFACTOR_INV_DST_ALPHA == BLENDFACTOR_INV_DST_ALPHA);
PIPE_ASSERT(PIPE_BLENDFACTOR_INV_DST_COLOR == BLENDFACTOR_INV_DST_COLOR);
PIPE_ASSERT(PIPE_BLENDFACTOR_INV_CONST_COLOR == BLENDFACTOR_INV_CONST_COLOR);
PIPE_ASSERT(PIPE_BLENDFACTOR_INV_CONST_ALPHA == BLENDFACTOR_INV_CONST_ALPHA);
PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC1_COLOR == BLENDFACTOR_INV_SRC1_COLOR);
PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC1_ALPHA == BLENDFACTOR_INV_SRC1_ALPHA);
/* pipe_blend_func happens to match the hardware. */
PIPE_ASSERT(PIPE_BLEND_ADD == BLENDFUNCTION_ADD);
PIPE_ASSERT(PIPE_BLEND_SUBTRACT == BLENDFUNCTION_SUBTRACT);
PIPE_ASSERT(PIPE_BLEND_REVERSE_SUBTRACT == BLENDFUNCTION_REVERSE_SUBTRACT);
PIPE_ASSERT(PIPE_BLEND_MIN == BLENDFUNCTION_MIN);
PIPE_ASSERT(PIPE_BLEND_MAX == BLENDFUNCTION_MAX);
/* pipe_stencil_op happens to match the hardware. */
PIPE_ASSERT(PIPE_STENCIL_OP_KEEP == STENCILOP_KEEP);
PIPE_ASSERT(PIPE_STENCIL_OP_ZERO == STENCILOP_ZERO);
PIPE_ASSERT(PIPE_STENCIL_OP_REPLACE == STENCILOP_REPLACE);
PIPE_ASSERT(PIPE_STENCIL_OP_INCR == STENCILOP_INCRSAT);
PIPE_ASSERT(PIPE_STENCIL_OP_DECR == STENCILOP_DECRSAT);
PIPE_ASSERT(PIPE_STENCIL_OP_INCR_WRAP == STENCILOP_INCR);
PIPE_ASSERT(PIPE_STENCIL_OP_DECR_WRAP == STENCILOP_DECR);
PIPE_ASSERT(PIPE_STENCIL_OP_INVERT == STENCILOP_INVERT);
#if GFX_VER >= 6
/* pipe_sprite_coord_mode happens to match 3DSTATE_SBE */
PIPE_ASSERT(PIPE_SPRITE_COORD_UPPER_LEFT == UPPERLEFT);
PIPE_ASSERT(PIPE_SPRITE_COORD_LOWER_LEFT == LOWERLEFT);
#endif
#undef PIPE_ASSERT
}
static unsigned
translate_prim_type(enum pipe_prim_type prim, uint8_t verts_per_patch)
{
static const unsigned map[] = {
[PIPE_PRIM_POINTS] = _3DPRIM_POINTLIST,
[PIPE_PRIM_LINES] = _3DPRIM_LINELIST,
[PIPE_PRIM_LINE_LOOP] = _3DPRIM_LINELOOP,
[PIPE_PRIM_LINE_STRIP] = _3DPRIM_LINESTRIP,
[PIPE_PRIM_TRIANGLES] = _3DPRIM_TRILIST,
[PIPE_PRIM_TRIANGLE_STRIP] = _3DPRIM_TRISTRIP,
[PIPE_PRIM_TRIANGLE_FAN] = _3DPRIM_TRIFAN,
[PIPE_PRIM_QUADS] = _3DPRIM_QUADLIST,
[PIPE_PRIM_QUAD_STRIP] = _3DPRIM_QUADSTRIP,
[PIPE_PRIM_POLYGON] = _3DPRIM_POLYGON,
#if GFX_VER >= 6
[PIPE_PRIM_LINES_ADJACENCY] = _3DPRIM_LINELIST_ADJ,
[PIPE_PRIM_LINE_STRIP_ADJACENCY] = _3DPRIM_LINESTRIP_ADJ,
[PIPE_PRIM_TRIANGLES_ADJACENCY] = _3DPRIM_TRILIST_ADJ,
[PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,
#endif
#if GFX_VER >= 7
[PIPE_PRIM_PATCHES] = _3DPRIM_PATCHLIST_1 - 1,
#endif
};
return map[prim] + (prim == PIPE_PRIM_PATCHES ? verts_per_patch : 0);
}
static unsigned
translate_compare_func(enum pipe_compare_func pipe_func)
{
static const unsigned map[] = {
[PIPE_FUNC_NEVER] = COMPAREFUNCTION_NEVER,
[PIPE_FUNC_LESS] = COMPAREFUNCTION_LESS,
[PIPE_FUNC_EQUAL] = COMPAREFUNCTION_EQUAL,
[PIPE_FUNC_LEQUAL] = COMPAREFUNCTION_LEQUAL,
[PIPE_FUNC_GREATER] = COMPAREFUNCTION_GREATER,
[PIPE_FUNC_NOTEQUAL] = COMPAREFUNCTION_NOTEQUAL,
[PIPE_FUNC_GEQUAL] = COMPAREFUNCTION_GEQUAL,
[PIPE_FUNC_ALWAYS] = COMPAREFUNCTION_ALWAYS,
};
return map[pipe_func];
}
static unsigned
translate_shadow_func(enum pipe_compare_func pipe_func)
{
/* Gallium specifies the result of shadow comparisons as:
*
* 1 if ref <op> texel,
* 0 otherwise.
*
* The hardware does:
*
* 0 if texel <op> ref,
* 1 otherwise.
*
* So we need to flip the operator and also negate.
*/
static const unsigned map[] = {
[PIPE_FUNC_NEVER] = PREFILTEROP_ALWAYS,
[PIPE_FUNC_LESS] = PREFILTEROP_LEQUAL,
[PIPE_FUNC_EQUAL] = PREFILTEROP_NOTEQUAL,
[PIPE_FUNC_LEQUAL] = PREFILTEROP_LESS,
[PIPE_FUNC_GREATER] = PREFILTEROP_GEQUAL,
[PIPE_FUNC_NOTEQUAL] = PREFILTEROP_EQUAL,
[PIPE_FUNC_GEQUAL] = PREFILTEROP_GREATER,
[PIPE_FUNC_ALWAYS] = PREFILTEROP_NEVER,
};
return map[pipe_func];
}
static unsigned
translate_cull_mode(unsigned pipe_face)
{
static const unsigned map[4] = {
[PIPE_FACE_NONE] = CULLMODE_NONE,
[PIPE_FACE_FRONT] = CULLMODE_FRONT,
[PIPE_FACE_BACK] = CULLMODE_BACK,
[PIPE_FACE_FRONT_AND_BACK] = CULLMODE_BOTH,
};
return map[pipe_face];
}
#if GFX_VER >= 6
static unsigned
translate_fill_mode(unsigned pipe_polymode)
{
static const unsigned map[4] = {
[PIPE_POLYGON_MODE_FILL] = FILL_MODE_SOLID,
[PIPE_POLYGON_MODE_LINE] = FILL_MODE_WIREFRAME,
[PIPE_POLYGON_MODE_POINT] = FILL_MODE_POINT,
[PIPE_POLYGON_MODE_FILL_RECTANGLE] = FILL_MODE_SOLID,
};
return map[pipe_polymode];
}
#endif
static unsigned
translate_mip_filter(enum pipe_tex_mipfilter pipe_mip)
{
static const unsigned map[] = {
[PIPE_TEX_MIPFILTER_NEAREST] = MIPFILTER_NEAREST,
[PIPE_TEX_MIPFILTER_LINEAR] = MIPFILTER_LINEAR,
[PIPE_TEX_MIPFILTER_NONE] = MIPFILTER_NONE,
};
return map[pipe_mip];
}
static uint32_t
translate_wrap(unsigned pipe_wrap, bool either_nearest)
{
static const unsigned map[] = {
[PIPE_TEX_WRAP_REPEAT] = TCM_WRAP,
#if GFX_VER == 8
[PIPE_TEX_WRAP_CLAMP] = TCM_HALF_BORDER,
#else
[PIPE_TEX_WRAP_CLAMP] = TCM_CLAMP_BORDER,
#endif
[PIPE_TEX_WRAP_CLAMP_TO_EDGE] = TCM_CLAMP,
[PIPE_TEX_WRAP_CLAMP_TO_BORDER] = TCM_CLAMP_BORDER,
[PIPE_TEX_WRAP_MIRROR_REPEAT] = TCM_MIRROR,
[PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE] = TCM_MIRROR_ONCE,
/* These are unsupported. */
[PIPE_TEX_WRAP_MIRROR_CLAMP] = -1,
[PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER] = -1,
};
#if GFX_VER < 8
if (pipe_wrap == PIPE_TEX_WRAP_CLAMP && either_nearest)
return TCM_CLAMP;
#endif
return map[pipe_wrap];
}
/**
* Equiv if brw_state_batch
*/
static uint32_t *
stream_state(struct crocus_batch *batch,
unsigned size,
unsigned alignment,
uint32_t *out_offset)
{
uint32_t offset = ALIGN(batch->state.used, alignment);
if (offset + size >= STATE_SZ && !batch->no_wrap) {
crocus_batch_flush(batch);
offset = ALIGN(batch->state.used, alignment);
} else if (offset + size >= batch->state.bo->size) {
const unsigned new_size =
MIN2(batch->state.bo->size + batch->state.bo->size / 2,
MAX_STATE_SIZE);
crocus_grow_buffer(batch, true, batch->state.used, new_size);
assert(offset + size < batch->state.bo->size);
}
crocus_record_state_size(batch->state_sizes, offset, size);
batch->state.used = offset + size;
*out_offset = offset;
return (uint32_t *)batch->state.map + (offset >> 2);
}
/**
* stream_state() + memcpy.
*/
static uint32_t
emit_state(struct crocus_batch *batch, const void *data, unsigned size,
unsigned alignment)
{
unsigned offset = 0;
uint32_t *map = stream_state(batch, size, alignment, &offset);
if (map)
memcpy(map, data, size);
return offset;
}
#if GFX_VER <= 5
static void
upload_pipelined_state_pointers(struct crocus_batch *batch,
bool gs_active, uint32_t gs_offset,
uint32_t vs_offset, uint32_t sf_offset,
uint32_t clip_offset, uint32_t wm_offset, uint32_t cc_offset)
{
#if GFX_VER == 5
/* Need to flush before changing clip max threads for errata. */
crocus_emit_cmd(batch, GENX(MI_FLUSH), foo);
#endif
crocus_emit_cmd(batch, GENX(3DSTATE_PIPELINED_POINTERS), pp) {
pp.PointertoVSState = ro_bo(batch->state.bo, vs_offset);
pp.GSEnable = gs_active;
if (gs_active)
pp.PointertoGSState = ro_bo(batch->state.bo, gs_offset);
pp.ClipEnable = true;
pp.PointertoCLIPState = ro_bo(batch->state.bo, clip_offset);
pp.PointertoSFState = ro_bo(batch->state.bo, sf_offset);
pp.PointertoWMState = ro_bo(batch->state.bo, wm_offset);
pp.PointertoColorCalcState = ro_bo(batch->state.bo, cc_offset);
}
}
#endif
/**
* Did field 'x' change between 'old_cso' and 'new_cso'?
*
* (If so, we may want to set some dirty flags.)
*/
#define cso_changed(x) (!old_cso || (old_cso->x != new_cso->x))
#define cso_changed_memcmp(x) \
(!old_cso || memcmp(old_cso->x, new_cso->x, sizeof(old_cso->x)) != 0)
static void
flush_before_state_base_change(struct crocus_batch *batch)
{
#if GFX_VER >= 6
/* Flush before emitting STATE_BASE_ADDRESS.
*
* This isn't documented anywhere in the PRM. However, it seems to be
* necessary prior to changing the surface state base adress. We've
* seen issues in Vulkan where we get GPU hangs when using multi-level
* command buffers which clear depth, reset state base address, and then
* go render stuff.
*
* Normally, in GL, we would trust the kernel to do sufficient stalls
* and flushes prior to executing our batch. However, it doesn't seem
* as if the kernel's flushing is always sufficient and we don't want to
* rely on it.
*
* We make this an end-of-pipe sync instead of a normal flush because we
* do not know the current status of the GPU. On Haswell at least,
* having a fast-clear operation in flight at the same time as a normal
* rendering operation can cause hangs. Since the kernel's flushing is
* insufficient, we need to ensure that any rendering operations from
* other processes are definitely complete before we try to do our own
* rendering. It's a bit of a big hammer but it appears to work.
*/
const unsigned dc_flush =
batch->screen->devinfo.ver >= 7 ? PIPE_CONTROL_DATA_CACHE_FLUSH : 0;
crocus_emit_end_of_pipe_sync(batch,
"change STATE_BASE_ADDRESS (flushes)",
PIPE_CONTROL_RENDER_TARGET_FLUSH |
dc_flush |
PIPE_CONTROL_DEPTH_CACHE_FLUSH);
#endif
}
static void
flush_after_state_base_change(struct crocus_batch *batch)
{
/* After re-setting the surface state base address, we have to do some
* cache flusing so that the sampler engine will pick up the new
* SURFACE_STATE objects and binding tables. From the Broadwell PRM,
* Shared Function > 3D Sampler > State > State Caching (page 96):
*
* Coherency with system memory in the state cache, like the texture
* cache is handled partially by software. It is expected that the
* command stream or shader will issue Cache Flush operation or
* Cache_Flush sampler message to ensure that the L1 cache remains
* coherent with system memory.
*
* [...]
*
* Whenever the value of the Dynamic_State_Base_Addr,
* Surface_State_Base_Addr are altered, the L1 state cache must be
* invalidated to ensure the new surface or sampler state is fetched
* from system memory.
*
* The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit
* which, according the PIPE_CONTROL instruction documentation in the
* Broadwell PRM:
*
* Setting this bit is independent of any other bit in this packet.
* This bit controls the invalidation of the L1 and L2 state caches
* at the top of the pipe i.e. at the parsing time.
*
* Unfortunately, experimentation seems to indicate that state cache
* invalidation through a PIPE_CONTROL does nothing whatsoever in
* regards to surface state and binding tables. In stead, it seems that
* invalidating the texture cache is what is actually needed.
*
* XXX: As far as we have been able to determine through
* experimentation, shows that flush the texture cache appears to be
* sufficient. The theory here is that all of the sampling/rendering
* units cache the binding table in the texture cache. However, we have
* yet to be able to actually confirm this.
*/
#if GFX_VER >= 6
crocus_emit_end_of_pipe_sync(batch,
"change STATE_BASE_ADDRESS (invalidates)",
PIPE_CONTROL_INSTRUCTION_INVALIDATE |
PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
PIPE_CONTROL_CONST_CACHE_INVALIDATE |
PIPE_CONTROL_STATE_CACHE_INVALIDATE);
#endif
}
#if GFX_VER >= 6
static void
crocus_store_register_mem32(struct crocus_batch *batch, uint32_t reg,
struct crocus_bo *bo, uint32_t offset,
bool predicated)
{
crocus_emit_cmd(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
srm.RegisterAddress = reg;
srm.MemoryAddress = ggtt_bo(bo, offset);
#if GFX_VERx10 >= 75
srm.PredicateEnable = predicated;
#else
if (predicated)
unreachable("unsupported predication");
#endif
}
}
static void
crocus_store_register_mem64(struct crocus_batch *batch, uint32_t reg,
struct crocus_bo *bo, uint32_t offset,
bool predicated)
{
crocus_store_register_mem32(batch, reg + 0, bo, offset + 0, predicated);
crocus_store_register_mem32(batch, reg + 4, bo, offset + 4, predicated);
}
#endif
#if GFX_VER >= 7
static void
_crocus_emit_lri(struct crocus_batch *batch, uint32_t reg, uint32_t val)
{
crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
lri.RegisterOffset = reg;
lri.DataDWord = val;
}
}
#define crocus_emit_lri(b, r, v) _crocus_emit_lri(b, GENX(r##_num), v)
#if GFX_VERx10 >= 75
static void
_crocus_emit_lrr(struct crocus_batch *batch, uint32_t dst, uint32_t src)
{
crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_REG), lrr) {
lrr.SourceRegisterAddress = src;
lrr.DestinationRegisterAddress = dst;
}
}
static void
crocus_load_register_reg32(struct crocus_batch *batch, uint32_t dst,
uint32_t src)
{
_crocus_emit_lrr(batch, dst, src);
}
static void
crocus_load_register_reg64(struct crocus_batch *batch, uint32_t dst,
uint32_t src)
{
_crocus_emit_lrr(batch, dst, src);
_crocus_emit_lrr(batch, dst + 4, src + 4);
}
#endif
static void
crocus_load_register_imm32(struct crocus_batch *batch, uint32_t reg,
uint32_t val)
{
_crocus_emit_lri(batch, reg, val);
}
static void
crocus_load_register_imm64(struct crocus_batch *batch, uint32_t reg,
uint64_t val)
{
_crocus_emit_lri(batch, reg + 0, val & 0xffffffff);
_crocus_emit_lri(batch, reg + 4, val >> 32);
}
/**
* Emit MI_LOAD_REGISTER_MEM to load a 32-bit MMIO register from a buffer.
*/
static void
crocus_load_register_mem32(struct crocus_batch *batch, uint32_t reg,
struct crocus_bo *bo, uint32_t offset)
{
crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
lrm.RegisterAddress = reg;
lrm.MemoryAddress = ro_bo(bo, offset);
}
}
/**
* Load a 64-bit value from a buffer into a MMIO register via
* two MI_LOAD_REGISTER_MEM commands.
*/
static void
crocus_load_register_mem64(struct crocus_batch *batch, uint32_t reg,
struct crocus_bo *bo, uint32_t offset)
{
crocus_load_register_mem32(batch, reg + 0, bo, offset + 0);
crocus_load_register_mem32(batch, reg + 4, bo, offset + 4);
}
#if GFX_VERx10 >= 75
static void
crocus_store_data_imm32(struct crocus_batch *batch,
struct crocus_bo *bo, uint32_t offset,
uint32_t imm)
{
crocus_emit_cmd(batch, GENX(MI_STORE_DATA_IMM), sdi) {
sdi.Address = rw_bo(bo, offset);
#if GFX_VER >= 6
sdi.ImmediateData = imm;
#endif
}
}
static void
crocus_store_data_imm64(struct crocus_batch *batch,
struct crocus_bo *bo, uint32_t offset,
uint64_t imm)
{
/* Can't use crocus_emit_cmd because MI_STORE_DATA_IMM has a length of
* 2 in genxml but it's actually variable length and we need 5 DWords.
*/
void *map = crocus_get_command_space(batch, 4 * 5);
_crocus_pack_command(batch, GENX(MI_STORE_DATA_IMM), map, sdi) {
sdi.DWordLength = 5 - 2;
sdi.Address = rw_bo(bo, offset);
#if GFX_VER >= 6
sdi.ImmediateData = imm;
#endif
}
}
#endif
static void
crocus_copy_mem_mem(struct crocus_batch *batch,
struct crocus_bo *dst_bo, uint32_t dst_offset,
struct crocus_bo *src_bo, uint32_t src_offset,
unsigned bytes)
{
assert(bytes % 4 == 0);
assert(dst_offset % 4 == 0);
assert(src_offset % 4 == 0);
#define CROCUS_TEMP_REG 0x2440 /* GEN7_3DPRIM_BASE_VERTEX */
for (unsigned i = 0; i < bytes; i += 4) {
crocus_load_register_mem32(batch, CROCUS_TEMP_REG,
src_bo, src_offset + i);
crocus_store_register_mem32(batch, CROCUS_TEMP_REG,
dst_bo, dst_offset + i, false);
}
}
#endif
/**
* Gallium CSO for rasterizer state.
*/
struct crocus_rasterizer_state {
struct pipe_rasterizer_state cso;
#if GFX_VER >= 6
uint32_t sf[GENX(3DSTATE_SF_length)];
uint32_t clip[GENX(3DSTATE_CLIP_length)];
#endif
#if GFX_VER >= 8
uint32_t raster[GENX(3DSTATE_RASTER_length)];
#endif
uint32_t line_stipple[GENX(3DSTATE_LINE_STIPPLE_length)];
uint8_t num_clip_plane_consts;
bool fill_mode_point_or_line;
};
#if GFX_VER <= 5
#define URB_VS 0
#define URB_GS 1
#define URB_CLP 2
#define URB_SF 3
#define URB_CS 4
static const struct {
uint32_t min_nr_entries;
uint32_t preferred_nr_entries;
uint32_t min_entry_size;
uint32_t max_entry_size;
} limits[URB_CS+1] = {
{ 16, 32, 1, 5 }, /* vs */
{ 4, 8, 1, 5 }, /* gs */
{ 5, 10, 1, 5 }, /* clp */
{ 1, 8, 1, 12 }, /* sf */
{ 1, 4, 1, 32 } /* cs */
};
static bool check_urb_layout(struct crocus_context *ice)
{
ice->urb.vs_start = 0;
ice->urb.gs_start = ice->urb.nr_vs_entries * ice->urb.vsize;
ice->urb.clip_start = ice->urb.gs_start + ice->urb.nr_gs_entries * ice->urb.vsize;
ice->urb.sf_start = ice->urb.clip_start + ice->urb.nr_clip_entries * ice->urb.vsize;
ice->urb.cs_start = ice->urb.sf_start + ice->urb.nr_sf_entries * ice->urb.sfsize;
return ice->urb.cs_start + ice->urb.nr_cs_entries *
ice->urb.csize <= ice->urb.size;
}
static bool
crocus_calculate_urb_fence(struct crocus_batch *batch, unsigned csize,
unsigned vsize, unsigned sfsize)
{
const struct intel_device_info *devinfo = &batch->screen->devinfo;
struct crocus_context *ice = batch->ice;
if (csize < limits[URB_CS].min_entry_size)
csize = limits[URB_CS].min_entry_size;
if (vsize < limits[URB_VS].min_entry_size)
vsize = limits[URB_VS].min_entry_size;
if (sfsize < limits[URB_SF].min_entry_size)
sfsize = limits[URB_SF].min_entry_size;
if (ice->urb.vsize < vsize ||
ice->urb.sfsize < sfsize ||
ice->urb.csize < csize ||
(ice->urb.constrained && (ice->urb.vsize > vsize ||
ice->urb.sfsize > sfsize ||
ice->urb.csize > csize))) {
ice->urb.csize = csize;
ice->urb.sfsize = sfsize;
ice->urb.vsize = vsize;
ice->urb.nr_vs_entries = limits[URB_VS].preferred_nr_entries;
ice->urb.nr_gs_entries = limits[URB_GS].preferred_nr_entries;
ice->urb.nr_clip_entries = limits[URB_CLP].preferred_nr_entries;
ice->urb.nr_sf_entries = limits[URB_SF].preferred_nr_entries;
ice->urb.nr_cs_entries = limits[URB_CS].preferred_nr_entries;
ice->urb.constrained = 0;
if (devinfo->ver == 5) {
ice->urb.nr_vs_entries = 128;
ice->urb.nr_sf_entries = 48;
if (check_urb_layout(ice)) {
goto done;
} else {
ice->urb.constrained = 1;
ice->urb.nr_vs_entries = limits[URB_VS].preferred_nr_entries;
ice->urb.nr_sf_entries = limits[URB_SF].preferred_nr_entries;
}
} else if (devinfo->is_g4x) {
ice->urb.nr_vs_entries = 64;
if (check_urb_layout(ice)) {
goto done;
} else {
ice->urb.constrained = 1;
ice->urb.nr_vs_entries = limits[URB_VS].preferred_nr_entries;
}
}
if (!check_urb_layout(ice)) {
ice->urb.nr_vs_entries = limits[URB_VS].min_nr_entries;
ice->urb.nr_gs_entries = limits[URB_GS].min_nr_entries;
ice->urb.nr_clip_entries = limits[URB_CLP].min_nr_entries;
ice->urb.nr_sf_entries = limits[URB_SF].min_nr_entries;
ice->urb.nr_cs_entries = limits[URB_CS].min_nr_entries;
/* Mark us as operating with constrained nr_entries, so that next
* time we recalculate we'll resize the fences in the hope of
* escaping constrained mode and getting back to normal performance.
*/
ice->urb.constrained = 1;
if (!check_urb_layout(ice)) {
/* This is impossible, given the maximal sizes of urb
* entries and the values for minimum nr of entries
* provided above.
*/
fprintf(stderr, "couldn't calculate URB layout!\n");
exit(1);
}
if (unlikely(INTEL_DEBUG & (DEBUG_URB|DEBUG_PERF)))
fprintf(stderr, "URB CONSTRAINED\n");
}
done:
if (unlikely(INTEL_DEBUG & DEBUG_URB))
fprintf(stderr,
"URB fence: %d ..VS.. %d ..GS.. %d ..CLP.. %d ..SF.. %d ..CS.. %d\n",
ice->urb.vs_start,
ice->urb.gs_start,
ice->urb.clip_start,
ice->urb.sf_start,
ice->urb.cs_start,
ice->urb.size);
return true;
}
return false;
}
static void
crocus_upload_urb_fence(struct crocus_batch *batch)
{
uint32_t urb_fence[3];
_crocus_pack_command(batch, GENX(URB_FENCE), urb_fence, urb) {
urb.VSUnitURBReallocationRequest = 1;
urb.GSUnitURBReallocationRequest = 1;
urb.CLIPUnitURBReallocationRequest = 1;
urb.SFUnitURBReallocationRequest = 1;
urb.VFEUnitURBReallocationRequest = 1;
urb.CSUnitURBReallocationRequest = 1;
urb.VSFence = batch->ice->urb.gs_start;
urb.GSFence = batch->ice->urb.clip_start;
urb.CLIPFence = batch->ice->urb.sf_start;
urb.SFFence = batch->ice->urb.cs_start;
urb.CSFence = batch->ice->urb.size;
}
/* erratum: URB_FENCE must not cross a 64byte cacheline */
if ((crocus_batch_bytes_used(batch) & 15) > 12) {
int pad = 16 - (crocus_batch_bytes_used(batch) & 15);
do {
*(uint32_t *)batch->command.map_next = 0;
batch->command.map_next += sizeof(uint32_t);
} while (--pad);
}
crocus_batch_emit(batch, urb_fence, sizeof(uint32_t) * 3);
}
static bool
calculate_curbe_offsets(struct crocus_batch *batch)
{
struct crocus_context *ice = batch->ice;
unsigned nr_fp_regs, nr_vp_regs, nr_clip_regs = 0;
unsigned total_regs;
nr_fp_regs = 0;
for (int i = 0; i < 4; i++) {
const struct brw_ubo_range *range = &ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data->ubo_ranges[i];
if (range->length == 0)
continue;
/* ubo range tracks at 256-bit, we need 512-bit */
nr_fp_regs += (range->length + 1) / 2;
}
if (ice->state.cso_rast->cso.clip_plane_enable) {
unsigned nr_planes = 6 + util_bitcount(ice->state.cso_rast->cso.clip_plane_enable);
nr_clip_regs = (nr_planes * 4 + 15) / 16;
}
nr_vp_regs = 0;
for (int i = 0; i < 4; i++) {
const struct brw_ubo_range *range = &ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data->ubo_ranges[i];
if (range->length == 0)
continue;
/* ubo range tracks at 256-bit, we need 512-bit */
nr_vp_regs += (range->length + 1) / 2;
}
if (nr_vp_regs == 0) {
/* The pre-gen6 VS requires that some push constants get loaded no
* matter what, or the GPU would hang.
*/
nr_vp_regs = 1;
}
total_regs = nr_fp_regs + nr_vp_regs + nr_clip_regs;
/* The CURBE allocation size is limited to 32 512-bit units (128 EU
* registers, or 1024 floats). See CS_URB_STATE in the gen4 or gen5
* (volume 1, part 1) PRMs.
*
* Note that in brw_fs.cpp we're only loading up to 16 EU registers of
* values as push constants before spilling to pull constants, and in
* brw_vec4.cpp we're loading up to 32 registers of push constants. An EU
* register is 1/2 of one of these URB entry units, so that leaves us 16 EU
* regs for clip.
*/
assert(total_regs <= 32);
/* Lazy resize:
*/
if (nr_fp_regs > ice->curbe.wm_size ||
nr_vp_regs > ice->curbe.vs_size ||
nr_clip_regs != ice->curbe.clip_size ||
(total_regs < ice->curbe.total_size / 4 &&
ice->curbe.total_size > 16)) {
GLuint reg = 0;
/* Calculate a new layout:
*/
reg = 0;
ice->curbe.wm_start = reg;
ice->curbe.wm_size = nr_fp_regs; reg += nr_fp_regs;
ice->curbe.clip_start = reg;
ice->curbe.clip_size = nr_clip_regs; reg += nr_clip_regs;
ice->curbe.vs_start = reg;
ice->curbe.vs_size = nr_vp_regs; reg += nr_vp_regs;
ice->curbe.total_size = reg;
if (0)
fprintf(stderr, "curbe wm %d+%d clip %d+%d vs %d+%d\n",
ice->curbe.wm_start,
ice->curbe.wm_size,
ice->curbe.clip_start,
ice->curbe.clip_size,
ice->curbe.vs_start,
ice->curbe.vs_size );
return true;
}
return false;
}
static void
upload_shader_consts(struct crocus_context *ice,
gl_shader_stage stage,
uint32_t *map,
unsigned start)
{
struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
struct brw_stage_prog_data *prog_data = (void *) shader->prog_data;
uint32_t *cmap;
bool found = false;
unsigned offset = start * 16;
int total = 0;
for (int i = 0; i < 4; i++) {
const struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
if (range->length == 0)
continue;
unsigned block_index = crocus_bti_to_group_index(
&shader->bt, CROCUS_SURFACE_GROUP_UBO, range->block);
unsigned len = range->length * 8 * sizeof(float);
unsigned start = range->start * 8 * sizeof(float);
struct pipe_transfer *transfer;
cmap = pipe_buffer_map_range(&ice->ctx, ice->state.shaders[stage].constbufs[block_index].buffer,
ice->state.shaders[stage].constbufs[block_index].buffer_offset + start, len,
PIPE_MAP_READ | PIPE_MAP_UNSYNCHRONIZED, &transfer);
if (cmap)
memcpy(&map[offset + (total * 8)], cmap, len);
pipe_buffer_unmap(&ice->ctx, transfer);
total += range->length;
found = true;
}
if (stage == MESA_SHADER_VERTEX && !found) {
/* The pre-gen6 VS requires that some push constants get loaded no
* matter what, or the GPU would hang.
*/
unsigned len = 16;
memset(&map[offset], 0, len);
}
}
static const float fixed_plane[6][4] = {
{ 0, 0, -1, 1 },
{ 0, 0, 1, 1 },
{ 0, -1, 0, 1 },
{ 0, 1, 0, 1 },
{-1, 0, 0, 1 },
{ 1, 0, 0, 1 }
};
static void
gen4_upload_curbe(struct crocus_batch *batch)
{
struct crocus_context *ice = batch->ice;
const unsigned sz = ice->curbe.total_size;
const unsigned buf_sz = sz * 16 * sizeof(float);
if (sz == 0)
goto emit;
uint32_t *map;
u_upload_alloc(ice->ctx.const_uploader, 0, buf_sz, 64,
&ice->curbe.curbe_offset, (struct pipe_resource **)&ice->curbe.curbe_res, (void **) &map);
/* fragment shader constants */
if (ice->curbe.wm_size) {
upload_shader_consts(ice, MESA_SHADER_FRAGMENT, map, ice->curbe.wm_start);
}
/* clipper constants */
if (ice->curbe.clip_size) {
unsigned offset = ice->curbe.clip_start * 16;
float *fmap = (float *)map;
unsigned i;
/* If any planes are going this way, send them all this way:
*/
for (i = 0; i < 6; i++) {
fmap[offset + i * 4 + 0] = fixed_plane[i][0];
fmap[offset + i * 4 + 1] = fixed_plane[i][1];
fmap[offset + i * 4 + 2] = fixed_plane[i][2];
fmap[offset + i * 4 + 3] = fixed_plane[i][3];
}
unsigned mask = ice->state.cso_rast->cso.clip_plane_enable;
struct pipe_clip_state *cp = &ice->state.clip_planes;
while (mask) {
const int j = u_bit_scan(&mask);
fmap[offset + i * 4 + 0] = cp->ucp[j][0];
fmap[offset + i * 4 + 1] = cp->ucp[j][1];
fmap[offset + i * 4 + 2] = cp->ucp[j][2];
fmap[offset + i * 4 + 3] = cp->ucp[j][3];
i++;
}
}
/* vertex shader constants */
if (ice->curbe.vs_size) {
upload_shader_consts(ice, MESA_SHADER_VERTEX, map, ice->curbe.vs_start);
}
if (0) {
for (int i = 0; i < sz*16; i+=4) {
float *f = (float *)map;
fprintf(stderr, "curbe %d.%d: %f %f %f %f\n", i/8, i&4,
f[i+0], f[i+1], f[i+2], f[i+3]);
}
}
emit:
crocus_emit_cmd(batch, GENX(CONSTANT_BUFFER), cb) {
if (ice->curbe.curbe_res) {
cb.BufferLength = ice->curbe.total_size - 1;
cb.Valid = 1;
cb.BufferStartingAddress = ro_bo(ice->curbe.curbe_res->bo, ice->curbe.curbe_offset);
}
}
#if GFX_VER == 4 && GFX_VERx10 != 45
/* Work around a Broadwater/Crestline depth interpolator bug. The
* following sequence will cause GPU hangs:
*
* 1. Change state so that all depth related fields in CC_STATE are
* disabled, and in WM_STATE, only "PS Use Source Depth" is enabled.
* 2. Emit a CONSTANT_BUFFER packet.
* 3. Draw via 3DPRIMITIVE.
*
* The recommended workaround is to emit a non-pipelined state change after
* emitting CONSTANT_BUFFER, in order to drain the windowizer pipeline.
*
* We arbitrarily choose 3DSTATE_GLOBAL_DEPTH_CLAMP_OFFSET (as it's small),
* and always emit it when "PS Use Source Depth" is set. We could be more
* precise, but the additional complexity is probably not worth it.
*
*/
const struct shader_info *fs_info =
crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
if (BITSET_TEST(fs_info->system_values_read, SYSTEM_VALUE_FRAG_COORD)) {
ice->state.global_depth_offset_clamp = 0;
crocus_emit_cmd(batch, GENX(3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP), clamp);
}
#endif
}
#endif
#if GFX_VER >= 7
#define IVB_L3SQCREG1_SQGHPCI_DEFAULT 0x00730000
#define VLV_L3SQCREG1_SQGHPCI_DEFAULT 0x00d30000
#define HSW_L3SQCREG1_SQGHPCI_DEFAULT 0x00610000
static void
setup_l3_config(struct crocus_batch *batch, const struct intel_l3_config *cfg)
{
#if GFX_VER == 7
const struct intel_device_info *devinfo = &batch->screen->devinfo;
const bool has_dc = cfg->n[INTEL_L3P_DC] || cfg->n[INTEL_L3P_ALL];
const bool has_is = cfg->n[INTEL_L3P_IS] || cfg->n[INTEL_L3P_RO] ||
cfg->n[INTEL_L3P_ALL];
const bool has_c = cfg->n[INTEL_L3P_C] || cfg->n[INTEL_L3P_RO] ||
cfg->n[INTEL_L3P_ALL];
const bool has_t = cfg->n[INTEL_L3P_T] || cfg->n[INTEL_L3P_RO] ||
cfg->n[INTEL_L3P_ALL];
const bool has_slm = cfg->n[INTEL_L3P_SLM];
#endif
/* According to the hardware docs, the L3 partitioning can only be changed
* while the pipeline is completely drained and the caches are flushed,
* which involves a first PIPE_CONTROL flush which stalls the pipeline...
*/
crocus_emit_pipe_control_flush(batch, "l3_config",
PIPE_CONTROL_DATA_CACHE_FLUSH |
PIPE_CONTROL_CS_STALL);
/* ...followed by a second pipelined PIPE_CONTROL that initiates
* invalidation of the relevant caches. Note that because RO invalidation
* happens at the top of the pipeline (i.e. right away as the PIPE_CONTROL
* command is processed by the CS) we cannot combine it with the previous
* stalling flush as the hardware documentation suggests, because that
* would cause the CS to stall on previous rendering *after* RO
* invalidation and wouldn't prevent the RO caches from being polluted by
* concurrent rendering before the stall completes. This intentionally
* doesn't implement the SKL+ hardware workaround suggesting to enable CS
* stall on PIPE_CONTROLs with the texture cache invalidation bit set for
* GPGPU workloads because the previous and subsequent PIPE_CONTROLs
* already guarantee that there is no concurrent GPGPU kernel execution
* (see SKL HSD 2132585).
*/
crocus_emit_pipe_control_flush(batch, "l3 config",
PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
PIPE_CONTROL_CONST_CACHE_INVALIDATE |
PIPE_CONTROL_INSTRUCTION_INVALIDATE |
PIPE_CONTROL_STATE_CACHE_INVALIDATE);
/* Now send a third stalling flush to make sure that invalidation is
* complete when the L3 configuration registers are modified.
*/
crocus_emit_pipe_control_flush(batch, "l3 config",
PIPE_CONTROL_DATA_CACHE_FLUSH |
PIPE_CONTROL_CS_STALL);
#if GFX_VER == 8
assert(!cfg->n[INTEL_L3P_IS] && !cfg->n[INTEL_L3P_C] && !cfg->n[INTEL_L3P_T]);
crocus_emit_reg(batch, GENX(L3CNTLREG), reg) {
reg.SLMEnable = cfg->n[INTEL_L3P_SLM] > 0;
reg.URBAllocation = cfg->n[INTEL_L3P_URB];
reg.ROAllocation = cfg->n[INTEL_L3P_RO];
reg.DCAllocation = cfg->n[INTEL_L3P_DC];
reg.AllAllocation = cfg->n[INTEL_L3P_ALL];
}
#else
assert(!cfg->n[INTEL_L3P_ALL]);
/* When enabled SLM only uses a portion of the L3 on half of the banks,
* the matching space on the remaining banks has to be allocated to a
* client (URB for all validated configurations) set to the
* lower-bandwidth 2-bank address hashing mode.
*/
const bool urb_low_bw = has_slm && !devinfo->is_baytrail;
assert(!urb_low_bw || cfg->n[INTEL_L3P_URB] == cfg->n[INTEL_L3P_SLM]);
/* Minimum number of ways that can be allocated to the URB. */
const unsigned n0_urb = (devinfo->is_baytrail ? 32 : 0);
assert(cfg->n[INTEL_L3P_URB] >= n0_urb);
uint32_t l3sqcr1, l3cr2, l3cr3;
crocus_pack_state(GENX(L3SQCREG1), &l3sqcr1, reg) {
reg.ConvertDC_UC = !has_dc;
reg.ConvertIS_UC = !has_is;
reg.ConvertC_UC = !has_c;
reg.ConvertT_UC = !has_t;
#if GFX_VERx10 == 75
reg.L3SQGeneralPriorityCreditInitialization = SQGPCI_DEFAULT;
#else
reg.L3SQGeneralPriorityCreditInitialization =
devinfo->is_baytrail ? BYT_SQGPCI_DEFAULT : SQGPCI_DEFAULT;
#endif
reg.L3SQHighPriorityCreditInitialization = SQHPCI_DEFAULT;
};
crocus_pack_state(GENX(L3CNTLREG2), &l3cr2, reg) {
reg.SLMEnable = has_slm;
reg.URBLowBandwidth = urb_low_bw;
reg.URBAllocation = cfg->n[INTEL_L3P_URB] - n0_urb;
#if !(GFX_VERx10 == 75)
reg.ALLAllocation = cfg->n[INTEL_L3P_ALL];
#endif
reg.ROAllocation = cfg->n[INTEL_L3P_RO];
reg.DCAllocation = cfg->n[INTEL_L3P_DC];
};
crocus_pack_state(GENX(L3CNTLREG3), &l3cr3, reg) {
reg.ISAllocation = cfg->n[INTEL_L3P_IS];
reg.ISLowBandwidth = 0;
reg.CAllocation = cfg->n[INTEL_L3P_C];
reg.CLowBandwidth = 0;
reg.TAllocation = cfg->n[INTEL_L3P_T];
reg.TLowBandwidth = 0;
};
/* Set up the L3 partitioning. */
crocus_emit_lri(batch, L3SQCREG1, l3sqcr1);
crocus_emit_lri(batch, L3CNTLREG2, l3cr2);
crocus_emit_lri(batch, L3CNTLREG3, l3cr3);
#if GFX_VERSIONx10 == 75
/* TODO: Fail screen creation if command parser version < 4 */
uint32_t scratch1, chicken3;
crocus_pack_state(GENX(SCRATCH1), &scratch1, reg) {
reg.L3AtomicDisable = !has_dc;
}
crocus_pack_state(GENX(CHICKEN3), &chicken3, reg) {
reg.L3AtomicDisableMask = true;
reg.L3AtomicDisable = !has_dc;
}
crocus_emit_lri(batch, SCRATCH1, scratch1);
crocus_emit_lri(batch, CHICKEN3, chicken3);
#endif
#endif
}
static void
emit_l3_state(struct crocus_batch *batch, bool compute)
{
const struct intel_l3_config *const cfg =
compute ? batch->screen->l3_config_cs : batch->screen->l3_config_3d;
setup_l3_config(batch, cfg);
if (unlikely(INTEL_DEBUG & DEBUG_L3)) {
intel_dump_l3_config(cfg, stderr);
}
}
/**
* Emit a PIPE_CONTROL command for gen7 with the CS Stall bit set.
*/
static void
gen7_emit_cs_stall_flush(struct crocus_batch *batch)
{
crocus_emit_pipe_control_write(batch,
"workaround",
PIPE_CONTROL_CS_STALL
| PIPE_CONTROL_WRITE_IMMEDIATE,
batch->ice->workaround_bo,
batch->ice->workaround_offset, 0);
}
#endif
static void
emit_pipeline_select(struct crocus_batch *batch, uint32_t pipeline)
{
#if GFX_VER == 8
/* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:
*
* Software must clear the COLOR_CALC_STATE Valid field in
* 3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT
* with Pipeline Select set to GPGPU.
*
* The internal hardware docs recommend the same workaround for Gfx9
* hardware too.
*/
if (pipeline == GPGPU)
crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), t);
#endif
#if GFX_VER >= 6
/* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
* PIPELINE_SELECT [DevBWR+]":
*
* "Project: DEVSNB+
*
* Software must ensure all the write caches are flushed through a
* stalling PIPE_CONTROL command followed by another PIPE_CONTROL
* command to invalidate read only caches prior to programming
* MI_PIPELINE_SELECT command to change the Pipeline Select Mode."
*/
const unsigned dc_flush =
batch->screen->devinfo.ver >= 7 ? PIPE_CONTROL_DATA_CACHE_FLUSH : 0;
crocus_emit_pipe_control_flush(batch,
"workaround: PIPELINE_SELECT flushes (1/2)",
PIPE_CONTROL_RENDER_TARGET_FLUSH |
PIPE_CONTROL_DEPTH_CACHE_FLUSH |
dc_flush |
PIPE_CONTROL_CS_STALL);
crocus_emit_pipe_control_flush(batch,
"workaround: PIPELINE_SELECT flushes (2/2)",
PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
PIPE_CONTROL_CONST_CACHE_INVALIDATE |
PIPE_CONTROL_STATE_CACHE_INVALIDATE |
PIPE_CONTROL_INSTRUCTION_INVALIDATE);
#else
/* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
* PIPELINE_SELECT [DevBWR+]":
*
* Project: PRE-DEVSNB
*
* Software must ensure the current pipeline is flushed via an
* MI_FLUSH or PIPE_CONTROL prior to the execution of PIPELINE_SELECT.
*/
crocus_emit_cmd(batch, GENX(MI_FLUSH), foo);
#endif
crocus_emit_cmd(batch, GENX(PIPELINE_SELECT), sel) {
sel.PipelineSelection = pipeline;
}
#if GFX_VER == 7 && !(GFX_VERx10 == 75)
if (pipeline == _3D) {
gen7_emit_cs_stall_flush(batch);
crocus_emit_cmd(batch, GENX(3DPRIMITIVE), prim) {
prim.PrimitiveTopologyType = _3DPRIM_POINTLIST;
};
}
#endif
}
/**
* The following diagram shows how we partition the URB:
*
* 16kB or 32kB Rest of the URB space
* __________-__________ _________________-_________________
* / \ / \
* +-------------------------------------------------------------+
* | VS/HS/DS/GS/FS Push | VS/HS/DS/GS URB |
* | Constants | Entries |
* +-------------------------------------------------------------+
*
* Notably, push constants must be stored at the beginning of the URB
* space, while entries can be stored anywhere. Ivybridge and Haswell
* GT1/GT2 have a maximum constant buffer size of 16kB, while Haswell GT3
* doubles this (32kB).
*
* Ivybridge and Haswell GT1/GT2 allow push constants to be located (and
* sized) in increments of 1kB. Haswell GT3 requires them to be located and
* sized in increments of 2kB.
*
* Currently we split the constant buffer space evenly among whatever stages
* are active. This is probably not ideal, but simple.
*
* Ivybridge GT1 and Haswell GT1 have 128kB of URB space.
* Ivybridge GT2 and Haswell GT2 have 256kB of URB space.
* Haswell GT3 has 512kB of URB space.
*
* See "Volume 2a: 3D Pipeline," section 1.8, "Volume 1b: Configurations",
* and the documentation for 3DSTATE_PUSH_CONSTANT_ALLOC_xS.
*/
#if GFX_VER >= 7
static void
crocus_alloc_push_constants(struct crocus_batch *batch)
{
#if GFX_VERx10 == 75
const unsigned push_constant_kb = batch->screen->devinfo.gt == 3 ? 32 : 16;
#elif GFX_VER == 8
const unsigned push_constant_kb = 32;
#else
const unsigned push_constant_kb = 16;
#endif
unsigned size_per_stage = push_constant_kb / 5;
/* For now, we set a static partitioning of the push constant area,
* assuming that all stages could be in use.
*
* TODO: Try lazily allocating the HS/DS/GS sections as needed, and
* see if that improves performance by offering more space to
* the VS/FS when those aren't in use. Also, try dynamically
* enabling/disabling it like i965 does. This would be more
* stalls and may not actually help; we don't know yet.
*/
for (int i = 0; i <= MESA_SHADER_FRAGMENT; i++) {
crocus_emit_cmd(batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {
alloc._3DCommandSubOpcode = 18 + i;
alloc.ConstantBufferOffset = size_per_stage * i;
alloc.ConstantBufferSize = i == MESA_SHADER_FRAGMENT ? (push_constant_kb - 4 * size_per_stage) : size_per_stage;
}
}
/* From p292 of the Ivy Bridge PRM (11.2.4 3DSTATE_PUSH_CONSTANT_ALLOC_PS):
*
* A PIPE_CONTROL command with the CS Stall bit set must be programmed
* in the ring after this instruction.
*
* No such restriction exists for Haswell or Baytrail.
*/
if (!(GFX_VERx10 == 75) && !batch->screen->devinfo.is_baytrail)
gen7_emit_cs_stall_flush(batch);
}
#endif
/**
* Upload the initial GPU state for a render context.
*
* This sets some invariant state that needs to be programmed a particular
* way, but we never actually change.
*/
static void
crocus_init_render_context(struct crocus_batch *batch)
{
UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo;
emit_pipeline_select(batch, _3D);
crocus_emit_cmd(batch, GENX(STATE_SIP), foo);
#if GFX_VER >= 7
emit_l3_state(batch, false);
#endif
#if (GFX_VERx10 == 70 || GFX_VERx10 == 80)
crocus_emit_reg(batch, GENX(INSTPM), reg) {
reg.CONSTANT_BUFFERAddressOffsetDisable = true;
reg.CONSTANT_BUFFERAddressOffsetDisableMask = true;
}
#endif
#if GFX_VER >= 5 || GFX_VERx10 == 45
/* Use the legacy AA line coverage computation. */
crocus_emit_cmd(batch, GENX(3DSTATE_AA_LINE_PARAMETERS), foo);
#endif
/* No polygon stippling offsets are necessary. */
/* TODO: may need to set an offset for origin-UL framebuffers */
crocus_emit_cmd(batch, GENX(3DSTATE_POLY_STIPPLE_OFFSET), foo);
#if GFX_VER >= 7
crocus_alloc_push_constants(batch);
#endif
#if GFX_VER == 8
/* Set the initial MSAA sample positions. */
crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLE_PATTERN), pat) {
INTEL_SAMPLE_POS_1X(pat._1xSample);
INTEL_SAMPLE_POS_2X(pat._2xSample);
INTEL_SAMPLE_POS_4X(pat._4xSample);
INTEL_SAMPLE_POS_8X(pat._8xSample);
}
/* Disable chromakeying (it's for media) */
crocus_emit_cmd(batch, GENX(3DSTATE_WM_CHROMAKEY), foo);
/* We want regular rendering, not special HiZ operations. */
crocus_emit_cmd(batch, GENX(3DSTATE_WM_HZ_OP), foo);
#endif
}
#if GFX_VER >= 7
static void
crocus_init_compute_context(struct crocus_batch *batch)
{
UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo;
emit_pipeline_select(batch, GPGPU);
#if GFX_VER >= 7
emit_l3_state(batch, true);
#endif
}
#endif
/**
* Generation-specific context state (ice->state.genx->...).
*
* Most state can go in crocus_context directly, but these encode hardware
* packets which vary by generation.
*/
struct crocus_genx_state {
struct {
#if GFX_VER >= 7
struct brw_image_param image_param[PIPE_MAX_SHADER_IMAGES];
#endif
} shaders[MESA_SHADER_STAGES];
#if GFX_VER == 8
bool pma_fix_enabled;
#endif
};
/**
* The pipe->set_blend_color() driver hook.
*
* This corresponds to our COLOR_CALC_STATE.
*/
static void
crocus_set_blend_color(struct pipe_context *ctx,
const struct pipe_blend_color *state)
{
struct crocus_context *ice = (struct crocus_context *) ctx;
/* Our COLOR_CALC_STATE is exactly pipe_blend_color, so just memcpy */
memcpy(&ice->state.blend_color, state, sizeof(struct pipe_blend_color));
#if GFX_VER <= 5
ice->state.dirty |= CROCUS_DIRTY_GEN4_CONSTANT_COLOR;
#else
ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
#endif
}
/**
* Gallium CSO for blend state (see pipe_blend_state).
*/
struct crocus_blend_state {
#if GFX_VER == 8
/** Partial 3DSTATE_PS_BLEND */
uint32_t ps_blend[GENX(3DSTATE_PS_BLEND_length)];
#endif
/** copy of BLEND_STATE */
struct pipe_blend_state cso;
/** Bitfield of whether blending is enabled for RT[i] - for aux resolves */
uint8_t blend_enables;
/** Bitfield of whether color writes are enabled for RT[i] */
uint8_t color_write_enables;
/** Does RT[0] use dual color blending? */
bool dual_color_blending;
};
static enum pipe_blendfactor
fix_blendfactor(enum pipe_blendfactor f, bool alpha_to_one)
{
if (alpha_to_one) {
if (f == PIPE_BLENDFACTOR_SRC1_ALPHA)
return PIPE_BLENDFACTOR_ONE;
if (f == PIPE_BLENDFACTOR_INV_SRC1_ALPHA)
return PIPE_BLENDFACTOR_ZERO;
}
return f;
}
#if GFX_VER >= 6
typedef struct GENX(BLEND_STATE_ENTRY) BLEND_ENTRY_GENXML;
#else
typedef struct GENX(COLOR_CALC_STATE) BLEND_ENTRY_GENXML;
#endif
static bool
can_emit_logic_op(struct crocus_context *ice)
{
/* all pre gen8 have logicop restricted to unorm */
enum pipe_format pformat = PIPE_FORMAT_NONE;
for (unsigned i = 0; i < ice->state.framebuffer.nr_cbufs; i++) {
if (ice->state.framebuffer.cbufs[i]) {
pformat = ice->state.framebuffer.cbufs[i]->format;
break;
}
}
return (pformat == PIPE_FORMAT_NONE || util_format_is_unorm(pformat));
}
static bool
set_blend_entry_bits(struct crocus_batch *batch, BLEND_ENTRY_GENXML *entry,
struct crocus_blend_state *cso_blend,
int idx)
{
struct crocus_context *ice = batch->ice;
bool independent_alpha_blend = false;
const struct pipe_rt_blend_state *rt =
&cso_blend->cso.rt[cso_blend->cso.independent_blend_enable ? idx : 0];
const unsigned blend_enabled = rt->blend_enable;
enum pipe_blendfactor src_rgb =
fix_blendfactor(rt->rgb_src_factor, cso_blend->cso.alpha_to_one);
enum pipe_blendfactor src_alpha =
fix_blendfactor(rt->alpha_src_factor, cso_blend->cso.alpha_to_one);
enum pipe_blendfactor dst_rgb =
fix_blendfactor(rt->rgb_dst_factor, cso_blend->cso.alpha_to_one);
enum pipe_blendfactor dst_alpha =
fix_blendfactor(rt->alpha_dst_factor, cso_blend->cso.alpha_to_one);
if (rt->rgb_func != rt->alpha_func ||
src_rgb != src_alpha || dst_rgb != dst_alpha)
independent_alpha_blend = true;
if (cso_blend->cso.logicop_enable) {
if (GFX_VER >= 8 || can_emit_logic_op(ice)) {
entry->LogicOpEnable = cso_blend->cso.logicop_enable;
entry->LogicOpFunction = cso_blend->cso.logicop_func;
}
} else if (blend_enabled) {
if (idx == 0) {
struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
struct brw_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
entry->ColorBufferBlendEnable =
(!cso_blend->dual_color_blending || wm_prog_data->dual_src_blend);
} else
entry->ColorBufferBlendEnable = 1;
entry->ColorBlendFunction = rt->rgb_func;
entry->AlphaBlendFunction = rt->alpha_func;
entry->SourceBlendFactor = (int) src_rgb;
entry->SourceAlphaBlendFactor = (int) src_alpha;
entry->DestinationBlendFactor = (int) dst_rgb;
entry->DestinationAlphaBlendFactor = (int) dst_alpha;
}
#if GFX_VER <= 5
/*
* Gen4/GM45/ILK can't handle have ColorBufferBlendEnable == 0
* when a dual src blend shader is in use. Setup dummy blending.
*/
struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
struct brw_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
if (idx == 0 && !blend_enabled && wm_prog_data->dual_src_blend) {
entry->ColorBufferBlendEnable = 1;
entry->ColorBlendFunction = PIPE_BLEND_ADD;
entry->AlphaBlendFunction = PIPE_BLEND_ADD;
entry->SourceBlendFactor = PIPE_BLENDFACTOR_ONE;
entry->SourceAlphaBlendFactor = PIPE_BLENDFACTOR_ONE;
entry->DestinationBlendFactor = PIPE_BLENDFACTOR_ZERO;
entry->DestinationAlphaBlendFactor = PIPE_BLENDFACTOR_ZERO;
}
#endif
return independent_alpha_blend;
}
/**
* The pipe->create_blend_state() driver hook.
*
* Translates a pipe_blend_state into crocus_blend_state.
*/
static void *
crocus_create_blend_state(struct pipe_context *ctx,
const struct pipe_blend_state *state)
{
struct crocus_blend_state *cso = malloc(sizeof(struct crocus_blend_state));
cso->blend_enables = 0;
cso->color_write_enables = 0;
STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS <= 8);
cso->cso = *state;
cso->dual_color_blending = util_blend_state_is_dual(state, 0);
#if GFX_VER == 8
bool indep_alpha_blend = false;
#endif
for (int i = 0; i < BRW_MAX_DRAW_BUFFERS; i++) {
const struct pipe_rt_blend_state *rt =
&state->rt[state->independent_blend_enable ? i : 0];
if (rt->blend_enable)
cso->blend_enables |= 1u << i;
if (rt->colormask)
cso->color_write_enables |= 1u << i;
#if GFX_VER == 8
enum pipe_blendfactor src_rgb =
fix_blendfactor(rt->rgb_src_factor, state->alpha_to_one);
enum pipe_blendfactor src_alpha =
fix_blendfactor(rt->alpha_src_factor, state->alpha_to_one);
enum pipe_blendfactor dst_rgb =
fix_blendfactor(rt->rgb_dst_factor, state->alpha_to_one);
enum pipe_blendfactor dst_alpha =
fix_blendfactor(rt->alpha_dst_factor, state->alpha_to_one);
if (rt->rgb_func != rt->alpha_func ||
src_rgb != src_alpha || dst_rgb != dst_alpha)
indep_alpha_blend = true;
#endif
}
#if GFX_VER == 8
crocus_pack_command(GENX(3DSTATE_PS_BLEND), cso->ps_blend, pb) {
/* pb.HasWriteableRT is filled in at draw time.
* pb.AlphaTestEnable is filled in at draw time.
*
* pb.ColorBufferBlendEnable is filled in at draw time so we can avoid
* setting it when dual color blending without an appropriate shader.
*/
pb.AlphaToCoverageEnable = state->alpha_to_coverage;
pb.IndependentAlphaBlendEnable = indep_alpha_blend;
/* The casts prevent warnings about implicit enum type conversions. */
pb.SourceBlendFactor =
(int) fix_blendfactor(state->rt[0].rgb_src_factor, state->alpha_to_one);
pb.SourceAlphaBlendFactor =
(int) fix_blendfactor(state->rt[0].alpha_src_factor, state->alpha_to_one);
pb.DestinationBlendFactor =
(int) fix_blendfactor(state->rt[0].rgb_dst_factor, state->alpha_to_one);
pb.DestinationAlphaBlendFactor =
(int) fix_blendfactor(state->rt[0].alpha_dst_factor, state->alpha_to_one);
}
#endif
return cso;
}
/**
* The pipe->bind_blend_state() driver hook.
*
* Bind a blending CSO and flag related dirty bits.
*/
static void
crocus_bind_blend_state(struct pipe_context *ctx, void *state)
{
struct crocus_context *ice = (struct crocus_context *) ctx;
struct crocus_blend_state *cso = state;
ice->state.cso_blend = cso;
ice->state.blend_enables = cso ? cso->blend_enables : 0;
ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_FS;
ice->state.dirty |= CROCUS_DIRTY_WM;
#if GFX_VER >= 6
ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
#endif
#if GFX_VER >= 7
ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_FS;
#endif
#if GFX_VER == 8
ice->state.dirty |= CROCUS_DIRTY_GEN8_PMA_FIX;
ice->state.dirty |= CROCUS_DIRTY_GEN8_PS_BLEND;
#endif
ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
ice->state.dirty |= CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_BLEND];
}
/**
* Return true if the FS writes to any color outputs which are not disabled
* via color masking.
*/
static bool
has_writeable_rt(const struct crocus_blend_state *cso_blend,
const struct shader_info *fs_info)
{
if (!fs_info)
return false;
unsigned rt_outputs = fs_info->outputs_written >> FRAG_RESULT_DATA0;
if (fs_info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_COLOR))
rt_outputs = (1 << BRW_MAX_DRAW_BUFFERS) - 1;
return cso_blend->color_write_enables & rt_outputs;
}
/**
* Gallium CSO for depth, stencil, and alpha testing state.
*/
struct crocus_depth_stencil_alpha_state {
struct pipe_depth_stencil_alpha_state cso;
bool depth_writes_enabled;
bool stencil_writes_enabled;
};
/**
* The pipe->create_depth_stencil_alpha_state() driver hook.
*
* We encode most of 3DSTATE_WM_DEPTH_STENCIL, and just save off the alpha
* testing state since we need pieces of it in a variety of places.
*/
static void *
crocus_create_zsa_state(struct pipe_context *ctx,
const struct pipe_depth_stencil_alpha_state *state)
{
struct crocus_depth_stencil_alpha_state *cso =
malloc(sizeof(struct crocus_depth_stencil_alpha_state));
bool two_sided_stencil = state->stencil[1].enabled;
cso->cso = *state;
cso->depth_writes_enabled = state->depth_writemask;
cso->stencil_writes_enabled =
state->stencil[0].writemask != 0 ||
(two_sided_stencil && state->stencil[1].writemask != 0);
/* The state tracker needs to optimize away EQUAL writes for us. */
assert(!(state->depth_func == PIPE_FUNC_EQUAL && state->depth_writemask));
return cso;
}
/**
* The pipe->bind_depth_stencil_alpha_state() driver hook.
*
* Bind a depth/stencil/alpha CSO and flag related dirty bits.
*/
static void
crocus_bind_zsa_state(struct pipe_context *ctx, void *state)
{
struct crocus_context *ice = (struct crocus_context *) ctx;
struct crocus_depth_stencil_alpha_state *old_cso = ice->state.cso_zsa;
struct crocus_depth_stencil_alpha_state *new_cso = state;
if (new_cso) {
if (cso_changed(cso.alpha_ref_value))
ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
if (cso_changed(cso.alpha_enabled))
ice->state.dirty |= CROCUS_DIRTY_WM;
#if GFX_VER >= 6
if (cso_changed(cso.alpha_enabled))
ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
if (cso_changed(cso.alpha_func))
ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
#endif
#if GFX_VER == 8
if (cso_changed(cso.alpha_enabled))
ice->state.dirty |= CROCUS_DIRTY_GEN8_PS_BLEND;
#endif
if (cso_changed(depth_writes_enabled))
ice->state.dirty |= CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
ice->state.depth_writes_enabled = new_cso->depth_writes_enabled;
ice->state.stencil_writes_enabled = new_cso->stencil_writes_enabled;
#if GFX_VER <= 5
ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
#endif
}
ice->state.cso_zsa = new_cso;
ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT;
#if GFX_VER >= 6
ice->state.dirty |= CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL;
#endif
#if GFX_VER == 8
ice->state.dirty |= CROCUS_DIRTY_GEN8_PMA_FIX;
#endif
ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_DEPTH_STENCIL_ALPHA];
}
#if GFX_VER == 8
static bool
want_pma_fix(struct crocus_context *ice)
{
UNUSED struct crocus_screen *screen = (void *) ice->ctx.screen;
UNUSED const struct intel_device_info *devinfo = &screen->devinfo;
const struct brw_wm_prog_data *wm_prog_data = (void *)
ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data;
const struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
const struct crocus_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
const struct crocus_blend_state *cso_blend = ice->state.cso_blend;
/* In very specific combinations of state, we can instruct Gfx8-9 hardware
* to avoid stalling at the pixel mask array. The state equations are
* documented in these places:
*
* - Gfx8 Depth PMA Fix: CACHE_MODE_1::NP_PMA_FIX_ENABLE
* - Gfx9 Stencil PMA Fix: CACHE_MODE_0::STC PMA Optimization Enable
*
* Both equations share some common elements:
*
* no_hiz_op =
* !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
* 3DSTATE_WM_HZ_OP::DepthBufferResolve ||
* 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
* 3DSTATE_WM_HZ_OP::StencilBufferClear) &&
*
* killpixels =
* 3DSTATE_WM::ForceKillPix != ForceOff &&
* (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
* 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
* 3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
* 3DSTATE_PS_BLEND::AlphaTestEnable ||
* 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
*
* (Technically the stencil PMA treats ForceKillPix differently,
* but I think this is a documentation oversight, and we don't
* ever use it in this way, so it doesn't matter).
*
* common_pma_fix =
* 3DSTATE_WM::ForceThreadDispatch != 1 &&
* 3DSTATE_RASTER::ForceSampleCount == NUMRASTSAMPLES_0 &&
* 3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
* 3DSTATE_DEPTH_BUFFER::HIZ Enable &&
* 3DSTATE_WM::EDSC_Mode != EDSC_PREPS &&
* 3DSTATE_PS_EXTRA::PixelShaderValid &&
* no_hiz_op
*
* These are always true:
*
* 3DSTATE_RASTER::ForceSampleCount == NUMRASTSAMPLES_0
* 3DSTATE_PS_EXTRA::PixelShaderValid
*
* Also, we never use the normal drawing path for HiZ ops; these are true:
*
* !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
* 3DSTATE_WM_HZ_OP::DepthBufferResolve ||
* 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
* 3DSTATE_WM_HZ_OP::StencilBufferClear)
*
* This happens sometimes:
*
* 3DSTATE_WM::ForceThreadDispatch != 1
*
* However, we choose to ignore it as it either agrees with the signal
* (dispatch was already enabled, so nothing out of the ordinary), or
* there are no framebuffer attachments (so no depth or HiZ anyway,
* meaning the PMA signal will already be disabled).
*/
if (!cso_fb->zsbuf)
return false;
struct crocus_resource *zres, *sres;
crocus_get_depth_stencil_resources(devinfo,
cso_fb->zsbuf->texture, &zres, &sres);
/* 3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
* 3DSTATE_DEPTH_BUFFER::HIZ Enable &&
*/
if (!zres || !crocus_resource_level_has_hiz(zres, cso_fb->zsbuf->u.tex.level))
return false;
/* 3DSTATE_WM::EDSC_Mode != EDSC_PREPS */
if (wm_prog_data->early_fragment_tests)
return false;
/* 3DSTATE_WM::ForceKillPix != ForceOff &&
* (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
* 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
* 3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
* 3DSTATE_PS_BLEND::AlphaTestEnable ||
* 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
*/
bool killpixels = wm_prog_data->uses_kill || wm_prog_data->uses_omask ||
cso_blend->cso.alpha_to_coverage || cso_zsa->cso.alpha_enabled;
/* The Gfx8 depth PMA equation becomes:
*
* depth_writes =
* 3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable &&
* 3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE
*
* stencil_writes =
* 3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
* 3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE &&
* 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE
*
* Z_PMA_OPT =
* common_pma_fix &&
* 3DSTATE_WM_DEPTH_STENCIL::DepthTestEnable &&
* ((killpixels && (depth_writes || stencil_writes)) ||
* 3DSTATE_PS_EXTRA::PixelShaderComputedDepthMode != PSCDEPTH_OFF)
*
*/
if (!cso_zsa->cso.depth_enabled)
return false;
return wm_prog_data->computed_depth_mode != PSCDEPTH_OFF ||
(killpixels && (cso_zsa->depth_writes_enabled ||
(sres && cso_zsa->stencil_writes_enabled)));
}
#endif
void
genX(crocus_update_pma_fix)(struct crocus_context *ice,
struct crocus_batch *batch,
bool enable)
{
#if GFX_VER == 8
struct crocus_genx_state *genx = ice->state.genx;
if (genx->pma_fix_enabled == enable)
return;
genx->pma_fix_enabled = enable;
/* According to the Broadwell PIPE_CONTROL documentation, software should
* emit a PIPE_CONTROL with the CS Stall and Depth Cache Flush bits set
* prior to the LRI. If stencil buffer writes are enabled, then a Render * Cache Flush is also necessary.
*
* The Gfx9 docs say to use a depth stall rather than a command streamer
* stall. However, the hardware seems to violently disagree. A full
* command streamer stall seems to be needed in both cases.
*/
crocus_emit_pipe_control_flush(batch, "PMA fix change (1/2)",
PIPE_CONTROL_CS_STALL |
PIPE_CONTROL_DEPTH_CACHE_FLUSH |
PIPE_CONTROL_RENDER_TARGET_FLUSH);
crocus_emit_reg(batch, GENX(CACHE_MODE_1), reg) {
reg.NPPMAFixEnable = enable;
reg.NPEarlyZFailsDisable = enable;
reg.NPPMAFixEnableMask = true;
reg.NPEarlyZFailsDisableMask = true;
}
/* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache
* Flush bits is often necessary. We do it regardless because it's easier.
* The render cache flush is also necessary if stencil writes are enabled.
*
* Again, the Gfx9 docs give a different set of flushes but the Broadwell
* flushes seem to work just as well.
*/
crocus_emit_pipe_control_flush(batch, "PMA fix change (1/2)",
PIPE_CONTROL_DEPTH_STALL |
PIPE_CONTROL_DEPTH_CACHE_FLUSH |
PIPE_CONTROL_RENDER_TARGET_FLUSH);
#endif
}
static float
get_line_width(const struct pipe_rasterizer_state *state)
{
float line_width = state->line_width;
/* From the OpenGL 4.4 spec:
*
* "The actual width of non-antialiased lines is determined by rounding
* the supplied width to the nearest integer, then clamping it to the
* implementation-dependent maximum non-antialiased line width."
*/
if (!state->multisample && !state->line_smooth)
line_width = roundf(state->line_width);
if (!state->multisample && state->line_smooth && line_width < 1.5f) {
/* For 1 pixel line thickness or less, the general anti-aliasing
* algorithm gives up, and a garbage line is generated. Setting a
* Line Width of 0.0 specifies the rasterization of the "thinnest"
* (one-pixel-wide), non-antialiased lines.
*
* Lines rendered with zero Line Width are rasterized using the
* "Grid Intersection Quantization" rules as specified by the
* "Zero-Width (Cosmetic) Line Rasterization" section of the docs.
*/
line_width = 0.0f;
}
return line_width;
}
/**
* The pipe->create_rasterizer_state() driver hook.
*/
static void *
crocus_create_rasterizer_state(struct pipe_context *ctx,
const struct pipe_rasterizer_state *state)
{
struct crocus_rasterizer_state *cso =
malloc(sizeof(struct crocus_rasterizer_state));
cso->fill_mode_point_or_line =
state->fill_front == PIPE_POLYGON_MODE_LINE ||
state->fill_front == PIPE_POLYGON_MODE_POINT ||
state->fill_back == PIPE_POLYGON_MODE_LINE ||
state->fill_back == PIPE_POLYGON_MODE_POINT;
if (state->clip_plane_enable != 0)
cso->num_clip_plane_consts = util_logbase2(state->clip_plane_enable) + 1;
else
cso->num_clip_plane_consts = 0;
cso->cso = *state;
#if GFX_VER >= 6
float line_width = get_line_width(state);
crocus_pack_command(GENX(3DSTATE_SF), cso->sf, sf) {
sf.StatisticsEnable = true;
sf.AALineDistanceMode = AALINEDISTANCE_TRUE;
sf.LineEndCapAntialiasingRegionWidth =
state->line_smooth ? _10pixels : _05pixels;
sf.LastPixelEnable = state->line_last_pixel;
#if GFX_VER == 8
struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
if (screen->devinfo.is_cherryview)
sf.CHVLineWidth = line_width;
else
sf.LineWidth = line_width;
#else
sf.LineWidth = line_width;
#endif
sf.PointWidthSource = state->point_size_per_vertex ? Vertex : State;
sf.PointWidth = state->point_size;
if (state->flatshade_first) {
sf.TriangleFanProvokingVertexSelect = 1;
} else {
sf.TriangleStripListProvokingVertexSelect = 2;
sf.TriangleFanProvokingVertexSelect = 2;
sf.LineStripListProvokingVertexSelect = 1;
}
#if GFX_VER == 6
sf.AttributeSwizzleEnable = true;
if (state->sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT)
sf.PointSpriteTextureCoordinateOrigin = LOWERLEFT;
else
sf.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
#endif
#if GFX_VER <= 7
sf.FrontWinding = state->front_ccw ? 1 : 0; // Or the other way...
#if GFX_VER >= 6
sf.GlobalDepthOffsetEnableSolid = state->offset_tri;
sf.GlobalDepthOffsetEnableWireframe = state->offset_line;
sf.GlobalDepthOffsetEnablePoint = state->offset_point;
sf.GlobalDepthOffsetConstant = state->offset_units * 2;
sf.GlobalDepthOffsetScale = state->offset_scale;
sf.GlobalDepthOffsetClamp = state->offset_clamp;
sf.FrontFaceFillMode = translate_fill_mode(state->fill_front);
sf.BackFaceFillMode = translate_fill_mode(state->fill_back);
#endif
sf.CullMode = translate_cull_mode(state->cull_face);
sf.ScissorRectangleEnable = true;
#if GFX_VERx10 == 75
sf.LineStippleEnable = state->line_stipple_enable;
#endif
#endif
}
#endif
#if GFX_VER == 8
crocus_pack_command(GENX(3DSTATE_RASTER), cso->raster, rr) {
rr.FrontWinding = state->front_ccw ? CounterClockwise : Clockwise;
rr.CullMode = translate_cull_mode(state->cull_face);
rr.FrontFaceFillMode = translate_fill_mode(state->fill_front);
rr.BackFaceFillMode = translate_fill_mode(state->fill_back);
rr.DXMultisampleRasterizationEnable = state->multisample;
rr.GlobalDepthOffsetEnableSolid = state->offset_tri;
rr.GlobalDepthOffsetEnableWireframe = state->offset_line;
rr.GlobalDepthOffsetEnablePoint = state->offset_point;
rr.GlobalDepthOffsetConstant = state->offset_units * 2;
rr.GlobalDepthOffsetScale = state->offset_scale;
rr.GlobalDepthOffsetClamp = state->offset_clamp;
rr.SmoothPointEnable = state->point_smooth;
rr.AntialiasingEnable = state->line_smooth;
rr.ScissorRectangleEnable = state->scissor;
rr.ViewportZClipTestEnable = (state->depth_clip_near || state->depth_clip_far);
}
#endif
#if GFX_VER >= 6
crocus_pack_command(GENX(3DSTATE_CLIP), cso->clip, cl) {
/* cl.NonPerspectiveBarycentricEnable is filled in at draw time from
* the FS program; cl.ForceZeroRTAIndexEnable is filled in from the FB.
*/
#if GFX_VER >= 7
cl.EarlyCullEnable = true;
#endif
#if GFX_VER == 7
cl.FrontWinding = state->front_ccw ? 1 : 0;
cl.CullMode = translate_cull_mode(state->cull_face);
#endif
cl.UserClipDistanceClipTestEnableBitmask = state->clip_plane_enable;
#if GFX_VER < 8
cl.ViewportZClipTestEnable = (state->depth_clip_near || state->depth_clip_far);
#endif
cl.APIMode = state->clip_halfz ? APIMODE_D3D : APIMODE_OGL;
cl.GuardbandClipTestEnable = true;
cl.ClipEnable = true;
cl.MinimumPointWidth = 0.125;
cl.MaximumPointWidth = 255.875;
#if GFX_VER == 8
cl.ForceUserClipDistanceClipTestEnableBitmask = true;
#endif
if (state->flatshade_first) {
cl.TriangleFanProvokingVertexSelect = 1;
} else {
cl.TriangleStripListProvokingVertexSelect = 2;
cl.TriangleFanProvokingVertexSelect = 2;
cl.LineStripListProvokingVertexSelect = 1;
}
}
#endif
/* Remap from 0..255 back to 1..256 */
const unsigned line_stipple_factor = state->line_stipple_factor + 1;
crocus_pack_command(GENX(3DSTATE_LINE_STIPPLE), cso->line_stipple, line) {
if (state->line_stipple_enable) {
line.LineStipplePattern = state->line_stipple_pattern;
line.LineStippleInverseRepeatCount = 1.0f / line_stipple_factor;
line.LineStippleRepeatCount = line_stipple_factor;
}
}
return cso;
}
/**
* The pipe->bind_rasterizer_state() driver hook.
*
* Bind a rasterizer CSO and flag related dirty bits.
*/
static void
crocus_bind_rasterizer_state(struct pipe_context *ctx, void *state)
{
struct crocus_context *ice = (struct crocus_context *) ctx;
struct crocus_rasterizer_state *old_cso = ice->state.cso_rast;
struct crocus_rasterizer_state *new_cso = state;
if (new_cso) {
/* Try to avoid re-emitting 3DSTATE_LINE_STIPPLE, it's non-pipelined */
if (cso_changed_memcmp(line_stipple))
ice->state.dirty |= CROCUS_DIRTY_LINE_STIPPLE;
#if GFX_VER >= 6
if (cso_changed(cso.half_pixel_center))
ice->state.dirty |= CROCUS_DIRTY_GEN6_MULTISAMPLE;
if (cso_changed(cso.scissor))
ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
if (cso_changed(cso.multisample))
ice->state.dirty |= CROCUS_DIRTY_WM;
#else
if (cso_changed(cso.scissor))
ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT;
#endif
if (cso_changed(cso.line_stipple_enable) || cso_changed(cso.poly_stipple_enable))
ice->state.dirty |= CROCUS_DIRTY_WM;
#if GFX_VER >= 6
if (cso_changed(cso.rasterizer_discard))
ice->state.dirty |= CROCUS_DIRTY_STREAMOUT | CROCUS_DIRTY_CLIP;
if (cso_changed(cso.flatshade_first))
ice->state.dirty |= CROCUS_DIRTY_STREAMOUT;
#endif
if (cso_changed(cso.depth_clip_near) || cso_changed(cso.depth_clip_far) ||
cso_changed(cso.clip_halfz))
ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT;
#if GFX_VER >= 7
if (cso_changed(cso.sprite_coord_enable) ||
cso_changed(cso.sprite_coord_mode) ||
cso_changed(cso.light_twoside))
ice->state.dirty |= CROCUS_DIRTY_GEN7_SBE;
#endif
#if GFX_VER <= 5
if (cso_changed(cso.clip_plane_enable))
ice->state.dirty |= CROCUS_DIRTY_GEN4_CURBE;
#endif
}
ice->state.cso_rast = new_cso;
ice->state.dirty |= CROCUS_DIRTY_RASTER;
ice->state.dirty |= CROCUS_DIRTY_CLIP;
#if GFX_VER <= 5
ice->state.dirty |= CROCUS_DIRTY_GEN4_CLIP_PROG | CROCUS_DIRTY_GEN4_SF_PROG;
ice->state.dirty |= CROCUS_DIRTY_WM;
#endif
#if GFX_VER <= 6
ice->state.dirty |= CROCUS_DIRTY_GEN4_FF_GS_PROG;
#endif
ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_RASTERIZER];
}
/**
* Return true if the given wrap mode requires the border color to exist.
*
* (We can skip uploading it if the sampler isn't going to use it.)
*/
static bool
wrap_mode_needs_border_color(unsigned wrap_mode)
{
#if GFX_VER == 8
return wrap_mode == TCM_CLAMP_BORDER || wrap_mode == TCM_HALF_BORDER;
#else
return wrap_mode == TCM_CLAMP_BORDER;
#endif
}
/**
* Gallium CSO for sampler state.
*/
struct crocus_sampler_state {
struct pipe_sampler_state pstate;
union pipe_color_union border_color;
bool needs_border_color;
unsigned wrap_s;
unsigned wrap_t;
unsigned wrap_r;
unsigned mag_img_filter;
float min_lod;
};
/**
* The pipe->create_sampler_state() driver hook.
*
* We fill out SAMPLER_STATE (except for the border color pointer), and
* store that on the CPU. It doesn't make sense to upload it to a GPU
* buffer object yet, because 3DSTATE_SAMPLER_STATE_POINTERS requires
* all bound sampler states to be in contiguous memor.
*/
static void *
crocus_create_sampler_state(struct pipe_context *ctx,
const struct pipe_sampler_state *state)
{
struct crocus_sampler_state *cso = CALLOC_STRUCT(crocus_sampler_state);
if (!cso)
return NULL;
STATIC_ASSERT(PIPE_TEX_FILTER_NEAREST == MAPFILTER_NEAREST);
STATIC_ASSERT(PIPE_TEX_FILTER_LINEAR == MAPFILTER_LINEAR);
bool either_nearest = state->min_img_filter == PIPE_TEX_FILTER_NEAREST ||
state->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
cso->wrap_s = translate_wrap(state->wrap_s, either_nearest);
cso->wrap_t = translate_wrap(state->wrap_t, either_nearest);
cso->wrap_r = translate_wrap(state->wrap_r, either_nearest);
cso->pstate = *state;
memcpy(&cso->border_color, &state->border_color, sizeof(cso->border_color));
cso->needs_border_color = wrap_mode_needs_border_color(cso->wrap_s) ||
wrap_mode_needs_border_color(cso->wrap_t) ||
wrap_mode_needs_border_color(cso->wrap_r);
cso->min_lod = state->min_lod;
cso->mag_img_filter = state->mag_img_filter;
// XXX: explain this code ported from ilo...I don't get it at all...
if (state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE &&
state->min_lod > 0.0f) {
cso->min_lod = 0.0f;
cso->mag_img_filter = state->min_img_filter;
}
return cso;
}
/**
* The pipe->bind_sampler_states() driver hook.
*/
static void
crocus_bind_sampler_states(struct pipe_context *ctx,
enum pipe_shader_type p_stage,
unsigned start, unsigned count,
void **states)
{
struct crocus_context *ice = (struct crocus_context *) ctx;
gl_shader_stage stage = stage_from_pipe(p_stage);
struct crocus_shader_state *shs = &ice->state.shaders[stage];
assert(start + count <= CROCUS_MAX_TEXTURE_SAMPLERS);
bool dirty = false;
for (int i = 0; i < count; i++) {
if (shs->samplers[start + i] != states[i]) {
shs->samplers[start + i] = states[i];
dirty = true;
}
}
if (dirty) {
#if GFX_VER <= 5
if (p_stage == PIPE_SHADER_FRAGMENT)
ice->state.dirty |= CROCUS_DIRTY_WM;
else if (p_stage == PIPE_SHADER_VERTEX)
ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_VS;
#endif
ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << stage;
ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_TEXTURES];
}
}
enum samp_workaround {
SAMP_NORMAL,
SAMP_CUBE_CLAMP,
SAMP_CUBE_CUBE,
SAMP_T_WRAP,
};
static void
crocus_upload_sampler_state(struct crocus_batch *batch,
struct crocus_sampler_state *cso,
uint32_t border_color_offset,
enum samp_workaround samp_workaround,
uint32_t first_level,
void *map)
{
struct pipe_sampler_state *state = &cso->pstate;
uint32_t wrap_s, wrap_t, wrap_r;
wrap_s = cso->wrap_s;
wrap_t = cso->wrap_t;
wrap_r = cso->wrap_r;
switch (samp_workaround) {
case SAMP_CUBE_CLAMP:
wrap_s = TCM_CLAMP;
wrap_t = TCM_CLAMP;
wrap_r = TCM_CLAMP;
break;
case SAMP_CUBE_CUBE:
wrap_s = TCM_CUBE;
wrap_t = TCM_CUBE;
wrap_r = TCM_CUBE;
break;
case SAMP_T_WRAP:
wrap_t = TCM_WRAP;
break;
default:
break;
}
_crocus_pack_state(batch, GENX(SAMPLER_STATE), map, samp) {
samp.TCXAddressControlMode = wrap_s;
samp.TCYAddressControlMode = wrap_t;
samp.TCZAddressControlMode = wrap_r;
#if GFX_VER >= 6
samp.NonnormalizedCoordinateEnable = !state->normalized_coords;
#endif
samp.MinModeFilter = state->min_img_filter;
samp.MagModeFilter = cso->mag_img_filter;
samp.MipModeFilter = translate_mip_filter(state->min_mip_filter);
samp.MaximumAnisotropy = RATIO21;
if (state->max_anisotropy >= 2) {
if (state->min_img_filter == PIPE_TEX_FILTER_LINEAR) {
samp.MinModeFilter = MAPFILTER_ANISOTROPIC;
#if GFX_VER >= 7
samp.AnisotropicAlgorithm = EWAApproximation;
#endif
}
if (state->mag_img_filter == PIPE_TEX_FILTER_LINEAR)
samp.MagModeFilter = MAPFILTER_ANISOTROPIC;
samp.MaximumAnisotropy =
MIN2((state->max_anisotropy - 2) / 2, RATIO161);
}
/* Set address rounding bits if not using nearest filtering. */
if (state->min_img_filter != PIPE_TEX_FILTER_NEAREST) {
samp.UAddressMinFilterRoundingEnable = true;
samp.VAddressMinFilterRoundingEnable = true;
samp.RAddressMinFilterRoundingEnable = true;
}
if (state->mag_img_filter != PIPE_TEX_FILTER_NEAREST) {
samp.UAddressMagFilterRoundingEnable = true;
samp.VAddressMagFilterRoundingEnable = true;
samp.RAddressMagFilterRoundingEnable = true;
}
if (state->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE)
samp.ShadowFunction = translate_shadow_func(state->compare_func);
const float hw_max_lod = GFX_VER >= 7 ? 14 : 13;
#if GFX_VER == 8
samp.LODPreClampMode = CLAMP_MODE_OGL;
#else
samp.LODPreClampEnable = true;
#endif
samp.MinLOD = CLAMP(cso->min_lod, 0, hw_max_lod);
samp.MaxLOD = CLAMP(state->max_lod, 0, hw_max_lod);
samp.TextureLODBias = CLAMP(state->lod_bias, -16, 15);
#if GFX_VER == 6
samp.BaseMipLevel = CLAMP(first_level, 0, hw_max_lod);
samp.MinandMagStateNotEqual = samp.MinModeFilter != samp.MagModeFilter;
#endif
#if GFX_VER < 6
samp.BorderColorPointer =
ro_bo(batch->state.bo, border_color_offset);
#else
samp.BorderColorPointer = border_color_offset;
#endif
}
}
static void
crocus_upload_border_color(struct crocus_batch *batch,
struct crocus_sampler_state *cso,
struct crocus_sampler_view *tex,
uint32_t *bc_offset)
{
/* We may need to swizzle the border color for format faking.
* A/LA formats are faked as R/RG with 000R or R00G swizzles.
* This means we need to move the border color's A channel into
* the R or G channels so that those read swizzles will move it
* back into A.
*/
enum pipe_format internal_format = PIPE_FORMAT_NONE;
union pipe_color_union *color = &cso->border_color;
union pipe_color_union tmp;
if (tex) {
internal_format = tex->res->internal_format;
if (util_format_is_alpha(internal_format)) {
unsigned char swz[4] = {
PIPE_SWIZZLE_0, PIPE_SWIZZLE_0,
PIPE_SWIZZLE_0, PIPE_SWIZZLE_W,
};
util_format_apply_color_swizzle(&tmp, color, swz, true);
color = &tmp;
} else if (util_format_is_luminance_alpha(internal_format) &&
internal_format != PIPE_FORMAT_L8A8_SRGB) {
unsigned char swz[4] = {
PIPE_SWIZZLE_X, PIPE_SWIZZLE_X,
PIPE_SWIZZLE_X, PIPE_SWIZZLE_W
};
util_format_apply_color_swizzle(&tmp, color, swz, true);
color = &tmp;
}
}
bool is_integer_format = util_format_is_pure_integer(internal_format);
unsigned sbc_size = GENX(SAMPLER_BORDER_COLOR_STATE_length) * 4;
const int sbc_align = (GFX_VER == 8 ? 64 : ((GFX_VERx10 == 75 && is_integer_format) ? 512 : 32));
uint32_t *sbc = stream_state(batch, sbc_size, sbc_align, bc_offset);
struct GENX(SAMPLER_BORDER_COLOR_STATE) state = { 0 };
#define ASSIGN(dst, src) \
do { \
dst = src; \
} while (0)
#define ASSIGNu16(dst, src) \
do { \
dst = (uint16_t)src; \
} while (0)
#define ASSIGNu8(dst, src) \
do { \
dst = (uint8_t)src; \
} while (0)
#define BORDER_COLOR_ATTR(macro, _color_type, src) \
macro(state.BorderColor ## _color_type ## Red, src[0]); \
macro(state.BorderColor ## _color_type ## Green, src[1]); \
macro(state.BorderColor ## _color_type ## Blue, src[2]); \
macro(state.BorderColor ## _color_type ## Alpha, src[3]);
#if GFX_VER >= 8
/* On Broadwell, the border color is represented as four 32-bit floats,
* integers, or unsigned values, interpreted according to the surface
* format. This matches the sampler->BorderColor union exactly; just
* memcpy the values.
*/
BORDER_COLOR_ATTR(ASSIGN, 32bit, color->ui);
#elif GFX_VERx10 == 75
if (is_integer_format) {
const struct util_format_description *format_desc =
util_format_description(internal_format);
/* From the Haswell PRM, "Command Reference: Structures", Page 36:
* "If any color channel is missing from the surface format,
* corresponding border color should be programmed as zero and if
* alpha channel is missing, corresponding Alpha border color should
* be programmed as 1."
*/
unsigned c[4] = { 0, 0, 0, 1 };
for (int i = 0; i < 4; i++) {
if (format_desc->channel[i].size)
c[i] = color->ui[i];
}
switch (format_desc->channel[0].size) {
case 8:
/* Copy RGBA in order. */
BORDER_COLOR_ATTR(ASSIGNu8, 8bit, c);
break;
case 10:
/* R10G10B10A2_UINT is treated like a 16-bit format. */
case 16:
BORDER_COLOR_ATTR(ASSIGNu16, 16bit, c);
break;
case 32:
if (format_desc->channel[1].size && !format_desc->channel[2].size) {
/* Careful inspection of the tables reveals that for RG32 formats,
* the green channel needs to go where blue normally belongs.
*/
state.BorderColor32bitRed = c[0];
state.BorderColor32bitBlue = c[1];
state.BorderColor32bitAlpha = 1;
} else {
/* Copy RGBA in order. */
BORDER_COLOR_ATTR(ASSIGN, 32bit, c);
}
break;
default:
assert(!"Invalid number of bits per channel in integer format.");
break;
}
} else {
BORDER_COLOR_ATTR(ASSIGN, Float, color->f);
}
#elif GFX_VER == 5 || GFX_VER == 6
BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_UBYTE, Unorm, color->f);
BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_USHORT, Unorm16, color->f);
BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_SHORT, Snorm16, color->f);
#define MESA_FLOAT_TO_HALF(dst, src) \
dst = _mesa_float_to_half(src);
BORDER_COLOR_ATTR(MESA_FLOAT_TO_HALF, Float16, color->f);
#undef MESA_FLOAT_TO_HALF
state.BorderColorSnorm8Red = state.BorderColorSnorm16Red >> 8;
state.BorderColorSnorm8Green = state.BorderColorSnorm16Green >> 8;
state.BorderColorSnorm8Blue = state.BorderColorSnorm16Blue >> 8;
state.BorderColorSnorm8Alpha = state.BorderColorSnorm16Alpha >> 8;
BORDER_COLOR_ATTR(ASSIGN, Float, color->f);
#elif GFX_VER == 4
BORDER_COLOR_ATTR(ASSIGN, , color->f);
#else
BORDER_COLOR_ATTR(ASSIGN, Float, color->f);
#endif
#undef ASSIGN
#undef BORDER_COLOR_ATTR
GENX(SAMPLER_BORDER_COLOR_STATE_pack)(batch, sbc, &state);
}
/**
* Upload the sampler states into a contiguous area of GPU memory, for
* for 3DSTATE_SAMPLER_STATE_POINTERS_*.
*
* Also fill out the border color state pointers.
*/
static void
crocus_upload_sampler_states(struct crocus_context *ice,
struct crocus_batch *batch, gl_shader_stage stage)
{
struct crocus_shader_state *shs = &ice->state.shaders[stage];
const struct shader_info *info = crocus_get_shader_info(ice, stage);
/* We assume the state tracker will call pipe->bind_sampler_states()
* if the program's number of textures changes.
*/
unsigned count = info ? BITSET_LAST_BIT(info->textures_used) : 0;
if (!count)
return;
/* Assemble the SAMPLER_STATEs into a contiguous table that lives
* in the dynamic state memory zone, so we can point to it via the
* 3DSTATE_SAMPLER_STATE_POINTERS_* commands.
*/
unsigned size = count * 4 * GENX(SAMPLER_STATE_length);
uint32_t *map = stream_state(batch, size, 32, &shs->sampler_offset);
if (unlikely(!map))
return;
for (int i = 0; i < count; i++) {
struct crocus_sampler_state *state = shs->samplers[i];
struct crocus_sampler_view *tex = shs->textures[i];
if (!state || !tex) {
memset(map, 0, 4 * GENX(SAMPLER_STATE_length));
} else {
unsigned border_color_offset = 0;
if (state->needs_border_color) {
crocus_upload_border_color(batch, state, tex, &border_color_offset);
}
enum samp_workaround wa = SAMP_NORMAL;
/* There's a bug in 1D texture sampling - it actually pays
* attention to the wrap_t value, though it should not.
* Override the wrap_t value here to GL_REPEAT to keep
* any nonexistent border pixels from floating in.
*/
if (tex->base.target == PIPE_TEXTURE_1D)
wa = SAMP_T_WRAP;
else if (tex->base.target == PIPE_TEXTURE_CUBE ||
tex->base.target == PIPE_TEXTURE_CUBE_ARRAY) {
/* Cube maps must use the same wrap mode for all three coordinate
* dimensions. Prior to Haswell, only CUBE and CLAMP are valid.
*
* Ivybridge and Baytrail seem to have problems with CUBE mode and
* integer formats. Fall back to CLAMP for now.
*/
if (state->pstate.seamless_cube_map &&
!(GFX_VERx10 == 70 && util_format_is_pure_integer(tex->base.format)))
wa = SAMP_CUBE_CUBE;
else
wa = SAMP_CUBE_CLAMP;
}
uint32_t first_level = 0;
if (tex->base.target != PIPE_BUFFER)
first_level = tex->base.u.tex.first_level;
crocus_upload_sampler_state(batch, state, border_color_offset, wa, first_level, map);
}
map += GENX(SAMPLER_STATE_length);
}
}
/**
* The pipe->create_sampler_view() driver hook.
*/
static struct pipe_sampler_view *
crocus_create_sampler_view(struct pipe_context *ctx,
struct pipe_resource *tex,
const struct pipe_sampler_view *tmpl)
{
struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
const struct intel_device_info *devinfo = &screen->devinfo;
struct crocus_sampler_view *isv = calloc(1, sizeof(struct crocus_sampler_view));
if (!isv)
return NULL;
/* initialize base object */
isv->base = *tmpl;
isv->base.context = ctx;
isv->base.texture = NULL;
pipe_reference_init(&isv->base.reference, 1);
pipe_resource_reference(&isv->base.texture, tex);
if (util_format_is_depth_or_stencil(tmpl->format)) {
struct crocus_resource *zres, *sres;
const struct util_format_description *desc =
util_format_description(tmpl->format);
crocus_get_depth_stencil_resources(devinfo, tex, &zres, &sres);
tex = util_format_has_depth(desc) ? &zres->base.b : &sres->base.b;
if (tex->format == PIPE_FORMAT_S8_UINT)
if (devinfo->ver == 7 && sres->shadow)
tex = &sres->shadow->base.b;
}
isv->res = (struct crocus_resource *) tex;
isl_surf_usage_flags_t usage = ISL_SURF_USAGE_TEXTURE_BIT;
if (isv->base.target == PIPE_TEXTURE_CUBE ||
isv->base.target == PIPE_TEXTURE_CUBE_ARRAY)
usage |= ISL_SURF_USAGE_CUBE_BIT;
const struct crocus_format_info fmt =
crocus_format_for_usage(devinfo, tmpl->format, usage);
enum pipe_swizzle vswz[4] = { tmpl->swizzle_r, tmpl->swizzle_g, tmpl->swizzle_b, tmpl->swizzle_a };
crocus_combine_swizzle(isv->swizzle, fmt.swizzles, vswz);
/* hardcode stencil swizzles - hw returns 0G01, we want GGGG */
if (devinfo->ver < 6 &&
(tmpl->format == PIPE_FORMAT_X32_S8X24_UINT ||
tmpl->format == PIPE_FORMAT_X24S8_UINT)) {
isv->swizzle[0] = tmpl->swizzle_g;
isv->swizzle[1] = tmpl->swizzle_g;
isv->swizzle[2] = tmpl->swizzle_g;
isv->swizzle[3] = tmpl->swizzle_g;
}
isv->clear_color = isv->res->aux.clear_color;
isv->view = (struct isl_view) {
.format = fmt.fmt,
#if GFX_VERx10 >= 75
.swizzle = (struct isl_swizzle) {
.r = pipe_to_isl_swizzle(isv->swizzle[0], false),
.g = pipe_to_isl_swizzle(isv->swizzle[1], false),
.b = pipe_to_isl_swizzle(isv->swizzle[2], false),
.a = pipe_to_isl_swizzle(isv->swizzle[3], false),
},
#else
/* swizzling handled in shader code */
.swizzle = ISL_SWIZZLE_IDENTITY,
#endif
.usage = usage,
};
/* Fill out SURFACE_STATE for this view. */
if (tmpl->target != PIPE_BUFFER) {
isv->view.base_level = tmpl->u.tex.first_level;
isv->view.levels = tmpl->u.tex.last_level - tmpl->u.tex.first_level + 1;
// XXX: do I need to port f9fd0cf4790cb2a530e75d1a2206dbb9d8af7cb2?
isv->view.base_array_layer = tmpl->u.tex.first_layer;
isv->view.array_len =
tmpl->u.tex.last_layer - tmpl->u.tex.first_layer + 1;
}
#if GFX_VER >= 6
/* just create a second view struct for texture gather just in case */
isv->gather_view = isv->view;
#if GFX_VER == 7
if (fmt.fmt == ISL_FORMAT_R32G32_FLOAT ||
fmt.fmt == ISL_FORMAT_R32G32_SINT ||
fmt.fmt == ISL_FORMAT_R32G32_UINT) {
isv->gather_view.format = ISL_FORMAT_R32G32_FLOAT_LD;
#if GFX_VERx10 >= 75
isv->gather_view.swizzle = (struct isl_swizzle) {
.r = pipe_to_isl_swizzle(isv->swizzle[0], GFX_VERx10 == 75),
.g = pipe_to_isl_swizzle(isv->swizzle[1], GFX_VERx10 == 75),
.b = pipe_to_isl_swizzle(isv->swizzle[2], GFX_VERx10 == 75),
.a = pipe_to_isl_swizzle(isv->swizzle[3], GFX_VERx10 == 75),
};
#endif
}
#endif
#if GFX_VER == 6
/* Sandybridge's gather4 message is broken for integer formats.
* To work around this, we pretend the surface is UNORM for
* 8 or 16-bit formats, and emit shader instructions to recover
* the real INT/UINT value. For 32-bit formats, we pretend
* the surface is FLOAT, and simply reinterpret the resulting
* bits.
*/
switch (fmt.fmt) {
case ISL_FORMAT_R8_SINT:
case ISL_FORMAT_R8_UINT:
isv->gather_view.format = ISL_FORMAT_R8_UNORM;
break;
case ISL_FORMAT_R16_SINT:
case ISL_FORMAT_R16_UINT:
isv->gather_view.format = ISL_FORMAT_R16_UNORM;
break;
case ISL_FORMAT_R32_SINT:
case ISL_FORMAT_R32_UINT:
isv->gather_view.format = ISL_FORMAT_R32_FLOAT;
break;
default:
break;
}
#endif
#endif
/* Fill out SURFACE_STATE for this view. */
if (tmpl->target != PIPE_BUFFER) {
if (crocus_resource_unfinished_aux_import(isv->res))
crocus_resource_finish_aux_import(&screen->base, isv->res);
}
return &isv->base;
}
static void
crocus_sampler_view_destroy(struct pipe_context *ctx,
struct pipe_sampler_view *state)
{
struct crocus_sampler_view *isv = (void *) state;
pipe_resource_reference(&state->texture, NULL);
free(isv);
}
/**
* The pipe->create_surface() driver hook.
*
* In Gallium nomenclature, "surfaces" are a view of a resource that
* can be bound as a render target or depth/stencil buffer.
*/
static struct pipe_surface *
crocus_create_surface(struct pipe_context *ctx,
struct pipe_resource *tex,
const struct pipe_surface *tmpl)
{
struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
const struct intel_device_info *devinfo = &screen->devinfo;
isl_surf_usage_flags_t usage = 0;
if (tmpl->writable)
usage = ISL_SURF_USAGE_STORAGE_BIT;
else if (util_format_is_depth_or_stencil(tmpl->format))
usage = ISL_SURF_USAGE_DEPTH_BIT;
else
usage = ISL_SURF_USAGE_RENDER_TARGET_BIT;
const struct crocus_format_info fmt =
crocus_format_for_usage(devinfo, tmpl->format, usage);
if ((usage & ISL_SURF_USAGE_RENDER_TARGET_BIT) &&
!isl_format_supports_rendering(devinfo, fmt.fmt)) {
/* Framebuffer validation will reject this invalid case, but it
* hasn't had the opportunity yet. In the meantime, we need to
* avoid hitting ISL asserts about unsupported formats below.
*/
return NULL;
}
struct crocus_surface *surf = calloc(1, sizeof(struct crocus_surface));
struct pipe_surface *psurf = &surf->base;
struct crocus_resource *res = (struct crocus_resource *) tex;
if (!surf)
return NULL;
pipe_reference_init(&psurf->reference, 1);
pipe_resource_reference(&psurf->texture, tex);
psurf->context = ctx;
psurf->format = tmpl->format;
psurf->width = tex->width0;
psurf->height = tex->height0;
psurf->texture = tex;
psurf->u.tex.first_layer = tmpl->u.tex.first_layer;
psurf->u.tex.last_layer = tmpl->u.tex.last_layer;
psurf->u.tex.level = tmpl->u.tex.level;
uint32_t array_len = tmpl->u.tex.last_layer - tmpl->u.tex.first_layer + 1;
struct isl_view *view = &surf->view;
*view = (struct isl_view) {
.format = fmt.fmt,
.base_level = tmpl->u.tex.level,
.levels = 1,
.base_array_layer = tmpl->u.tex.first_layer,
.array_len = array_len,
.swizzle = ISL_SWIZZLE_IDENTITY,
.usage = usage,
};
#if GFX_VER >= 6
struct isl_view *read_view = &surf->read_view;
*read_view = (struct isl_view) {
.format = fmt.fmt,
.base_level = tmpl->u.tex.level,
.levels = 1,
.base_array_layer = tmpl->u.tex.first_layer,
.array_len = array_len,
.swizzle = ISL_SWIZZLE_IDENTITY,
.usage = ISL_SURF_USAGE_TEXTURE_BIT,
};
#endif
surf->clear_color = res->aux.clear_color;
/* Bail early for depth/stencil - we don't want SURFACE_STATE for them. */
if (res->surf.usage & (ISL_SURF_USAGE_DEPTH_BIT |
ISL_SURF_USAGE_STENCIL_BIT))
return psurf;
if (!isl_format_is_compressed(res->surf.format)) {
if (crocus_resource_unfinished_aux_import(res))
crocus_resource_finish_aux_import(&screen->base, res);
memcpy(&surf->surf, &res->surf, sizeof(surf->surf));
uint64_t temp_offset;
uint32_t temp_x, temp_y;
isl_surf_get_image_offset_B_tile_sa(&res->surf, tmpl->u.tex.level,
res->base.b.target == PIPE_TEXTURE_3D ? 0 : tmpl->u.tex.first_layer,
res->base.b.target == PIPE_TEXTURE_3D ? tmpl->u.tex.first_layer : 0,
&temp_offset, &temp_x, &temp_y);
if (!devinfo->has_surface_tile_offset &&
(temp_x || temp_y)) {
/* Original gfx4 hardware couldn't draw to a non-tile-aligned
* destination.
*/
/* move to temp */
struct pipe_resource wa_templ = (struct pipe_resource) {
.width0 = u_minify(res->base.b.width0, tmpl->u.tex.level),
.height0 = u_minify(res->base.b.height0, tmpl->u.tex.level),
.depth0 = 1,
.array_size = 1,
.format = res->base.b.format,
.target = PIPE_TEXTURE_2D,
.bind = (usage & ISL_SURF_USAGE_DEPTH_BIT ? PIPE_BIND_DEPTH_STENCIL : PIPE_BIND_RENDER_TARGET) | PIPE_BIND_SAMPLER_VIEW,
};
surf->align_res = screen->base.resource_create(&screen->base, &wa_templ);
view->base_level = 0;
view->base_array_layer = 0;
view->array_len = 1;
struct crocus_resource *align_res = (struct crocus_resource *)surf->align_res;
memcpy(&surf->surf, &align_res->surf, sizeof(surf->surf));
}
return psurf;
}
/* The resource has a compressed format, which is not renderable, but we
* have a renderable view format. We must be attempting to upload blocks
* of compressed data via an uncompressed view.
*
* In this case, we can assume there are no auxiliary buffers, a single
* miplevel, and that the resource is single-sampled. Gallium may try
* and create an uncompressed view with multiple layers, however.
*/
assert(!isl_format_is_compressed(fmt.fmt));
assert(res->surf.samples == 1);
assert(view->levels == 1);
/* TODO: compressed pbo uploads aren't working here */
return NULL;
uint64_t offset_B = 0;
uint32_t tile_x_sa = 0, tile_y_sa = 0;
if (view->base_level > 0) {
/* We can't rely on the hardware's miplevel selection with such
* a substantial lie about the format, so we select a single image
* using the Tile X/Y Offset fields. In this case, we can't handle
* multiple array slices.
*
* On Broadwell, HALIGN and VALIGN are specified in pixels and are
* hard-coded to align to exactly the block size of the compressed
* texture. This means that, when reinterpreted as a non-compressed
* texture, the tile offsets may be anything and we can't rely on
* X/Y Offset.
*
* Return NULL to force the state tracker to take fallback paths.
*/
// TODO: check if the gen7 check is right, originally gen8
if (view->array_len > 1 || GFX_VER == 7)
return NULL;
const bool is_3d = res->surf.dim == ISL_SURF_DIM_3D;
isl_surf_get_image_surf(&screen->isl_dev, &res->surf,
view->base_level,
is_3d ? 0 : view->base_array_layer,
is_3d ? view->base_array_layer : 0,
&surf->surf,
&offset_B, &tile_x_sa, &tile_y_sa);
/* We use address and tile offsets to access a single level/layer
* as a subimage, so reset level/layer so it doesn't offset again.
*/
view->base_array_layer = 0;
view->base_level = 0;
} else {
/* Level 0 doesn't require tile offsets, and the hardware can find
* array slices using QPitch even with the format override, so we
* can allow layers in this case. Copy the original ISL surface.
*/
memcpy(&surf->surf, &res->surf, sizeof(surf->surf));
}
/* Scale down the image dimensions by the block size. */
const struct isl_format_layout *fmtl =
isl_format_get_layout(res->surf.format);
surf->surf.format = fmt.fmt;
surf->surf.logical_level0_px = isl_surf_get_logical_level0_el(&surf->surf);
surf->surf.phys_level0_sa = isl_surf_get_phys_level0_el(&surf->surf);
tile_x_sa /= fmtl->bw;
tile_y_sa /= fmtl->bh;
psurf->width = surf->surf.logical_level0_px.width;
psurf->height = surf->surf.logical_level0_px.height;
return psurf;
}
#if GFX_VER >= 7
static void
fill_default_image_param(struct brw_image_param *param)
{
memset(param, 0, sizeof(*param));
/* Set the swizzling shifts to all-ones to effectively disable swizzling --
* See emit_address_calculation() in brw_fs_surface_builder.cpp for a more
* detailed explanation of these parameters.
*/
param->swizzling[0] = 0xff;
param->swizzling[1] = 0xff;
}
static void
fill_buffer_image_param(struct brw_image_param *param,
enum pipe_format pfmt,
unsigned size)
{
const unsigned cpp = util_format_get_blocksize(pfmt);
fill_default_image_param(param);
param->size[0] = size / cpp;
param->stride[0] = cpp;
}
#endif
/**
* The pipe->set_shader_images() driver hook.
*/
static void
crocus_set_shader_images(struct pipe_context *ctx,
enum pipe_shader_type p_stage,
unsigned start_slot, unsigned count,
unsigned unbind_num_trailing_slots,
const struct pipe_image_view *p_images)
{
#if GFX_VER >= 7
struct crocus_context *ice = (struct crocus_context *) ctx;
struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
const struct intel_device_info *devinfo = &screen->devinfo;
gl_shader_stage stage = stage_from_pipe(p_stage);
struct crocus_shader_state *shs = &ice->state.shaders[stage];
struct crocus_genx_state *genx = ice->state.genx;
struct brw_image_param *image_params = genx->shaders[stage].image_param;
shs->bound_image_views &= ~u_bit_consecutive(start_slot, count);
for (unsigned i = 0; i < count; i++) {
struct crocus_image_view *iv = &shs->image[start_slot + i];
if (p_images && p_images[i].resource) {
const struct pipe_image_view *img = &p_images[i];
struct crocus_resource *res = (void *) img->resource;
util_copy_image_view(&iv->base, img);
shs->bound_image_views |= 1 << (start_slot + i);
res->bind_history |= PIPE_BIND_SHADER_IMAGE;
res->bind_stages |= 1 << stage;
isl_surf_usage_flags_t usage = ISL_SURF_USAGE_STORAGE_BIT;
struct crocus_format_info fmt =
crocus_format_for_usage(devinfo, img->format, usage);
struct isl_swizzle swiz = pipe_to_isl_swizzles(fmt.swizzles);
if (img->shader_access & PIPE_IMAGE_ACCESS_READ) {
/* On Gen8, try to use typed surfaces reads (which support a
* limited number of formats), and if not possible, fall back
* to untyped reads.
*/
if (!isl_has_matching_typed_storage_image_format(devinfo, fmt.fmt))
fmt.fmt = ISL_FORMAT_RAW;
else
fmt.fmt = isl_lower_storage_image_format(devinfo, fmt.fmt);
}
if (res->base.b.target != PIPE_BUFFER) {
struct isl_view view = {
.format = fmt.fmt,
.base_level = img->u.tex.level,
.levels = 1,
.base_array_layer = img->u.tex.first_layer,
.array_len = img->u.tex.last_layer - img->u.tex.first_layer + 1,
.swizzle = swiz,
.usage = usage,
};
iv->view = view;
isl_surf_fill_image_param(&screen->isl_dev,
&image_params[start_slot + i],
&res->surf, &view);
} else {
struct isl_view view = {
.format = fmt.fmt,
.swizzle = swiz,
.usage = usage,
};
iv->view = view;
util_range_add(&res->base.b, &res->valid_buffer_range, img->u.buf.offset,
img->u.buf.offset + img->u.buf.size);
fill_buffer_image_param(&image_params[start_slot + i],
img->format, img->u.buf.size);
}
} else {
pipe_resource_reference(&iv->base.resource, NULL);
fill_default_image_param(&image_params[start_slot + i]);
}
}
ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << stage;
ice->state.dirty |=
stage == MESA_SHADER_COMPUTE ? CROCUS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES
: CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
/* Broadwell also needs brw_image_params re-uploaded */
ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS << stage;
shs->sysvals_need_upload = true;
#endif
}
/**
* The pipe->set_sampler_views() driver hook.
*/
static void
crocus_set_sampler_views(struct pipe_context *ctx,
enum pipe_shader_type p_stage,
unsigned start, unsigned count,
unsigned unbind_num_trailing_slots,
bool take_ownership,
struct pipe_sampler_view **views)
{
struct crocus_context *ice = (struct crocus_context *) ctx;
gl_shader_stage stage = stage_from_pipe(p_stage);
struct crocus_shader_state *shs = &ice->state.shaders[stage];
shs->bound_sampler_views &= ~u_bit_consecutive(start, count);
for (unsigned i = 0; i < count; i++) {
struct pipe_sampler_view *pview = views ? views[i] : NULL;
if (take_ownership) {
pipe_sampler_view_reference((struct pipe_sampler_view **)
&shs->textures[start + i], NULL);
shs->textures[start + i] = (struct crocus_sampler_view *)pview;
} else {
pipe_sampler_view_reference((struct pipe_sampler_view **)
&shs->textures[start + i], pview);
}
struct crocus_sampler_view *view = (void *) pview;
if (view) {
view->res->bind_history |= PIPE_BIND_SAMPLER_VIEW;
view->res->bind_stages |= 1 << stage;
shs->bound_sampler_views |= 1 << (start + i);
}
}
#if GFX_VER == 6
/* first level parameters to crocus_upload_sampler_state is gfx6 only */
ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << stage;
#endif
ice->state.stage_dirty |= (CROCUS_STAGE_DIRTY_BINDINGS_VS << stage);
ice->state.dirty |=
stage == MESA_SHADER_COMPUTE ? CROCUS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES
: CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_TEXTURES];
}
/**
* The pipe->set_tess_state() driver hook.
*/
static void
crocus_set_tess_state(struct pipe_context *ctx,
const float default_outer_level[4],
const float default_inner_level[2])
{
struct crocus_context *ice = (struct crocus_context *) ctx;
struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_TESS_CTRL];
memcpy(&ice->state.default_outer_level[0], &default_outer_level[0], 4 * sizeof(float));
memcpy(&ice->state.default_inner_level[0], &default_inner_level[0], 2 * sizeof(float));
ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TCS;
shs->sysvals_need_upload = true;
}
static void
crocus_set_patch_vertices(struct pipe_context *ctx, uint8_t patch_vertices)
{
struct crocus_context *ice = (struct crocus_context *) ctx;
ice->state.patch_vertices = patch_vertices;
}
static void
crocus_surface_destroy(struct pipe_context *ctx, struct pipe_surface *p_surf)
{
struct crocus_surface *surf = (void *) p_surf;
pipe_resource_reference(&p_surf->texture, NULL);
pipe_resource_reference(&surf->align_res, NULL);
free(surf);
}
static void
crocus_set_clip_state(struct pipe_context *ctx,
const struct pipe_clip_state *state)
{
struct crocus_context *ice = (struct crocus_context *) ctx;
struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_VERTEX];
struct crocus_shader_state *gshs = &ice->state.shaders[MESA_SHADER_GEOMETRY];
struct crocus_shader_state *tshs = &ice->state.shaders[MESA_SHADER_TESS_EVAL];
memcpy(&ice->state.clip_planes, state, sizeof(*state));
#if GFX_VER <= 5
ice->state.dirty |= CROCUS_DIRTY_GEN4_CURBE;
#endif
ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS | CROCUS_STAGE_DIRTY_CONSTANTS_GS |
CROCUS_STAGE_DIRTY_CONSTANTS_TES;
shs->sysvals_need_upload = true;
gshs->sysvals_need_upload = true;
tshs->sysvals_need_upload = true;
}
/**
* The pipe->set_polygon_stipple() driver hook.
*/
static void
crocus_set_polygon_stipple(struct pipe_context *ctx,
const struct pipe_poly_stipple *state)
{
struct crocus_context *ice = (struct crocus_context *) ctx;
memcpy(&ice->state.poly_stipple, state, sizeof(*state));
ice->state.dirty |= CROCUS_DIRTY_POLYGON_STIPPLE;
}
/**
* The pipe->set_sample_mask() driver hook.
*/
static void
crocus_set_sample_mask(struct pipe_context *ctx, unsigned sample_mask)
{
struct crocus_context *ice = (struct crocus_context *) ctx;
/* We only support 16x MSAA, so we have 16 bits of sample maks.
* st/mesa may pass us 0xffffffff though, meaning "enable all samples".
*/
ice->state.sample_mask = sample_mask & 0xff;
ice->state.dirty |= CROCUS_DIRTY_GEN6_SAMPLE_MASK;
}
static void
crocus_fill_scissor_rect(struct crocus_context *ice,
int idx,
struct pipe_scissor_state *ss)
{
struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
struct pipe_rasterizer_state *cso_state = &ice->state.cso_rast->cso;
const struct pipe_viewport_state *vp = &ice->state.viewports[idx];
struct pipe_scissor_state scissor = (struct pipe_scissor_state) {
.minx = MAX2(-fabsf(vp->scale[0]) + vp->translate[0], 0),
.maxx = MIN2( fabsf(vp->scale[0]) + vp->translate[0], cso_fb->width) - 1,
.miny = MAX2(-fabsf(vp->scale[1]) + vp->translate[1], 0),
.maxy = MIN2( fabsf(vp->scale[1]) + vp->translate[1], cso_fb->height) - 1,
};
if (cso_state->scissor) {
struct pipe_scissor_state *s = &ice->state.scissors[idx];
scissor.minx = MAX2(scissor.minx, s->minx);
scissor.miny = MAX2(scissor.miny, s->miny);
scissor.maxx = MIN2(scissor.maxx, s->maxx);
scissor.maxy = MIN2(scissor.maxy, s->maxy);
}
*ss = scissor;
}
/**
* The pipe->set_scissor_states() driver hook.
*
* This corresponds to our SCISSOR_RECT state structures. It's an
* exact match, so we just store them, and memcpy them out later.
*/
static void
crocus_set_scissor_states(struct pipe_context *ctx,
unsigned start_slot,
unsigned num_scissors,
const struct pipe_scissor_state *rects)
{
struct crocus_context *ice = (struct crocus_context *) ctx;
for (unsigned i = 0; i < num_scissors; i++) {
if (rects[i].minx == rects[i].maxx || rects[i].miny == rects[i].maxy) {
/* If the scissor was out of bounds and got clamped to 0 width/height
* at the bounds, the subtraction of 1 from maximums could produce a
* negative number and thus not clip anything. Instead, just provide
* a min > max scissor inside the bounds, which produces the expected
* no rendering.
*/
ice->state.scissors[start_slot + i] = (struct pipe_scissor_state) {
.minx = 1, .maxx = 0, .miny = 1, .maxy = 0,
};
} else {
ice->state.scissors[start_slot + i] = (struct pipe_scissor_state) {
.minx = rects[i].minx, .miny = rects[i].miny,
.maxx = rects[i].maxx - 1, .maxy = rects[i].maxy - 1,
};
}
}
#if GFX_VER < 6
ice->state.dirty |= CROCUS_DIRTY_RASTER; /* SF state */
#else
ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
#endif
ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT;
}
/**
* The pipe->set_stencil_ref() driver hook.
*
* This is added to 3DSTATE_WM_DEPTH_STENCIL dynamically at draw time.
*/
static void
crocus_set_stencil_ref(struct pipe_context *ctx,
const struct pipe_stencil_ref ref)
{
struct crocus_context *ice = (struct crocus_context *) ctx;
ice->state.stencil_ref = ref;
ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
}
#if GFX_VER == 8
static float
viewport_extent(const struct pipe_viewport_state *state, int axis, float sign)
{
return copysignf(state->scale[axis], sign) + state->translate[axis];
}
#endif
/**
* The pipe->set_viewport_states() driver hook.
*
* This corresponds to our SF_CLIP_VIEWPORT states. We can't calculate
* the guardband yet, as we need the framebuffer dimensions, but we can
* at least fill out the rest.
*/
static void
crocus_set_viewport_states(struct pipe_context *ctx,
unsigned start_slot,
unsigned count,
const struct pipe_viewport_state *states)
{
struct crocus_context *ice = (struct crocus_context *) ctx;
memcpy(&ice->state.viewports[start_slot], states, sizeof(*states) * count);
ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT;
ice->state.dirty |= CROCUS_DIRTY_RASTER;
#if GFX_VER >= 6
ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
#endif
if (ice->state.cso_rast && (!ice->state.cso_rast->cso.depth_clip_near ||
!ice->state.cso_rast->cso.depth_clip_far))
ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT;
}
/**
* The pipe->set_framebuffer_state() driver hook.
*
* Sets the current draw FBO, including color render targets, depth,
* and stencil buffers.
*/
static void
crocus_set_framebuffer_state(struct pipe_context *ctx,
const struct pipe_framebuffer_state *state)
{
struct crocus_context *ice = (struct crocus_context *) ctx;
struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
const struct intel_device_info *devinfo = &screen->devinfo;
#if 0
struct isl_device *isl_dev = &screen->isl_dev;
struct crocus_resource *zres;
struct crocus_resource *stencil_res;
#endif
unsigned samples = util_framebuffer_get_num_samples(state);
unsigned layers = util_framebuffer_get_num_layers(state);
#if GFX_VER >= 6
if (cso->samples != samples) {
ice->state.dirty |= CROCUS_DIRTY_GEN6_MULTISAMPLE;
ice->state.dirty |= CROCUS_DIRTY_GEN6_SAMPLE_MASK;
ice->state.dirty |= CROCUS_DIRTY_RASTER;
#if GFX_VERx10 == 75
ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_FS;
#endif
}
#endif
#if GFX_VER >= 6 && GFX_VER < 8
ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
#endif
if ((cso->layers == 0) != (layers == 0)) {
ice->state.dirty |= CROCUS_DIRTY_CLIP;
}
if (cso->width != state->width || cso->height != state->height) {
ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT;
ice->state.dirty |= CROCUS_DIRTY_RASTER;
ice->state.dirty |= CROCUS_DIRTY_DRAWING_RECTANGLE;
#if GFX_VER >= 6
ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
#endif
}
if (cso->zsbuf || state->zsbuf) {
ice->state.dirty |= CROCUS_DIRTY_DEPTH_BUFFER;
/* update SF's depth buffer format */
if (GFX_VER == 7 && cso->zsbuf)
ice->state.dirty |= CROCUS_DIRTY_RASTER;
}
/* wm thread dispatch enable */
ice->state.dirty |= CROCUS_DIRTY_WM;
util_copy_framebuffer_state(cso, state);
cso->samples = samples;
cso->layers = layers;
if (cso->zsbuf) {
struct crocus_resource *zres;
struct crocus_resource *stencil_res;
enum isl_aux_usage aux_usage = ISL_AUX_USAGE_NONE;
crocus_get_depth_stencil_resources(devinfo, cso->zsbuf->texture, &zres,
&stencil_res);
if (zres && crocus_resource_level_has_hiz(zres, cso->zsbuf->u.tex.level)) {
aux_usage = zres->aux.usage;
}
ice->state.hiz_usage = aux_usage;
}
/* Render target change */
ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_FS;
ice->state.dirty |= CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_FRAMEBUFFER];
}
/**
* The pipe->set_constant_buffer() driver hook.
*
* This uploads any constant data in user buffers, and references
* any UBO resources containing constant data.
*/
static void
crocus_set_constant_buffer(struct pipe_context *ctx,
enum pipe_shader_type p_stage, unsigned index,
bool take_ownership,
const struct pipe_constant_buffer *input)
{
struct crocus_context *ice = (struct crocus_context *) ctx;
gl_shader_stage stage = stage_from_pipe(p_stage);
struct crocus_shader_state *shs = &ice->state.shaders[stage];
struct pipe_constant_buffer *cbuf = &shs->constbufs[index];
util_copy_constant_buffer(&shs->constbufs[index], input, take_ownership);
if (input && input->buffer_size && (input->buffer || input->user_buffer)) {
shs->bound_cbufs |= 1u << index;
if (input->user_buffer) {
void *map = NULL;
pipe_resource_reference(&cbuf->buffer, NULL);
u_upload_alloc(ice->ctx.const_uploader, 0, input->buffer_size, 64,
&cbuf->buffer_offset, &cbuf->buffer, (void **) &map);
if (!cbuf->buffer) {
/* Allocation was unsuccessful - just unbind */
crocus_set_constant_buffer(ctx, p_stage, index, false, NULL);
return;
}
assert(map);
memcpy(map, input->user_buffer, input->buffer_size);
}
cbuf->buffer_size =
MIN2(input->buffer_size,
crocus_resource_bo(cbuf->buffer)->size - cbuf->buffer_offset);
struct crocus_resource *res = (void *) cbuf->buffer;
res->bind_history |= PIPE_BIND_CONSTANT_BUFFER;
res->bind_stages |= 1 << stage;
} else {
shs->bound_cbufs &= ~(1u << index);
}
ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS << stage;
}
static void
upload_sysvals(struct crocus_context *ice,
gl_shader_stage stage)
{
UNUSED struct crocus_genx_state *genx = ice->state.genx;
struct crocus_shader_state *shs = &ice->state.shaders[stage];
struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
if (!shader || shader->num_system_values == 0)
return;
assert(shader->num_cbufs > 0);
unsigned sysval_cbuf_index = shader->num_cbufs - 1;
struct pipe_constant_buffer *cbuf = &shs->constbufs[sysval_cbuf_index];
unsigned upload_size = shader->num_system_values * sizeof(uint32_t);
uint32_t *map = NULL;
assert(sysval_cbuf_index < PIPE_MAX_CONSTANT_BUFFERS);
u_upload_alloc(ice->ctx.const_uploader, 0, upload_size, 64,
&cbuf->buffer_offset, &cbuf->buffer, (void **) &map);
for (int i = 0; i < shader->num_system_values; i++) {
uint32_t sysval = shader->system_values[i];
uint32_t value = 0;
if (BRW_PARAM_DOMAIN(sysval) == BRW_PARAM_DOMAIN_IMAGE) {
#if GFX_VER >= 7
unsigned img = BRW_PARAM_IMAGE_IDX(sysval);
unsigned offset = BRW_PARAM_IMAGE_OFFSET(sysval);
struct brw_image_param *param =
&genx->shaders[stage].image_param[img];
assert(offset < sizeof(struct brw_image_param));
value = ((uint32_t *) param)[offset];
#endif
} else if (sysval == BRW_PARAM_BUILTIN_ZERO) {
value = 0;
} else if (BRW_PARAM_BUILTIN_IS_CLIP_PLANE(sysval)) {
int plane = BRW_PARAM_BUILTIN_CLIP_PLANE_IDX(sysval);
int comp = BRW_PARAM_BUILTIN_CLIP_PLANE_COMP(sysval);
value = fui(ice->state.clip_planes.ucp[plane][comp]);
} else if (sysval == BRW_PARAM_BUILTIN_PATCH_VERTICES_IN) {
if (stage == MESA_SHADER_TESS_CTRL) {
value = ice->state.vertices_per_patch;
} else {
assert(stage == MESA_SHADER_TESS_EVAL);
const struct shader_info *tcs_info =
crocus_get_shader_info(ice, MESA_SHADER_TESS_CTRL);
if (tcs_info)
value = tcs_info->tess.tcs_vertices_out;
else
value = ice->state.vertices_per_patch;
}
} else if (sysval >= BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X &&
sysval <= BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_W) {
unsigned i = sysval - BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X;
value = fui(ice->state.default_outer_level[i]);
} else if (sysval == BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_X) {
value = fui(ice->state.default_inner_level[0]);
} else if (sysval == BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_Y) {
value = fui(ice->state.default_inner_level[1]);
} else if (sysval >= BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X &&
sysval <= BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_Z) {
unsigned i = sysval - BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X;
value = ice->state.last_block[i];
} else {
assert(!"unhandled system value");
}
*map++ = value;
}
cbuf->buffer_size = upload_size;
shs->sysvals_need_upload = false;
}
/**
* The pipe->set_shader_buffers() driver hook.
*
* This binds SSBOs and ABOs. Unfortunately, we need to stream out
* SURFACE_STATE here, as the buffer offset may change each time.
*/
static void
crocus_set_shader_buffers(struct pipe_context *ctx,
enum pipe_shader_type p_stage,
unsigned start_slot, unsigned count,
const struct pipe_shader_buffer *buffers,
unsigned writable_bitmask)
{
struct crocus_context *ice = (struct crocus_context *) ctx;
gl_shader_stage stage = stage_from_pipe(p_stage);
struct crocus_shader_state *shs = &ice->state.shaders[stage];
unsigned modified_bits = u_bit_consecutive(start_slot, count);
shs->bound_ssbos &= ~modified_bits;
shs->writable_ssbos &= ~modified_bits;
shs->writable_ssbos |= writable_bitmask << start_slot;
for (unsigned i = 0; i < count; i++) {
if (buffers && buffers[i].buffer) {
struct crocus_resource *res = (void *) buffers[i].buffer;
struct pipe_shader_buffer *ssbo = &shs->ssbo[start_slot + i];
pipe_resource_reference(&ssbo->buffer, &res->base.b);
ssbo->buffer_offset = buffers[i].buffer_offset;
ssbo->buffer_size =
MIN2(buffers[i].buffer_size, res->bo->size - ssbo->buffer_offset);
shs->bound_ssbos |= 1 << (start_slot + i);
res->bind_history |= PIPE_BIND_SHADER_BUFFER;
res->bind_stages |= 1 << stage;
util_range_add(&res->base.b, &res->valid_buffer_range, ssbo->buffer_offset,
ssbo->buffer_offset + ssbo->buffer_size);
} else {
pipe_resource_reference(&shs->ssbo[start_slot + i].buffer, NULL);
}
}
ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << stage;
}
static void
crocus_delete_state(struct pipe_context *ctx, void *state)
{
free(state);
}
/**
* The pipe->set_vertex_buffers() driver hook.
*
* This translates pipe_vertex_buffer to our 3DSTATE_VERTEX_BUFFERS packet.
*/
static void
crocus_set_vertex_buffers(struct pipe_context *ctx,
unsigned start_slot, unsigned count,
unsigned unbind_num_trailing_slots,
bool take_ownership,
const struct pipe_vertex_buffer *buffers)
{
struct crocus_context *ice = (struct crocus_context *) ctx;
struct crocus_screen *screen = (struct crocus_screen *) ctx->screen;
const unsigned padding =
(GFX_VERx10 < 75 && !screen->devinfo.is_baytrail) * 2;
ice->state.bound_vertex_buffers &=
~u_bit_consecutive64(start_slot, count + unbind_num_trailing_slots);
util_set_vertex_buffers_mask(ice->state.vertex_buffers, &ice->state.bound_vertex_buffers,
buffers, start_slot, count, unbind_num_trailing_slots,
take_ownership);
for (unsigned i = 0; i < count; i++) {
struct pipe_vertex_buffer *state =
&ice->state.vertex_buffers[start_slot + i];
if (!state->is_user_buffer && state->buffer.resource) {
struct crocus_resource *res = (void *)state->buffer.resource;
res->bind_history |= PIPE_BIND_VERTEX_BUFFER;
}
uint32_t end = 0;
if (state->buffer.resource)
end = state->buffer.resource->width0 + padding;
ice->state.vb_end[start_slot + i] = end;
}
ice->state.dirty |= CROCUS_DIRTY_VERTEX_BUFFERS;
}
#if GFX_VERx10 < 75
static uint8_t get_wa_flags(enum isl_format format)
{
uint8_t wa_flags = 0;
switch (format) {
case ISL_FORMAT_R10G10B10A2_USCALED:
wa_flags = BRW_ATTRIB_WA_SCALE;
break;
case ISL_FORMAT_R10G10B10A2_SSCALED:
wa_flags = BRW_ATTRIB_WA_SIGN | BRW_ATTRIB_WA_SCALE;
break;
case ISL_FORMAT_R10G10B10A2_UNORM:
wa_flags = BRW_ATTRIB_WA_NORMALIZE;
break;
case ISL_FORMAT_R10G10B10A2_SNORM:
wa_flags = BRW_ATTRIB_WA_SIGN | BRW_ATTRIB_WA_NORMALIZE;
break;
case ISL_FORMAT_R10G10B10A2_SINT:
wa_flags = BRW_ATTRIB_WA_SIGN;
break;
case ISL_FORMAT_B10G10R10A2_USCALED:
wa_flags = BRW_ATTRIB_WA_SCALE | BRW_ATTRIB_WA_BGRA;
break;
case ISL_FORMAT_B10G10R10A2_SSCALED:
wa_flags = BRW_ATTRIB_WA_SIGN | BRW_ATTRIB_WA_SCALE | BRW_ATTRIB_WA_BGRA;
break;
case ISL_FORMAT_B10G10R10A2_UNORM:
wa_flags = BRW_ATTRIB_WA_NORMALIZE | BRW_ATTRIB_WA_BGRA;
break;
case ISL_FORMAT_B10G10R10A2_SNORM:
wa_flags = BRW_ATTRIB_WA_SIGN | BRW_ATTRIB_WA_NORMALIZE | BRW_ATTRIB_WA_BGRA;
break;
case ISL_FORMAT_B10G10R10A2_SINT:
wa_flags = BRW_ATTRIB_WA_SIGN | BRW_ATTRIB_WA_BGRA;
break;
case ISL_FORMAT_B10G10R10A2_UINT:
wa_flags = BRW_ATTRIB_WA_BGRA;
break;
default:
break;
}
return wa_flags;
}
#endif
/**
* Gallium CSO for vertex elements.
*/
struct crocus_vertex_element_state {
uint32_t vertex_elements[1 + 33 * GENX(VERTEX_ELEMENT_STATE_length)];
#if GFX_VER == 8
uint32_t vf_instancing[33 * GENX(3DSTATE_VF_INSTANCING_length)];
#endif
uint32_t edgeflag_ve[GENX(VERTEX_ELEMENT_STATE_length)];
#if GFX_VER == 8
uint32_t edgeflag_vfi[GENX(3DSTATE_VF_INSTANCING_length)];
#endif
uint32_t step_rate[16];
uint8_t wa_flags[33];
unsigned count;
};
/**
* The pipe->create_vertex_elements() driver hook.
*
* This translates pipe_vertex_element to our 3DSTATE_VERTEX_ELEMENTS
* and 3DSTATE_VF_INSTANCING commands. The vertex_elements and vf_instancing
* arrays are ready to be emitted at draw time if no EdgeFlag or SGVs are
* needed. In these cases we will need information available at draw time.
* We setup edgeflag_ve and edgeflag_vfi as alternatives last
* 3DSTATE_VERTEX_ELEMENT and 3DSTATE_VF_INSTANCING that can be used at
* draw time if we detect that EdgeFlag is needed by the Vertex Shader.
*/
static void *
crocus_create_vertex_elements(struct pipe_context *ctx,
unsigned count,
const struct pipe_vertex_element *state)
{
struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
const struct intel_device_info *devinfo = &screen->devinfo;
struct crocus_vertex_element_state *cso =
malloc(sizeof(struct crocus_vertex_element_state));
cso->count = count;
crocus_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS), cso->vertex_elements, ve) {
ve.DWordLength =
1 + GENX(VERTEX_ELEMENT_STATE_length) * MAX2(count, 1) - 2;
}
uint32_t *ve_pack_dest = &cso->vertex_elements[1];
#if GFX_VER == 8
uint32_t *vfi_pack_dest = cso->vf_instancing;
#endif
if (count == 0) {
crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
ve.Valid = true;
ve.SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT;
ve.Component0Control = VFCOMP_STORE_0;
ve.Component1Control = VFCOMP_STORE_0;
ve.Component2Control = VFCOMP_STORE_0;
ve.Component3Control = VFCOMP_STORE_1_FP;
}
#if GFX_VER == 8
crocus_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {
}
#endif
}
for (int i = 0; i < count; i++) {
const struct crocus_format_info fmt =
crocus_format_for_usage(devinfo, state[i].src_format, 0);
unsigned comp[4] = { VFCOMP_STORE_SRC, VFCOMP_STORE_SRC,
VFCOMP_STORE_SRC, VFCOMP_STORE_SRC };
enum isl_format actual_fmt = fmt.fmt;
#if GFX_VERx10 < 75
cso->wa_flags[i] = get_wa_flags(fmt.fmt);
if (fmt.fmt == ISL_FORMAT_R10G10B10A2_USCALED ||
fmt.fmt == ISL_FORMAT_R10G10B10A2_SSCALED ||
fmt.fmt == ISL_FORMAT_R10G10B10A2_UNORM ||
fmt.fmt == ISL_FORMAT_R10G10B10A2_SNORM ||
fmt.fmt == ISL_FORMAT_R10G10B10A2_SINT ||
fmt.fmt == ISL_FORMAT_B10G10R10A2_USCALED ||
fmt.fmt == ISL_FORMAT_B10G10R10A2_SSCALED ||
fmt.fmt == ISL_FORMAT_B10G10R10A2_UNORM ||
fmt.fmt == ISL_FORMAT_B10G10R10A2_SNORM ||
fmt.fmt == ISL_FORMAT_B10G10R10A2_UINT ||
fmt.fmt == ISL_FORMAT_B10G10R10A2_SINT)
actual_fmt = ISL_FORMAT_R10G10B10A2_UINT;
if (fmt.fmt == ISL_FORMAT_R8G8B8_SINT)
actual_fmt = ISL_FORMAT_R8G8B8A8_SINT;
if (fmt.fmt == ISL_FORMAT_R8G8B8_UINT)
actual_fmt = ISL_FORMAT_R8G8B8A8_UINT;
if (fmt.fmt == ISL_FORMAT_R16G16B16_SINT)
actual_fmt = ISL_FORMAT_R16G16B16A16_SINT;
if (fmt.fmt == ISL_FORMAT_R16G16B16_UINT)
actual_fmt = ISL_FORMAT_R16G16B16A16_UINT;
#endif
cso->step_rate[state[i].vertex_buffer_index] = state[i].instance_divisor;
switch (isl_format_get_num_channels(fmt.fmt)) {
case 0: comp[0] = VFCOMP_STORE_0; FALLTHROUGH;
case 1: comp[1] = VFCOMP_STORE_0; FALLTHROUGH;
case 2: comp[2] = VFCOMP_STORE_0; FALLTHROUGH;
case 3:
comp[3] = isl_format_has_int_channel(fmt.fmt) ? VFCOMP_STORE_1_INT
: VFCOMP_STORE_1_FP;
break;
}
crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
#if GFX_VER >= 6
ve.EdgeFlagEnable = false;
#endif
ve.VertexBufferIndex = state[i].vertex_buffer_index;
ve.Valid = true;
ve.SourceElementOffset = state[i].src_offset;
ve.SourceElementFormat = actual_fmt;
ve.Component0Control = comp[0];
ve.Component1Control = comp[1];
ve.Component2Control = comp[2];
ve.Component3Control = comp[3];
#if GFX_VER < 5
ve.DestinationElementOffset = i * 4;
#endif
}
#if GFX_VER == 8
crocus_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {
vi.VertexElementIndex = i;
vi.InstancingEnable = state[i].instance_divisor > 0;
vi.InstanceDataStepRate = state[i].instance_divisor;
}
#endif
ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
#if GFX_VER == 8
vfi_pack_dest += GENX(3DSTATE_VF_INSTANCING_length);
#endif
}
/* An alternative version of the last VE and VFI is stored so it
* can be used at draw time in case Vertex Shader uses EdgeFlag
*/
if (count) {
const unsigned edgeflag_index = count - 1;
const struct crocus_format_info fmt =
crocus_format_for_usage(devinfo, state[edgeflag_index].src_format, 0);
crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), cso->edgeflag_ve, ve) {
#if GFX_VER >= 6
ve.EdgeFlagEnable = true;
#endif
ve.VertexBufferIndex = state[edgeflag_index].vertex_buffer_index;
ve.Valid = true;
ve.SourceElementOffset = state[edgeflag_index].src_offset;
ve.SourceElementFormat = fmt.fmt;
ve.Component0Control = VFCOMP_STORE_SRC;
ve.Component1Control = VFCOMP_STORE_0;
ve.Component2Control = VFCOMP_STORE_0;
ve.Component3Control = VFCOMP_STORE_0;
}
#if GFX_VER == 8
crocus_pack_command(GENX(3DSTATE_VF_INSTANCING), cso->edgeflag_vfi, vi) {
/* The vi.VertexElementIndex of the EdgeFlag Vertex Element is filled
* at draw time, as it should change if SGVs are emitted.
*/
vi.InstancingEnable = state[edgeflag_index].instance_divisor > 0;
vi.InstanceDataStepRate = state[edgeflag_index].instance_divisor;
}
#endif
}
return cso;
}
/**
* The pipe->bind_vertex_elements_state() driver hook.
*/
static void
crocus_bind_vertex_elements_state(struct pipe_context *ctx, void *state)
{
struct crocus_context *ice = (struct crocus_context *) ctx;
#if GFX_VER == 8
struct crocus_vertex_element_state *old_cso = ice->state.cso_vertex_elements;
struct crocus_vertex_element_state *new_cso = state;
if (new_cso && cso_changed(count))
ice->state.dirty |= CROCUS_DIRTY_GEN8_VF_SGVS;
#endif
ice->state.cso_vertex_elements = state;
ice->state.dirty |= CROCUS_DIRTY_VERTEX_ELEMENTS | CROCUS_DIRTY_VERTEX_BUFFERS;
ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_VERTEX_ELEMENTS];
}
#if GFX_VER >= 6
struct crocus_streamout_counter {
uint32_t offset_start;
uint32_t offset_end;
uint64_t accum;
};
/**
* Gallium CSO for stream output (transform feedback) targets.
*/
struct crocus_stream_output_target {
struct pipe_stream_output_target base;
/** Stride (bytes-per-vertex) during this transform feedback operation */
uint16_t stride;
/** Has 3DSTATE_SO_BUFFER actually been emitted, zeroing the offsets? */
bool zeroed;
struct crocus_resource *offset_res;
uint32_t offset_offset;
#if GFX_VER == 6
void *prim_map;
struct crocus_streamout_counter prev_count;
struct crocus_streamout_counter count;
#endif
#if GFX_VER == 8
/** Does the next 3DSTATE_SO_BUFFER need to zero the offsets? */
bool zero_offset;
#endif
};
#if GFX_VER >= 7
static uint32_t
crocus_get_so_offset(struct pipe_stream_output_target *so)
{
struct crocus_stream_output_target *tgt = (void *)so;
struct pipe_transfer *transfer;
struct pipe_box box;
uint32_t result;
u_box_1d(tgt->offset_offset, 4, &box);
void *val = so->context->buffer_map(so->context, &tgt->offset_res->base.b,
0, PIPE_MAP_DIRECTLY,
&box, &transfer);
assert(val);
result = *(uint32_t *)val;
so->context->buffer_unmap(so->context, transfer);
return result / tgt->stride;
}
#endif
#if GFX_VER == 6
static void
compute_vertices_written_so_far(struct crocus_context *ice,
struct crocus_stream_output_target *tgt,
struct crocus_streamout_counter *count,
uint64_t *svbi);
static uint32_t
crocus_get_so_offset(struct pipe_stream_output_target *so)
{
struct crocus_stream_output_target *tgt = (void *)so;
struct crocus_context *ice = (void *)so->context;
uint64_t vert_written;
compute_vertices_written_so_far(ice, tgt, &tgt->prev_count, &vert_written);
return vert_written;
}
#endif
/**
* The pipe->create_stream_output_target() driver hook.
*
* "Target" here refers to a destination buffer. We translate this into
* a 3DSTATE_SO_BUFFER packet. We can handle most fields, but don't yet
* know which buffer this represents, or whether we ought to zero the
* write-offsets, or append. Those are handled in the set() hook.
*/
static struct pipe_stream_output_target *
crocus_create_stream_output_target(struct pipe_context *ctx,
struct pipe_resource *p_res,
unsigned buffer_offset,
unsigned buffer_size)
{
struct crocus_resource *res = (void *) p_res;
struct crocus_stream_output_target *cso = calloc(1, sizeof(*cso));
if (!cso)
return NULL;
res->bind_history |= PIPE_BIND_STREAM_OUTPUT;
pipe_reference_init(&cso->base.reference, 1);
pipe_resource_reference(&cso->base.buffer, p_res);
cso->base.buffer_offset = buffer_offset;
cso->base.buffer_size = buffer_size;
cso->base.context = ctx;
util_range_add(&res->base.b, &res->valid_buffer_range, buffer_offset,
buffer_offset + buffer_size);
#if GFX_VER >= 7
struct crocus_context *ice = (struct crocus_context *) ctx;
void *temp;
u_upload_alloc(ice->ctx.stream_uploader, 0, sizeof(uint32_t), 4,
&cso->offset_offset,
(struct pipe_resource **)&cso->offset_res,
&temp);
#endif
return &cso->base;
}
static void
crocus_stream_output_target_destroy(struct pipe_context *ctx,
struct pipe_stream_output_target *state)
{
struct crocus_stream_output_target *cso = (void *) state;
pipe_resource_reference((struct pipe_resource **)&cso->offset_res, NULL);
pipe_resource_reference(&cso->base.buffer, NULL);
free(cso);
}
#define GEN6_SO_NUM_PRIMS_WRITTEN 0x2288
#define GEN7_SO_WRITE_OFFSET(n) (0x5280 + (n) * 4)
#if GFX_VER == 6
static void
aggregate_stream_counter(struct crocus_batch *batch, struct crocus_stream_output_target *tgt,
struct crocus_streamout_counter *counter)
{
uint64_t *prim_counts = tgt->prim_map;
if (crocus_batch_references(batch, tgt->offset_res->bo)) {
struct pipe_fence_handle *out_fence = NULL;
batch->ice->ctx.flush(&batch->ice->ctx, &out_fence, 0);
batch->screen->base.fence_finish(&batch->screen->base, &batch->ice->ctx, out_fence, UINT64_MAX);
batch->screen->base.fence_reference(&batch->screen->base, &out_fence, NULL);
}
for (unsigned i = counter->offset_start / sizeof(uint64_t); i < counter->offset_end / sizeof(uint64_t); i += 2) {
counter->accum += prim_counts[i + 1] - prim_counts[i];
}
tgt->count.offset_start = tgt->count.offset_end = 0;
}
static void
crocus_stream_store_prims_written(struct crocus_batch *batch,
struct crocus_stream_output_target *tgt)
{
if (!tgt->offset_res) {
u_upload_alloc(batch->ice->ctx.stream_uploader, 0, 4096, 4,
&tgt->offset_offset,
(struct pipe_resource **)&tgt->offset_res,
&tgt->prim_map);
tgt->count.offset_start = tgt->count.offset_end = 0;
}
if (tgt->count.offset_end + 16 >= 4096) {
aggregate_stream_counter(batch, tgt, &tgt->prev_count);
aggregate_stream_counter(batch, tgt, &tgt->count);
}
crocus_emit_mi_flush(batch);
crocus_store_register_mem64(batch, GEN6_SO_NUM_PRIMS_WRITTEN,
tgt->offset_res->bo,
tgt->count.offset_end + tgt->offset_offset, false);
tgt->count.offset_end += 8;
}
static void
compute_vertices_written_so_far(struct crocus_context *ice,
struct crocus_stream_output_target *tgt,
struct crocus_streamout_counter *counter,
uint64_t *svbi)
{
//TODO vertices per prim
aggregate_stream_counter(&ice->batches[0], tgt, counter);
*svbi = counter->accum * ice->state.last_xfb_verts_per_prim;
}
#endif
/**
* The pipe->set_stream_output_targets() driver hook.
*
* At this point, we know which targets are bound to a particular index,
* and also whether we want to append or start over. We can finish the
* 3DSTATE_SO_BUFFER packets we started earlier.
*/
static void
crocus_set_stream_output_targets(struct pipe_context *ctx,
unsigned num_targets,
struct pipe_stream_output_target **targets,
const unsigned *offsets)
{
struct crocus_context *ice = (struct crocus_context *) ctx;
struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
struct pipe_stream_output_target *old_tgt[4] = { NULL, NULL, NULL, NULL };
const bool active = num_targets > 0;
if (ice->state.streamout_active != active) {
ice->state.streamout_active = active;
#if GFX_VER >= 7
ice->state.dirty |= CROCUS_DIRTY_STREAMOUT;
#else
ice->state.dirty |= CROCUS_DIRTY_GEN4_FF_GS_PROG;
#endif
/* We only emit 3DSTATE_SO_DECL_LIST when streamout is active, because
* it's a non-pipelined command. If we're switching streamout on, we
* may have missed emitting it earlier, so do so now. (We're already
* taking a stall to update 3DSTATE_SO_BUFFERS anyway...)
*/
if (active) {
#if GFX_VER >= 7
ice->state.dirty |= CROCUS_DIRTY_SO_DECL_LIST;
#endif
} else {
uint32_t flush = 0;
for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
struct crocus_stream_output_target *tgt =
(void *) ice->state.so_target[i];
if (tgt) {
struct crocus_resource *res = (void *) tgt->base.buffer;
flush |= crocus_flush_bits_for_history(res);
crocus_dirty_for_history(ice, res);
}
}
crocus_emit_pipe_control_flush(&ice->batches[CROCUS_BATCH_RENDER],
"make streamout results visible", flush);
}
}
ice->state.so_targets = num_targets;
for (int i = 0; i < 4; i++) {
pipe_so_target_reference(&old_tgt[i], ice->state.so_target[i]);
pipe_so_target_reference(&ice->state.so_target[i],
i < num_targets ? targets[i] : NULL);
}
#if GFX_VER == 6
bool stored_num_prims = false;
for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
if (num_targets) {
struct crocus_stream_output_target *tgt =
(void *) ice->state.so_target[i];
if (!tgt)
continue;
if (offsets[i] == 0) {
// This means that we're supposed to ignore anything written to
// the buffer before. We can do this by just clearing out the
// count of writes to the prim count buffer.
tgt->count.offset_start = tgt->count.offset_end;
tgt->count.accum = 0;
ice->state.svbi = 0;
} else {
if (tgt->offset_res) {
compute_vertices_written_so_far(ice, tgt, &tgt->count, &ice->state.svbi);
tgt->count.offset_start = tgt->count.offset_end;
}
}
if (!stored_num_prims) {
crocus_stream_store_prims_written(batch, tgt);
stored_num_prims = true;
}
} else {
struct crocus_stream_output_target *tgt =
(void *) old_tgt[i];
if (tgt) {
if (!stored_num_prims) {
crocus_stream_store_prims_written(batch, tgt);
stored_num_prims = true;
}
if (tgt->offset_res) {
tgt->prev_count = tgt->count;
}
}
}
pipe_so_target_reference(&old_tgt[i], NULL);
}
ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_GS;
#else
for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
if (num_targets) {
struct crocus_stream_output_target *tgt =
(void *) ice->state.so_target[i];
if (offsets[i] == 0) {
#if GFX_VER == 8
if (tgt)
tgt->zero_offset = true;
#endif
crocus_load_register_imm32(batch, GEN7_SO_WRITE_OFFSET(i), 0);
}
else if (tgt)
crocus_load_register_mem32(batch, GEN7_SO_WRITE_OFFSET(i),
tgt->offset_res->bo,
tgt->offset_offset);
} else {
struct crocus_stream_output_target *tgt =
(void *) old_tgt[i];
if (tgt)
crocus_store_register_mem32(batch, GEN7_SO_WRITE_OFFSET(i),
tgt->offset_res->bo,
tgt->offset_offset, false);
}
pipe_so_target_reference(&old_tgt[i], NULL);
}
#endif
/* No need to update 3DSTATE_SO_BUFFER unless SOL is active. */
if (!active)
return;
#if GFX_VER >= 7
ice->state.dirty |= CROCUS_DIRTY_GEN7_SO_BUFFERS;
#elif GFX_VER == 6
ice->state.dirty |= CROCUS_DIRTY_GEN6_SVBI;
#endif
}
#endif
#if GFX_VER >= 7
/**
* An crocus-vtable helper for encoding the 3DSTATE_SO_DECL_LIST and
* 3DSTATE_STREAMOUT packets.
*
* 3DSTATE_SO_DECL_LIST is a list of shader outputs we want the streamout
* hardware to record. We can create it entirely based on the shader, with
* no dynamic state dependencies.
*
* 3DSTATE_STREAMOUT is an annoying mix of shader-based information and
* state-based settings. We capture the shader-related ones here, and merge
* the rest in at draw time.
*/
static uint32_t *
crocus_create_so_decl_list(const struct pipe_stream_output_info *info,
const struct brw_vue_map *vue_map)
{
struct GENX(SO_DECL) so_decl[MAX_VERTEX_STREAMS][128];
int buffer_mask[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
int next_offset[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
int decls[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
int max_decls = 0;
STATIC_ASSERT(ARRAY_SIZE(so_decl[0]) >= MAX_PROGRAM_OUTPUTS);
memset(so_decl, 0, sizeof(so_decl));
/* Construct the list of SO_DECLs to be emitted. The formatting of the
* command feels strange -- each dword pair contains a SO_DECL per stream.
*/
for (unsigned i = 0; i < info->num_outputs; i++) {
const struct pipe_stream_output *output = &info->output[i];
const int buffer = output->output_buffer;
const int varying = output->register_index;
const unsigned stream_id = output->stream;
assert(stream_id < MAX_VERTEX_STREAMS);
buffer_mask[stream_id] |= 1 << buffer;
assert(vue_map->varying_to_slot[varying] >= 0);
/* Mesa doesn't store entries for gl_SkipComponents in the Outputs[]
* array. Instead, it simply increments DstOffset for the following
* input by the number of components that should be skipped.
*
* Our hardware is unusual in that it requires us to program SO_DECLs
* for fake "hole" components, rather than simply taking the offset
* for each real varying. Each hole can have size 1, 2, 3, or 4; we
* program as many size = 4 holes as we can, then a final hole to
* accommodate the final 1, 2, or 3 remaining.
*/
int skip_components = output->dst_offset - next_offset[buffer];
while (skip_components > 0) {
so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
.HoleFlag = 1,
.OutputBufferSlot = output->output_buffer,
.ComponentMask = (1 << MIN2(skip_components, 4)) - 1,
};
skip_components -= 4;
}
next_offset[buffer] = output->dst_offset + output->num_components;
so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
.OutputBufferSlot = output->output_buffer,
.RegisterIndex = vue_map->varying_to_slot[varying],
.ComponentMask =
((1 << output->num_components) - 1) << output->start_component,
};
if (decls[stream_id] > max_decls)
max_decls = decls[stream_id];
}
unsigned dwords = GENX(3DSTATE_STREAMOUT_length) + (3 + 2 * max_decls);
uint32_t *map = ralloc_size(NULL, sizeof(uint32_t) * dwords);
uint32_t *so_decl_map = map + GENX(3DSTATE_STREAMOUT_length);
crocus_pack_command(GENX(3DSTATE_STREAMOUT), map, sol) {
int urb_entry_read_offset = 0;
int urb_entry_read_length = (vue_map->num_slots + 1) / 2 -
urb_entry_read_offset;
/* We always read the whole vertex. This could be reduced at some
* point by reading less and offsetting the register index in the
* SO_DECLs.
*/
sol.Stream0VertexReadOffset = urb_entry_read_offset;
sol.Stream0VertexReadLength = urb_entry_read_length - 1;
sol.Stream1VertexReadOffset = urb_entry_read_offset;
sol.Stream1VertexReadLength = urb_entry_read_length - 1;
sol.Stream2VertexReadOffset = urb_entry_read_offset;
sol.Stream2VertexReadLength = urb_entry_read_length - 1;
sol.Stream3VertexReadOffset = urb_entry_read_offset;
sol.Stream3VertexReadLength = urb_entry_read_length - 1;
// TODO: Double-check that stride == 0 means no buffer. Probably this
// needs to go elsewhere, where the buffer enable stuff is actually
// known.
#if GFX_VER < 8
sol.SOBufferEnable0 = !!info->stride[0];
sol.SOBufferEnable1 = !!info->stride[1];
sol.SOBufferEnable2 = !!info->stride[2];
sol.SOBufferEnable3 = !!info->stride[3];
#else
/* Set buffer pitches; 0 means unbound. */
sol.Buffer0SurfacePitch = 4 * info->stride[0];
sol.Buffer1SurfacePitch = 4 * info->stride[1];
sol.Buffer2SurfacePitch = 4 * info->stride[2];
sol.Buffer3SurfacePitch = 4 * info->stride[3];
#endif
}
crocus_pack_command(GENX(3DSTATE_SO_DECL_LIST), so_decl_map, list) {
list.DWordLength = 3 + 2 * max_decls - 2;
list.StreamtoBufferSelects0 = buffer_mask[0];
list.StreamtoBufferSelects1 = buffer_mask[1];
list.StreamtoBufferSelects2 = buffer_mask[2];
list.StreamtoBufferSelects3 = buffer_mask[3];
list.NumEntries0 = decls[0];
list.NumEntries1 = decls[1];
list.NumEntries2 = decls[2];
list.NumEntries3 = decls[3];
}
for (int i = 0; i < max_decls; i++) {
crocus_pack_state(GENX(SO_DECL_ENTRY), so_decl_map + 3 + i * 2, entry) {
entry.Stream0Decl = so_decl[0][i];
entry.Stream1Decl = so_decl[1][i];
entry.Stream2Decl = so_decl[2][i];
entry.Stream3Decl = so_decl[3][i];
}
}
return map;
}
#endif
#if GFX_VER == 6
static void
crocus_emit_so_svbi(struct crocus_context *ice)
{
struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
unsigned max_vertex = 0xffffffff;
for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
struct crocus_stream_output_target *tgt =
(void *) ice->state.so_target[i];
if (tgt)
max_vertex = MIN2(max_vertex, tgt->base.buffer_size / tgt->stride);
}
crocus_emit_cmd(batch, GENX(3DSTATE_GS_SVB_INDEX), svbi) {
svbi.IndexNumber = 0;
svbi.StreamedVertexBufferIndex = (uint32_t)ice->state.svbi; /* fix when resuming, based on target's prim count */
svbi.MaximumIndex = max_vertex;
}
/* initialize the rest of the SVBI's to reasonable values so that we don't
* run out of room writing the regular data.
*/
for (int i = 1; i < 4; i++) {
crocus_emit_cmd(batch, GENX(3DSTATE_GS_SVB_INDEX), svbi) {
svbi.IndexNumber = i;
svbi.StreamedVertexBufferIndex = 0;
svbi.MaximumIndex = 0xffffffff;
}
}
}
#endif
#if GFX_VER >= 6
static bool
crocus_is_drawing_points(const struct crocus_context *ice)
{
const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
if (cso_rast->cso.fill_front == PIPE_POLYGON_MODE_POINT ||
cso_rast->cso.fill_back == PIPE_POLYGON_MODE_POINT)
return true;
if (ice->shaders.prog[MESA_SHADER_GEOMETRY]) {
const struct brw_gs_prog_data *gs_prog_data =
(void *) ice->shaders.prog[MESA_SHADER_GEOMETRY]->prog_data;
return gs_prog_data->output_topology == _3DPRIM_POINTLIST;
} else if (ice->shaders.prog[MESA_SHADER_TESS_EVAL]) {
const struct brw_tes_prog_data *tes_data =
(void *) ice->shaders.prog[MESA_SHADER_TESS_EVAL]->prog_data;
return tes_data->output_topology == BRW_TESS_OUTPUT_TOPOLOGY_POINT;
} else {
return ice->state.prim_mode == PIPE_PRIM_POINTS;
}
}
#endif
#if GFX_VER >= 6
static void
get_attr_override(
struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr,
const struct brw_vue_map *vue_map,
int urb_entry_read_offset, int fs_attr,
bool two_side_color, uint32_t *max_source_attr)
{
/* Find the VUE slot for this attribute. */
int slot = vue_map->varying_to_slot[fs_attr];
/* Viewport and Layer are stored in the VUE header. We need to override
* them to zero if earlier stages didn't write them, as GL requires that
* they read back as zero when not explicitly set.
*/
if (fs_attr == VARYING_SLOT_VIEWPORT || fs_attr == VARYING_SLOT_LAYER) {
attr->ComponentOverrideX = true;
attr->ComponentOverrideW = true;
attr->ConstantSource = CONST_0000;
if (!(vue_map->slots_valid & VARYING_BIT_LAYER))
attr->ComponentOverrideY = true;
if (!(vue_map->slots_valid & VARYING_BIT_VIEWPORT))
attr->ComponentOverrideZ = true;
return;
}
/* If there was only a back color written but not front, use back
* as the color instead of undefined
*/
if (slot == -1 && fs_attr == VARYING_SLOT_COL0)
slot = vue_map->varying_to_slot[VARYING_SLOT_BFC0];
if (slot == -1 && fs_attr == VARYING_SLOT_COL1)
slot = vue_map->varying_to_slot[VARYING_SLOT_BFC1];
if (slot == -1) {
/* This attribute does not exist in the VUE--that means that the vertex
* shader did not write to it. This means that either:
*
* (a) This attribute is a texture coordinate, and it is going to be
* replaced with point coordinates (as a consequence of a call to
* glTexEnvi(GL_POINT_SPRITE, GL_COORD_REPLACE, GL_TRUE)), so the
* hardware will ignore whatever attribute override we supply.
*
* (b) This attribute is read by the fragment shader but not written by
* the vertex shader, so its value is undefined. Therefore the
* attribute override we supply doesn't matter.
*
* (c) This attribute is gl_PrimitiveID, and it wasn't written by the
* previous shader stage.
*
* Note that we don't have to worry about the cases where the attribute
* is gl_PointCoord or is undergoing point sprite coordinate
* replacement, because in those cases, this function isn't called.
*
* In case (c), we need to program the attribute overrides so that the
* primitive ID will be stored in this slot. In every other case, the
* attribute override we supply doesn't matter. So just go ahead and
* program primitive ID in every case.
*/
attr->ComponentOverrideW = true;
attr->ComponentOverrideX = true;
attr->ComponentOverrideY = true;
attr->ComponentOverrideZ = true;
attr->ConstantSource = PRIM_ID;
return;
}
/* Compute the location of the attribute relative to urb_entry_read_offset.
* Each increment of urb_entry_read_offset represents a 256-bit value, so
* it counts for two 128-bit VUE slots.
*/
int source_attr = slot - 2 * urb_entry_read_offset;
assert(source_attr >= 0 && source_attr < 32);
/* If we are doing two-sided color, and the VUE slot following this one
* represents a back-facing color, then we need to instruct the SF unit to
* do back-facing swizzling.
*/
bool swizzling = two_side_color &&
((vue_map->slot_to_varying[slot] == VARYING_SLOT_COL0 &&
vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC0) ||
(vue_map->slot_to_varying[slot] == VARYING_SLOT_COL1 &&
vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC1));
/* Update max_source_attr. If swizzling, the SF will read this slot + 1. */
if (*max_source_attr < source_attr + swizzling)
*max_source_attr = source_attr + swizzling;
attr->SourceAttribute = source_attr;
if (swizzling)
attr->SwizzleSelect = INPUTATTR_FACING;
}
static void
calculate_attr_overrides(
const struct crocus_context *ice,
struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr_overrides,
uint32_t *point_sprite_enables,
uint32_t *urb_entry_read_length,
uint32_t *urb_entry_read_offset)
{
const struct brw_wm_prog_data *wm_prog_data = (void *)
ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data;
const struct brw_vue_map *vue_map = ice->shaders.last_vue_map;
const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
uint32_t max_source_attr = 0;
const struct shader_info *fs_info =
crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
int first_slot =
brw_compute_first_urb_slot_required(fs_info->inputs_read, vue_map);
/* Each URB offset packs two varying slots */
assert(first_slot % 2 == 0);
*urb_entry_read_offset = first_slot / 2;
*point_sprite_enables = 0;
for (int fs_attr = 0; fs_attr < VARYING_SLOT_MAX; fs_attr++) {
const int input_index = wm_prog_data->urb_setup[fs_attr];
if (input_index < 0)
continue;
bool point_sprite = false;
if (crocus_is_drawing_points(ice)) {
if (fs_attr >= VARYING_SLOT_TEX0 &&
fs_attr <= VARYING_SLOT_TEX7 &&
cso_rast->cso.sprite_coord_enable & (1 << (fs_attr - VARYING_SLOT_TEX0)))
point_sprite = true;
if (fs_attr == VARYING_SLOT_PNTC)
point_sprite = true;
if (point_sprite)
*point_sprite_enables |= 1U << input_index;
}
struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attribute = { 0 };
if (!point_sprite) {
get_attr_override(&attribute, vue_map, *urb_entry_read_offset, fs_attr,
cso_rast->cso.light_twoside, &max_source_attr);
}
/* The hardware can only do the overrides on 16 overrides at a
* time, and the other up to 16 have to be lined up so that the
* input index = the output index. We'll need to do some
* tweaking to make sure that's the case.
*/
if (input_index < 16)
attr_overrides[input_index] = attribute;
else
assert(attribute.SourceAttribute == input_index);
}
/* From the Sandy Bridge PRM, Volume 2, Part 1, documentation for
* 3DSTATE_SF DWord 1 bits 15:11, "Vertex URB Entry Read Length":
*
* "This field should be set to the minimum length required to read the
* maximum source attribute. The maximum source attribute is indicated
* by the maximum value of the enabled Attribute # Source Attribute if
* Attribute Swizzle Enable is set, Number of Output Attributes-1 if
* enable is not set.
* read_length = ceiling((max_source_attr + 1) / 2)
*
* [errata] Corruption/Hang possible if length programmed larger than
* recommended"
*
* Similar text exists for Ivy Bridge.
*/
*urb_entry_read_length = DIV_ROUND_UP(max_source_attr + 1, 2);
}
#endif
#if GFX_VER >= 7
static void
crocus_emit_sbe(struct crocus_batch *batch, const struct crocus_context *ice)
{
const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
const struct brw_wm_prog_data *wm_prog_data = (void *)
ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data;
#if GFX_VER >= 8
struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attr_overrides[16] = { { 0 } };
#else
#define attr_overrides sbe.Attribute
#endif
uint32_t urb_entry_read_length;
uint32_t urb_entry_read_offset;
uint32_t point_sprite_enables;
crocus_emit_cmd(batch, GENX(3DSTATE_SBE), sbe) {
sbe.AttributeSwizzleEnable = true;
sbe.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
sbe.PointSpriteTextureCoordinateOrigin = cso_rast->cso.sprite_coord_mode;
calculate_attr_overrides(ice,
attr_overrides,
&point_sprite_enables,
&urb_entry_read_length,
&urb_entry_read_offset);
sbe.VertexURBEntryReadOffset = urb_entry_read_offset;
sbe.VertexURBEntryReadLength = urb_entry_read_length;
sbe.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
sbe.PointSpriteTextureCoordinateEnable = point_sprite_enables;
#if GFX_VER >= 8
sbe.ForceVertexURBEntryReadLength = true;
sbe.ForceVertexURBEntryReadOffset = true;
#endif
}
#if GFX_VER >= 8
crocus_emit_cmd(batch, GENX(3DSTATE_SBE_SWIZ), sbes) {
for (int i = 0; i < 16; i++)
sbes.Attribute[i] = attr_overrides[i];
}
#endif
}
#endif
/* ------------------------------------------------------------------- */
/**
* Populate VS program key fields based on the current state.
*/
static void
crocus_populate_vs_key(const struct crocus_context *ice,
const struct shader_info *info,
gl_shader_stage last_stage,
struct brw_vs_prog_key *key)
{
const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
if (info->clip_distance_array_size == 0 &&
(info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
last_stage == MESA_SHADER_VERTEX)
key->nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
#if GFX_VER <= 5
key->copy_edgeflag = (cso_rast->cso.fill_back != PIPE_POLYGON_MODE_FILL ||
cso_rast->cso.fill_front != PIPE_POLYGON_MODE_FILL);
key->point_coord_replace = cso_rast->cso.sprite_coord_enable & 0xff;
#endif
key->clamp_vertex_color = cso_rast->cso.clamp_vertex_color;
#if GFX_VERx10 < 75
uint64_t inputs_read = info->inputs_read;
int ve_idx = 0;
while (inputs_read) {
int i = u_bit_scan64(&inputs_read);
key->gl_attrib_wa_flags[i] = ice->state.cso_vertex_elements->wa_flags[ve_idx];
ve_idx++;
}
#endif
}
/**
* Populate TCS program key fields based on the current state.
*/
static void
crocus_populate_tcs_key(const struct crocus_context *ice,
struct brw_tcs_prog_key *key)
{
}
/**
* Populate TES program key fields based on the current state.
*/
static void
crocus_populate_tes_key(const struct crocus_context *ice,
const struct shader_info *info,
gl_shader_stage last_stage,
struct brw_tes_prog_key *key)
{
const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
if (info->clip_distance_array_size == 0 &&
(info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
last_stage == MESA_SHADER_TESS_EVAL)
key->nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
}
/**
* Populate GS program key fields based on the current state.
*/
static void
crocus_populate_gs_key(const struct crocus_context *ice,
const struct shader_info *info,
gl_shader_stage last_stage,
struct brw_gs_prog_key *key)
{
const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
if (info->clip_distance_array_size == 0 &&
(info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
last_stage == MESA_SHADER_GEOMETRY)
key->nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
}
/**
* Populate FS program key fields based on the current state.
*/
static void
crocus_populate_fs_key(const struct crocus_context *ice,
const struct shader_info *info,
struct brw_wm_prog_key *key)
{
struct crocus_screen *screen = (void *) ice->ctx.screen;
const struct pipe_framebuffer_state *fb = &ice->state.framebuffer;
const struct crocus_depth_stencil_alpha_state *zsa = ice->state.cso_zsa;
const struct crocus_rasterizer_state *rast = ice->state.cso_rast;
const struct crocus_blend_state *blend = ice->state.cso_blend;
#if GFX_VER < 6
uint32_t lookup = 0;
if (info->fs.uses_discard || zsa->cso.alpha_enabled)
lookup |= BRW_WM_IZ_PS_KILL_ALPHATEST_BIT;
if (info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
lookup |= BRW_WM_IZ_PS_COMPUTES_DEPTH_BIT;
if (fb->zsbuf && zsa->cso.depth_enabled) {
lookup |= BRW_WM_IZ_DEPTH_TEST_ENABLE_BIT;
if (zsa->cso.depth_writemask)
lookup |= BRW_WM_IZ_DEPTH_WRITE_ENABLE_BIT;
}
if (zsa->cso.stencil[0].enabled || zsa->cso.stencil[1].enabled) {
lookup |= BRW_WM_IZ_STENCIL_TEST_ENABLE_BIT;
if (zsa->cso.stencil[0].writemask || zsa->cso.stencil[1].writemask)
lookup |= BRW_WM_IZ_STENCIL_WRITE_ENABLE_BIT;
}
key->iz_lookup = lookup;
key->stats_wm = ice->state.stats_wm;
#endif
uint32_t line_aa = BRW_WM_AA_NEVER;
if (rast->cso.line_smooth) {
int reduced_prim = u_reduced_prim(ice->state.prim_mode);
if (reduced_prim == PIPE_PRIM_LINES)
line_aa = BRW_WM_AA_ALWAYS;
else if (reduced_prim == PIPE_PRIM_TRIANGLES) {
if (rast->cso.fill_front == PIPE_POLYGON_MODE_LINE) {
line_aa = BRW_WM_AA_SOMETIMES;
if (rast->cso.fill_back == PIPE_POLYGON_MODE_LINE ||
rast->cso.cull_face == PIPE_FACE_BACK)
line_aa = BRW_WM_AA_ALWAYS;
} else if (rast->cso.fill_back == PIPE_POLYGON_MODE_LINE) {
line_aa = BRW_WM_AA_SOMETIMES;
if (rast->cso.cull_face == PIPE_FACE_FRONT)
line_aa = BRW_WM_AA_ALWAYS;
}
}
}
key->line_aa = line_aa;
key->nr_color_regions = fb->nr_cbufs;
key->clamp_fragment_color = rast->cso.clamp_fragment_color;
key->alpha_to_coverage = blend->cso.alpha_to_coverage;
key->alpha_test_replicate_alpha = fb->nr_cbufs > 1 && zsa->cso.alpha_enabled;
key->flat_shade = rast->cso.flatshade &&
(info->inputs_read & (VARYING_BIT_COL0 | VARYING_BIT_COL1));
key->persample_interp = rast->cso.force_persample_interp;
key->multisample_fbo = rast->cso.multisample && fb->samples > 1;
key->ignore_sample_mask_out = !key->multisample_fbo;
key->coherent_fb_fetch = false; // TODO: needed?
key->force_dual_color_blend =
screen->driconf.dual_color_blend_by_location &&
(blend->blend_enables & 1) && blend->dual_color_blending;
/* TODO: Respect glHint for key->high_quality_derivatives */
#if GFX_VER <= 5
if (fb->nr_cbufs > 1 && zsa->cso.alpha_enabled) {
key->alpha_test_func = zsa->cso.alpha_func;
key->alpha_test_ref = zsa->cso.alpha_ref_value;
}
#endif
}
static void
crocus_populate_cs_key(const struct crocus_context *ice,
struct brw_cs_prog_key *key)
{
}
#if GFX_VER == 4
#define KSP(ice, shader) ro_bo((ice)->shaders.cache_bo, (shader)->offset);
#elif GFX_VER >= 5
static uint64_t
KSP(const struct crocus_context *ice, const struct crocus_compiled_shader *shader)
{
return shader->offset;
}
#endif
/* Gen11 workaround table #2056 WABTPPrefetchDisable suggests to disable
* prefetching of binding tables in A0 and B0 steppings. XXX: Revisit
* this WA on C0 stepping.
*
* TODO: Fill out SamplerCount for prefetching?
*/
#define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix, stage) \
pkt.KernelStartPointer = KSP(ice, shader); \
pkt.BindingTableEntryCount = shader->bt.size_bytes / 4; \
pkt.FloatingPointMode = prog_data->use_alt_mode; \
\
pkt.DispatchGRFStartRegisterForURBData = \
prog_data->dispatch_grf_start_reg; \
pkt.prefix##URBEntryReadLength = vue_prog_data->urb_read_length; \
pkt.prefix##URBEntryReadOffset = 0; \
\
pkt.StatisticsEnable = true; \
pkt.Enable = true; \
\
if (prog_data->total_scratch) { \
struct crocus_bo *bo = \
crocus_get_scratch_space(ice, prog_data->total_scratch, stage); \
pkt.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 11; \
pkt.ScratchSpaceBasePointer = rw_bo(bo, 0); \
}
/* ------------------------------------------------------------------- */
#if GFX_VER >= 6
static const uint32_t push_constant_opcodes[] = {
[MESA_SHADER_VERTEX] = 21,
[MESA_SHADER_TESS_CTRL] = 25, /* HS */
[MESA_SHADER_TESS_EVAL] = 26, /* DS */
[MESA_SHADER_GEOMETRY] = 22,
[MESA_SHADER_FRAGMENT] = 23,
[MESA_SHADER_COMPUTE] = 0,
};
#endif
static void
emit_sized_null_surface(struct crocus_batch *batch,
unsigned width, unsigned height,
unsigned layers, unsigned levels,
unsigned minimum_array_element,
uint32_t *out_offset)
{
struct isl_device *isl_dev = &batch->screen->isl_dev;
uint32_t *surf = stream_state(batch, isl_dev->ss.size,
isl_dev->ss.align,
out_offset);
//TODO gen 6 multisample crash
isl_null_fill_state(isl_dev, surf,
.size = isl_extent3d(width, height, layers),
.levels = levels,
.minimum_array_element = minimum_array_element);
}
static void
emit_null_surface(struct crocus_batch *batch,
uint32_t *out_offset)
{
emit_sized_null_surface(batch, 1, 1, 1, 0, 0, out_offset);
}
static void
emit_null_fb_surface(struct crocus_batch *batch,
struct crocus_context *ice,
uint32_t *out_offset)
{
uint32_t width, height, layers, level, layer;
/* If set_framebuffer_state() was never called, fall back to 1x1x1 */
if (ice->state.framebuffer.width == 0 && ice->state.framebuffer.height == 0) {
emit_null_surface(batch, out_offset);
return;
}
struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
width = MAX2(cso->width, 1);
height = MAX2(cso->height, 1);
layers = cso->layers ? cso->layers : 1;
level = 0;
layer = 0;
if (cso->nr_cbufs == 0 && cso->zsbuf) {
width = cso->zsbuf->width;
height = cso->zsbuf->height;
level = cso->zsbuf->u.tex.level;
layer = cso->zsbuf->u.tex.first_layer;
}
emit_sized_null_surface(batch, width, height,
layers, level, layer,
out_offset);
}
static void
emit_surface_state(struct crocus_batch *batch,
struct crocus_resource *res,
const struct isl_surf *in_surf,
bool adjust_surf,
struct isl_view *view,
bool writeable,
enum isl_aux_usage aux_usage,
bool blend_enable,
uint32_t write_disables,
uint32_t *surf_state,
uint32_t addr_offset)
{
const struct intel_device_info *devinfo = &batch->screen->devinfo;
struct isl_device *isl_dev = &batch->screen->isl_dev;
uint32_t reloc = RELOC_32BIT;
uint64_t offset_B = res->offset;
uint32_t tile_x_sa = 0, tile_y_sa = 0;
if (writeable)
reloc |= RELOC_WRITE;
struct isl_surf surf = *in_surf;
if (adjust_surf) {
if (res->base.b.target == PIPE_TEXTURE_3D && view->array_len == 1) {
isl_surf_get_image_surf(isl_dev, in_surf,
view->base_level, 0,
view->base_array_layer,
&surf, &offset_B,
&tile_x_sa, &tile_y_sa);
view->base_array_layer = 0;
view->base_level = 0;
} else if (res->base.b.target == PIPE_TEXTURE_CUBE && devinfo->ver == 4) {
isl_surf_get_image_surf(isl_dev, in_surf,
view->base_level, view->base_array_layer,
0,
&surf, &offset_B,
&tile_x_sa, &tile_y_sa);
view->base_array_layer = 0;
view->base_level = 0;
} else if (res->base.b.target == PIPE_TEXTURE_1D_ARRAY)
surf.dim = ISL_SURF_DIM_2D;
}
union isl_color_value clear_color = { .u32 = { 0, 0, 0, 0 } };
struct crocus_bo *aux_bo = NULL;
uint32_t aux_offset = 0;
struct isl_surf *aux_surf = NULL;
if (aux_usage != ISL_AUX_USAGE_NONE) {
aux_surf = &res->aux.surf;
aux_offset = res->aux.offset;
aux_bo = res->aux.bo;
clear_color = crocus_resource_get_clear_color(res);
}
isl_surf_fill_state(isl_dev, surf_state,
.surf = &surf,
.view = view,
.address = crocus_state_reloc(batch,
addr_offset + isl_dev->ss.addr_offset,
res->bo, offset_B, reloc),
.aux_surf = aux_surf,
.aux_usage = aux_usage,
.aux_address = aux_offset,
.mocs = crocus_mocs(res->bo, isl_dev),
.clear_color = clear_color,
.use_clear_address = false,
.clear_address = 0,
.x_offset_sa = tile_x_sa,
.y_offset_sa = tile_y_sa,
#if GFX_VER <= 5
.blend_enable = blend_enable,
.write_disables = write_disables,
#endif
);
if (aux_surf) {
/* On gen7 and prior, the upper 20 bits of surface state DWORD 6 are the
* upper 20 bits of the GPU address of the MCS buffer; the lower 12 bits
* contain other control information. Since buffer addresses are always
* on 4k boundaries (and thus have their lower 12 bits zero), we can use
* an ordinary reloc to do the necessary address translation.
*
* FIXME: move to the point of assignment.
*/
if (devinfo->ver == 8) {
uint64_t *aux_addr = (uint64_t *)(surf_state + (isl_dev->ss.aux_addr_offset / 4));
*aux_addr = crocus_state_reloc(batch,
addr_offset + isl_dev->ss.aux_addr_offset,
aux_bo, *aux_addr,
reloc);
} else {
uint32_t *aux_addr = surf_state + (isl_dev->ss.aux_addr_offset / 4);
*aux_addr = crocus_state_reloc(batch,
addr_offset + isl_dev->ss.aux_addr_offset,
aux_bo, *aux_addr,
reloc);
}
}
}
static uint32_t
emit_surface(struct crocus_batch *batch,
struct crocus_surface *surf,
enum isl_aux_usage aux_usage,
bool blend_enable,
uint32_t write_disables)
{
const struct intel_device_info *devinfo = &batch->screen->devinfo;
struct isl_device *isl_dev = &batch->screen->isl_dev;
struct crocus_resource *res = (struct crocus_resource *)surf->base.texture;
struct isl_view *view = &surf->view;
uint32_t offset = 0;
enum pipe_texture_target target = res->base.b.target;
bool adjust_surf = false;
if (devinfo->ver == 4 && target == PIPE_TEXTURE_CUBE)
adjust_surf = true;
if (surf->align_res)
res = (struct crocus_resource *)surf->align_res;
uint32_t *surf_state = stream_state(batch, isl_dev->ss.size, isl_dev->ss.align, &offset);
emit_surface_state(batch, res, &surf->surf, adjust_surf, view, true,
aux_usage, blend_enable,
write_disables,
surf_state, offset);
return offset;
}
static uint32_t
emit_rt_surface(struct crocus_batch *batch,
struct crocus_surface *surf,
enum isl_aux_usage aux_usage)
{
struct isl_device *isl_dev = &batch->screen->isl_dev;
struct crocus_resource *res = (struct crocus_resource *)surf->base.texture;
struct isl_view *view = &surf->read_view;
uint32_t offset = 0;
uint32_t *surf_state = stream_state(batch, isl_dev->ss.size, isl_dev->ss.align, &offset);
emit_surface_state(batch, res, &surf->surf, true, view, false,
aux_usage, 0, false,
surf_state, offset);
return offset;
}
static uint32_t
emit_grid(struct crocus_context *ice,
struct crocus_batch *batch)
{
UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
uint32_t offset = 0;
struct crocus_state_ref *grid_ref = &ice->state.grid_size;
uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
isl_dev->ss.align, &offset);
isl_buffer_fill_state(isl_dev, surf_state,
.address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
crocus_resource_bo(grid_ref->res),
grid_ref->offset,
RELOC_32BIT),
.size_B = 12,
.format = ISL_FORMAT_RAW,
.stride_B = 1,
.mocs = crocus_mocs(crocus_resource_bo(grid_ref->res), isl_dev));
return offset;
}
static uint32_t
emit_ubo_buffer(struct crocus_context *ice,
struct crocus_batch *batch,
struct pipe_constant_buffer *buffer)
{
UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
uint32_t offset = 0;
uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
isl_dev->ss.align, &offset);
isl_buffer_fill_state(isl_dev, surf_state,
.address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
crocus_resource_bo(buffer->buffer),
buffer->buffer_offset,
RELOC_32BIT),
.size_B = buffer->buffer_size,
.format = 0,
.swizzle = ISL_SWIZZLE_IDENTITY,
.stride_B = 1,
.mocs = crocus_mocs(crocus_resource_bo(buffer->buffer), isl_dev));
return offset;
}
static uint32_t
emit_ssbo_buffer(struct crocus_context *ice,
struct crocus_batch *batch,
struct pipe_shader_buffer *buffer, bool writeable)
{
UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
uint32_t offset = 0;
uint32_t reloc = RELOC_32BIT;
if (writeable)
reloc |= RELOC_WRITE;
uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
isl_dev->ss.align, &offset);
isl_buffer_fill_state(isl_dev, surf_state,
.address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
crocus_resource_bo(buffer->buffer),
buffer->buffer_offset,
reloc),
.size_B = buffer->buffer_size,
.format = ISL_FORMAT_RAW,
.swizzle = ISL_SWIZZLE_IDENTITY,
.stride_B = 1,
.mocs = crocus_mocs(crocus_resource_bo(buffer->buffer), isl_dev));
return offset;
}
static uint32_t
emit_sampler_view(struct crocus_context *ice,
struct crocus_batch *batch,
bool for_gather,
struct crocus_sampler_view *isv)
{
UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
uint32_t offset = 0;
uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
isl_dev->ss.align, &offset);
if (isv->base.target == PIPE_BUFFER) {
const struct isl_format_layout *fmtl = isl_format_get_layout(isv->view.format);
const unsigned cpp = isv->view.format == ISL_FORMAT_RAW ? 1 : fmtl->bpb / 8;
unsigned final_size =
MIN3(isv->base.u.buf.size, isv->res->bo->size - isv->res->offset,
CROCUS_MAX_TEXTURE_BUFFER_SIZE * cpp);
isl_buffer_fill_state(isl_dev, surf_state,
.address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
isv->res->bo,
isv->res->offset + isv->base.u.buf.offset, RELOC_32BIT),
.size_B = final_size,
.format = isv->view.format,
.swizzle = isv->view.swizzle,
.stride_B = cpp,
.mocs = crocus_mocs(isv->res->bo, isl_dev)
);
} else {
enum isl_aux_usage aux_usage =
crocus_resource_texture_aux_usage(isv->res);
emit_surface_state(batch, isv->res, &isv->res->surf, false,
for_gather ? &isv->gather_view : &isv->view,
false, aux_usage, false,
0, surf_state, offset);
}
return offset;
}
static uint32_t
emit_image_view(struct crocus_context *ice,
struct crocus_batch *batch,
struct crocus_image_view *iv)
{
UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
uint32_t offset = 0;
struct crocus_resource *res = (struct crocus_resource *)iv->base.resource;
uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
isl_dev->ss.align, &offset);
bool write = iv->base.shader_access & PIPE_IMAGE_ACCESS_WRITE;
uint32_t reloc = RELOC_32BIT | (write ? RELOC_WRITE : 0);
if (res->base.b.target == PIPE_BUFFER) {
const struct isl_format_layout *fmtl = isl_format_get_layout(iv->view.format);
const unsigned cpp = iv->view.format == ISL_FORMAT_RAW ? 1 : fmtl->bpb / 8;
unsigned final_size =
MIN3(iv->base.u.buf.size, res->bo->size - res->offset - iv->base.u.buf.offset,
CROCUS_MAX_TEXTURE_BUFFER_SIZE * cpp);
isl_buffer_fill_state(isl_dev, surf_state,
.address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
res->bo,
res->offset + iv->base.u.buf.offset, reloc),
.size_B = final_size,
.format = iv->view.format,
.swizzle = iv->view.swizzle,
.stride_B = cpp,
.mocs = crocus_mocs(res->bo, isl_dev)
);
} else {
if (iv->view.format == ISL_FORMAT_RAW) {
isl_buffer_fill_state(isl_dev, surf_state,
.address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
res->bo,
res->offset, reloc),
.size_B = res->bo->size - res->offset,
.format = iv->view.format,
.swizzle = iv->view.swizzle,
.stride_B = 1,
.mocs = crocus_mocs(res->bo, isl_dev),
);
} else {
emit_surface_state(batch, res,
&res->surf, false, &iv->view,
write, 0, false,
0, surf_state, offset);
}
}
return offset;
}
#if GFX_VER == 6
static uint32_t
emit_sol_surface(struct crocus_batch *batch,
struct pipe_stream_output_info *so_info,
uint32_t idx)
{
struct crocus_context *ice = batch->ice;
if (idx >= so_info->num_outputs || !ice->state.streamout_active)
return 0;
const struct pipe_stream_output *output = &so_info->output[idx];
const int buffer = output->output_buffer;
assert(output->stream == 0);
struct crocus_resource *buf = (struct crocus_resource *)ice->state.so_target[buffer]->buffer;
unsigned stride_dwords = so_info->stride[buffer];
unsigned offset_dwords = ice->state.so_target[buffer]->buffer_offset / 4 + output->dst_offset;
size_t size_dwords = (ice->state.so_target[buffer]->buffer_offset + ice->state.so_target[buffer]->buffer_size) / 4;
unsigned num_vector_components = output->num_components;
unsigned num_elements;
/* FIXME: can we rely on core Mesa to ensure that the buffer isn't
* too big to map using a single binding table entry?
*/
// assert((size_dwords - offset_dwords) / stride_dwords
// <= BRW_MAX_NUM_BUFFER_ENTRIES);
if (size_dwords > offset_dwords + num_vector_components) {
/* There is room for at least 1 transform feedback output in the buffer.
* Compute the number of additional transform feedback outputs the
* buffer has room for.
*/
num_elements =
(size_dwords - offset_dwords - num_vector_components);
} else {
/* There isn't even room for a single transform feedback output in the
* buffer. We can't configure the binding table entry to prevent output
* entirely; we'll have to rely on the geometry shader to detect
* overflow. But to minimize the damage in case of a bug, set up the
* binding table entry to just allow a single output.
*/
num_elements = 0;
}
num_elements += stride_dwords;
uint32_t surface_format;
switch (num_vector_components) {
case 1:
surface_format = ISL_FORMAT_R32_FLOAT;
break;
case 2:
surface_format = ISL_FORMAT_R32G32_FLOAT;
break;
case 3:
surface_format = ISL_FORMAT_R32G32B32_FLOAT;
break;
case 4:
surface_format = ISL_FORMAT_R32G32B32A32_FLOAT;
break;
default:
unreachable("Invalid vector size for transform feedback output");
}
UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
uint32_t offset = 0;
uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
isl_dev->ss.align, &offset);
isl_buffer_fill_state(isl_dev, surf_state,
.address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
crocus_resource_bo(&buf->base.b),
offset_dwords * 4, RELOC_32BIT|RELOC_WRITE),
.size_B = num_elements * 4,
.stride_B = stride_dwords * 4,
.swizzle = ISL_SWIZZLE_IDENTITY,
.format = surface_format);
return offset;
}
#endif
#define foreach_surface_used(index, group) \
for (int index = 0; index < bt->sizes[group]; index++) \
if (crocus_group_index_to_bti(bt, group, index) != \
CROCUS_SURFACE_NOT_USED)
static void
crocus_populate_binding_table(struct crocus_context *ice,
struct crocus_batch *batch,
gl_shader_stage stage, bool ff_gs)
{
struct crocus_compiled_shader *shader = ff_gs ? ice->shaders.ff_gs_prog : ice->shaders.prog[stage];
struct crocus_shader_state *shs = ff_gs ? NULL : &ice->state.shaders[stage];
if (!shader)
return;
struct crocus_binding_table *bt = &shader->bt;
int s = 0;
uint32_t *surf_offsets = shader->surf_offset;
#if GFX_VER < 8
const struct shader_info *info = crocus_get_shader_info(ice, stage);
#endif
if (stage == MESA_SHADER_FRAGMENT) {
struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
/* Note that cso_fb->nr_cbufs == fs_key->nr_color_regions. */
if (cso_fb->nr_cbufs) {
for (unsigned i = 0; i < cso_fb->nr_cbufs; i++) {
uint32_t write_disables = 0;
bool blend_enable = false;
#if GFX_VER <= 5
const struct pipe_rt_blend_state *rt =
&ice->state.cso_blend->cso.rt[ice->state.cso_blend->cso.independent_blend_enable ? i : 0];
struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
struct brw_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
write_disables |= (rt->colormask & PIPE_MASK_A) ? 0x0 : 0x8;
write_disables |= (rt->colormask & PIPE_MASK_R) ? 0x0 : 0x4;
write_disables |= (rt->colormask & PIPE_MASK_G) ? 0x0 : 0x2;
write_disables |= (rt->colormask & PIPE_MASK_B) ? 0x0 : 0x1;
/* Gen4/5 can't handle blending off when a dual src blend wm is enabled. */
blend_enable = rt->blend_enable || wm_prog_data->dual_src_blend;
#endif
if (cso_fb->cbufs[i]) {
surf_offsets[s] = emit_surface(batch,
(struct crocus_surface *)cso_fb->cbufs[i],
ice->state.draw_aux_usage[i],
blend_enable,
write_disables);
} else {
emit_null_fb_surface(batch, ice, &surf_offsets[s]);
}
s++;
}
} else {
emit_null_fb_surface(batch, ice, &surf_offsets[s]);
s++;
}
foreach_surface_used(i, CROCUS_SURFACE_GROUP_RENDER_TARGET_READ) {
struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
if (cso_fb->cbufs[i]) {
surf_offsets[s++] = emit_rt_surface(batch,
(struct crocus_surface *)cso_fb->cbufs[i],
ice->state.draw_aux_usage[i]);
}
}
}
if (stage == MESA_SHADER_COMPUTE) {
foreach_surface_used(i, CROCUS_SURFACE_GROUP_CS_WORK_GROUPS) {
surf_offsets[s] = emit_grid(ice, batch);
s++;
}
}
#if GFX_VER == 6
if (stage == MESA_SHADER_GEOMETRY) {
struct pipe_stream_output_info *so_info;
if (ice->shaders.uncompiled[MESA_SHADER_GEOMETRY])
so_info = &ice->shaders.uncompiled[MESA_SHADER_GEOMETRY]->stream_output;
else
so_info = &ice->shaders.uncompiled[MESA_SHADER_VERTEX]->stream_output;
foreach_surface_used(i, CROCUS_SURFACE_GROUP_SOL) {
surf_offsets[s] = emit_sol_surface(batch, so_info, i);
s++;
}
}
#endif
foreach_surface_used(i, CROCUS_SURFACE_GROUP_TEXTURE) {
struct crocus_sampler_view *view = shs->textures[i];
if (view)
surf_offsets[s] = emit_sampler_view(ice, batch, false, view);
else
emit_null_surface(batch, &surf_offsets[s]);
s++;
}
#if GFX_VER < 8
if (info && info->uses_texture_gather) {
foreach_surface_used(i, CROCUS_SURFACE_GROUP_TEXTURE_GATHER) {
struct crocus_sampler_view *view = shs->textures[i];
if (view)
surf_offsets[s] = emit_sampler_view(ice, batch, true, view);
else
emit_null_surface(batch, &surf_offsets[s]);
s++;
}
}
#endif
foreach_surface_used(i, CROCUS_SURFACE_GROUP_IMAGE) {
struct crocus_image_view *view = &shs->image[i];
if (view->base.resource)
surf_offsets[s] = emit_image_view(ice, batch, view);
else
emit_null_surface(batch, &surf_offsets[s]);
s++;
}
foreach_surface_used(i, CROCUS_SURFACE_GROUP_UBO) {
if (shs->constbufs[i].buffer)
surf_offsets[s] = emit_ubo_buffer(ice, batch, &shs->constbufs[i]);
else
emit_null_surface(batch, &surf_offsets[s]);
s++;
}
foreach_surface_used(i, CROCUS_SURFACE_GROUP_SSBO) {
if (shs->ssbo[i].buffer)
surf_offsets[s] = emit_ssbo_buffer(ice, batch, &shs->ssbo[i],
!!(shs->writable_ssbos & (1 << i)));
else
emit_null_surface(batch, &surf_offsets[s]);
s++;
}
}
/* ------------------------------------------------------------------- */
static uint32_t
crocus_upload_binding_table(struct crocus_context *ice,
struct crocus_batch *batch,
uint32_t *table,
uint32_t size)
{
if (size == 0)
return 0;
return emit_state(batch, table, size, 32);
}
/**
* Possibly emit STATE_BASE_ADDRESS to update Surface State Base Address.
*/
static void
crocus_update_surface_base_address(struct crocus_batch *batch)
{
if (batch->state_base_address_emitted)
return;
#if GFX_VER >= 6
uint32_t mocs = batch->screen->isl_dev.mocs.internal;
#endif
flush_before_state_base_change(batch);
crocus_emit_cmd(batch, GENX(STATE_BASE_ADDRESS), sba) {
sba.SurfaceStateBaseAddressModifyEnable = true;
sba.SurfaceStateBaseAddress = ro_bo(batch->state.bo, 0);
#if GFX_VER >= 5
sba.InstructionBaseAddress = ro_bo(batch->ice->shaders.cache_bo, 0); // TODO!
#endif
sba.GeneralStateBaseAddressModifyEnable = true;
sba.IndirectObjectBaseAddressModifyEnable = true;
#if GFX_VER >= 5
sba.InstructionBaseAddressModifyEnable = true;
#endif
#if GFX_VER < 8
sba.GeneralStateAccessUpperBoundModifyEnable = true;
#endif
#if GFX_VER >= 5 && GFX_VER < 8
sba.IndirectObjectAccessUpperBoundModifyEnable = true;
sba.InstructionAccessUpperBoundModifyEnable = true;
#endif
#if GFX_VER <= 5
sba.GeneralStateAccessUpperBound = ro_bo(NULL, 0xfffff000);
#endif
#if GFX_VER >= 6
/* The hardware appears to pay attention to the MOCS fields even
* if you don't set the "Address Modify Enable" bit for the base.
*/
sba.GeneralStateMOCS = mocs;
sba.StatelessDataPortAccessMOCS = mocs;
#if GFX_VER == 8
sba.DynamicStateMOCS = mocs;
sba.IndirectObjectMOCS = mocs;
sba.InstructionMOCS = mocs;
sba.SurfaceStateMOCS = mocs;
sba.GeneralStateBufferSize = 0xfffff;
sba.IndirectObjectBufferSize = 0xfffff;
sba.InstructionBufferSize = 0xfffff;
sba.DynamicStateBufferSize = MAX_STATE_SIZE;
sba.GeneralStateBufferSizeModifyEnable = true;
sba.DynamicStateBufferSizeModifyEnable = true;
sba.IndirectObjectBufferSizeModifyEnable = true;
sba.InstructionBuffersizeModifyEnable = true;
#endif
sba.DynamicStateBaseAddressModifyEnable = true;
sba.DynamicStateBaseAddress = ro_bo(batch->state.bo, 0);
/* Dynamic state upper bound. Although the documentation says that
* programming it to zero will cause it to be ignored, that is a lie.
* If this isn't programmed to a real bound, the sampler border color
* pointer is rejected, causing border color to mysteriously fail.
*/
#if GFX_VER < 8
sba.DynamicStateAccessUpperBoundModifyEnable = true;
sba.DynamicStateAccessUpperBound = ro_bo(NULL, 0xfffff000);
#endif
#endif
}
flush_after_state_base_change(batch);
/* According to section 3.6.1 of VOL1 of the 965 PRM,
* STATE_BASE_ADDRESS updates require a reissue of:
*
* 3DSTATE_PIPELINE_POINTERS
* 3DSTATE_BINDING_TABLE_POINTERS
* MEDIA_STATE_POINTERS
*
* and this continues through Ironlake. The Sandy Bridge PRM, vol
* 1 part 1 says that the folowing packets must be reissued:
*
* 3DSTATE_CC_POINTERS
* 3DSTATE_BINDING_TABLE_POINTERS
* 3DSTATE_SAMPLER_STATE_POINTERS
* 3DSTATE_VIEWPORT_STATE_POINTERS
* MEDIA_STATE_POINTERS
*
* Those are always reissued following SBA updates anyway (new
* batch time), except in the case of the program cache BO
* changing. Having a separate state flag makes the sequence more
* obvious.
*/
#if GFX_VER <= 5
batch->ice->state.dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS | CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS;
#elif GFX_VER == 6
batch->ice->state.dirty |= CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS | CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS;
#endif
batch->state_base_address_emitted = true;
}
static inline void
crocus_viewport_zmin_zmax(const struct pipe_viewport_state *vp, bool halfz,
bool window_space_position, float *zmin, float *zmax)
{
if (window_space_position) {
*zmin = 0.f;
*zmax = 1.f;
return;
}
util_viewport_zmin_zmax(vp, halfz, zmin, zmax);
}
struct push_bos {
struct {
struct crocus_address addr;
uint32_t length;
} buffers[4];
int buffer_count;
uint32_t max_length;
};
#if GFX_VER >= 6
static void
setup_constant_buffers(struct crocus_context *ice,
struct crocus_batch *batch,
int stage,
struct push_bos *push_bos)
{
struct crocus_shader_state *shs = &ice->state.shaders[stage];
struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
struct brw_stage_prog_data *prog_data = (void *) shader->prog_data;
uint32_t push_range_sum = 0;
int n = 0;
for (int i = 0; i < 4; i++) {
const struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
if (range->length == 0)
continue;
push_range_sum += range->length;
if (range->length > push_bos->max_length)
push_bos->max_length = range->length;
/* Range block is a binding table index, map back to UBO index. */
unsigned block_index = crocus_bti_to_group_index(
&shader->bt, CROCUS_SURFACE_GROUP_UBO, range->block);
assert(block_index != CROCUS_SURFACE_NOT_USED);
struct pipe_constant_buffer *cbuf = &shs->constbufs[block_index];
struct crocus_resource *res = (void *) cbuf->buffer;
assert(cbuf->buffer_offset % 32 == 0);
push_bos->buffers[n].length = range->length;
push_bos->buffers[n].addr =
res ? ro_bo(res->bo, range->start * 32 + cbuf->buffer_offset)
: ro_bo(batch->ice->workaround_bo,
batch->ice->workaround_offset);
n++;
}
/* From the 3DSTATE_CONSTANT_XS and 3DSTATE_CONSTANT_ALL programming notes:
*
* "The sum of all four read length fields must be less than or
* equal to the size of 64."
*/
assert(push_range_sum <= 64);
push_bos->buffer_count = n;
}
#if GFX_VER == 7
static void
gen7_emit_vs_workaround_flush(struct crocus_batch *batch)
{
ASSERTED const struct intel_device_info *devinfo = &batch->screen->devinfo;
assert(devinfo->ver == 7);
crocus_emit_pipe_control_write(batch,
"vs workaround",
PIPE_CONTROL_WRITE_IMMEDIATE
| PIPE_CONTROL_DEPTH_STALL,
batch->ice->workaround_bo,
batch->ice->workaround_offset, 0);
}
#endif
static void
emit_push_constant_packets(struct crocus_context *ice,
struct crocus_batch *batch,
int stage,
const struct push_bos *push_bos)
{
struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
struct brw_stage_prog_data *prog_data = shader ? (void *) shader->prog_data : NULL;
#if GFX_VER == 7
if (stage == MESA_SHADER_VERTEX) {
if (!(GFX_VERx10 == 75) && !batch->screen->devinfo.is_baytrail)
gen7_emit_vs_workaround_flush(batch);
}
#endif
crocus_emit_cmd(batch, GENX(3DSTATE_CONSTANT_VS), pkt) {
pkt._3DCommandSubOpcode = push_constant_opcodes[stage];
#if GFX_VER >= 7
if (prog_data) {
/* The Skylake PRM contains the following restriction:
*
* "The driver must ensure The following case does not occur
* without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
* buffer 3 read length equal to zero committed followed by a
* 3DSTATE_CONSTANT_* with buffer 0 read length not equal to
* zero committed."
*
* To avoid this, we program the buffers in the highest slots.
* This way, slot 0 is only used if slot 3 is also used.
*/
int n = push_bos->buffer_count;
assert(n <= 4);
#if GFX_VERx10 >= 75
const unsigned shift = 4 - n;
#else
const unsigned shift = 0;
#endif
for (int i = 0; i < n; i++) {
pkt.ConstantBody.ReadLength[i + shift] =
push_bos->buffers[i].length;
pkt.ConstantBody.Buffer[i + shift] = push_bos->buffers[i].addr;
}
}
#else
if (prog_data) {
int n = push_bos->buffer_count;
assert (n <= 1);
if (n == 1) {
pkt.Buffer0Valid = true;
pkt.ConstantBody.PointertoConstantBuffer0 = push_bos->buffers[0].addr.offset;
pkt.ConstantBody.ConstantBuffer0ReadLength = push_bos->buffers[0].length - 1;
}
}
#endif
}
}
#endif
#if GFX_VER == 8
typedef struct GENX(3DSTATE_WM_DEPTH_STENCIL) DEPTH_STENCIL_GENXML;
#elif GFX_VER >= 6
typedef struct GENX(DEPTH_STENCIL_STATE) DEPTH_STENCIL_GENXML;
#else
typedef struct GENX(COLOR_CALC_STATE) DEPTH_STENCIL_GENXML;
#endif
static inline void
set_depth_stencil_bits(struct crocus_context *ice, DEPTH_STENCIL_GENXML *ds)
{
struct crocus_depth_stencil_alpha_state *cso = ice->state.cso_zsa;
ds->DepthTestEnable = cso->cso.depth_enabled;
ds->DepthBufferWriteEnable = cso->cso.depth_writemask;
ds->DepthTestFunction = translate_compare_func(cso->cso.depth_func);
ds->StencilFailOp = cso->cso.stencil[0].fail_op;
ds->StencilPassDepthFailOp = cso->cso.stencil[0].zfail_op;
ds->StencilPassDepthPassOp = cso->cso.stencil[0].zpass_op;
ds->StencilTestFunction = translate_compare_func(cso->cso.stencil[0].func);
ds->StencilTestMask = cso->cso.stencil[0].valuemask;
ds->StencilWriteMask = cso->cso.stencil[0].writemask;
ds->BackfaceStencilFailOp = cso->cso.stencil[1].fail_op;
ds->BackfaceStencilPassDepthFailOp = cso->cso.stencil[1].zfail_op;
ds->BackfaceStencilPassDepthPassOp = cso->cso.stencil[1].zpass_op;
ds->BackfaceStencilTestFunction = translate_compare_func(cso->cso.stencil[1].func);
ds->BackfaceStencilTestMask = cso->cso.stencil[1].valuemask;
ds->BackfaceStencilWriteMask = cso->cso.stencil[1].writemask;
ds->DoubleSidedStencilEnable = cso->cso.stencil[1].enabled;
ds->StencilTestEnable = cso->cso.stencil[0].enabled;
ds->StencilBufferWriteEnable =
cso->cso.stencil[0].writemask != 0 ||
(cso->cso.stencil[1].enabled && cso->cso.stencil[1].writemask != 0);
}
static void
emit_vertex_buffer_state(struct crocus_batch *batch,
unsigned buffer_id,
struct crocus_bo *bo,
unsigned start_offset,
unsigned end_offset,
unsigned stride,
unsigned step_rate,
uint32_t **map)
{
const unsigned vb_dwords = GENX(VERTEX_BUFFER_STATE_length);
_crocus_pack_state(batch, GENX(VERTEX_BUFFER_STATE), *map, vb) {
vb.BufferStartingAddress = ro_bo(bo, start_offset);
#if GFX_VER >= 8
vb.BufferSize = end_offset - start_offset;
#endif
vb.VertexBufferIndex = buffer_id;
vb.BufferPitch = stride;
#if GFX_VER >= 7
vb.AddressModifyEnable = true;
#endif
#if GFX_VER >= 6
vb.MOCS = crocus_mocs(bo, &batch->screen->isl_dev);
#endif
#if GFX_VER < 8
vb.BufferAccessType = step_rate ? INSTANCEDATA : VERTEXDATA;
vb.InstanceDataStepRate = step_rate;
#if GFX_VER >= 5
vb.EndAddress = ro_bo(bo, end_offset - 1);
#endif
#endif
}
*map += vb_dwords;
}
#if GFX_VER >= 6
static uint32_t
determine_sample_mask(struct crocus_context *ice)
{
uint32_t num_samples = ice->state.framebuffer.samples;
if (num_samples <= 1)
return 1;
uint32_t fb_mask = (1 << num_samples) - 1;
return ice->state.sample_mask & fb_mask;
}
#endif
static void
crocus_upload_dirty_render_state(struct crocus_context *ice,
struct crocus_batch *batch,
const struct pipe_draw_info *draw)
{
uint64_t dirty = ice->state.dirty;
uint64_t stage_dirty = ice->state.stage_dirty;
if (!(dirty & CROCUS_ALL_DIRTY_FOR_RENDER) &&
!(stage_dirty & CROCUS_ALL_STAGE_DIRTY_FOR_RENDER))
return;
if (dirty & CROCUS_DIRTY_VF_STATISTICS) {
crocus_emit_cmd(batch, GENX(3DSTATE_VF_STATISTICS), vf) {
vf.StatisticsEnable = true;
}
}
#if GFX_VER <= 5
if (stage_dirty & (CROCUS_STAGE_DIRTY_CONSTANTS_VS |
CROCUS_STAGE_DIRTY_CONSTANTS_FS)) {
bool ret = calculate_curbe_offsets(batch);
if (ret) {
dirty |= CROCUS_DIRTY_GEN4_CURBE | CROCUS_DIRTY_WM | CROCUS_DIRTY_CLIP;
stage_dirty |= CROCUS_STAGE_DIRTY_VS;
}
}
if (dirty & (CROCUS_DIRTY_GEN4_CURBE | CROCUS_DIRTY_RASTER) ||
stage_dirty & CROCUS_STAGE_DIRTY_VS) {
bool ret = crocus_calculate_urb_fence(batch, ice->curbe.total_size,
brw_vue_prog_data(ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data)->urb_entry_size,
((struct brw_sf_prog_data *)ice->shaders.sf_prog->prog_data)->urb_entry_size);
if (ret)
dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
}
#endif
if (dirty & CROCUS_DIRTY_CC_VIEWPORT) {
const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
uint32_t cc_vp_address;
/* XXX: could avoid streaming for depth_clip [0,1] case. */
uint32_t *cc_vp_map =
stream_state(batch,
4 * ice->state.num_viewports *
GENX(CC_VIEWPORT_length), 32, &cc_vp_address);
for (int i = 0; i < ice->state.num_viewports; i++) {
float zmin, zmax;
crocus_viewport_zmin_zmax(&ice->state.viewports[i], cso_rast->cso.clip_halfz,
ice->state.window_space_position,
&zmin, &zmax);
if (cso_rast->cso.depth_clip_near)
zmin = 0.0;
if (cso_rast->cso.depth_clip_far)
zmax = 1.0;
crocus_pack_state(GENX(CC_VIEWPORT), cc_vp_map, ccv) {
ccv.MinimumDepth = zmin;
ccv.MaximumDepth = zmax;
}
cc_vp_map += GENX(CC_VIEWPORT_length);
}
#if GFX_VER >= 7
crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), ptr) {
ptr.CCViewportPointer = cc_vp_address;
}
#elif GFX_VER == 6
crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) {
vp.CCViewportStateChange = 1;
vp.PointertoCC_VIEWPORT = cc_vp_address;
}
#else
ice->state.cc_vp_address = cc_vp_address;
dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
#endif
}
if (dirty & CROCUS_DIRTY_SF_CL_VIEWPORT) {
struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
#if GFX_VER >= 7
uint32_t sf_cl_vp_address;
uint32_t *vp_map =
stream_state(batch,
4 * ice->state.num_viewports *
GENX(SF_CLIP_VIEWPORT_length), 64, &sf_cl_vp_address);
#else
uint32_t *vp_map =
stream_state(batch,
4 * ice->state.num_viewports * GENX(SF_VIEWPORT_length),
32, &ice->state.sf_vp_address);
uint32_t *clip_map =
stream_state(batch,
4 * ice->state.num_viewports * GENX(CLIP_VIEWPORT_length),
32, &ice->state.clip_vp_address);
#endif
for (unsigned i = 0; i < ice->state.num_viewports; i++) {
const struct pipe_viewport_state *state = &ice->state.viewports[i];
float gb_xmin, gb_xmax, gb_ymin, gb_ymax;
#if GFX_VER == 8
float vp_xmin = viewport_extent(state, 0, -1.0f);
float vp_xmax = viewport_extent(state, 0, 1.0f);
float vp_ymin = viewport_extent(state, 1, -1.0f);
float vp_ymax = viewport_extent(state, 1, 1.0f);
#endif
intel_calculate_guardband_size(cso_fb->width, cso_fb->height,
state->scale[0], state->scale[1],
state->translate[0], state->translate[1],
&gb_xmin, &gb_xmax, &gb_ymin, &gb_ymax);
#if GFX_VER >= 7
crocus_pack_state(GENX(SF_CLIP_VIEWPORT), vp_map, vp)
#else
crocus_pack_state(GENX(SF_VIEWPORT), vp_map, vp)
#endif
{
vp.ViewportMatrixElementm00 = state->scale[0];
vp.ViewportMatrixElementm11 = state->scale[1];
vp.ViewportMatrixElementm22 = state->scale[2];
vp.ViewportMatrixElementm30 = state->translate[0];
vp.ViewportMatrixElementm31 = state->translate[1];
vp.ViewportMatrixElementm32 = state->translate[2];
#if GFX_VER < 6
struct pipe_scissor_state scissor;
crocus_fill_scissor_rect(ice, 0, &scissor);
vp.ScissorRectangle.ScissorRectangleXMin = scissor.minx;
vp.ScissorRectangle.ScissorRectangleXMax = scissor.maxx;
vp.ScissorRectangle.ScissorRectangleYMin = scissor.miny;
vp.ScissorRectangle.ScissorRectangleYMax = scissor.maxy;
#endif
#if GFX_VER >= 7
vp.XMinClipGuardband = gb_xmin;
vp.XMaxClipGuardband = gb_xmax;
vp.YMinClipGuardband = gb_ymin;
vp.YMaxClipGuardband = gb_ymax;
#endif
#if GFX_VER == 8
vp.XMinViewPort = MAX2(vp_xmin, 0);
vp.XMaxViewPort = MIN2(vp_xmax, cso_fb->width) - 1;
vp.YMinViewPort = MAX2(vp_ymin, 0);
vp.YMaxViewPort = MIN2(vp_ymax, cso_fb->height) - 1;
#endif
}
#if GFX_VER < 7
crocus_pack_state(GENX(CLIP_VIEWPORT), clip_map, clip) {
clip.XMinClipGuardband = gb_xmin;
clip.XMaxClipGuardband = gb_xmax;
clip.YMinClipGuardband = gb_ymin;
clip.YMaxClipGuardband = gb_ymax;
}
#endif
#if GFX_VER >= 7
vp_map += GENX(SF_CLIP_VIEWPORT_length);
#else
vp_map += GENX(SF_VIEWPORT_length);
clip_map += GENX(CLIP_VIEWPORT_length);
#endif
}
#if GFX_VER >= 7
crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), ptr) {
ptr.SFClipViewportPointer = sf_cl_vp_address;
}
#elif GFX_VER == 6
crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) {
vp.SFViewportStateChange = 1;
vp.CLIPViewportStateChange = 1;
vp.PointertoCLIP_VIEWPORT = ice->state.clip_vp_address;
vp.PointertoSF_VIEWPORT = ice->state.sf_vp_address;
}
#endif
}
#if GFX_VER >= 6
if (dirty & CROCUS_DIRTY_GEN6_URB) {
#if GFX_VER == 6
bool gs_present = ice->shaders.prog[MESA_SHADER_GEOMETRY] != NULL
|| ice->shaders.ff_gs_prog;
struct brw_vue_prog_data *vue_prog_data =
(void *) ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data;
const unsigned vs_size = vue_prog_data->urb_entry_size;
unsigned gs_size = vs_size;
if (ice->shaders.prog[MESA_SHADER_GEOMETRY]) {
struct brw_vue_prog_data *gs_vue_prog_data =
(void *) ice->shaders.prog[MESA_SHADER_GEOMETRY]->prog_data;
gs_size = gs_vue_prog_data->urb_entry_size;
}
genX(crocus_upload_urb)(batch, vs_size, gs_present, gs_size);
#endif
#if GFX_VER >= 7
const struct intel_device_info *devinfo = &batch->screen->devinfo;
bool gs_present = ice->shaders.prog[MESA_SHADER_GEOMETRY] != NULL;
bool tess_present = ice->shaders.prog[MESA_SHADER_TESS_EVAL] != NULL;
unsigned entry_size[4];
for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
if (!ice->shaders.prog[i]) {
entry_size[i] = 1;
} else {
struct brw_vue_prog_data *vue_prog_data =
(void *) ice->shaders.prog[i]->prog_data;
entry_size[i] = vue_prog_data->urb_entry_size;
}
assert(entry_size[i] != 0);
}
/* If we're just switching between programs with the same URB requirements,
* skip the rest of the logic.
*/
bool no_change = false;
if (ice->urb.vsize == entry_size[MESA_SHADER_VERTEX] &&
ice->urb.gs_present == gs_present &&
ice->urb.gsize == entry_size[MESA_SHADER_GEOMETRY] &&
ice->urb.tess_present == tess_present &&
ice->urb.hsize == entry_size[MESA_SHADER_TESS_CTRL] &&
ice->urb.dsize == entry_size[MESA_SHADER_TESS_EVAL]) {
no_change = true;
}
if (!no_change) {
ice->urb.vsize = entry_size[MESA_SHADER_VERTEX];
ice->urb.gs_present = gs_present;
ice->urb.gsize = entry_size[MESA_SHADER_GEOMETRY];
ice->urb.tess_present = tess_present;
ice->urb.hsize = entry_size[MESA_SHADER_TESS_CTRL];
ice->urb.dsize = entry_size[MESA_SHADER_TESS_EVAL];
unsigned entries[4];
unsigned start[4];
bool constrained;
intel_get_urb_config(devinfo,
batch->screen->l3_config_3d,
tess_present,
gs_present,
entry_size,
entries, start, NULL, &constrained);
#if GFX_VER == 7
if (GFX_VERx10 < 75 && !devinfo->is_baytrail)
gen7_emit_vs_workaround_flush(batch);
#endif
for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
crocus_emit_cmd(batch, GENX(3DSTATE_URB_VS), urb) {
urb._3DCommandSubOpcode += i;
urb.VSURBStartingAddress = start[i];
urb.VSURBEntryAllocationSize = entry_size[i] - 1;
urb.VSNumberofURBEntries = entries[i];
}
}
}
#endif
}
if (dirty & CROCUS_DIRTY_GEN6_BLEND_STATE) {
struct crocus_blend_state *cso_blend = ice->state.cso_blend;
struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
struct crocus_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
STATIC_ASSERT(GENX(BLEND_STATE_ENTRY_length) == 2);
int rt_dwords =
MAX2(cso_fb->nr_cbufs, 1) * GENX(BLEND_STATE_ENTRY_length);
#if GFX_VER >= 8
rt_dwords += GENX(BLEND_STATE_length);
#endif
uint32_t blend_offset;
uint32_t *blend_map =
stream_state(batch,
4 * rt_dwords, 64, &blend_offset);
#if GFX_VER >= 8
struct GENX(BLEND_STATE) be = { 0 };
{
#else
for (int i = 0; i < BRW_MAX_DRAW_BUFFERS; i++) {
struct GENX(BLEND_STATE_ENTRY) entry = { 0 };
#define be entry
#endif
be.AlphaTestEnable = cso_zsa->cso.alpha_enabled;
be.AlphaTestFunction = translate_compare_func(cso_zsa->cso.alpha_func);
be.AlphaToCoverageEnable = cso_blend->cso.alpha_to_coverage;
be.AlphaToOneEnable = cso_blend->cso.alpha_to_one;
be.AlphaToCoverageDitherEnable = GFX_VER >= 7 && cso_blend->cso.alpha_to_coverage;
be.ColorDitherEnable = cso_blend->cso.dither;
#if GFX_VER >= 8
for (int i = 0; i < BRW_MAX_DRAW_BUFFERS; i++) {
struct GENX(BLEND_STATE_ENTRY) entry = { 0 };
#else
{
#endif
const struct pipe_rt_blend_state *rt =
&cso_blend->cso.rt[cso_blend->cso.independent_blend_enable ? i : 0];
be.IndependentAlphaBlendEnable = set_blend_entry_bits(batch, &entry, cso_blend, i) ||
be.IndependentAlphaBlendEnable;
if (GFX_VER >= 8 || can_emit_logic_op(ice)) {
entry.LogicOpEnable = cso_blend->cso.logicop_enable;
entry.LogicOpFunction = cso_blend->cso.logicop_func;
}
entry.ColorClampRange = COLORCLAMP_RTFORMAT;
entry.PreBlendColorClampEnable = true;
entry.PostBlendColorClampEnable = true;
entry.WriteDisableRed = !(rt->colormask & PIPE_MASK_R);
entry.WriteDisableGreen = !(rt->colormask & PIPE_MASK_G);
entry.WriteDisableBlue = !(rt->colormask & PIPE_MASK_B);
entry.WriteDisableAlpha = !(rt->colormask & PIPE_MASK_A);
#if GFX_VER >= 8
GENX(BLEND_STATE_ENTRY_pack)(NULL, &blend_map[1 + i * 2], &entry);
#else
GENX(BLEND_STATE_ENTRY_pack)(NULL, &blend_map[i * 2], &entry);
#endif
}
}
#if GFX_VER >= 8
GENX(BLEND_STATE_pack)(NULL, blend_map, &be);
#endif
#if GFX_VER < 7
crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
ptr.PointertoBLEND_STATE = blend_offset;
ptr.BLEND_STATEChange = true;
}
#else
crocus_emit_cmd(batch, GENX(3DSTATE_BLEND_STATE_POINTERS), ptr) {
ptr.BlendStatePointer = blend_offset;
#if GFX_VER >= 8
ptr.BlendStatePointerValid = true;
#endif
}
#endif
}
#endif
if (dirty & CROCUS_DIRTY_COLOR_CALC_STATE) {
struct crocus_depth_stencil_alpha_state *cso = ice->state.cso_zsa;
UNUSED struct crocus_blend_state *cso_blend = ice->state.cso_blend;
struct pipe_stencil_ref *p_stencil_refs = &ice->state.stencil_ref;
uint32_t cc_offset;
void *cc_map =
stream_state(batch,
sizeof(uint32_t) * GENX(COLOR_CALC_STATE_length),
64, &cc_offset);
#if GFX_VER <= 5
dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
#endif
_crocus_pack_state(batch, GENX(COLOR_CALC_STATE), cc_map, cc) {
cc.AlphaTestFormat = ALPHATEST_FLOAT32;
cc.AlphaReferenceValueAsFLOAT32 = cso->cso.alpha_ref_value;
#if GFX_VER <= 5
set_depth_stencil_bits(ice, &cc);
if (cso_blend->cso.logicop_enable) {
if (can_emit_logic_op(ice)) {
cc.LogicOpEnable = cso_blend->cso.logicop_enable;
cc.LogicOpFunction = cso_blend->cso.logicop_func;
}
}
cc.ColorDitherEnable = cso_blend->cso.dither;
cc.IndependentAlphaBlendEnable = set_blend_entry_bits(batch, &cc, cso_blend, 0);
if (cso->cso.alpha_enabled && ice->state.framebuffer.nr_cbufs <= 1) {
cc.AlphaTestEnable = cso->cso.alpha_enabled;
cc.AlphaTestFunction = translate_compare_func(cso->cso.alpha_func);
}
cc.StatisticsEnable = ice->state.stats_wm ? 1 : 0;
cc.CCViewportStatePointer = ro_bo(batch->state.bo, ice->state.cc_vp_address);
#else
cc.AlphaTestFormat = ALPHATEST_FLOAT32;
cc.AlphaReferenceValueAsFLOAT32 = cso->cso.alpha_ref_value;
cc.BlendConstantColorRed = ice->state.blend_color.color[0];
cc.BlendConstantColorGreen = ice->state.blend_color.color[1];
cc.BlendConstantColorBlue = ice->state.blend_color.color[2];
cc.BlendConstantColorAlpha = ice->state.blend_color.color[3];
#endif
cc.StencilReferenceValue = p_stencil_refs->ref_value[0];
cc.BackfaceStencilReferenceValue = p_stencil_refs->ref_value[1];
}
ice->shaders.cc_offset = cc_offset;
#if GFX_VER >= 6
crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
ptr.ColorCalcStatePointer = cc_offset;
#if GFX_VER != 7
ptr.ColorCalcStatePointerValid = true;
#endif
}
#endif
}
#if GFX_VER <= 5
if (dirty & CROCUS_DIRTY_GEN4_CONSTANT_COLOR) {
crocus_emit_cmd(batch, GENX(3DSTATE_CONSTANT_COLOR), blend_cc) {
blend_cc.BlendConstantColorRed = ice->state.blend_color.color[0];
blend_cc.BlendConstantColorGreen = ice->state.blend_color.color[1];
blend_cc.BlendConstantColorBlue = ice->state.blend_color.color[2];
blend_cc.BlendConstantColorAlpha = ice->state.blend_color.color[3];
}
}
#endif
for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
if (!(stage_dirty & (CROCUS_STAGE_DIRTY_CONSTANTS_VS << stage)))
continue;
struct crocus_shader_state *shs = &ice->state.shaders[stage];
struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
if (!shader)
continue;
if (shs->sysvals_need_upload)
upload_sysvals(ice, stage);
#if GFX_VER <= 5
dirty |= CROCUS_DIRTY_GEN4_CURBE;
#endif
#if GFX_VER >= 7
struct push_bos push_bos = {};
setup_constant_buffers(ice, batch, stage, &push_bos);
emit_push_constant_packets(ice, batch, stage, &push_bos);
#endif
}
for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
if (stage_dirty & (CROCUS_STAGE_DIRTY_BINDINGS_VS << stage)) {
if (ice->shaders.prog[stage]) {
#if GFX_VER <= 6
dirty |= CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS;
#endif
crocus_populate_binding_table(ice, batch, stage, false);
ice->shaders.prog[stage]->bind_bo_offset =
crocus_upload_binding_table(ice, batch,
ice->shaders.prog[stage]->surf_offset,
ice->shaders.prog[stage]->bt.size_bytes);
#if GFX_VER >= 7
crocus_emit_cmd(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), ptr) {
ptr._3DCommandSubOpcode = 38 + stage;
ptr.PointertoVSBindingTable = ice->shaders.prog[stage]->bind_bo_offset;
}
#endif
#if GFX_VER == 6
} else if (stage == MESA_SHADER_GEOMETRY && ice->shaders.ff_gs_prog) {
dirty |= CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS;
crocus_populate_binding_table(ice, batch, stage, true);
ice->shaders.ff_gs_prog->bind_bo_offset =
crocus_upload_binding_table(ice, batch,
ice->shaders.ff_gs_prog->surf_offset,
ice->shaders.ff_gs_prog->bt.size_bytes);
#endif
}
}
}
#if GFX_VER <= 6
if (dirty & CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS) {
struct crocus_compiled_shader *gs = ice->shaders.prog[MESA_SHADER_GEOMETRY];
if (gs == NULL)
gs = ice->shaders.ff_gs_prog;
crocus_emit_cmd(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS), ptr) {
ptr.PointertoVSBindingTable = ice->shaders.prog[MESA_SHADER_VERTEX]->bind_bo_offset;
ptr.PointertoPSBindingTable = ice->shaders.prog[MESA_SHADER_FRAGMENT]->bind_bo_offset;
#if GFX_VER == 6
ptr.VSBindingTableChange = true;
ptr.PSBindingTableChange = true;
ptr.GSBindingTableChange = gs ? true : false;
ptr.PointertoGSBindingTable = gs ? gs->bind_bo_offset : 0;
#endif
}
}
#endif
bool sampler_updates = dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS;
for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
if (!(stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << stage)) ||
!ice->shaders.prog[stage])
continue;
crocus_upload_sampler_states(ice, batch, stage);
sampler_updates = true;
#if GFX_VER >= 7
struct crocus_shader_state *shs = &ice->state.shaders[stage];
crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ptr) {
ptr._3DCommandSubOpcode = 43 + stage;
ptr.PointertoVSSamplerState = shs->sampler_offset;
}
#endif
}
if (sampler_updates) {
#if GFX_VER == 6
struct crocus_shader_state *shs_vs = &ice->state.shaders[MESA_SHADER_VERTEX];
struct crocus_shader_state *shs_gs = &ice->state.shaders[MESA_SHADER_GEOMETRY];
struct crocus_shader_state *shs_fs = &ice->state.shaders[MESA_SHADER_FRAGMENT];
crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS), ptr) {
if (ice->shaders.prog[MESA_SHADER_VERTEX] &&
(dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS ||
stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << MESA_SHADER_VERTEX))) {
ptr.VSSamplerStateChange = true;
ptr.PointertoVSSamplerState = shs_vs->sampler_offset;
}
if (ice->shaders.prog[MESA_SHADER_GEOMETRY] &&
(dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS ||
stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << MESA_SHADER_GEOMETRY))) {
ptr.GSSamplerStateChange = true;
ptr.PointertoGSSamplerState = shs_gs->sampler_offset;
}
if (ice->shaders.prog[MESA_SHADER_FRAGMENT] &&
(dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS ||
stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << MESA_SHADER_FRAGMENT))) {
ptr.PSSamplerStateChange = true;
ptr.PointertoPSSamplerState = shs_fs->sampler_offset;
}
}
#endif
}
#if GFX_VER >= 6
if (dirty & CROCUS_DIRTY_GEN6_MULTISAMPLE) {
crocus_emit_cmd(batch, GENX(3DSTATE_MULTISAMPLE), ms) {
ms.PixelLocation =
ice->state.cso_rast->cso.half_pixel_center ? CENTER : UL_CORNER;
if (ice->state.framebuffer.samples > 0)
ms.NumberofMultisamples = ffs(ice->state.framebuffer.samples) - 1;
#if GFX_VER == 6
INTEL_SAMPLE_POS_4X(ms.Sample);
#elif GFX_VER == 7
switch (ice->state.framebuffer.samples) {
case 1:
INTEL_SAMPLE_POS_1X(ms.Sample);
break;
case 2:
INTEL_SAMPLE_POS_2X(ms.Sample);
break;
case 4:
INTEL_SAMPLE_POS_4X(ms.Sample);
break;
case 8:
INTEL_SAMPLE_POS_8X(ms.Sample);
break;
default:
break;
}
#endif
}
}
if (dirty & CROCUS_DIRTY_GEN6_SAMPLE_MASK) {
crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLE_MASK), ms) {
ms.SampleMask = determine_sample_mask(ice);
}
}
#endif
#if GFX_VER >= 7
struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
if ((stage_dirty & CROCUS_STAGE_DIRTY_FS) && shader) {
struct brw_stage_prog_data *prog_data = shader->prog_data;
struct brw_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
crocus_emit_cmd(batch, GENX(3DSTATE_PS), ps) {
/* Initialize the execution mask with VMask. Otherwise, derivatives are
* incorrect for subspans where some of the pixels are unlit. We believe
* the bit just didn't take effect in previous generations.
*/
ps.VectorMaskEnable = GFX_VER >= 8;
ps._8PixelDispatchEnable = wm_prog_data->dispatch_8;
ps._16PixelDispatchEnable = wm_prog_data->dispatch_16;
ps._32PixelDispatchEnable = wm_prog_data->dispatch_32;
ps.DispatchGRFStartRegisterForConstantSetupData0 =
brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0);
ps.DispatchGRFStartRegisterForConstantSetupData1 =
brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1);
ps.DispatchGRFStartRegisterForConstantSetupData2 =
brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2);
ps.KernelStartPointer0 = KSP(ice, shader) +
brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0);
ps.KernelStartPointer1 = KSP(ice, shader) +
brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1);
ps.KernelStartPointer2 = KSP(ice, shader) +
brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2);
#if GFX_VERx10 == 75
ps.SampleMask = determine_sample_mask(ice);
#endif
// XXX: WABTPPrefetchDisable, see above, drop at C0
ps.BindingTableEntryCount = shader->bt.size_bytes / 4;
ps.FloatingPointMode = prog_data->use_alt_mode;
#if GFX_VER >= 8
ps.MaximumNumberofThreadsPerPSD = 64 - 2;
#else
ps.MaximumNumberofThreads = batch->screen->devinfo.max_wm_threads - 1;
#endif
ps.PushConstantEnable = prog_data->ubo_ranges[0].length > 0;
#if GFX_VER < 8
ps.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
ps.DualSourceBlendEnable = wm_prog_data->dual_src_blend && ice->state.cso_blend->dual_color_blending;
ps.AttributeEnable = (wm_prog_data->num_varying_inputs != 0);
#endif
/* From the documentation for this packet:
* "If the PS kernel does not need the Position XY Offsets to
* compute a Position Value, then this field should be programmed
* to POSOFFSET_NONE."
*
* "SW Recommendation: If the PS kernel needs the Position Offsets
* to compute a Position XY value, this field should match Position
* ZW Interpolation Mode to ensure a consistent position.xyzw
* computation."
*
* We only require XY sample offsets. So, this recommendation doesn't
* look useful at the moment. We might need this in future.
*/
ps.PositionXYOffsetSelect =
wm_prog_data->uses_pos_offset ? POSOFFSET_SAMPLE : POSOFFSET_NONE;
if (wm_prog_data->base.total_scratch) {
struct crocus_bo *bo = crocus_get_scratch_space(ice, wm_prog_data->base.total_scratch, MESA_SHADER_FRAGMENT);
ps.PerThreadScratchSpace = ffs(wm_prog_data->base.total_scratch) - 11;
ps.ScratchSpaceBasePointer = rw_bo(bo, 0);
}
}
#if GFX_VER == 8
const struct shader_info *fs_info =
crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
crocus_emit_cmd(batch, GENX(3DSTATE_PS_EXTRA), psx) {
psx.PixelShaderValid = true;
psx.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode;
psx.PixelShaderKillsPixel = wm_prog_data->uses_kill;
psx.AttributeEnable = wm_prog_data->num_varying_inputs != 0;
psx.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth;
psx.PixelShaderUsesSourceW = wm_prog_data->uses_src_w;
psx.PixelShaderIsPerSample = wm_prog_data->persample_dispatch;
/* _NEW_MULTISAMPLE | BRW_NEW_CONSERVATIVE_RASTERIZATION */
if (wm_prog_data->uses_sample_mask)
psx.PixelShaderUsesInputCoverageMask = true;
psx.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
/* The stricter cross-primitive coherency guarantees that the hardware
* gives us with the "Accesses UAV" bit set for at least one shader stage
* and the "UAV coherency required" bit set on the 3DPRIMITIVE command
* are redundant within the current image, atomic counter and SSBO GL
* APIs, which all have very loose ordering and coherency requirements
* and generally rely on the application to insert explicit barriers when
* a shader invocation is expected to see the memory writes performed by
* the invocations of some previous primitive. Regardless of the value
* of "UAV coherency required", the "Accesses UAV" bits will implicitly
* cause an in most cases useless DC flush when the lowermost stage with
* the bit set finishes execution.
*
* It would be nice to disable it, but in some cases we can't because on
* Gfx8+ it also has an influence on rasterization via the PS UAV-only
* signal (which could be set independently from the coherency mechanism
* in the 3DSTATE_WM command on Gfx7), and because in some cases it will
* determine whether the hardware skips execution of the fragment shader
* or not via the ThreadDispatchEnable signal. However if we know that
* GFX8_PS_BLEND_HAS_WRITEABLE_RT is going to be set and
* GFX8_PSX_PIXEL_SHADER_NO_RT_WRITE is not set it shouldn't make any
* difference so we may just disable it here.
*
* Gfx8 hardware tries to compute ThreadDispatchEnable for us but doesn't
* take into account KillPixels when no depth or stencil writes are
* enabled. In order for occlusion queries to work correctly with no
* attachments, we need to force-enable here.
*
*/
if ((wm_prog_data->has_side_effects || wm_prog_data->uses_kill) &&
!(has_writeable_rt(ice->state.cso_blend, fs_info)))
psx.PixelShaderHasUAV = true;
}
#endif
}
#endif
#if GFX_VER >= 7
if (ice->state.streamout_active) {
if (dirty & CROCUS_DIRTY_GEN7_SO_BUFFERS) {
for (int i = 0; i < 4; i++) {
struct crocus_stream_output_target *tgt =
(void *) ice->state.so_target[i];
if (!tgt) {
crocus_emit_cmd(batch, GENX(3DSTATE_SO_BUFFER), sob) {
sob.SOBufferIndex = i;
}
continue;
}
struct crocus_resource *res = (void *) tgt->base.buffer;
uint32_t start = tgt->base.buffer_offset;
#if GFX_VER < 8
uint32_t end = ALIGN(start + tgt->base.buffer_size, 4);
#endif
crocus_emit_cmd(batch, GENX(3DSTATE_SO_BUFFER), sob) {
sob.SOBufferIndex = i;
sob.SurfaceBaseAddress = rw_bo(res->bo, start);
#if GFX_VER < 8
sob.SurfacePitch = tgt->stride;
sob.SurfaceEndAddress = rw_bo(res->bo, end);
#else
sob.SOBufferEnable = true;
sob.StreamOffsetWriteEnable = true;
sob.StreamOutputBufferOffsetAddressEnable = true;
sob.MOCS = crocus_mocs(res->bo, &batch->screen->isl_dev);
sob.SurfaceSize = MAX2(tgt->base.buffer_size / 4, 1) - 1;
sob.StreamOutputBufferOffsetAddress =
rw_bo(crocus_resource_bo(&tgt->offset_res->base.b), tgt->offset_offset);
if (tgt->zero_offset) {
sob.StreamOffset = 0;
tgt->zero_offset = false;
} else
sob.StreamOffset = 0xFFFFFFFF; /* not offset, see above */
#endif
}
}
}
if ((dirty & CROCUS_DIRTY_SO_DECL_LIST) && ice->state.streamout) {
uint32_t *decl_list =
ice->state.streamout + GENX(3DSTATE_STREAMOUT_length);
crocus_batch_emit(batch, decl_list, 4 * ((decl_list[0] & 0xff) + 2));
}
if (dirty & CROCUS_DIRTY_STREAMOUT) {
const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
uint32_t dynamic_sol[GENX(3DSTATE_STREAMOUT_length)];
crocus_pack_command(GENX(3DSTATE_STREAMOUT), dynamic_sol, sol) {
sol.SOFunctionEnable = true;
sol.SOStatisticsEnable = true;
sol.RenderingDisable = cso_rast->cso.rasterizer_discard &&
!ice->state.prims_generated_query_active;
sol.ReorderMode = cso_rast->cso.flatshade_first ? LEADING : TRAILING;
}
assert(ice->state.streamout);
crocus_emit_merge(batch, ice->state.streamout, dynamic_sol,
GENX(3DSTATE_STREAMOUT_length));
}
} else {
if (dirty & CROCUS_DIRTY_STREAMOUT) {
crocus_emit_cmd(batch, GENX(3DSTATE_STREAMOUT), sol);
}
}
#endif
#if GFX_VER == 6
if (ice->state.streamout_active) {
if (dirty & CROCUS_DIRTY_GEN6_SVBI) {
crocus_emit_so_svbi(ice);
}
}
#endif
if (dirty & CROCUS_DIRTY_CLIP) {
#if GFX_VER < 6
const struct brw_clip_prog_data *clip_prog_data = (struct brw_clip_prog_data *)ice->shaders.clip_prog->prog_data;
struct pipe_rasterizer_state *cso_state = &ice->state.cso_rast->cso;
uint32_t *clip_ptr = stream_state(batch, GENX(CLIP_STATE_length) * 4, 32, &ice->shaders.clip_offset);
dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
_crocus_pack_state(batch, GENX(CLIP_STATE), clip_ptr, clip) {
clip.KernelStartPointer = KSP(ice, ice->shaders.clip_prog);
clip.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
clip.SingleProgramFlow = true;
clip.GRFRegisterCount = DIV_ROUND_UP(clip_prog_data->total_grf, 16) - 1;
clip.VertexURBEntryReadLength = clip_prog_data->urb_read_length;
clip.ConstantURBEntryReadLength = clip_prog_data->curb_read_length;
clip.DispatchGRFStartRegisterForURBData = 1;
clip.VertexURBEntryReadOffset = 0;
clip.ConstantURBEntryReadOffset = ice->curbe.clip_start * 2;
clip.NumberofURBEntries = batch->ice->urb.nr_clip_entries;
clip.URBEntryAllocationSize = batch->ice->urb.vsize - 1;
if (batch->ice->urb.nr_clip_entries >= 10) {
/* Half of the URB entries go to each thread, and it has to be an
* even number.
*/
assert(batch->ice->urb.nr_clip_entries % 2 == 0);
/* Although up to 16 concurrent Clip threads are allowed on Ironlake,
* only 2 threads can output VUEs at a time.
*/
clip.MaximumNumberofThreads = (GFX_VER == 5 ? 16 : 2) - 1;
} else {
assert(batch->ice->urb.nr_clip_entries >= 5);
clip.MaximumNumberofThreads = 1 - 1;
}
clip.VertexPositionSpace = VPOS_NDCSPACE;
clip.UserClipFlagsMustClipEnable = true;
clip.GuardbandClipTestEnable = true;
clip.ClipperViewportStatePointer = ro_bo(batch->state.bo, ice->state.clip_vp_address);
clip.ScreenSpaceViewportXMin = -1.0;
clip.ScreenSpaceViewportXMax = 1.0;
clip.ScreenSpaceViewportYMin = -1.0;
clip.ScreenSpaceViewportYMax = 1.0;
clip.ViewportXYClipTestEnable = true;
clip.ViewportZClipTestEnable = (cso_state->depth_clip_near || cso_state->depth_clip_far);
#if GFX_VER == 5 || GFX_VERx10 == 45
clip.UserClipDistanceClipTestEnableBitmask = cso_state->clip_plane_enable;
#else
/* Up to 6 actual clip flags, plus the 7th for the negative RHW
* workaround.
*/
clip.UserClipDistanceClipTestEnableBitmask = (cso_state->clip_plane_enable & 0x3f) | 0x40;
#endif
clip.APIMode = cso_state->clip_halfz ? APIMODE_D3D : APIMODE_OGL;
clip.GuardbandClipTestEnable = true;
clip.ClipMode = clip_prog_data->clip_mode;
#if GFX_VERx10 == 45
clip.NegativeWClipTestEnable = true;
#endif
}
#else //if GFX_VER >= 6
struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
const struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data );
struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
bool gs_or_tes = ice->shaders.prog[MESA_SHADER_GEOMETRY] ||
ice->shaders.prog[MESA_SHADER_TESS_EVAL];
bool points_or_lines = cso_rast->fill_mode_point_or_line ||
(gs_or_tes ? ice->shaders.output_topology_is_points_or_lines
: ice->state.prim_is_points_or_lines);
uint32_t dynamic_clip[GENX(3DSTATE_CLIP_length)];
crocus_pack_command(GENX(3DSTATE_CLIP), &dynamic_clip, cl) {
cl.StatisticsEnable = ice->state.statistics_counters_enabled;
if (cso_rast->cso.rasterizer_discard)
cl.ClipMode = CLIPMODE_REJECT_ALL;
else if (ice->state.window_space_position)
cl.ClipMode = CLIPMODE_ACCEPT_ALL;
else
cl.ClipMode = CLIPMODE_NORMAL;
cl.PerspectiveDivideDisable = ice->state.window_space_position;
cl.ViewportXYClipTestEnable = !points_or_lines;
cl.UserClipDistanceCullTestEnableBitmask =
brw_vue_prog_data(ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data)->cull_distance_mask;
if (wm_prog_data->barycentric_interp_modes &
BRW_BARYCENTRIC_NONPERSPECTIVE_BITS)
cl.NonPerspectiveBarycentricEnable = true;
cl.ForceZeroRTAIndexEnable = cso_fb->layers <= 1;
cl.MaximumVPIndex = ice->state.num_viewports - 1;
}
crocus_emit_merge(batch, cso_rast->clip, dynamic_clip,
ARRAY_SIZE(cso_rast->clip));
#endif
}
if (stage_dirty & CROCUS_STAGE_DIRTY_VS) {
struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_VERTEX];
const struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(shader->prog_data);
const struct brw_stage_prog_data *prog_data = &vue_prog_data->base;
#if GFX_VER == 7
if (batch->screen->devinfo.is_ivybridge)
gen7_emit_vs_workaround_flush(batch);
#endif
#if GFX_VER == 6
struct push_bos push_bos = {};
setup_constant_buffers(ice, batch, MESA_SHADER_VERTEX, &push_bos);
emit_push_constant_packets(ice, batch, MESA_SHADER_VERTEX, &push_bos);
#endif
#if GFX_VER >= 6
crocus_emit_cmd(batch, GENX(3DSTATE_VS), vs)
#else
uint32_t *vs_ptr = stream_state(batch,
GENX(VS_STATE_length) * 4, 32, &ice->shaders.vs_offset);
dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
_crocus_pack_state(batch, GENX(VS_STATE), vs_ptr, vs)
#endif
{
INIT_THREAD_DISPATCH_FIELDS(vs, Vertex, MESA_SHADER_VERTEX);
vs.MaximumNumberofThreads = batch->screen->devinfo.max_vs_threads - 1;
#if GFX_VER < 6
vs.GRFRegisterCount = DIV_ROUND_UP(vue_prog_data->total_grf, 16) - 1;
vs.ConstantURBEntryReadLength = vue_prog_data->base.curb_read_length;
vs.ConstantURBEntryReadOffset = ice->curbe.vs_start * 2;
vs.NumberofURBEntries = batch->ice->urb.nr_vs_entries >> (GFX_VER == 5 ? 2 : 0);
vs.URBEntryAllocationSize = batch->ice->urb.vsize - 1;
vs.MaximumNumberofThreads =
CLAMP(batch->ice->urb.nr_vs_entries / 2, 1, batch->screen->devinfo.max_vs_threads) - 1;
vs.StatisticsEnable = false;
vs.SamplerStatePointer = ro_bo(batch->state.bo, ice->state.shaders[MESA_SHADER_VERTEX].sampler_offset);
#endif
#if GFX_VER == 5
/* Force single program flow on Ironlake. We cannot reliably get
* all applications working without it. See:
* https://bugs.freedesktop.org/show_bug.cgi?id=29172
*
* The most notable and reliably failing application is the Humus
* demo "CelShading"
*/
vs.SingleProgramFlow = true;
vs.SamplerCount = 0; /* hardware requirement */
#endif
#if GFX_VER >= 8
vs.SIMD8DispatchEnable =
vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8;
vs.UserClipDistanceCullTestEnableBitmask =
vue_prog_data->cull_distance_mask;
#endif
}
#if GFX_VER == 6
crocus_emit_pipe_control_flush(batch,
"post VS const",
PIPE_CONTROL_DEPTH_STALL |
PIPE_CONTROL_INSTRUCTION_INVALIDATE |
PIPE_CONTROL_STATE_CACHE_INVALIDATE);
#endif
}
if (stage_dirty & CROCUS_STAGE_DIRTY_GS) {
struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_GEOMETRY];
bool active = GFX_VER >= 6 && shader;
#if GFX_VER == 6
struct push_bos push_bos = {};
if (shader)
setup_constant_buffers(ice, batch, MESA_SHADER_GEOMETRY, &push_bos);
emit_push_constant_packets(ice, batch, MESA_SHADER_GEOMETRY, &push_bos);
#endif
#if GFX_VER >= 6
crocus_emit_cmd(batch, GENX(3DSTATE_GS), gs)
#else
uint32_t *gs_ptr = stream_state(batch,
GENX(GS_STATE_length) * 4, 32, &ice->shaders.gs_offset);
dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
_crocus_pack_state(batch, GENX(GS_STATE), gs_ptr, gs)
#endif
{
#if GFX_VER >= 6
if (active) {
const struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(shader->prog_data);
const struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(shader->prog_data);
const struct brw_stage_prog_data *prog_data = &gs_prog_data->base.base;
INIT_THREAD_DISPATCH_FIELDS(gs, Vertex, MESA_SHADER_GEOMETRY);
#if GFX_VER >= 7
gs.OutputVertexSize = gs_prog_data->output_vertex_size_hwords * 2 - 1;
gs.OutputTopology = gs_prog_data->output_topology;
gs.ControlDataHeaderSize =
gs_prog_data->control_data_header_size_hwords;
gs.InstanceControl = gs_prog_data->invocations - 1;
gs.DispatchMode = vue_prog_data->dispatch_mode;
gs.IncludePrimitiveID = gs_prog_data->include_primitive_id;
gs.ControlDataFormat = gs_prog_data->control_data_format;
#endif
/* Note: the meaning of the GEN7_GS_REORDER_TRAILING bit changes between
* Ivy Bridge and Haswell.
*
* On Ivy Bridge, setting this bit causes the vertices of a triangle
* strip to be delivered to the geometry shader in an order that does
* not strictly follow the OpenGL spec, but preserves triangle
* orientation. For example, if the vertices are (1, 2, 3, 4, 5), then
* the geometry shader sees triangles:
*
* (1, 2, 3), (2, 4, 3), (3, 4, 5)
*
* (Clearing the bit is even worse, because it fails to preserve
* orientation).
*
* Triangle strips with adjacency always ordered in a way that preserves
* triangle orientation but does not strictly follow the OpenGL spec,
* regardless of the setting of this bit.
*
* On Haswell, both triangle strips and triangle strips with adjacency
* are always ordered in a way that preserves triangle orientation.
* Setting this bit causes the ordering to strictly follow the OpenGL
* spec.
*
* So in either case we want to set the bit. Unfortunately on Ivy
* Bridge this will get the order close to correct but not perfect.
*/
gs.ReorderMode = TRAILING;
gs.MaximumNumberofThreads =
GFX_VER == 8 ? (batch->screen->devinfo.max_gs_threads / 2 - 1) :
(batch->screen->devinfo.max_gs_threads - 1);
#if GFX_VER < 7
gs.SOStatisticsEnable = true;
if (gs_prog_data->num_transform_feedback_bindings)
gs.SVBIPayloadEnable = ice->state.streamout_active;
/* GEN6_GS_SPF_MODE and GEN6_GS_VECTOR_MASK_ENABLE are enabled as it
* was previously done for gen6.
*
* TODO: test with both disabled to see if the HW is behaving
* as expected, like in gen7.
*/
gs.SingleProgramFlow = true;
gs.VectorMaskEnable = true;
#endif
#if GFX_VER >= 8
gs.ExpectedVertexCount = gs_prog_data->vertices_in;
if (gs_prog_data->static_vertex_count != -1) {
gs.StaticOutput = true;
gs.StaticOutputVertexCount = gs_prog_data->static_vertex_count;
}
gs.IncludeVertexHandles = vue_prog_data->include_vue_handles;
gs.UserClipDistanceCullTestEnableBitmask =
vue_prog_data->cull_distance_mask;
const int urb_entry_write_offset = 1;
const uint32_t urb_entry_output_length =
DIV_ROUND_UP(vue_prog_data->vue_map.num_slots, 2) -
urb_entry_write_offset;
gs.VertexURBEntryOutputReadOffset = urb_entry_write_offset;
gs.VertexURBEntryOutputLength = MAX2(urb_entry_output_length, 1);
#endif
}
#endif
#if GFX_VER <= 6
if (!active && ice->shaders.ff_gs_prog) {
const struct brw_ff_gs_prog_data *gs_prog_data = (struct brw_ff_gs_prog_data *)ice->shaders.ff_gs_prog->prog_data;
/* In gen6, transform feedback for the VS stage is done with an
* ad-hoc GS program. This function provides the needed 3DSTATE_GS
* for this.
*/
gs.KernelStartPointer = KSP(ice, ice->shaders.ff_gs_prog);
gs.SingleProgramFlow = true;
gs.DispatchGRFStartRegisterForURBData = GFX_VER == 6 ? 2 : 1;
gs.VertexURBEntryReadLength = gs_prog_data->urb_read_length;
#if GFX_VER <= 5
gs.GRFRegisterCount =
DIV_ROUND_UP(gs_prog_data->total_grf, 16) - 1;
/* BRW_NEW_URB_FENCE */
gs.NumberofURBEntries = batch->ice->urb.nr_gs_entries;
gs.URBEntryAllocationSize = batch->ice->urb.vsize - 1;
gs.MaximumNumberofThreads = batch->ice->urb.nr_gs_entries >= 8 ? 1 : 0;
gs.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
#else
gs.Enable = true;
gs.VectorMaskEnable = true;
gs.SVBIPayloadEnable = true;
gs.SVBIPostIncrementEnable = true;
gs.SVBIPostIncrementValue = gs_prog_data->svbi_postincrement_value;
gs.SOStatisticsEnable = true;
gs.MaximumNumberofThreads = batch->screen->devinfo.max_gs_threads - 1;
#endif
}
#endif
if (!active && !ice->shaders.ff_gs_prog) {
#if GFX_VER < 8
gs.DispatchGRFStartRegisterForURBData = 1;
#if GFX_VER >= 7
gs.IncludeVertexHandles = true;
#endif
#endif
}
#if GFX_VER >= 6
gs.StatisticsEnable = true;
#endif
#if GFX_VER == 5 || GFX_VER == 6
gs.RenderingEnabled = true;
#endif
#if GFX_VER <= 5
gs.MaximumVPIndex = ice->state.num_viewports - 1;
#endif
}
}
#if GFX_VER >= 7
if (stage_dirty & CROCUS_STAGE_DIRTY_TCS) {
struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_TESS_CTRL];
if (shader) {
const struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(shader->prog_data);
const struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(shader->prog_data);
const struct brw_stage_prog_data *prog_data = &tcs_prog_data->base.base;
crocus_emit_cmd(batch, GENX(3DSTATE_HS), hs) {
INIT_THREAD_DISPATCH_FIELDS(hs, Vertex, MESA_SHADER_TESS_CTRL);
hs.InstanceCount = tcs_prog_data->instances - 1;
hs.IncludeVertexHandles = true;
hs.MaximumNumberofThreads = batch->screen->devinfo.max_tcs_threads - 1;
}
} else {
crocus_emit_cmd(batch, GENX(3DSTATE_HS), hs);
}
}
if (stage_dirty & CROCUS_STAGE_DIRTY_TES) {
struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_TESS_EVAL];
if (shader) {
const struct brw_tes_prog_data *tes_prog_data = brw_tes_prog_data(shader->prog_data);
const struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(shader->prog_data);
const struct brw_stage_prog_data *prog_data = &tes_prog_data->base.base;
crocus_emit_cmd(batch, GENX(3DSTATE_TE), te) {
te.Partitioning = tes_prog_data->partitioning;
te.OutputTopology = tes_prog_data->output_topology;
te.TEDomain = tes_prog_data->domain;
te.TEEnable = true;
te.MaximumTessellationFactorOdd = 63.0;
te.MaximumTessellationFactorNotOdd = 64.0;
};
crocus_emit_cmd(batch, GENX(3DSTATE_DS), ds) {
INIT_THREAD_DISPATCH_FIELDS(ds, Patch, MESA_SHADER_TESS_EVAL);
ds.MaximumNumberofThreads = batch->screen->devinfo.max_tes_threads - 1;
ds.ComputeWCoordinateEnable =
tes_prog_data->domain == BRW_TESS_DOMAIN_TRI;
#if GFX_VER >= 8
if (vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8)
ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH;
ds.UserClipDistanceCullTestEnableBitmask =
vue_prog_data->cull_distance_mask;
#endif
};
} else {
crocus_emit_cmd(batch, GENX(3DSTATE_TE), te);
crocus_emit_cmd(batch, GENX(3DSTATE_DS), ds);
}
}
#endif
if (dirty & CROCUS_DIRTY_RASTER) {
#if GFX_VER < 6
const struct brw_sf_prog_data *sf_prog_data = (struct brw_sf_prog_data *)ice->shaders.sf_prog->prog_data;
struct pipe_rasterizer_state *cso_state = &ice->state.cso_rast->cso;
uint32_t *sf_ptr = stream_state(batch,
GENX(SF_STATE_length) * 4, 32, &ice->shaders.sf_offset);
dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
_crocus_pack_state(batch, GENX(SF_STATE), sf_ptr, sf) {
sf.KernelStartPointer = KSP(ice, ice->shaders.sf_prog);
sf.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
sf.GRFRegisterCount = DIV_ROUND_UP(sf_prog_data->total_grf, 16) - 1;
sf.DispatchGRFStartRegisterForURBData = 3;
sf.VertexURBEntryReadOffset = BRW_SF_URB_ENTRY_READ_OFFSET;
sf.VertexURBEntryReadLength = sf_prog_data->urb_read_length;
sf.URBEntryAllocationSize = batch->ice->urb.sfsize - 1;
sf.NumberofURBEntries = batch->ice->urb.nr_sf_entries;
sf.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
sf.SetupViewportStateOffset = ro_bo(batch->state.bo, ice->state.sf_vp_address);
sf.MaximumNumberofThreads =
MIN2(GFX_VER == 5 ? 48 : 24, batch->ice->urb.nr_sf_entries) - 1;
sf.SpritePointEnable = cso_state->point_quad_rasterization;
sf.DestinationOriginHorizontalBias = 0.5;
sf.DestinationOriginVerticalBias = 0.5;
sf.LastPixelEnable = cso_state->line_last_pixel;
sf.LineWidth = get_line_width(cso_state);
sf.PointWidth = cso_state->point_size;
sf.PointWidthSource = cso_state->point_size_per_vertex ? Vertex : State;
#if GFX_VERx10 == 45 || GFX_VER >= 5
sf.AALineDistanceMode = AALINEDISTANCE_TRUE;
#endif
sf.ViewportTransformEnable = true;
sf.FrontWinding = cso_state->front_ccw ? 1 : 0;
sf.ScissorRectangleEnable = true;
sf.CullMode = translate_cull_mode(cso_state->cull_face);
if (cso_state->flatshade_first) {
sf.TriangleFanProvokingVertexSelect = 1;
} else {
sf.TriangleStripListProvokingVertexSelect = 2;
sf.TriangleFanProvokingVertexSelect = 2;
sf.LineStripListProvokingVertexSelect = 1;
}
}
#else
struct crocus_rasterizer_state *cso = ice->state.cso_rast;
uint32_t dynamic_sf[GENX(3DSTATE_SF_length)];
crocus_pack_command(GENX(3DSTATE_SF), &dynamic_sf, sf) {
sf.ViewportTransformEnable = !ice->state.window_space_position;
#if GFX_VER == 6
const struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data);
uint32_t urb_entry_read_length;
uint32_t urb_entry_read_offset;
uint32_t point_sprite_enables;
calculate_attr_overrides(ice, sf.Attribute, &point_sprite_enables,
&urb_entry_read_length,
&urb_entry_read_offset);
sf.VertexURBEntryReadLength = urb_entry_read_length;
sf.VertexURBEntryReadOffset = urb_entry_read_offset;
sf.PointSpriteTextureCoordinateEnable = point_sprite_enables;
sf.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
sf.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
#endif
#if GFX_VER >= 6 && GFX_VER < 8
if (ice->state.framebuffer.samples > 1 && ice->state.cso_rast->cso.multisample)
sf.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
#endif
#if GFX_VER == 7
if (ice->state.framebuffer.zsbuf) {
struct crocus_resource *zres, *sres;
crocus_get_depth_stencil_resources(&batch->screen->devinfo,
ice->state.framebuffer.zsbuf->texture,
&zres, &sres);
/* ANV thinks that the stencil-ness doesn't matter, this is just
* about handling polygon offset scaling.
*/
sf.DepthBufferSurfaceFormat = zres ? isl_format_get_depth_format(zres->surf.format, false) : D16_UNORM;
}
#endif
}
crocus_emit_merge(batch, cso->sf, dynamic_sf,
ARRAY_SIZE(dynamic_sf));
#if GFX_VER == 8
crocus_batch_emit(batch, cso->raster, sizeof(cso->raster));
#endif
#endif
}
if (dirty & CROCUS_DIRTY_WM) {
struct crocus_rasterizer_state *cso = ice->state.cso_rast;
const struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data);
UNUSED bool writes_depth = wm_prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF;
UNUSED const struct shader_info *fs_info =
crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
#if GFX_VER == 6
struct push_bos push_bos = {};
setup_constant_buffers(ice, batch, MESA_SHADER_FRAGMENT, &push_bos);
emit_push_constant_packets(ice, batch, MESA_SHADER_FRAGMENT, &push_bos);
#endif
#if GFX_VER >= 6
crocus_emit_cmd(batch, GENX(3DSTATE_WM), wm)
#else
uint32_t *wm_ptr = stream_state(batch,
GENX(WM_STATE_length) * 4, 32, &ice->shaders.wm_offset);
dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
_crocus_pack_state(batch, GENX(WM_STATE), wm_ptr, wm)
#endif
{
#if GFX_VER <= 6
wm._8PixelDispatchEnable = wm_prog_data->dispatch_8;
wm._16PixelDispatchEnable = wm_prog_data->dispatch_16;
wm._32PixelDispatchEnable = wm_prog_data->dispatch_32;
#endif
#if GFX_VER == 4
/* On gen4, we only have one shader kernel */
if (brw_wm_state_has_ksp(wm, 0)) {
wm.KernelStartPointer0 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]);
wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 0);
wm.DispatchGRFStartRegisterForConstantSetupData0 =
wm_prog_data->base.dispatch_grf_start_reg;
}
#elif GFX_VER == 5
wm.KernelStartPointer0 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0);
wm.KernelStartPointer1 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
brw_wm_prog_data_prog_offset(wm_prog_data, wm, 1);
wm.KernelStartPointer2 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
brw_wm_prog_data_prog_offset(wm_prog_data, wm, 2);
wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 0);
wm.GRFRegisterCount1 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 1);
wm.GRFRegisterCount2 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 2);
wm.DispatchGRFStartRegisterForConstantSetupData0 =
wm_prog_data->base.dispatch_grf_start_reg;
#elif GFX_VER == 6
wm.KernelStartPointer0 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0);
wm.KernelStartPointer1 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
brw_wm_prog_data_prog_offset(wm_prog_data, wm, 1);
wm.KernelStartPointer2 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
brw_wm_prog_data_prog_offset(wm_prog_data, wm, 2);
wm.DispatchGRFStartRegisterForConstantSetupData0 =
brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 0);
wm.DispatchGRFStartRegisterForConstantSetupData1 =
brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 1);
wm.DispatchGRFStartRegisterForConstantSetupData2 =
brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 2);
#endif
#if GFX_VER <= 5
wm.ConstantURBEntryReadLength = wm_prog_data->base.curb_read_length;
wm.ConstantURBEntryReadOffset = ice->curbe.wm_start * 2;
wm.SetupURBEntryReadLength = wm_prog_data->num_varying_inputs * 2;
wm.SetupURBEntryReadOffset = 0;
wm.EarlyDepthTestEnable = true;
wm.LineAntialiasingRegionWidth = _05pixels;
wm.LineEndCapAntialiasingRegionWidth = _10pixels;
wm.DepthCoefficientURBReadOffset = 1;
if (cso->cso.offset_tri) {
wm.GlobalDepthOffsetEnable = true;
/* Something weird going on with legacy_global_depth_bias,
* offset_constant, scaling and MRD. This value passes glean
* but gives some odd results elsewere (eg. the
* quad-offset-units test).
*/
wm.GlobalDepthOffsetConstant = cso->cso.offset_units * 2;
wm.GlobalDepthOffsetScale = cso->cso.offset_scale;
}
wm.SamplerStatePointer = ro_bo(batch->state.bo,
ice->state.shaders[MESA_SHADER_FRAGMENT].sampler_offset);
#endif
wm.StatisticsEnable = (GFX_VER >= 6 || ice->state.stats_wm) ?
ice->state.statistics_counters_enabled : 0;
#if GFX_VER >= 6
wm.LineAntialiasingRegionWidth = _10pixels;
wm.LineEndCapAntialiasingRegionWidth = _05pixels;
wm.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
wm.BarycentricInterpolationMode = wm_prog_data->barycentric_interp_modes;
#endif
#if GFX_VER == 6
wm.DualSourceBlendEnable = wm_prog_data->dual_src_blend &&
ice->state.cso_blend->dual_color_blending;
wm.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
wm.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
/* From the SNB PRM, volume 2 part 1, page 281:
* "If the PS kernel does not need the Position XY Offsets
* to compute a Position XY value, then this field should be
* programmed to POSOFFSET_NONE."
*
* "SW Recommendation: If the PS kernel needs the Position Offsets
* to compute a Position XY value, this field should match Position
* ZW Interpolation Mode to ensure a consistent position.xyzw
* computation."
* We only require XY sample offsets. So, this recommendation doesn't
* look useful at the moment. We might need this in future.
*/
if (wm_prog_data->uses_pos_offset)
wm.PositionXYOffsetSelect = POSOFFSET_SAMPLE;
else
wm.PositionXYOffsetSelect = POSOFFSET_NONE;
#endif
wm.LineStippleEnable = cso->cso.line_stipple_enable;
wm.PolygonStippleEnable = cso->cso.poly_stipple_enable;
#if GFX_VER < 7
if (wm_prog_data->base.use_alt_mode)
wm.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
wm.BindingTableEntryCount = ice->shaders.prog[MESA_SHADER_FRAGMENT]->bt.size_bytes / 4;
wm.MaximumNumberofThreads = batch->screen->devinfo.max_wm_threads - 1;
#endif
#if GFX_VER < 8
#if GFX_VER >= 6
wm.PixelShaderUsesSourceW = wm_prog_data->uses_src_w;
struct pipe_framebuffer_state *fb = &ice->state.framebuffer;
if (fb->samples > 1) {
if (cso->cso.multisample)
wm.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
else
wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
if (wm_prog_data->persample_dispatch)
wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
else
wm.MultisampleDispatchMode = MSDISPMODE_PERPIXEL;
} else {
wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
}
#endif
wm.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth;
if (wm_prog_data->uses_kill ||
ice->state.cso_zsa->cso.alpha_enabled ||
ice->state.cso_blend->cso.alpha_to_coverage ||
(GFX_VER >= 6 && wm_prog_data->uses_omask))
wm.PixelShaderKillsPixel = true;
if (has_writeable_rt(ice->state.cso_blend, fs_info) ||
writes_depth || wm.PixelShaderKillsPixel ||
(GFX_VER >= 6 && wm_prog_data->has_side_effects))
wm.ThreadDispatchEnable = true;
#if GFX_VER >= 7
wm.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode;
wm.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask;
#else
if (wm_prog_data->base.total_scratch) {
struct crocus_bo *bo = crocus_get_scratch_space(ice, wm_prog_data->base.total_scratch,
MESA_SHADER_FRAGMENT);
wm.PerThreadScratchSpace = ffs(wm_prog_data->base.total_scratch) - 11;
wm.ScratchSpaceBasePointer = rw_bo(bo, 0);
}
wm.PixelShaderComputedDepth = writes_depth;
#endif
/* The "UAV access enable" bits are unnecessary on HSW because they only
* seem to have an effect on the HW-assisted coherency mechanism which we
* don't need, and the rasterization-related UAV_ONLY flag and the
* DISPATCH_ENABLE bit can be set independently from it.
* C.f. gen8_upload_ps_extra().
*
* BRW_NEW_FRAGMENT_PROGRAM | BRW_NEW_FS_PROG_DATA | _NEW_BUFFERS |
* _NEW_COLOR
*/
#if GFX_VERx10 == 75
if (!(has_writeable_rt(ice->state.cso_blend, fs_info) || writes_depth) &&
wm_prog_data->has_side_effects)
wm.PSUAVonly = ON;
#endif
#endif
#if GFX_VER >= 7
/* BRW_NEW_FS_PROG_DATA */
if (wm_prog_data->early_fragment_tests)
wm.EarlyDepthStencilControl = EDSC_PREPS;
else if (wm_prog_data->has_side_effects)
wm.EarlyDepthStencilControl = EDSC_PSEXEC;
#endif
#if GFX_VER == 8
/* We could skip this bit if color writes are enabled. */
if (wm_prog_data->has_side_effects || wm_prog_data->uses_kill)
wm.ForceThreadDispatchEnable = ForceON;
#endif
};
#if GFX_VER <= 5
if (ice->state.global_depth_offset_clamp != cso->cso.offset_clamp) {
crocus_emit_cmd(batch, GENX(3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP), clamp) {
clamp.GlobalDepthOffsetClamp = cso->cso.offset_clamp;
}
ice->state.global_depth_offset_clamp = cso->cso.offset_clamp;
}
#endif
}
#if GFX_VER >= 7
if (dirty & CROCUS_DIRTY_GEN7_SBE) {
crocus_emit_sbe(batch, ice);
}
#endif
#if GFX_VER >= 8
if (dirty & CROCUS_DIRTY_GEN8_PS_BLEND) {
struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
struct crocus_blend_state *cso_blend = ice->state.cso_blend;
struct crocus_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
struct brw_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
const struct shader_info *fs_info =
crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
uint32_t dynamic_pb[GENX(3DSTATE_PS_BLEND_length)];
crocus_pack_command(GENX(3DSTATE_PS_BLEND), &dynamic_pb, pb) {
pb.HasWriteableRT = has_writeable_rt(cso_blend, fs_info);
pb.AlphaTestEnable = cso_zsa->cso.alpha_enabled;
pb.ColorBufferBlendEnable = (cso_blend->blend_enables & 1) &&
(!cso_blend->dual_color_blending || wm_prog_data->dual_src_blend);
}
crocus_emit_merge(batch, cso_blend->ps_blend, dynamic_pb,
ARRAY_SIZE(cso_blend->ps_blend));
}
#endif
#if GFX_VER >= 6
if (dirty & CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL) {
#if GFX_VER >= 8
crocus_emit_cmd(batch, GENX(3DSTATE_WM_DEPTH_STENCIL), wmds) {
set_depth_stencil_bits(ice, &wmds);
}
#else
uint32_t ds_offset;
void *ds_map = stream_state(batch,
sizeof(uint32_t) * GENX(DEPTH_STENCIL_STATE_length),
64, &ds_offset);
_crocus_pack_state(batch, GENX(DEPTH_STENCIL_STATE), ds_map, ds) {
set_depth_stencil_bits(ice, &ds);
}
#if GFX_VER == 6
crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
ptr.DEPTH_STENCIL_STATEChange = true;
}
#else
crocus_emit_cmd(batch, GENX(3DSTATE_DEPTH_STENCIL_STATE_POINTERS), ptr) {
ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
}
#endif
#endif
}
if (dirty & CROCUS_DIRTY_GEN6_SCISSOR_RECT) {
/* Align to 64-byte boundary as per anv. */
uint32_t scissor_offset;
struct pipe_scissor_state *scissor_map = (void *)
stream_state(batch, sizeof(struct pipe_scissor_state) * ice->state.num_viewports,
64, &scissor_offset);
for (int i = 0; i < ice->state.num_viewports; i++) {
struct pipe_scissor_state scissor;
crocus_fill_scissor_rect(ice, i, &scissor);
scissor_map[i] = scissor;
}
crocus_emit_cmd(batch, GENX(3DSTATE_SCISSOR_STATE_POINTERS), ptr) {
ptr.ScissorRectPointer = scissor_offset;
}
}
#endif
if (dirty & CROCUS_DIRTY_DEPTH_BUFFER) {
struct isl_device *isl_dev = &batch->screen->isl_dev;
#if GFX_VER >= 6
crocus_emit_depth_stall_flushes(batch);
#endif
void *batch_ptr;
struct crocus_resource *zres, *sres;
struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
batch_ptr = crocus_get_command_space(batch, isl_dev->ds.size);
struct isl_view view = {
.base_level = 0,
.levels = 1,
.base_array_layer = 0,
.array_len = 1,
.swizzle = ISL_SWIZZLE_IDENTITY,
};
struct isl_depth_stencil_hiz_emit_info info = { .view = &view };
if (cso->zsbuf) {
crocus_get_depth_stencil_resources(&batch->screen->devinfo, cso->zsbuf->texture, &zres, &sres);
struct crocus_surface *zsbuf = (struct crocus_surface *)cso->zsbuf;
if (zsbuf->align_res) {
zres = (struct crocus_resource *)zsbuf->align_res;
}
view.base_level = cso->zsbuf->u.tex.level;
view.base_array_layer = cso->zsbuf->u.tex.first_layer;
view.array_len = cso->zsbuf->u.tex.last_layer - cso->zsbuf->u.tex.first_layer + 1;
if (zres) {
view.usage |= ISL_SURF_USAGE_DEPTH_BIT;
info.depth_surf = &zres->surf;
info.depth_address = crocus_command_reloc(batch,
(batch_ptr - batch->command.map) + isl_dev->ds.depth_offset,
zres->bo, 0, RELOC_32BIT);
info.mocs = crocus_mocs(zres->bo, isl_dev);
view.format = zres->surf.format;
if (crocus_resource_level_has_hiz(zres, view.base_level)) {
info.hiz_usage = zres->aux.usage;
info.hiz_surf = &zres->aux.surf;
uint64_t hiz_offset = 0;
#if GFX_VER == 6
/* HiZ surfaces on Sandy Bridge technically don't support
* mip-mapping. However, we can fake it by offsetting to the
* first slice of LOD0 in the HiZ surface.
*/
isl_surf_get_image_offset_B_tile_sa(&zres->aux.surf,
view.base_level, 0, 0,
&hiz_offset, NULL, NULL);
#endif
info.hiz_address = crocus_command_reloc(batch,
(batch_ptr - batch->command.map) + isl_dev->ds.hiz_offset,
zres->aux.bo, zres->aux.offset + hiz_offset,
RELOC_32BIT);
info.depth_clear_value = crocus_resource_get_clear_color(zres).f32[0];
}
}
#if GFX_VER >= 6
if (sres) {
view.usage |= ISL_SURF_USAGE_STENCIL_BIT;
info.stencil_aux_usage = sres->aux.usage;
info.stencil_surf = &sres->surf;
uint64_t stencil_offset = 0;
#if GFX_VER == 6
/* Stencil surfaces on Sandy Bridge technically don't support
* mip-mapping. However, we can fake it by offsetting to the
* first slice of LOD0 in the stencil surface.
*/
isl_surf_get_image_offset_B_tile_sa(&sres->surf,
view.base_level, 0, 0,
&stencil_offset, NULL, NULL);
#endif
info.stencil_address = crocus_command_reloc(batch,
(batch_ptr - batch->command.map) + isl_dev->ds.stencil_offset,
sres->bo, stencil_offset, RELOC_32BIT);
if (!zres) {
view.format = sres->surf.format;
info.mocs = crocus_mocs(sres->bo, isl_dev);
}
}
#endif
}
isl_emit_depth_stencil_hiz_s(isl_dev, batch_ptr, &info);
}
/* TODO: Disable emitting this until something uses a stipple. */
if (dirty & CROCUS_DIRTY_POLYGON_STIPPLE) {
crocus_emit_cmd(batch, GENX(3DSTATE_POLY_STIPPLE_PATTERN), poly) {
for (int i = 0; i < 32; i++) {
poly.PatternRow[i] = ice->state.poly_stipple.stipple[i];
}
}
}
if (dirty & CROCUS_DIRTY_LINE_STIPPLE) {
struct crocus_rasterizer_state *cso = ice->state.cso_rast;
crocus_batch_emit(batch, cso->line_stipple, sizeof(cso->line_stipple));
}
#if GFX_VER >= 8
if (dirty & CROCUS_DIRTY_GEN8_VF_TOPOLOGY) {
crocus_emit_cmd(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
topo.PrimitiveTopologyType =
translate_prim_type(draw->mode, ice->state.patch_vertices);
}
}
#endif
#if GFX_VER <= 5
if (dirty & CROCUS_DIRTY_GEN5_PIPELINED_POINTERS) {
upload_pipelined_state_pointers(batch, ice->shaders.ff_gs_prog ? true : false, ice->shaders.gs_offset,
ice->shaders.vs_offset, ice->shaders.sf_offset,
ice->shaders.clip_offset, ice->shaders.wm_offset, ice->shaders.cc_offset);
crocus_upload_urb_fence(batch);
crocus_emit_cmd(batch, GENX(CS_URB_STATE), cs) {
cs.NumberofURBEntries = ice->urb.nr_cs_entries;
cs.URBEntryAllocationSize = ice->urb.csize - 1;
}
dirty |= CROCUS_DIRTY_GEN4_CURBE;
}
#endif
if (dirty & CROCUS_DIRTY_DRAWING_RECTANGLE) {
struct pipe_framebuffer_state *fb = &ice->state.framebuffer;
if (fb->width && fb->height) {
crocus_emit_cmd(batch, GENX(3DSTATE_DRAWING_RECTANGLE), rect) {
rect.ClippedDrawingRectangleXMax = fb->width - 1;
rect.ClippedDrawingRectangleYMax = fb->height - 1;
}
}
}
if (dirty & CROCUS_DIRTY_VERTEX_BUFFERS) {
const uint32_t user_count = util_bitcount(ice->state.bound_vertex_buffers);
const uint32_t count = user_count +
ice->state.vs_uses_draw_params + ice->state.vs_uses_derived_draw_params;
uint32_t dynamic_bound = ice->state.bound_vertex_buffers;
if (count) {
const unsigned vb_dwords = GENX(VERTEX_BUFFER_STATE_length);
uint32_t *map =
crocus_get_command_space(batch, 4 * (1 + vb_dwords * count));
_crocus_pack_command(batch, GENX(3DSTATE_VERTEX_BUFFERS), map, vb) {
vb.DWordLength = (vb_dwords * count + 1) - 2;
}
map += 1;
uint32_t bound = dynamic_bound;
int i;
while (bound) {
i = u_bit_scan(&bound);
struct pipe_vertex_buffer *buf = &ice->state.vertex_buffers[i];
struct crocus_bo *bo = crocus_resource_bo(buf->buffer.resource);
uint32_t step_rate = ice->state.cso_vertex_elements->step_rate[i];
emit_vertex_buffer_state(batch, i, bo,
buf->buffer_offset,
ice->state.vb_end[i],
buf->stride,
step_rate,
&map);
}
i = user_count;
if (ice->state.vs_uses_draw_params) {
struct crocus_resource *res = (struct crocus_resource *)ice->draw.draw_params.res;
emit_vertex_buffer_state(batch, i++,
res->bo,
ice->draw.draw_params.offset,
ice->draw.draw_params.res->width0,
0, 0, &map);
}
if (ice->state.vs_uses_derived_draw_params) {
struct crocus_resource *res = (struct crocus_resource *)ice->draw.derived_draw_params.res;
emit_vertex_buffer_state(batch, i++,
res->bo,
ice->draw.derived_draw_params.offset,
ice->draw.derived_draw_params.res->width0,
0, 0, &map);
}
}
}
if (dirty & CROCUS_DIRTY_VERTEX_ELEMENTS) {
struct crocus_vertex_element_state *cso = ice->state.cso_vertex_elements;
const unsigned entries = MAX2(cso->count, 1);
if (!(ice->state.vs_needs_sgvs_element ||
ice->state.vs_uses_derived_draw_params ||
ice->state.vs_needs_edge_flag)) {
crocus_batch_emit(batch, cso->vertex_elements, sizeof(uint32_t) *
(1 + entries * GENX(VERTEX_ELEMENT_STATE_length)));
} else {
uint32_t dynamic_ves[1 + 33 * GENX(VERTEX_ELEMENT_STATE_length)];
const unsigned dyn_count = cso->count +
ice->state.vs_needs_sgvs_element +
ice->state.vs_uses_derived_draw_params;
crocus_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS),
&dynamic_ves, ve) {
ve.DWordLength =
1 + GENX(VERTEX_ELEMENT_STATE_length) * dyn_count - 2;
}
memcpy(&dynamic_ves[1], &cso->vertex_elements[1],
(cso->count - ice->state.vs_needs_edge_flag) *
GENX(VERTEX_ELEMENT_STATE_length) * sizeof(uint32_t));
uint32_t *ve_pack_dest =
&dynamic_ves[1 + (cso->count - ice->state.vs_needs_edge_flag) *
GENX(VERTEX_ELEMENT_STATE_length)];
if (ice->state.vs_needs_sgvs_element) {
uint32_t base_ctrl = ice->state.vs_uses_draw_params ?
VFCOMP_STORE_SRC : VFCOMP_STORE_0;
crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
ve.Valid = true;
ve.VertexBufferIndex =
util_bitcount64(ice->state.bound_vertex_buffers);
ve.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
ve.Component0Control = base_ctrl;
ve.Component1Control = base_ctrl;
#if GFX_VER < 8
ve.Component2Control = ice->state.vs_uses_vertexid ? VFCOMP_STORE_VID : VFCOMP_STORE_0;
ve.Component3Control = ice->state.vs_uses_instanceid ? VFCOMP_STORE_IID : VFCOMP_STORE_0;
#else
ve.Component2Control = VFCOMP_STORE_0;
ve.Component3Control = VFCOMP_STORE_0;
#endif
#if GFX_VER < 5
ve.DestinationElementOffset = cso->count * 4;
#endif
}
ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
}
if (ice->state.vs_uses_derived_draw_params) {
crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
ve.Valid = true;
ve.VertexBufferIndex =
util_bitcount64(ice->state.bound_vertex_buffers) +
ice->state.vs_uses_draw_params;
ve.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
ve.Component0Control = VFCOMP_STORE_SRC;
ve.Component1Control = VFCOMP_STORE_SRC;
ve.Component2Control = VFCOMP_STORE_0;
ve.Component3Control = VFCOMP_STORE_0;
#if GFX_VER < 5
ve.DestinationElementOffset = (cso->count + ice->state.vs_needs_sgvs_element) * 4;
#endif
}
ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
}
if (ice->state.vs_needs_edge_flag) {
for (int i = 0; i < GENX(VERTEX_ELEMENT_STATE_length); i++)
ve_pack_dest[i] = cso->edgeflag_ve[i];
}
crocus_batch_emit(batch, &dynamic_ves, sizeof(uint32_t) *
(1 + dyn_count * GENX(VERTEX_ELEMENT_STATE_length)));
}
#if GFX_VER == 8
if (!ice->state.vs_needs_edge_flag) {
crocus_batch_emit(batch, cso->vf_instancing, sizeof(uint32_t) *
entries * GENX(3DSTATE_VF_INSTANCING_length));
} else {
assert(cso->count > 0);
const unsigned edgeflag_index = cso->count - 1;
uint32_t dynamic_vfi[33 * GENX(3DSTATE_VF_INSTANCING_length)];
memcpy(&dynamic_vfi[0], cso->vf_instancing, edgeflag_index *
GENX(3DSTATE_VF_INSTANCING_length) * sizeof(uint32_t));
uint32_t *vfi_pack_dest = &dynamic_vfi[0] +
edgeflag_index * GENX(3DSTATE_VF_INSTANCING_length);
crocus_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {
vi.VertexElementIndex = edgeflag_index +
ice->state.vs_needs_sgvs_element +
ice->state.vs_uses_derived_draw_params;
}
for (int i = 0; i < GENX(3DSTATE_VF_INSTANCING_length); i++)
vfi_pack_dest[i] |= cso->edgeflag_vfi[i];
crocus_batch_emit(batch, &dynamic_vfi[0], sizeof(uint32_t) *
entries * GENX(3DSTATE_VF_INSTANCING_length));
}
#endif
}
#if GFX_VER == 8
if (dirty & CROCUS_DIRTY_GEN8_VF_SGVS) {
const struct brw_vs_prog_data *vs_prog_data = (void *)
ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data;
struct crocus_vertex_element_state *cso = ice->state.cso_vertex_elements;
crocus_emit_cmd(batch, GENX(3DSTATE_VF_SGVS), sgv) {
if (vs_prog_data->uses_vertexid) {
sgv.VertexIDEnable = true;
sgv.VertexIDComponentNumber = 2;
sgv.VertexIDElementOffset =
cso->count - ice->state.vs_needs_edge_flag;
}
if (vs_prog_data->uses_instanceid) {
sgv.InstanceIDEnable = true;
sgv.InstanceIDComponentNumber = 3;
sgv.InstanceIDElementOffset =
cso->count - ice->state.vs_needs_edge_flag;
}
}
}
#endif
#if GFX_VERx10 >= 75
if (dirty & CROCUS_DIRTY_GEN75_VF) {
crocus_emit_cmd(batch, GENX(3DSTATE_VF), vf) {
if (draw->primitive_restart) {
vf.IndexedDrawCutIndexEnable = true;
vf.CutIndex = draw->restart_index;
}
}
}
#endif
#if GFX_VER == 8
if (dirty & CROCUS_DIRTY_GEN8_PMA_FIX) {
bool enable = want_pma_fix(ice);
genX(crocus_update_pma_fix)(ice, batch, enable);
}
#endif
#if GFX_VER <= 5
if (dirty & CROCUS_DIRTY_GEN4_CURBE) {
gen4_upload_curbe(batch);
}
#endif
}
static void
crocus_upload_render_state(struct crocus_context *ice,
struct crocus_batch *batch,
const struct pipe_draw_info *draw,
unsigned drawid_offset,
const struct pipe_draw_indirect_info *indirect,
const struct pipe_draw_start_count_bias *sc)
{
#if GFX_VER >= 7
bool use_predicate = ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT;
#endif
batch->no_wrap = true;
batch->contains_draw = true;
crocus_update_surface_base_address(batch);
crocus_upload_dirty_render_state(ice, batch, draw);
batch->no_wrap = false;
if (draw->index_size > 0) {
unsigned offset;
unsigned size;
bool emit_index = false;
if (draw->has_user_indices) {
unsigned start_offset = draw->index_size * sc->start;
u_upload_data(ice->ctx.stream_uploader, 0,
sc->count * draw->index_size, 4,
(char *)draw->index.user + start_offset,
&offset, &ice->state.index_buffer.res);
offset -= start_offset;
size = start_offset + sc->count * draw->index_size;
emit_index = true;
} else {
struct crocus_resource *res = (void *) draw->index.resource;
if (ice->state.index_buffer.res != draw->index.resource) {
res->bind_history |= PIPE_BIND_INDEX_BUFFER;
pipe_resource_reference(&ice->state.index_buffer.res,
draw->index.resource);
emit_index = true;
}
offset = 0;
size = draw->index.resource->width0;
}
if (!emit_index &&
(ice->state.index_buffer.size != size ||
ice->state.index_buffer.index_size != draw->index_size
#if GFX_VERx10 < 75
|| ice->state.index_buffer.prim_restart != draw->primitive_restart
#endif
)
)
emit_index = true;
if (emit_index) {
struct crocus_bo *bo = crocus_resource_bo(ice->state.index_buffer.res);
crocus_emit_cmd(batch, GENX(3DSTATE_INDEX_BUFFER), ib) {
#if GFX_VERx10 < 75
ib.CutIndexEnable = draw->primitive_restart;
#endif
ib.IndexFormat = draw->index_size >> 1;
ib.BufferStartingAddress = ro_bo(bo, offset);
#if GFX_VER >= 8
ib.MOCS = crocus_mocs(bo, &batch->screen->isl_dev);
ib.BufferSize = bo->size - offset;
#else
ib.BufferEndingAddress = ro_bo(bo, offset + size - 1);
#endif
}
ice->state.index_buffer.size = size;
ice->state.index_buffer.offset = offset;
ice->state.index_buffer.index_size = draw->index_size;
#if GFX_VERx10 < 75
ice->state.index_buffer.prim_restart = draw->primitive_restart;
#endif
}
}
#define _3DPRIM_END_OFFSET 0x2420
#define _3DPRIM_START_VERTEX 0x2430
#define _3DPRIM_VERTEX_COUNT 0x2434
#define _3DPRIM_INSTANCE_COUNT 0x2438
#define _3DPRIM_START_INSTANCE 0x243C
#define _3DPRIM_BASE_VERTEX 0x2440
#if GFX_VER >= 7
if (indirect && !indirect->count_from_stream_output) {
if (indirect->indirect_draw_count) {
use_predicate = true;
struct crocus_bo *draw_count_bo =
crocus_resource_bo(indirect->indirect_draw_count);
unsigned draw_count_offset =
indirect->indirect_draw_count_offset;
crocus_emit_pipe_control_flush(batch,
"ensure indirect draw buffer is flushed",
PIPE_CONTROL_FLUSH_ENABLE);
if (ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT) {
#if GFX_VERx10 >= 75
struct mi_builder b;
mi_builder_init(&b, &batch->screen->devinfo, batch);
/* comparison = draw id < draw count */
struct mi_value comparison =
mi_ult(&b, mi_imm(drawid_offset),
mi_mem32(ro_bo(draw_count_bo,
draw_count_offset)));
#if GFX_VER == 8
/* predicate = comparison & conditional rendering predicate */
mi_store(&b, mi_reg32(MI_PREDICATE_RESULT),
mi_iand(&b, comparison, mi_reg32(CS_GPR(15))));
#else
/* predicate = comparison & conditional rendering predicate */
struct mi_value pred = mi_iand(&b, comparison,
mi_reg32(CS_GPR(15)));
mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), pred);
mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
unsigned mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOADINV |
MI_PREDICATE_COMBINEOP_SET |
MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
crocus_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
#endif
#endif
} else {
uint32_t mi_predicate;
/* Upload the id of the current primitive to MI_PREDICATE_SRC1. */
crocus_load_register_imm64(batch, MI_PREDICATE_SRC1, drawid_offset);
/* Upload the current draw count from the draw parameters buffer
* to MI_PREDICATE_SRC0.
*/
crocus_load_register_mem32(batch, MI_PREDICATE_SRC0,
draw_count_bo, draw_count_offset);
/* Zero the top 32-bits of MI_PREDICATE_SRC0 */
crocus_load_register_imm32(batch, MI_PREDICATE_SRC0 + 4, 0);
if (drawid_offset == 0) {
mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOADINV |
MI_PREDICATE_COMBINEOP_SET |
MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
} else {
/* While draw_index < draw_count the predicate's result will be
* (draw_index == draw_count) ^ TRUE = TRUE
* When draw_index == draw_count the result is
* (TRUE) ^ TRUE = FALSE
* After this all results will be:
* (FALSE) ^ FALSE = FALSE
*/
mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOAD |
MI_PREDICATE_COMBINEOP_XOR |
MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
}
crocus_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
}
}
#if GFX_VER >= 7
struct crocus_bo *bo = crocus_resource_bo(indirect->buffer);
assert(bo);
crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
lrm.RegisterAddress = _3DPRIM_VERTEX_COUNT;
lrm.MemoryAddress = ro_bo(bo, indirect->offset + 0);
}
crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
lrm.RegisterAddress = _3DPRIM_INSTANCE_COUNT;
lrm.MemoryAddress = ro_bo(bo, indirect->offset + 4);
}
crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
lrm.RegisterAddress = _3DPRIM_START_VERTEX;
lrm.MemoryAddress = ro_bo(bo, indirect->offset + 8);
}
if (draw->index_size) {
crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
lrm.RegisterAddress = _3DPRIM_BASE_VERTEX;
lrm.MemoryAddress = ro_bo(bo, indirect->offset + 12);
}
crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
lrm.RegisterAddress = _3DPRIM_START_INSTANCE;
lrm.MemoryAddress = ro_bo(bo, indirect->offset + 16);
}
} else {
crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
lrm.RegisterAddress = _3DPRIM_START_INSTANCE;
lrm.MemoryAddress = ro_bo(bo, indirect->offset + 12);
}
crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
lri.RegisterOffset = _3DPRIM_BASE_VERTEX;
lri.DataDWord = 0;
}
}
#endif
} else if (indirect && indirect->count_from_stream_output) {
#if GFX_VERx10 >= 75
struct crocus_stream_output_target *so =
(void *) indirect->count_from_stream_output;
/* XXX: Replace with actual cache tracking */
crocus_emit_pipe_control_flush(batch,
"draw count from stream output stall",
PIPE_CONTROL_CS_STALL);
struct mi_builder b;
mi_builder_init(&b, &batch->screen->devinfo, batch);
struct crocus_address addr =
ro_bo(crocus_resource_bo(&so->offset_res->base.b), so->offset_offset);
struct mi_value offset =
mi_iadd_imm(&b, mi_mem32(addr), -so->base.buffer_offset);
mi_store(&b, mi_reg32(_3DPRIM_VERTEX_COUNT),
mi_udiv32_imm(&b, offset, so->stride));
_crocus_emit_lri(batch, _3DPRIM_START_VERTEX, 0);
_crocus_emit_lri(batch, _3DPRIM_BASE_VERTEX, 0);
_crocus_emit_lri(batch, _3DPRIM_START_INSTANCE, 0);
_crocus_emit_lri(batch, _3DPRIM_INSTANCE_COUNT, draw->instance_count);
#endif
}
#else
assert(!indirect);
#endif
crocus_emit_cmd(batch, GENX(3DPRIMITIVE), prim) {
prim.VertexAccessType = draw->index_size > 0 ? RANDOM : SEQUENTIAL;
#if GFX_VER >= 7
prim.PredicateEnable = use_predicate;
#endif
prim.PrimitiveTopologyType = translate_prim_type(ice->state.prim_mode, ice->state.patch_vertices);
if (indirect) {
// XXX Probably have to do something for gen6 here?
#if GFX_VER >= 7
prim.IndirectParameterEnable = true;
#endif
} else {
#if GFX_VER >= 5
prim.StartInstanceLocation = draw->start_instance;
#endif
prim.InstanceCount = draw->instance_count;
prim.VertexCountPerInstance = sc->count;
prim.StartVertexLocation = sc->start;
if (draw->index_size) {
prim.BaseVertexLocation += sc->index_bias;
}
}
}
}
#if GFX_VER >= 7
static void
crocus_upload_compute_state(struct crocus_context *ice,
struct crocus_batch *batch,
const struct pipe_grid_info *grid)
{
const uint64_t stage_dirty = ice->state.stage_dirty;
struct crocus_screen *screen = batch->screen;
const struct intel_device_info *devinfo = &screen->devinfo;
struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_COMPUTE];
struct crocus_compiled_shader *shader =
ice->shaders.prog[MESA_SHADER_COMPUTE];
struct brw_stage_prog_data *prog_data = shader->prog_data;
struct brw_cs_prog_data *cs_prog_data = (void *) prog_data;
const struct brw_cs_dispatch_info dispatch =
brw_cs_get_dispatch_info(devinfo, cs_prog_data, grid->block);
crocus_update_surface_base_address(batch);
if ((stage_dirty & CROCUS_STAGE_DIRTY_CONSTANTS_CS) && shs->sysvals_need_upload)
upload_sysvals(ice, MESA_SHADER_COMPUTE);
if (stage_dirty & CROCUS_STAGE_DIRTY_BINDINGS_CS) {
crocus_populate_binding_table(ice, batch, MESA_SHADER_COMPUTE, false);
ice->shaders.prog[MESA_SHADER_COMPUTE]->bind_bo_offset =
crocus_upload_binding_table(ice, batch,
ice->shaders.prog[MESA_SHADER_COMPUTE]->surf_offset,
ice->shaders.prog[MESA_SHADER_COMPUTE]->bt.size_bytes);
}
if (stage_dirty & CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS)
crocus_upload_sampler_states(ice, batch, MESA_SHADER_COMPUTE);
if ((stage_dirty & CROCUS_STAGE_DIRTY_CS) ||
cs_prog_data->local_size[0] == 0 /* Variable local group size */) {
/* The MEDIA_VFE_STATE documentation for Gen8+ says:
*
* "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
* the only bits that are changed are scoreboard related: Scoreboard
* Enable, Scoreboard Type, Scoreboard Mask, Scoreboard Delta. For
* these scoreboard related states, a MEDIA_STATE_FLUSH is
* sufficient."
*/
crocus_emit_pipe_control_flush(batch,
"workaround: stall before MEDIA_VFE_STATE",
PIPE_CONTROL_CS_STALL);
crocus_emit_cmd(batch, GENX(MEDIA_VFE_STATE), vfe) {
if (prog_data->total_scratch) {
struct crocus_bo *bo =
crocus_get_scratch_space(ice, prog_data->total_scratch,
MESA_SHADER_COMPUTE);
#if GFX_VER == 8
/* Broadwell's Per Thread Scratch Space is in the range [0, 11]
* where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M.
*/
vfe.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 11;
#elif GFX_VERx10 == 75
/* Haswell's Per Thread Scratch Space is in the range [0, 10]
* where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M.
*/
vfe.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 12;
#else
/* Earlier platforms use the range [0, 11] to mean [1kB, 12kB]
* where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB.
*/
vfe.PerThreadScratchSpace = prog_data->total_scratch / 1024 - 1;
#endif
vfe.ScratchSpaceBasePointer = rw_bo(bo, 0);
}
vfe.MaximumNumberofThreads =
devinfo->max_cs_threads * screen->subslice_total - 1;
vfe.ResetGatewayTimer =
Resettingrelativetimerandlatchingtheglobaltimestamp;
vfe.BypassGatewayControl = true;
#if GFX_VER == 7
vfe.GPGPUMode = 1;
#endif
#if GFX_VER == 8
vfe.BypassGatewayControl = true;
#endif
vfe.NumberofURBEntries = GFX_VER == 8 ? 2 : 0;
vfe.URBEntryAllocationSize = GFX_VER == 8 ? 2 : 0;
vfe.CURBEAllocationSize =
ALIGN(cs_prog_data->push.per_thread.regs * dispatch.threads +
cs_prog_data->push.cross_thread.regs, 2);
}
}
/* TODO: Combine subgroup-id with cbuf0 so we can push regular uniforms */
if ((stage_dirty & CROCUS_STAGE_DIRTY_CS) ||
cs_prog_data->local_size[0] == 0 /* Variable local group size */) {
uint32_t curbe_data_offset = 0;
assert(cs_prog_data->push.cross_thread.dwords == 0 &&
cs_prog_data->push.per_thread.dwords == 1 &&
cs_prog_data->base.param[0] == BRW_PARAM_BUILTIN_SUBGROUP_ID);
const unsigned push_const_size =
brw_cs_push_const_total_size(cs_prog_data, dispatch.threads);
uint32_t *curbe_data_map =
stream_state(batch,
ALIGN(push_const_size, 64), 64,
&curbe_data_offset);
assert(curbe_data_map);
memset(curbe_data_map, 0x5a, ALIGN(push_const_size, 64));
crocus_fill_cs_push_const_buffer(cs_prog_data, dispatch.threads,
curbe_data_map);
crocus_emit_cmd(batch, GENX(MEDIA_CURBE_LOAD), curbe) {
curbe.CURBETotalDataLength = ALIGN(push_const_size, 64);
curbe.CURBEDataStartAddress = curbe_data_offset;
}
}
if (stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS |
CROCUS_STAGE_DIRTY_BINDINGS_CS |
CROCUS_STAGE_DIRTY_CONSTANTS_CS |
CROCUS_STAGE_DIRTY_CS)) {
uint32_t desc[GENX(INTERFACE_DESCRIPTOR_DATA_length)];
const uint64_t ksp = KSP(ice,shader) + brw_cs_prog_data_prog_offset(cs_prog_data, dispatch.simd_size);
crocus_pack_state(GENX(INTERFACE_DESCRIPTOR_DATA), desc, idd) {
idd.KernelStartPointer = ksp;
idd.SamplerStatePointer = shs->sampler_offset;
idd.BindingTablePointer = ice->shaders.prog[MESA_SHADER_COMPUTE]->bind_bo_offset;
idd.BindingTableEntryCount = MIN2(shader->bt.size_bytes / 4, 31);
idd.NumberofThreadsinGPGPUThreadGroup = dispatch.threads;
idd.ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs;
idd.BarrierEnable = cs_prog_data->uses_barrier;
idd.SharedLocalMemorySize = encode_slm_size(GFX_VER,
prog_data->total_shared);
#if GFX_VERx10 >= 75
idd.CrossThreadConstantDataReadLength = cs_prog_data->push.cross_thread.regs;
#endif
}
crocus_emit_cmd(batch, GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), load) {
load.InterfaceDescriptorTotalLength =
GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
load.InterfaceDescriptorDataStartAddress =
emit_state(batch, desc, sizeof(desc), 64);
}
}
#define GPGPU_DISPATCHDIMX 0x2500
#define GPGPU_DISPATCHDIMY 0x2504
#define GPGPU_DISPATCHDIMZ 0x2508
if (grid->indirect) {
struct crocus_state_ref *grid_size = &ice->state.grid_size;
struct crocus_bo *bo = crocus_resource_bo(grid_size->res);
crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
lrm.RegisterAddress = GPGPU_DISPATCHDIMX;
lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 0);
}
crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
lrm.RegisterAddress = GPGPU_DISPATCHDIMY;
lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 4);
}
crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
lrm.RegisterAddress = GPGPU_DISPATCHDIMZ;
lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 8);
}
#if GFX_VER == 7
/* Clear upper 32-bits of SRC0 and all 64-bits of SRC1 */
_crocus_emit_lri(batch, MI_PREDICATE_SRC0 + 4, 0);
crocus_load_register_imm64(batch, MI_PREDICATE_SRC1, 0);
/* Load compute_dispatch_indirect_x_size into SRC0 */
crocus_load_register_mem32(batch, MI_PREDICATE_SRC0, bo, grid_size->offset + 0);
/* predicate = (compute_dispatch_indirect_x_size == 0); */
crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) {
mip.LoadOperation = LOAD_LOAD;
mip.CombineOperation = COMBINE_SET;
mip.CompareOperation = COMPARE_SRCS_EQUAL;
};
/* Load compute_dispatch_indirect_y_size into SRC0 */
crocus_load_register_mem32(batch, MI_PREDICATE_SRC0, bo, grid_size->offset + 4);
/* predicate = (compute_dispatch_indirect_y_size == 0); */
crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) {
mip.LoadOperation = LOAD_LOAD;
mip.CombineOperation = COMBINE_OR;
mip.CompareOperation = COMPARE_SRCS_EQUAL;
};
/* Load compute_dispatch_indirect_z_size into SRC0 */
crocus_load_register_mem32(batch, MI_PREDICATE_SRC0, bo, grid_size->offset + 8);
/* predicate = (compute_dispatch_indirect_z_size == 0); */
crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) {
mip.LoadOperation = LOAD_LOAD;
mip.CombineOperation = COMBINE_OR;
mip.CompareOperation = COMPARE_SRCS_EQUAL;
};
/* predicate = !predicate; */
#define COMPARE_FALSE 1
crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) {
mip.LoadOperation = LOAD_LOADINV;
mip.CombineOperation = COMBINE_OR;
mip.CompareOperation = COMPARE_FALSE;
}
#endif
}
crocus_emit_cmd(batch, GENX(GPGPU_WALKER), ggw) {
ggw.IndirectParameterEnable = grid->indirect != NULL;
ggw.PredicateEnable = GFX_VER <= 7 && grid->indirect != NULL;
ggw.SIMDSize = dispatch.simd_size / 16;
ggw.ThreadDepthCounterMaximum = 0;
ggw.ThreadHeightCounterMaximum = 0;
ggw.ThreadWidthCounterMaximum = dispatch.threads - 1;
ggw.ThreadGroupIDXDimension = grid->grid[0];
ggw.ThreadGroupIDYDimension = grid->grid[1];
ggw.ThreadGroupIDZDimension = grid->grid[2];
ggw.RightExecutionMask = dispatch.right_mask;
ggw.BottomExecutionMask = 0xffffffff;
}
crocus_emit_cmd(batch, GENX(MEDIA_STATE_FLUSH), msf);
batch->contains_draw = true;
}
#endif /* GFX_VER >= 7 */
/**
* State module teardown.
*/
static void
crocus_destroy_state(struct crocus_context *ice)
{
pipe_resource_reference(&ice->draw.draw_params.res, NULL);
pipe_resource_reference(&ice->draw.derived_draw_params.res, NULL);
free(ice->state.genx);
for (int i = 0; i < 4; i++) {
pipe_so_target_reference(&ice->state.so_target[i], NULL);
}
for (unsigned i = 0; i < ice->state.framebuffer.nr_cbufs; i++) {
pipe_surface_reference(&ice->state.framebuffer.cbufs[i], NULL);
}
pipe_surface_reference(&ice->state.framebuffer.zsbuf, NULL);
for (int stage = 0; stage < MESA_SHADER_STAGES; stage++) {
struct crocus_shader_state *shs = &ice->state.shaders[stage];
for (int i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
pipe_resource_reference(&shs->constbufs[i].buffer, NULL);
}
for (int i = 0; i < PIPE_MAX_SHADER_IMAGES; i++) {
pipe_resource_reference(&shs->image[i].base.resource, NULL);
}
for (int i = 0; i < PIPE_MAX_SHADER_BUFFERS; i++) {
pipe_resource_reference(&shs->ssbo[i].buffer, NULL);
}
for (int i = 0; i < CROCUS_MAX_TEXTURE_SAMPLERS; i++) {
pipe_sampler_view_reference((struct pipe_sampler_view **)
&shs->textures[i], NULL);
}
}
for (int i = 0; i < 16; i++)
pipe_resource_reference(&ice->state.vertex_buffers[i].buffer.resource, NULL);
pipe_resource_reference(&ice->state.grid_size.res, NULL);
pipe_resource_reference(&ice->state.index_buffer.res, NULL);
}
/* ------------------------------------------------------------------- */
static void
crocus_rebind_buffer(struct crocus_context *ice,
struct crocus_resource *res)
{
struct pipe_context *ctx = &ice->ctx;
assert(res->base.b.target == PIPE_BUFFER);
/* Buffers can't be framebuffer attachments, nor display related,
* and we don't have upstream Clover support.
*/
assert(!(res->bind_history & (PIPE_BIND_DEPTH_STENCIL |
PIPE_BIND_RENDER_TARGET |
PIPE_BIND_BLENDABLE |
PIPE_BIND_DISPLAY_TARGET |
PIPE_BIND_CURSOR |
PIPE_BIND_COMPUTE_RESOURCE |
PIPE_BIND_GLOBAL)));
if (res->bind_history & PIPE_BIND_VERTEX_BUFFER) {
uint64_t bound_vbs = ice->state.bound_vertex_buffers;
while (bound_vbs) {
const int i = u_bit_scan64(&bound_vbs);
struct pipe_vertex_buffer *buffer = &ice->state.vertex_buffers[i];
if (!buffer->is_user_buffer && &res->base.b == buffer->buffer.resource)
ice->state.dirty |= CROCUS_DIRTY_VERTEX_BUFFERS;
}
}
if ((res->bind_history & PIPE_BIND_INDEX_BUFFER) &&
ice->state.index_buffer.res) {
if (res->bo == crocus_resource_bo(ice->state.index_buffer.res))
pipe_resource_reference(&ice->state.index_buffer.res, NULL);
}
/* There is no need to handle these:
* - PIPE_BIND_COMMAND_ARGS_BUFFER (emitted for every indirect draw)
* - PIPE_BIND_QUERY_BUFFER (no persistent state references)
*/
if (res->bind_history & PIPE_BIND_STREAM_OUTPUT) {
/* XXX: be careful about resetting vs appending... */
for (int i = 0; i < 4; i++) {
if (ice->state.so_target[i] &&
(ice->state.so_target[i]->buffer == &res->base.b)) {
#if GFX_VER == 6
ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_GS;
#else
ice->state.dirty |= CROCUS_DIRTY_GEN7_SO_BUFFERS;
#endif
}
}
}
for (int s = MESA_SHADER_VERTEX; s < MESA_SHADER_STAGES; s++) {
struct crocus_shader_state *shs = &ice->state.shaders[s];
enum pipe_shader_type p_stage = stage_to_pipe(s);
if (!(res->bind_stages & (1 << s)))
continue;
if (res->bind_history & PIPE_BIND_CONSTANT_BUFFER) {
/* Skip constant buffer 0, it's for regular uniforms, not UBOs */
uint32_t bound_cbufs = shs->bound_cbufs & ~1u;
while (bound_cbufs) {
const int i = u_bit_scan(&bound_cbufs);
struct pipe_constant_buffer *cbuf = &shs->constbufs[i];
if (res->bo == crocus_resource_bo(cbuf->buffer)) {
ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS << s;
}
}
}
if (res->bind_history & PIPE_BIND_SHADER_BUFFER) {
uint32_t bound_ssbos = shs->bound_ssbos;
while (bound_ssbos) {
const int i = u_bit_scan(&bound_ssbos);
struct pipe_shader_buffer *ssbo = &shs->ssbo[i];
if (res->bo == crocus_resource_bo(ssbo->buffer)) {
struct pipe_shader_buffer buf = {
.buffer = &res->base.b,
.buffer_offset = ssbo->buffer_offset,
.buffer_size = ssbo->buffer_size,
};
crocus_set_shader_buffers(ctx, p_stage, i, 1, &buf,
(shs->writable_ssbos >> i) & 1);
}
}
}
if (res->bind_history & PIPE_BIND_SAMPLER_VIEW) {
uint32_t bound_sampler_views = shs->bound_sampler_views;
while (bound_sampler_views) {
const int i = u_bit_scan(&bound_sampler_views);
struct crocus_sampler_view *isv = shs->textures[i];
struct crocus_bo *bo = isv->res->bo;
if (res->bo == bo) {
ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << s;
}
}
}
if (res->bind_history & PIPE_BIND_SHADER_IMAGE) {
uint32_t bound_image_views = shs->bound_image_views;
while (bound_image_views) {
const int i = u_bit_scan(&bound_image_views);
struct crocus_image_view *iv = &shs->image[i];
struct crocus_bo *bo = crocus_resource_bo(iv->base.resource);
if (res->bo == bo)
ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << s;
}
}
}
}
/* ------------------------------------------------------------------- */
static unsigned
flags_to_post_sync_op(uint32_t flags)
{
if (flags & PIPE_CONTROL_WRITE_IMMEDIATE)
return WriteImmediateData;
if (flags & PIPE_CONTROL_WRITE_DEPTH_COUNT)
return WritePSDepthCount;
if (flags & PIPE_CONTROL_WRITE_TIMESTAMP)
return WriteTimestamp;
return 0;
}
/*
* Do the given flags have a Post Sync or LRI Post Sync operation?
*/
static enum pipe_control_flags
get_post_sync_flags(enum pipe_control_flags flags)
{
flags &= PIPE_CONTROL_WRITE_IMMEDIATE |
PIPE_CONTROL_WRITE_DEPTH_COUNT |
PIPE_CONTROL_WRITE_TIMESTAMP |
PIPE_CONTROL_LRI_POST_SYNC_OP;
/* Only one "Post Sync Op" is allowed, and it's mutually exclusive with
* "LRI Post Sync Operation". So more than one bit set would be illegal.
*/
assert(util_bitcount(flags) <= 1);
return flags;
}
#define IS_COMPUTE_PIPELINE(batch) (batch->name == CROCUS_BATCH_COMPUTE)
/**
* Emit a series of PIPE_CONTROL commands, taking into account any
* workarounds necessary to actually accomplish the caller's request.
*
* Unless otherwise noted, spec quotations in this function come from:
*
* Synchronization of the 3D Pipeline > PIPE_CONTROL Command > Programming
* Restrictions for PIPE_CONTROL.
*
* You should not use this function directly. Use the helpers in
* crocus_pipe_control.c instead, which may split the pipe control further.
*/
static void
crocus_emit_raw_pipe_control(struct crocus_batch *batch,
const char *reason,
uint32_t flags,
struct crocus_bo *bo,
uint32_t offset,
uint64_t imm)
{
UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo;
enum pipe_control_flags post_sync_flags = get_post_sync_flags(flags);
UNUSED enum pipe_control_flags non_lri_post_sync_flags =
post_sync_flags & ~PIPE_CONTROL_LRI_POST_SYNC_OP;
/* Recursive PIPE_CONTROL workarounds --------------------------------
* (http://knowyourmeme.com/memes/xzibit-yo-dawg)
*
* We do these first because we want to look at the original operation,
* rather than any workarounds we set.
*/
/* "Flush Types" workarounds ---------------------------------------------
* We do these now because they may add post-sync operations or CS stalls.
*/
if (GFX_VER == 6 && (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH)) {
/* Hardware workaround: SNB B-Spec says:
*
* "[Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush
* Enable = 1, a PIPE_CONTROL with any non-zero post-sync-op is
* required."
*/
crocus_emit_post_sync_nonzero_flush(batch);
}
#if GFX_VER == 8
if (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE) {
/* Project: BDW, SKL+ (stopping at CNL) / Argument: VF Invalidate
*
* "'Post Sync Operation' must be enabled to 'Write Immediate Data' or
* 'Write PS Depth Count' or 'Write Timestamp'."
*/
if (!bo) {
flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
post_sync_flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
non_lri_post_sync_flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
bo = batch->ice->workaround_bo;
offset = batch->ice->workaround_offset;
}
}
#endif
#if GFX_VERx10 < 75
if (flags & PIPE_CONTROL_DEPTH_STALL) {
/* Project: PRE-HSW / Argument: Depth Stall
*
* "The following bits must be clear:
* - Render Target Cache Flush Enable ([12] of DW1)
* - Depth Cache Flush Enable ([0] of DW1)"
*/
assert(!(flags & (PIPE_CONTROL_RENDER_TARGET_FLUSH |
PIPE_CONTROL_DEPTH_CACHE_FLUSH)));
}
#endif
if (GFX_VER >= 6 && (flags & PIPE_CONTROL_DEPTH_STALL)) {
/* From the PIPE_CONTROL instruction table, bit 13 (Depth Stall Enable):
*
* "This bit must be DISABLED for operations other than writing
* PS_DEPTH_COUNT."
*
* This seems like nonsense. An Ivybridge workaround requires us to
* emit a PIPE_CONTROL with a depth stall and write immediate post-sync
* operation. Gen8+ requires us to emit depth stalls and depth cache
* flushes together. So, it's hard to imagine this means anything other
* than "we originally intended this to be used for PS_DEPTH_COUNT".
*
* We ignore the supposed restriction and do nothing.
*/
}
if (GFX_VERx10 < 75 && (flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH)) {
/* Project: PRE-HSW / Argument: Depth Cache Flush
*
* "Depth Stall must be clear ([13] of DW1)."
*/
assert(!(flags & PIPE_CONTROL_DEPTH_STALL));
}
if (flags & (PIPE_CONTROL_RENDER_TARGET_FLUSH |
PIPE_CONTROL_STALL_AT_SCOREBOARD)) {
/* From the PIPE_CONTROL instruction table, bit 12 and bit 1:
*
* "This bit must be DISABLED for End-of-pipe (Read) fences,
* PS_DEPTH_COUNT or TIMESTAMP queries."
*
* TODO: Implement end-of-pipe checking.
*/
assert(!(post_sync_flags & (PIPE_CONTROL_WRITE_DEPTH_COUNT |
PIPE_CONTROL_WRITE_TIMESTAMP)));
}
if (flags & PIPE_CONTROL_STALL_AT_SCOREBOARD) {
/* From the PIPE_CONTROL instruction table, bit 1:
*
* "This bit is ignored if Depth Stall Enable is set.
* Further, the render cache is not flushed even if Write Cache
* Flush Enable bit is set."
*
* We assert that the caller doesn't do this combination, to try and
* prevent mistakes. It shouldn't hurt the GPU, though.
*
* We skip this check on Gen11+ as the "Stall at Pixel Scoreboard"
* and "Render Target Flush" combo is explicitly required for BTI
* update workarounds.
*/
assert(!(flags & (PIPE_CONTROL_DEPTH_STALL |
PIPE_CONTROL_RENDER_TARGET_FLUSH)));
}
/* PIPE_CONTROL page workarounds ------------------------------------- */
if (GFX_VER >= 7 && (flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE)) {
/* From the PIPE_CONTROL page itself:
*
* "IVB, HSW, BDW
* Restriction: Pipe_control with CS-stall bit set must be issued
* before a pipe-control command that has the State Cache
* Invalidate bit set."
*/
flags |= PIPE_CONTROL_CS_STALL;
}
if ((GFX_VERx10 == 75)) {
/* From the PIPE_CONTROL page itself:
*
* "HSW - Programming Note: PIPECONTROL with RO Cache Invalidation:
* Prior to programming a PIPECONTROL command with any of the RO
* cache invalidation bit set, program a PIPECONTROL flush command
* with “CS stall” bit and “HDC Flush” bit set."
*
* TODO: Actually implement this. What's an HDC Flush?
*/
}
if (flags & PIPE_CONTROL_FLUSH_LLC) {
/* From the PIPE_CONTROL instruction table, bit 26 (Flush LLC):
*
* "Project: ALL
* SW must always program Post-Sync Operation to "Write Immediate
* Data" when Flush LLC is set."
*
* For now, we just require the caller to do it.
*/
assert(flags & PIPE_CONTROL_WRITE_IMMEDIATE);
}
/* "Post-Sync Operation" workarounds -------------------------------- */
/* Project: All / Argument: Global Snapshot Count Reset [19]
*
* "This bit must not be exercised on any product.
* Requires stall bit ([20] of DW1) set."
*
* We don't use this, so we just assert that it isn't used. The
* PIPE_CONTROL instruction page indicates that they intended this
* as a debug feature and don't think it is useful in production,
* but it may actually be usable, should we ever want to.
*/
assert((flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET) == 0);
if (flags & (PIPE_CONTROL_MEDIA_STATE_CLEAR |
PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE)) {
/* Project: All / Arguments:
*
* - Generic Media State Clear [16]
* - Indirect State Pointers Disable [16]
*
* "Requires stall bit ([20] of DW1) set."
*
* Also, the PIPE_CONTROL instruction table, bit 16 (Generic Media
* State Clear) says:
*
* "PIPECONTROL command with “Command Streamer Stall Enable” must be
* programmed prior to programming a PIPECONTROL command with "Media
* State Clear" set in GPGPU mode of operation"
*
* This is a subset of the earlier rule, so there's nothing to do.
*/
flags |= PIPE_CONTROL_CS_STALL;
}
if (flags & PIPE_CONTROL_STORE_DATA_INDEX) {
/* Project: All / Argument: Store Data Index
*
* "Post-Sync Operation ([15:14] of DW1) must be set to something other
* than '0'."
*
* For now, we just assert that the caller does this. We might want to
* automatically add a write to the workaround BO...
*/
assert(non_lri_post_sync_flags != 0);
}
if (flags & PIPE_CONTROL_SYNC_GFDT) {
/* Project: All / Argument: Sync GFDT
*
* "Post-Sync Operation ([15:14] of DW1) must be set to something other
* than '0' or 0x2520[13] must be set."
*
* For now, we just assert that the caller does this.
*/
assert(non_lri_post_sync_flags != 0);
}
if (GFX_VER >= 6 && GFX_VER < 8 && (flags & PIPE_CONTROL_TLB_INVALIDATE)) {
/* Project: SNB, IVB, HSW / Argument: TLB inv
*
* "{All SKUs}{All Steppings}: Post-Sync Operation ([15:14] of DW1)
* must be set to something other than '0'."
*
* For now, we just assert that the caller does this.
*/
assert(non_lri_post_sync_flags != 0);
}
if (GFX_VER >= 7 && (flags & PIPE_CONTROL_TLB_INVALIDATE)) {
/* Project: IVB+ / Argument: TLB inv
*
* "Requires stall bit ([20] of DW1) set."
*
* Also, from the PIPE_CONTROL instruction table:
*
* "Project: SKL+
* Post Sync Operation or CS stall must be set to ensure a TLB
* invalidation occurs. Otherwise no cycle will occur to the TLB
* cache to invalidate."
*
* This is not a subset of the earlier rule, so there's nothing to do.
*/
flags |= PIPE_CONTROL_CS_STALL;
}
#if GFX_VER == 8
if (IS_COMPUTE_PIPELINE(batch)) {
if (post_sync_flags ||
(flags & (PIPE_CONTROL_NOTIFY_ENABLE |
PIPE_CONTROL_DEPTH_STALL |
PIPE_CONTROL_RENDER_TARGET_FLUSH |
PIPE_CONTROL_DEPTH_CACHE_FLUSH |
PIPE_CONTROL_DATA_CACHE_FLUSH))) {
/* Project: BDW / Arguments:
*
* - LRI Post Sync Operation [23]
* - Post Sync Op [15:14]
* - Notify En [8]
* - Depth Stall [13]
* - Render Target Cache Flush [12]
* - Depth Cache Flush [0]
* - DC Flush Enable [5]
*
* "Requires stall bit ([20] of DW) set for all GPGPU and Media
* Workloads."
*
* (The docs have separate table rows for each bit, with essentially
* the same workaround text. We've combined them here.)
*/
flags |= PIPE_CONTROL_CS_STALL;
/* Also, from the PIPE_CONTROL instruction table, bit 20:
*
* "Project: BDW
* This bit must be always set when PIPE_CONTROL command is
* programmed by GPGPU and MEDIA workloads, except for the cases
* when only Read Only Cache Invalidation bits are set (State
* Cache Invalidation Enable, Instruction cache Invalidation
* Enable, Texture Cache Invalidation Enable, Constant Cache
* Invalidation Enable). This is to WA FFDOP CG issue, this WA
* need not implemented when FF_DOP_CG is disable via "Fixed
* Function DOP Clock Gate Disable" bit in RC_PSMI_CTRL register."
*
* It sounds like we could avoid CS stalls in some cases, but we
* don't currently bother. This list isn't exactly the list above,
* either...
*/
}
}
#endif
/* Implement the WaCsStallAtEveryFourthPipecontrol workaround on IVB, BYT:
*
* "Every 4th PIPE_CONTROL command, not counting the PIPE_CONTROL with
* only read-cache-invalidate bit(s) set, must have a CS_STALL bit set."
*
* Note that the kernel does CS stalls between batches, so we only need
* to count them within a batch. We currently naively count every 4, and
* don't skip the ones with only read-cache-invalidate bits set. This
* may or may not be a problem...
*/
if (GFX_VER == 7 && !(GFX_VERx10 == 75)) {
if (flags & PIPE_CONTROL_CS_STALL) {
/* If we're doing a CS stall, reset the counter and carry on. */
batch->pipe_controls_since_last_cs_stall = 0;
}
/* If this is the fourth pipe control without a CS stall, do one now. */
if (++batch->pipe_controls_since_last_cs_stall == 4) {
batch->pipe_controls_since_last_cs_stall = 0;
flags |= PIPE_CONTROL_CS_STALL;
}
}
/* "Stall" workarounds ----------------------------------------------
* These have to come after the earlier ones because we may have added
* some additional CS stalls above.
*/
if (flags & PIPE_CONTROL_CS_STALL) {
/* Project: PRE-SKL, VLV, CHV
*
* "[All Stepping][All SKUs]:
*
* One of the following must also be set:
*
* - Render Target Cache Flush Enable ([12] of DW1)
* - Depth Cache Flush Enable ([0] of DW1)
* - Stall at Pixel Scoreboard ([1] of DW1)
* - Depth Stall ([13] of DW1)
* - Post-Sync Operation ([13] of DW1)
* - DC Flush Enable ([5] of DW1)"
*
* If we don't already have one of those bits set, we choose to add
* "Stall at Pixel Scoreboard". Some of the other bits require a
* CS stall as a workaround (see above), which would send us into
* an infinite recursion of PIPE_CONTROLs. "Stall at Pixel Scoreboard"
* appears to be safe, so we choose that.
*/
const uint32_t wa_bits = PIPE_CONTROL_RENDER_TARGET_FLUSH |
PIPE_CONTROL_DEPTH_CACHE_FLUSH |
PIPE_CONTROL_WRITE_IMMEDIATE |
PIPE_CONTROL_WRITE_DEPTH_COUNT |
PIPE_CONTROL_WRITE_TIMESTAMP |
PIPE_CONTROL_STALL_AT_SCOREBOARD |
PIPE_CONTROL_DEPTH_STALL |
PIPE_CONTROL_DATA_CACHE_FLUSH;
if (!(flags & wa_bits))
flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;
}
/* Emit --------------------------------------------------------------- */
if (INTEL_DEBUG & DEBUG_PIPE_CONTROL) {
fprintf(stderr,
" PC [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%"PRIx64"]: %s\n",
(flags & PIPE_CONTROL_FLUSH_ENABLE) ? "PipeCon " : "",
(flags & PIPE_CONTROL_CS_STALL) ? "CS " : "",
(flags & PIPE_CONTROL_STALL_AT_SCOREBOARD) ? "Scoreboard " : "",
(flags & PIPE_CONTROL_VF_CACHE_INVALIDATE) ? "VF " : "",
(flags & PIPE_CONTROL_RENDER_TARGET_FLUSH) ? "RT " : "",
(flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE) ? "Const " : "",
(flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE) ? "TC " : "",
(flags & PIPE_CONTROL_DATA_CACHE_FLUSH) ? "DC " : "",
(flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH) ? "ZFlush " : "",
(flags & PIPE_CONTROL_DEPTH_STALL) ? "ZStall " : "",
(flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE) ? "State " : "",
(flags & PIPE_CONTROL_TLB_INVALIDATE) ? "TLB " : "",
(flags & PIPE_CONTROL_INSTRUCTION_INVALIDATE) ? "Inst " : "",
(flags & PIPE_CONTROL_MEDIA_STATE_CLEAR) ? "MediaClear " : "",
(flags & PIPE_CONTROL_NOTIFY_ENABLE) ? "Notify " : "",
(flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET) ?
"SnapRes" : "",
(flags & PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE) ?
"ISPDis" : "",
(flags & PIPE_CONTROL_WRITE_IMMEDIATE) ? "WriteImm " : "",
(flags & PIPE_CONTROL_WRITE_DEPTH_COUNT) ? "WriteZCount " : "",
(flags & PIPE_CONTROL_WRITE_TIMESTAMP) ? "WriteTimestamp " : "",
imm, reason);
}
crocus_emit_cmd(batch, GENX(PIPE_CONTROL), pc) {
#if GFX_VER >= 7
pc.LRIPostSyncOperation = NoLRIOperation;
pc.PipeControlFlushEnable = flags & PIPE_CONTROL_FLUSH_ENABLE;
pc.DCFlushEnable = flags & PIPE_CONTROL_DATA_CACHE_FLUSH;
#endif
#if GFX_VER >= 6
pc.StoreDataIndex = 0;
pc.CommandStreamerStallEnable = flags & PIPE_CONTROL_CS_STALL;
pc.GlobalSnapshotCountReset =
flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET;
pc.TLBInvalidate = flags & PIPE_CONTROL_TLB_INVALIDATE;
pc.GenericMediaStateClear = flags & PIPE_CONTROL_MEDIA_STATE_CLEAR;
pc.StallAtPixelScoreboard = flags & PIPE_CONTROL_STALL_AT_SCOREBOARD;
pc.RenderTargetCacheFlushEnable =
flags & PIPE_CONTROL_RENDER_TARGET_FLUSH;
pc.DepthCacheFlushEnable = flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH;
pc.StateCacheInvalidationEnable =
flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE;
pc.VFCacheInvalidationEnable = flags & PIPE_CONTROL_VF_CACHE_INVALIDATE;
pc.ConstantCacheInvalidationEnable =
flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE;
#else
pc.WriteCacheFlush = flags & PIPE_CONTROL_RENDER_TARGET_FLUSH;
#endif
pc.PostSyncOperation = flags_to_post_sync_op(flags);
pc.DepthStallEnable = flags & PIPE_CONTROL_DEPTH_STALL;
pc.InstructionCacheInvalidateEnable =
flags & PIPE_CONTROL_INSTRUCTION_INVALIDATE;
pc.NotifyEnable = flags & PIPE_CONTROL_NOTIFY_ENABLE;
#if GFX_VER >= 5 || GFX_VERx10 == 45
pc.IndirectStatePointersDisable =
flags & PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE;
#endif
#if GFX_VER >= 6
pc.TextureCacheInvalidationEnable =
flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
#elif GFX_VER == 5 || GFX_VERx10 == 45
pc.TextureCacheFlushEnable =
flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
#endif
pc.Address = ggtt_bo(bo, offset);
if (GFX_VER < 7 && bo)
pc.DestinationAddressType = DAT_GGTT;
pc.ImmediateData = imm;
}
}
#if GFX_VER == 6
void
genX(crocus_upload_urb)(struct crocus_batch *batch,
unsigned vs_size,
bool gs_present,
unsigned gs_size)
{
struct crocus_context *ice = batch->ice;
int nr_vs_entries, nr_gs_entries;
int total_urb_size = ice->urb.size * 1024; /* in bytes */
const struct intel_device_info *devinfo = &batch->screen->devinfo;
/* Calculate how many entries fit in each stage's section of the URB */
if (gs_present) {
nr_vs_entries = (total_urb_size/2) / (vs_size * 128);
nr_gs_entries = (total_urb_size/2) / (gs_size * 128);
} else {
nr_vs_entries = total_urb_size / (vs_size * 128);
nr_gs_entries = 0;
}
/* Then clamp to the maximum allowed by the hardware */
if (nr_vs_entries > devinfo->urb.max_entries[MESA_SHADER_VERTEX])
nr_vs_entries = devinfo->urb.max_entries[MESA_SHADER_VERTEX];
if (nr_gs_entries > devinfo->urb.max_entries[MESA_SHADER_GEOMETRY])
nr_gs_entries = devinfo->urb.max_entries[MESA_SHADER_GEOMETRY];
/* Finally, both must be a multiple of 4 (see 3DSTATE_URB in the PRM). */
ice->urb.nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, 4);
ice->urb.nr_gs_entries = ROUND_DOWN_TO(nr_gs_entries, 4);
assert(ice->urb.nr_vs_entries >=
devinfo->urb.min_entries[MESA_SHADER_VERTEX]);
assert(ice->urb.nr_vs_entries % 4 == 0);
assert(ice->urb.nr_gs_entries % 4 == 0);
assert(vs_size <= 5);
assert(gs_size <= 5);
crocus_emit_cmd(batch, GENX(3DSTATE_URB), urb) {
urb.VSNumberofURBEntries = ice->urb.nr_vs_entries;
urb.VSURBEntryAllocationSize = vs_size - 1;
urb.GSNumberofURBEntries = ice->urb.nr_gs_entries;
urb.GSURBEntryAllocationSize = gs_size - 1;
};
/* From the PRM Volume 2 part 1, section 1.4.7:
*
* Because of a urb corruption caused by allocating a previous gsunits
* urb entry to vsunit software is required to send a "GS NULL
* Fence"(Send URB fence with VS URB size == 1 and GS URB size == 0) plus
* a dummy DRAW call before any case where VS will be taking over GS URB
* space.
*
* It is not clear exactly what this means ("URB fence" is a command that
* doesn't exist on Gen6). So for now we just do a full pipeline flush as
* a workaround.
*/
if (ice->urb.gs_present && !gs_present)
crocus_emit_mi_flush(batch);
ice->urb.gs_present = gs_present;
}
#endif
static void
crocus_lost_genx_state(struct crocus_context *ice, struct crocus_batch *batch)
{
}
static void
crocus_emit_mi_report_perf_count(struct crocus_batch *batch,
struct crocus_bo *bo,
uint32_t offset_in_bytes,
uint32_t report_id)
{
#if GFX_VER >= 7
crocus_emit_cmd(batch, GENX(MI_REPORT_PERF_COUNT), mi_rpc) {
mi_rpc.MemoryAddress = rw_bo(bo, offset_in_bytes);
mi_rpc.ReportID = report_id;
}
#endif
}
/**
* From the PRM, Volume 2a:
*
* "Indirect State Pointers Disable
*
* At the completion of the post-sync operation associated with this pipe
* control packet, the indirect state pointers in the hardware are
* considered invalid; the indirect pointers are not saved in the context.
* If any new indirect state commands are executed in the command stream
* while the pipe control is pending, the new indirect state commands are
* preserved.
*
* [DevIVB+]: Using Invalidate State Pointer (ISP) only inhibits context
* restoring of Push Constant (3DSTATE_CONSTANT_*) commands. Push Constant
* commands are only considered as Indirect State Pointers. Once ISP is
* issued in a context, SW must initialize by programming push constant
* commands for all the shaders (at least to zero length) before attempting
* any rendering operation for the same context."
*
* 3DSTATE_CONSTANT_* packets are restored during a context restore,
* even though they point to a BO that has been already unreferenced at
* the end of the previous batch buffer. This has been fine so far since
* we are protected by these scratch page (every address not covered by
* a BO should be pointing to the scratch page). But on CNL, it is
* causing a GPU hang during context restore at the 3DSTATE_CONSTANT_*
* instruction.
*
* The flag "Indirect State Pointers Disable" in PIPE_CONTROL tells the
* hardware to ignore previous 3DSTATE_CONSTANT_* packets during a
* context restore, so the mentioned hang doesn't happen. However,
* software must program push constant commands for all stages prior to
* rendering anything, so we flag them as dirty.
*
* Finally, we also make sure to stall at pixel scoreboard to make sure the
* constants have been loaded into the EUs prior to disable the push constants
* so that it doesn't hang a previous 3DPRIMITIVE.
*/
#if GFX_VER >= 7
static void
gen7_emit_isp_disable(struct crocus_batch *batch)
{
crocus_emit_raw_pipe_control(batch, "isp disable",
PIPE_CONTROL_STALL_AT_SCOREBOARD |
PIPE_CONTROL_CS_STALL,
NULL, 0, 0);
crocus_emit_raw_pipe_control(batch, "isp disable",
PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE |
PIPE_CONTROL_CS_STALL,
NULL, 0, 0);
struct crocus_context *ice = batch->ice;
ice->state.stage_dirty |= (CROCUS_STAGE_DIRTY_CONSTANTS_VS |
CROCUS_STAGE_DIRTY_CONSTANTS_TCS |
CROCUS_STAGE_DIRTY_CONSTANTS_TES |
CROCUS_STAGE_DIRTY_CONSTANTS_GS |
CROCUS_STAGE_DIRTY_CONSTANTS_FS);
}
#endif
#if GFX_VER >= 7
static void
crocus_state_finish_batch(struct crocus_batch *batch)
{
#if GFX_VERx10 == 75
if (batch->name == CROCUS_BATCH_RENDER) {
crocus_emit_mi_flush(batch);
crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
ptr.ColorCalcStatePointer = batch->ice->shaders.cc_offset;
}
crocus_emit_pipe_control_flush(batch, "hsw wa", PIPE_CONTROL_RENDER_TARGET_FLUSH |
PIPE_CONTROL_CS_STALL);
}
#endif
gen7_emit_isp_disable(batch);
}
#endif
static void
crocus_batch_reset_dirty(struct crocus_batch *batch)
{
/* unreference any index buffer so it get reemitted. */
pipe_resource_reference(&batch->ice->state.index_buffer.res, NULL);
/* for GEN4/5 need to reemit anything that ends up in the state batch that points to anything in the state batch
* as the old state batch won't still be available.
*/
batch->ice->state.dirty |= CROCUS_DIRTY_DEPTH_BUFFER |
CROCUS_DIRTY_COLOR_CALC_STATE;
batch->ice->state.dirty |= CROCUS_DIRTY_VERTEX_ELEMENTS | CROCUS_DIRTY_VERTEX_BUFFERS;
batch->ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_BINDINGS;
batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS;
batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_TES;
batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_TCS;
batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_GS;
batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_PS;
batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS;
batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS;
batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TES;
batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TCS;
batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_GS;
batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_FS;
batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_CS;
batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_VS;
batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_GS;
batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CS;
batch->ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT | CROCUS_DIRTY_SF_CL_VIEWPORT;
#if GFX_VER >= 6
/* SCISSOR_STATE */
batch->ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
batch->ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
batch->ice->state.dirty |= CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL;
#endif
#if GFX_VER <= 5
/* dirty the SF state on gen4/5 */
batch->ice->state.dirty |= CROCUS_DIRTY_RASTER;
batch->ice->state.dirty |= CROCUS_DIRTY_GEN4_CURBE;
batch->ice->state.dirty |= CROCUS_DIRTY_CLIP;
batch->ice->state.dirty |= CROCUS_DIRTY_WM;
#endif
#if GFX_VER >= 7
/* Streamout dirty */
batch->ice->state.dirty |= CROCUS_DIRTY_STREAMOUT;
batch->ice->state.dirty |= CROCUS_DIRTY_SO_DECL_LIST;
batch->ice->state.dirty |= CROCUS_DIRTY_GEN7_SO_BUFFERS;
#endif
}
#if GFX_VERx10 == 75
struct pipe_rasterizer_state *crocus_get_rast_state(struct crocus_context *ice)
{
return &ice->state.cso_rast->cso;
}
#endif
#if GFX_VER >= 6
static void update_so_strides(struct crocus_context *ice,
uint16_t *strides)
{
for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
struct crocus_stream_output_target *so = (void *)ice->state.so_target[i];
if (so)
so->stride = strides[i] * sizeof(uint32_t);
}
}
#endif
static void crocus_fill_clamp_mask(const struct crocus_sampler_state *samp,
int s,
uint32_t *clamp_mask)
{
#if GFX_VER < 8
if (samp->pstate.min_img_filter != PIPE_TEX_FILTER_NEAREST &&
samp->pstate.mag_img_filter != PIPE_TEX_FILTER_NEAREST) {
if (samp->pstate.wrap_s == PIPE_TEX_WRAP_CLAMP)
clamp_mask[0] |= (1 << s);
if (samp->pstate.wrap_t == PIPE_TEX_WRAP_CLAMP)
clamp_mask[1] |= (1 << s);
if (samp->pstate.wrap_r == PIPE_TEX_WRAP_CLAMP)
clamp_mask[2] |= (1 << s);
}
#endif
}
static void
crocus_set_frontend_noop(struct pipe_context *ctx, bool enable)
{
struct crocus_context *ice = (struct crocus_context *) ctx;
if (crocus_batch_prepare_noop(&ice->batches[CROCUS_BATCH_RENDER], enable)) {
ice->state.dirty |= CROCUS_ALL_DIRTY_FOR_RENDER;
ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_FOR_RENDER;
}
if (ice->batch_count == 1)
return;
if (crocus_batch_prepare_noop(&ice->batches[CROCUS_BATCH_COMPUTE], enable)) {
ice->state.dirty |= CROCUS_ALL_DIRTY_FOR_COMPUTE;
ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_FOR_COMPUTE;
}
}
void
genX(crocus_init_screen_state)(struct crocus_screen *screen)
{
assert(screen->devinfo.verx10 == GFX_VERx10);
screen->vtbl.destroy_state = crocus_destroy_state;
screen->vtbl.init_render_context = crocus_init_render_context;
screen->vtbl.upload_render_state = crocus_upload_render_state;
#if GFX_VER >= 7
screen->vtbl.init_compute_context = crocus_init_compute_context;
screen->vtbl.upload_compute_state = crocus_upload_compute_state;
#endif
screen->vtbl.emit_raw_pipe_control = crocus_emit_raw_pipe_control;
screen->vtbl.emit_mi_report_perf_count = crocus_emit_mi_report_perf_count;
screen->vtbl.rebind_buffer = crocus_rebind_buffer;
#if GFX_VERx10 >= 75
screen->vtbl.load_register_reg32 = crocus_load_register_reg32;
screen->vtbl.load_register_reg64 = crocus_load_register_reg64;
screen->vtbl.load_register_imm32 = crocus_load_register_imm32;
screen->vtbl.load_register_imm64 = crocus_load_register_imm64;
screen->vtbl.store_data_imm32 = crocus_store_data_imm32;
screen->vtbl.store_data_imm64 = crocus_store_data_imm64;
#endif
#if GFX_VER >= 7
screen->vtbl.load_register_mem32 = crocus_load_register_mem32;
screen->vtbl.load_register_mem64 = crocus_load_register_mem64;
screen->vtbl.copy_mem_mem = crocus_copy_mem_mem;
screen->vtbl.create_so_decl_list = crocus_create_so_decl_list;
#endif
screen->vtbl.update_surface_base_address = crocus_update_surface_base_address;
#if GFX_VER >= 6
screen->vtbl.store_register_mem32 = crocus_store_register_mem32;
screen->vtbl.store_register_mem64 = crocus_store_register_mem64;
#endif
screen->vtbl.populate_vs_key = crocus_populate_vs_key;
screen->vtbl.populate_tcs_key = crocus_populate_tcs_key;
screen->vtbl.populate_tes_key = crocus_populate_tes_key;
screen->vtbl.populate_gs_key = crocus_populate_gs_key;
screen->vtbl.populate_fs_key = crocus_populate_fs_key;
screen->vtbl.populate_cs_key = crocus_populate_cs_key;
screen->vtbl.lost_genx_state = crocus_lost_genx_state;
#if GFX_VER >= 7
screen->vtbl.finish_batch = crocus_state_finish_batch;
#endif
#if GFX_VER <= 5
screen->vtbl.upload_urb_fence = crocus_upload_urb_fence;
screen->vtbl.calculate_urb_fence = crocus_calculate_urb_fence;
#endif
screen->vtbl.fill_clamp_mask = crocus_fill_clamp_mask;
screen->vtbl.batch_reset_dirty = crocus_batch_reset_dirty;
screen->vtbl.translate_prim_type = translate_prim_type;
#if GFX_VER >= 6
screen->vtbl.update_so_strides = update_so_strides;
screen->vtbl.get_so_offset = crocus_get_so_offset;
#endif
genX(crocus_init_blt)(screen);
}
void
genX(crocus_init_state)(struct crocus_context *ice)
{
struct pipe_context *ctx = &ice->ctx;
ctx->create_blend_state = crocus_create_blend_state;
ctx->create_depth_stencil_alpha_state = crocus_create_zsa_state;
ctx->create_rasterizer_state = crocus_create_rasterizer_state;
ctx->create_sampler_state = crocus_create_sampler_state;
ctx->create_sampler_view = crocus_create_sampler_view;
ctx->create_surface = crocus_create_surface;
ctx->create_vertex_elements_state = crocus_create_vertex_elements;
ctx->bind_blend_state = crocus_bind_blend_state;
ctx->bind_depth_stencil_alpha_state = crocus_bind_zsa_state;
ctx->bind_sampler_states = crocus_bind_sampler_states;
ctx->bind_rasterizer_state = crocus_bind_rasterizer_state;
ctx->bind_vertex_elements_state = crocus_bind_vertex_elements_state;
ctx->delete_blend_state = crocus_delete_state;
ctx->delete_depth_stencil_alpha_state = crocus_delete_state;
ctx->delete_rasterizer_state = crocus_delete_state;
ctx->delete_sampler_state = crocus_delete_state;
ctx->delete_vertex_elements_state = crocus_delete_state;
ctx->set_blend_color = crocus_set_blend_color;
ctx->set_clip_state = crocus_set_clip_state;
ctx->set_constant_buffer = crocus_set_constant_buffer;
ctx->set_shader_buffers = crocus_set_shader_buffers;
ctx->set_shader_images = crocus_set_shader_images;
ctx->set_sampler_views = crocus_set_sampler_views;
ctx->set_tess_state = crocus_set_tess_state;
ctx->set_patch_vertices = crocus_set_patch_vertices;
ctx->set_framebuffer_state = crocus_set_framebuffer_state;
ctx->set_polygon_stipple = crocus_set_polygon_stipple;
ctx->set_sample_mask = crocus_set_sample_mask;
ctx->set_scissor_states = crocus_set_scissor_states;
ctx->set_stencil_ref = crocus_set_stencil_ref;
ctx->set_vertex_buffers = crocus_set_vertex_buffers;
ctx->set_viewport_states = crocus_set_viewport_states;
ctx->sampler_view_destroy = crocus_sampler_view_destroy;
ctx->surface_destroy = crocus_surface_destroy;
ctx->draw_vbo = crocus_draw_vbo;
ctx->launch_grid = crocus_launch_grid;
ctx->set_frontend_noop = crocus_set_frontend_noop;
#if GFX_VER >= 6
ctx->create_stream_output_target = crocus_create_stream_output_target;
ctx->stream_output_target_destroy = crocus_stream_output_target_destroy;
ctx->set_stream_output_targets = crocus_set_stream_output_targets;
#endif
ice->state.dirty = ~0ull;
ice->state.stage_dirty = ~0ull;
ice->state.statistics_counters_enabled = true;
ice->state.sample_mask = 0xff;
ice->state.num_viewports = 1;
ice->state.prim_mode = PIPE_PRIM_MAX;
ice->state.genx = calloc(1, sizeof(struct crocus_genx_state));
ice->draw.derived_params.drawid = -1;
/* Default all scissor rectangles to be empty regions. */
for (int i = 0; i < CROCUS_MAX_VIEWPORTS; i++) {
ice->state.scissors[i] = (struct pipe_scissor_state) {
.minx = 1, .maxx = 0, .miny = 1, .maxy = 0,
};
}
}