swr/rasterizer: improvements in simdlib

1. fix build issues with MSVC 2019 compiler

The MSVC 2019 compiler seems to have an issue with optimized code-gen
when using the _mm256_and_si256() intrinsic.
Only disable use of integer vpand on buggy versions MSVC 2019.
Otherwise allow use of integer vpand intrinsic.

2. Remove unused vec/matrix functionality

Reviewed-by: Alok Hota <alok.hota@intel.com>
This commit is contained in:
Jan Zielinski 2019-07-31 16:01:01 +02:00
parent b55a93fdd4
commit ff75c35846
17 changed files with 49 additions and 492 deletions

View File

@ -191,57 +191,6 @@ SIMDINLINE SIMD128::Integer _simd_blend4_epi32(SIMD128::Integer const& a, SIMD12
SIMD128::blend_ps<mask>(SIMD128::castsi_ps(a), SIMD128::castsi_ps(b)));
}
SIMDINLINE
void _simd_mov(simdscalar& r, unsigned int rlane, simdscalar& s, unsigned int slane)
{
OSALIGNSIMD(float) rArray[KNOB_SIMD_WIDTH], sArray[KNOB_SIMD_WIDTH];
SIMD256::store_ps(rArray, r);
SIMD256::store_ps(sArray, s);
rArray[rlane] = sArray[slane];
r = SIMD256::load_ps(rArray);
}
// Populates a simdvector from a vector. So p = xyzw becomes xxxx yyyy zzzz wwww.
#define _simdvec_load_ps SIMD::vec4_load1_ps
SIMDINLINE
void _simdvec_mov(simdvector& r, const simdscalar& s)
{
SIMD::vec4_set1_vps(r, s);
}
SIMDINLINE
void _simdvec_mov(simdvector& r, const simdvector& v)
{
r = v;
}
#if 0
// just move a lane from the source simdvector to dest simdvector
SIMDINLINE
void _simdvec_mov(simdvector &r, unsigned int rlane, simdvector& s, unsigned int slane)
{
_simd_mov(r[0], rlane, s[0], slane);
_simd_mov(r[1], rlane, s[1], slane);
_simd_mov(r[2], rlane, s[2], slane);
_simd_mov(r[3], rlane, s[3], slane);
}
#endif
#define _simdvec_dp3_ps SIMD::vec4_dp3_ps
#define _simdvec_dp4_ps SIMD::vec4_dp4_ps
#define _simdvec_rcp_length_ps SIMD::vec4_rcp_length_ps
#define _simdvec_normalize_ps SIMD::vec4_normalize_ps
#define _simdvec_mul_ps SIMD::vec4_mul_ps
#define _simdvec_add_ps SIMD::vec4_add_ps
#define _simdvec_min_ps SIMD::vec4_min_ps
#define _simdvec_max_ps SIMD::vec4_max_ps
#define _simd_mat4x4_vec4_multiply SIMD::mat4x4_vec4_multiply
#define _simd_mat3x3_vec3_w0_multiply SIMD::mat3x3_vec3_w0_multiply
#define _simd_mat4x4_vec3_w1_multiply SIMD::mat4x4_vec3_w1_multiply
#define _simd_mat4x3_vec3_w1_multiply SIMD::mat4x3_vec3_w1_multiply
//////////////////////////////////////////////////////////////////////////
/// @brief Compute plane equation vA * vX + vB * vY + vC
SIMDINLINE simdscalar vplaneps(simdscalar const& vA,

View File

@ -209,339 +209,6 @@ struct SIMDBase : Traits::IsaImpl
using Integer = typename Traits::Integer;
using Vec4 = typename Traits::Vec4;
using Mask = typename Traits::Mask;
static const size_t VECTOR_BYTES = sizeof(Float);
// Populates a SIMD Vec4 from a non-simd vector. So p = xyzw becomes xxxx yyyy zzzz wwww.
static SIMDINLINE void vec4_load1_ps(Vec4& r, const float* p)
{
r[0] = SIMD::set1_ps(p[0]);
r[1] = SIMD::set1_ps(p[1]);
r[2] = SIMD::set1_ps(p[2]);
r[3] = SIMD::set1_ps(p[3]);
}
static SIMDINLINE void vec4_set1_vps(Vec4& r, Float const& s)
{
r[0] = s;
r[1] = s;
r[2] = s;
r[3] = s;
}
static SIMDINLINE Float vec4_dp3_ps(const Vec4& v0, const Vec4& v1)
{
Float tmp, r;
r = SIMD::mul_ps(v0[0], v1[0]); // (v0.x*v1.x)
tmp = SIMD::mul_ps(v0[1], v1[1]); // (v0.y*v1.y)
r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y)
tmp = SIMD::mul_ps(v0[2], v1[2]); // (v0.z*v1.z)
r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
return r;
}
static SIMDINLINE Float vec4_dp4_ps(const Vec4& v0, const Vec4& v1)
{
Float tmp, r;
r = SIMD::mul_ps(v0[0], v1[0]); // (v0.x*v1.x)
tmp = SIMD::mul_ps(v0[1], v1[1]); // (v0.y*v1.y)
r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y)
tmp = SIMD::mul_ps(v0[2], v1[2]); // (v0.z*v1.z)
r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
tmp = SIMD::mul_ps(v0[3], v1[3]); // (v0.w*v1.w)
r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
return r;
}
static SIMDINLINE Float vec4_rcp_length_ps(const Vec4& v)
{
Float length = vec4_dp4_ps(v, v);
return SIMD::rsqrt_ps(length);
}
static SIMDINLINE void vec4_normalize_ps(Vec4& r, const Vec4& v)
{
Float rcpLength = vec4_rcp_length_ps(v);
r[0] = SIMD::mul_ps(v[0], rcpLength);
r[1] = SIMD::mul_ps(v[1], rcpLength);
r[2] = SIMD::mul_ps(v[2], rcpLength);
r[3] = SIMD::mul_ps(v[3], rcpLength);
}
static SIMDINLINE void vec4_mul_ps(Vec4& r, const Vec4& v, Float const& s)
{
r[0] = SIMD::mul_ps(v[0], s);
r[1] = SIMD::mul_ps(v[1], s);
r[2] = SIMD::mul_ps(v[2], s);
r[3] = SIMD::mul_ps(v[3], s);
}
static SIMDINLINE void vec4_mul_ps(Vec4& r, const Vec4& v0, const Vec4& v1)
{
r[0] = SIMD::mul_ps(v0[0], v1[0]);
r[1] = SIMD::mul_ps(v0[1], v1[1]);
r[2] = SIMD::mul_ps(v0[2], v1[2]);
r[3] = SIMD::mul_ps(v0[3], v1[3]);
}
static SIMDINLINE void vec4_add_ps(Vec4& r, const Vec4& v0, Float const& s)
{
r[0] = SIMD::add_ps(v0[0], s);
r[1] = SIMD::add_ps(v0[1], s);
r[2] = SIMD::add_ps(v0[2], s);
r[3] = SIMD::add_ps(v0[3], s);
}
static SIMDINLINE void vec4_add_ps(Vec4& r, const Vec4& v0, const Vec4& v1)
{
r[0] = SIMD::add_ps(v0[0], v1[0]);
r[1] = SIMD::add_ps(v0[1], v1[1]);
r[2] = SIMD::add_ps(v0[2], v1[2]);
r[3] = SIMD::add_ps(v0[3], v1[3]);
}
static SIMDINLINE void vec4_min_ps(Vec4& r, const Vec4& v0, Float const& s)
{
r[0] = SIMD::min_ps(v0[0], s);
r[1] = SIMD::min_ps(v0[1], s);
r[2] = SIMD::min_ps(v0[2], s);
r[3] = SIMD::min_ps(v0[3], s);
}
static SIMDINLINE void vec4_max_ps(Vec4& r, const Vec4& v0, Float const& s)
{
r[0] = SIMD::max_ps(v0[0], s);
r[1] = SIMD::max_ps(v0[1], s);
r[2] = SIMD::max_ps(v0[2], s);
r[3] = SIMD::max_ps(v0[3], s);
}
// Matrix4x4 * Vector4
// outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * v.w)
// outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * v.w)
// outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * v.w)
// outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * v.w)
static SIMDINLINE void SIMDCALL mat4x4_vec4_multiply(Vec4& result,
const float* pMatrix,
const Vec4& v)
{
Float m;
Float r0;
Float r1;
m = SIMD::load1_ps(pMatrix + 0 * 4 + 0); // m[row][0]
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
m = SIMD::load1_ps(pMatrix + 0 * 4 + 1); // m[row][1]
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
m = SIMD::load1_ps(pMatrix + 0 * 4 + 2); // m[row][2]
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
m = SIMD::load1_ps(pMatrix + 0 * 4 + 3); // m[row][3]
r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
result[0] = r0;
m = SIMD::load1_ps(pMatrix + 1 * 4 + 0); // m[row][0]
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
m = SIMD::load1_ps(pMatrix + 1 * 4 + 1); // m[row][1]
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
m = SIMD::load1_ps(pMatrix + 1 * 4 + 2); // m[row][2]
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
m = SIMD::load1_ps(pMatrix + 1 * 4 + 3); // m[row][3]
r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
result[1] = r0;
m = SIMD::load1_ps(pMatrix + 2 * 4 + 0); // m[row][0]
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
m = SIMD::load1_ps(pMatrix + 2 * 4 + 1); // m[row][1]
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
m = SIMD::load1_ps(pMatrix + 2 * 4 + 2); // m[row][2]
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
m = SIMD::load1_ps(pMatrix + 2 * 4 + 3); // m[row][3]
r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
result[2] = r0;
m = SIMD::load1_ps(pMatrix + 3 * 4 + 0); // m[row][0]
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
m = SIMD::load1_ps(pMatrix + 3 * 4 + 1); // m[row][1]
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
m = SIMD::load1_ps(pMatrix + 3 * 4 + 2); // m[row][2]
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
m = SIMD::load1_ps(pMatrix + 3 * 4 + 3); // m[row][3]
r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
result[3] = r0;
}
// Matrix4x4 * Vector3 - Direction Vector where w = 0.
// outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 0)
// outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 0)
// outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 0)
// outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 0)
static SIMDINLINE void SIMDCALL mat3x3_vec3_w0_multiply(Vec4& result,
const float* pMatrix,
const Vec4& v)
{
Float m;
Float r0;
Float r1;
m = SIMD::load1_ps(pMatrix + 0 * 4 + 0); // m[row][0]
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
m = SIMD::load1_ps(pMatrix + 0 * 4 + 1); // m[row][1]
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
m = SIMD::load1_ps(pMatrix + 0 * 4 + 2); // m[row][2]
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
result[0] = r0;
m = SIMD::load1_ps(pMatrix + 1 * 4 + 0); // m[row][0]
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
m = SIMD::load1_ps(pMatrix + 1 * 4 + 1); // m[row][1]
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
m = SIMD::load1_ps(pMatrix + 1 * 4 + 2); // m[row][2]
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
result[1] = r0;
m = SIMD::load1_ps(pMatrix + 2 * 4 + 0); // m[row][0]
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
m = SIMD::load1_ps(pMatrix + 2 * 4 + 1); // m[row][1]
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
m = SIMD::load1_ps(pMatrix + 2 * 4 + 2); // m[row][2]
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
result[2] = r0;
result[3] = SIMD::setzero_ps();
}
// Matrix4x4 * Vector3 - Position vector where w = 1.
// outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 1)
// outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 1)
// outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 1)
// outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 1)
static SIMDINLINE void SIMDCALL mat4x4_vec3_w1_multiply(Vec4& result,
const float* pMatrix,
const Vec4& v)
{
Float m;
Float r0;
Float r1;
m = SIMD::load1_ps(pMatrix + 0 * 4 + 0); // m[row][0]
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
m = SIMD::load1_ps(pMatrix + 0 * 4 + 1); // m[row][1]
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
m = SIMD::load1_ps(pMatrix + 0 * 4 + 2); // m[row][2]
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
m = SIMD::load1_ps(pMatrix + 0 * 4 + 3); // m[row][3]
r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
result[0] = r0;
m = SIMD::load1_ps(pMatrix + 1 * 4 + 0); // m[row][0]
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
m = SIMD::load1_ps(pMatrix + 1 * 4 + 1); // m[row][1]
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
m = SIMD::load1_ps(pMatrix + 1 * 4 + 2); // m[row][2]
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
m = SIMD::load1_ps(pMatrix + 1 * 4 + 3); // m[row][3]
r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
result[1] = r0;
m = SIMD::load1_ps(pMatrix + 2 * 4 + 0); // m[row][0]
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
m = SIMD::load1_ps(pMatrix + 2 * 4 + 1); // m[row][1]
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
m = SIMD::load1_ps(pMatrix + 2 * 4 + 2); // m[row][2]
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
m = SIMD::load1_ps(pMatrix + 2 * 4 + 3); // m[row][3]
r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
result[2] = r0;
m = SIMD::load1_ps(pMatrix + 3 * 4 + 0); // m[row][0]
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
m = SIMD::load1_ps(pMatrix + 3 * 4 + 1); // m[row][1]
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
m = SIMD::load1_ps(pMatrix + 3 * 4 + 2); // m[row][2]
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
m = SIMD::load1_ps(pMatrix + 3 * 4 + 3); // m[row][3]
result[3] = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
}
static SIMDINLINE void SIMDCALL mat4x3_vec3_w1_multiply(Vec4& result,
const float* pMatrix,
const Vec4& v)
{
Float m;
Float r0;
Float r1;
m = SIMD::load1_ps(pMatrix + 0 * 4 + 0); // m[row][0]
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
m = SIMD::load1_ps(pMatrix + 0 * 4 + 1); // m[row][1]
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
m = SIMD::load1_ps(pMatrix + 0 * 4 + 2); // m[row][2]
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
m = SIMD::load1_ps(pMatrix + 0 * 4 + 3); // m[row][3]
r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
result[0] = r0;
m = SIMD::load1_ps(pMatrix + 1 * 4 + 0); // m[row][0]
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
m = SIMD::load1_ps(pMatrix + 1 * 4 + 1); // m[row][1]
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
m = SIMD::load1_ps(pMatrix + 1 * 4 + 2); // m[row][2]
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
m = SIMD::load1_ps(pMatrix + 1 * 4 + 3); // m[row][3]
r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
result[1] = r0;
m = SIMD::load1_ps(pMatrix + 2 * 4 + 0); // m[row][0]
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
m = SIMD::load1_ps(pMatrix + 2 * 4 + 1); // m[row][1]
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
m = SIMD::load1_ps(pMatrix + 2 * 4 + 2); // m[row][2]
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
m = SIMD::load1_ps(pMatrix + 2 * 4 + 3); // m[row][3]
r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
result[2] = r0;
result[3] = SIMD::set1_ps(1.0f);
}
}; // struct SIMDBase
using SIMD128 = SIMDBase<SIMDImpl::SIMD128Impl::Traits>;

View File

@ -222,14 +222,14 @@ SIMD_EMU_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
//-----------------------------------------------------------------------
// Logical operations
//-----------------------------------------------------------------------
SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int)
SIMD_EMU_IWRAPPER_2(and_si); // return a & b (int)
SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int)
SIMD_EMU_IWRAPPER_2(andnot_si); // return (~a) & b (int)
SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int)
SIMD_EMU_IWRAPPER_2(or_si); // return a | b (int)
SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int)
SIMD_EMU_IWRAPPER_2(xor_si); // return a ^ b (int)
SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int)
SIMD_IFWRAPPER_2(and_si, _mm256_and_ps); // return a & b (int)
SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int)
SIMD_IFWRAPPER_2(andnot_si, _mm256_andnot_ps); // return (~a) & b (int)
SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int)
SIMD_IFWRAPPER_2(or_si, _mm256_or_ps); // return a | b (int)
SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int)
SIMD_IFWRAPPER_2(xor_si, _mm256_xor_ps); // return a ^ b (int)
//-----------------------------------------------------------------------
// Shift operations

View File

@ -81,6 +81,7 @@
return _mm256_##op(a, b, ImmT); \
}
//-----------------------------------------------------------------------
// Floating point arithmetic operations
//-----------------------------------------------------------------------
@ -116,7 +117,14 @@ SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
//-----------------------------------------------------------------------
// Logical operations
//-----------------------------------------------------------------------
SIMD_IWRAPPER_2_(and_si, and_si256); // return a & b (int)
#if _MSC_VER >= 1920 // && _MSC_FULL_VER < [some_fixed_version]
// Some versions of MSVC 2019 don't handle constant folding of and_si() correctly.
// Using and_ps instead inhibits the compiler's constant folding and actually issues
// the and intrinsic even though both inputs are constant values.
#else
// Use native integer and intrinsic
SIMD_IWRAPPER_2_(and_si, and_si256); // return a & b (int)
#endif
SIMD_IWRAPPER_2_(andnot_si, andnot_si256); // return (~a) & b (int)
SIMD_IWRAPPER_2_(or_si, or_si256); // return a | b (int)
SIMD_IWRAPPER_2_(xor_si, xor_si256); // return a ^ b (int)
@ -213,6 +221,10 @@ static SIMDINLINE Float SIMDCALL
return _mm256_i32gather_ps(p, idx, static_cast<int>(ScaleT));
}
#if _MSC_VER == 1920 // && _MSC_FULL_VER < [some_fixed_version]
// Don't use _mm256_mask_i32gather_ps(), the compiler doesn't preserve the mask register
// correctly in early versions of MSVC 2019
#else
// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
template <ScaleFactor ScaleT = ScaleFactor::SF_1>
static SIMDINLINE Float SIMDCALL
@ -222,6 +234,7 @@ static SIMDINLINE Float SIMDCALL
// Only for this intrinsic - not sure why. :(
return _mm256_mask_i32gather_ps(old.v, p, idx.v, mask.v, static_cast<int>(ScaleT));
}
#endif
static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer const& a)
{

View File

@ -328,101 +328,5 @@ struct SIMD256 // or SIMD4 or SIMD16
//=======================================================================
// Advanced masking interface (currently available only in SIMD16 width)
//=======================================================================
//=======================================================================
// Extended Utility Functions (common to SIMD256 and SIMD16)
//=======================================================================
//-----------------------------------------------------------------------
// Extended Types
//-----------------------------------------------------------------------
// Vec4, an SOA SIMD set of 4-dimensional vectors
union Vec4
{
Vec4() = default;
Vec4(Float in)
{
s.x = in;
s.y = in;
s.z = in;
s.w = in;
}
Vec4(Float x, Float y, Float z, Float w)
{
s.x = x;
s.y = y;
s.z = z;
s.w = w;
}
Float v[4];
Integer vi[4];
struct
{
Float x;
Float y;
Float z;
Float w;
} s;
Float& operator[] (const int i) { return v[i]; }
Float const & operator[] (const int i) const { return v[i]; }
};
//-----------------------------------------------------------------------
// Extended Functions
//-----------------------------------------------------------------------
static void vec4_set1_ps(Vec4& r, const float *p); // r[0] = set1(p[0]), r[1] = set1(p[1]), ...
static void vec4_set1_vps(Vec4& r, Float s); // r[0] = s, r[1] = s, ...
static Float vec4_dp3_ps(const Vec4& v0, const Vec4& v1); // return dp3(v0, v1)
static Float vec4_dp4_ps(const Vec4& v0, const Vec4& v1); // return dp4(v0, v1)
static Float vec4_rcp_length_ps(const Vec4& v); // return 1.0f / sqrt(dp4(v, v))
static void vec4_normalize_ps(Vec4& r, const Vec4& v); // r = v * rcp_length(v)
static void vec4_mul_ps(Vec4& r, const Vec4& v, Float s); // r = v * set1_vps(s)
static void vec4_mul_ps(Vec4& r, const Vec4& v0, const Vec4& v1); // r = v0 * v1
static void vec4_add_ps(Vec4& r, const Vec4& v0, const Vec4& v1); // r = v0 + v1
static void vec4_min_ps(Vec4& r, const Vec4& v0, Float s); // r = (v0 < s) ? v0 : s
static void vec4_max_ps(Vec4& r, const Vec4& v0, Float s); // r = (v0 > s) ? v0 : s
// Matrix4x4 * Vector4
// result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * v.s.w)
// result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * v.s.w)
// result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * v.s.w)
// result.s.w = (m30 * v.s.x) + (m31 * v.s.y) + (m32 * v.s.z) + (m33 * v.s.w)
static void mat4x4_vec4_multiply(
Vec4& result,
const float *pMatrix,
const Vec4& v);
// Matrix4x4 * Vector3 - Direction Vector where w = 0.
// result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * 0)
// result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * 0)
// result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * 0)
// result.s.w = (m30 * v.s.x) + (m31 * v.s.y) + (m32 * v.s.z) + (m33 * 0)
static void mat3x3_vec3_w0_multiply(
Vec4& result,
const float *pMatrix,
const Vec4& v);
// Matrix4x4 * Vector3 - Position vector where w = 1.
// result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * 1)
// result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * 1)
// result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * 1)
// result.s.w = (m30 * v.s.x) + (m31 * v.s.y) + (m32 * v.s.z) + (m33 * 1)
static void mat4x4_vec3_w1_multiply(
Vec4& result,
const float *pMatrix,
const Vec4& v);
// Matrix4x3 * Vector3 - Position vector where w = 1.
// result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * 1)
// result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * 1)
// result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * 1)
// result.s.w = 1
static void mat4x3_vec3_w1_multiply(
Vec4& result,
const float *pMatrix,
const Vec4& v);
};
#endif // #if 0

View File

@ -315,7 +315,7 @@ namespace SIMDImpl
namespace SIMD512Impl
{
#if !(defined(__AVX512F__) || defined(_MM_K0_REG))
#if !(defined(__AVX512F__) || defined(_ZMMINTRIN_H_INCLUDED))
// Define AVX512 types if not included via immintrin.h.
// All data members of these types are ONLY to viewed
// in a debugger. Do NOT access them via code!

View File

@ -1551,6 +1551,7 @@ void SwrDispatch(HANDLE hContext,
pTaskData->threadGroupCountX = threadGroupCountX;
pTaskData->threadGroupCountY = threadGroupCountY;
pTaskData->threadGroupCountZ = threadGroupCountZ;
pTaskData->enableThreadDispatch = false;
uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ;
uint32_t dcIndex = pDC->drawId % pContext->MAX_DRAWS_IN_FLIGHT;

View File

@ -588,6 +588,7 @@ SWR_FUNC(void,
uint32_t threadGroupCountY,
uint32_t threadGroupCountZ);
/// @note this enum needs to be kept in sync with HOTTILE_STATE!
enum SWR_TILE_STATE
{

View File

@ -140,6 +140,7 @@ struct COMPUTE_DESC
uint32_t threadGroupCountX;
uint32_t threadGroupCountY;
uint32_t threadGroupCountZ;
bool enableThreadDispatch;
};
typedef void (*PFN_WORK_FUNC)(DRAW_CONTEXT* pDC,

View File

@ -1447,7 +1447,7 @@ struct PA_TESS : PA_STATE
SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]);
if (!m_SOA)
{
indices = _simd16_mul_epi32(indices, _simd16_set1_epi32(vertexStride / 4));
indices = _simd16_mullo_epi32(indices, _simd16_set1_epi32(vertexStride / 4));
}
#else
SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]);

View File

@ -584,7 +584,7 @@ struct JitCacheFileHeader
uint64_t GetObjectCRC() const { return m_objCRC; }
private:
static const uint64_t JC_MAGIC_NUMBER = 0xfedcba9876543210ULL + 6;
static const uint64_t JC_MAGIC_NUMBER = 0xfedcba9876543210ULL + 7;
static const size_t JC_STR_MAX_LEN = 32;
static const uint32_t JC_PLATFORM_KEY = (LLVM_VERSION_MAJOR << 24) |
(LLVM_VERSION_MINOR << 16) | (LLVM_VERSION_PATCH << 8) |

View File

@ -61,6 +61,7 @@ namespace SwrJit
mInt16PtrTy = PointerType::get(mInt16Ty, 0);
mInt32PtrTy = PointerType::get(mInt32Ty, 0);
mInt64PtrTy = PointerType::get(mInt64Ty, 0);
mHandleTy = mInt8PtrTy;
mSimd4FP64Ty = VectorType::get(mDoubleTy, 4);

View File

@ -78,6 +78,7 @@ namespace SwrJit
// Built in types: scalar
Type* mVoidTy;
Type* mHandleTy;
Type* mInt1Ty;
Type* mInt8Ty;
Type* mInt16Ty;

View File

@ -237,6 +237,11 @@ namespace SwrJit
return MASKED_GATHER(pVecSrcPtr, 4, pVecMask, pVecPassthru);
}
void Builder::SCATTER_PTR(Value* pVecDstPtr, Value* pVecSrc, Value* pVecMask)
{
MASKED_SCATTER(pVecSrc, pVecDstPtr, 4, pVecMask);
}
void Builder::Gather4(const SWR_FORMAT format,
Value* pSrcBase,
Value* byteOffsets,

View File

@ -148,6 +148,7 @@ void GATHER4DD(const SWR_FORMAT_INFO& info,
Value* GATHERPD(Value* src, Value* pBase, Value* indices, Value* mask, uint8_t scale = 1);
Value* GATHER_PTR(Value* pVecSrcPtr, Value* pVecMask, Value* pVecPassthru);
void SCATTER_PTR(Value* pVecDstPtr, Value* pVecSrc, Value* pVecMask);
virtual void SCATTERPS(Value* pDst,
Value* vSrc,

View File

@ -170,6 +170,16 @@ namespace SwrJit
return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0));
}
Value* Builder::VIMMED1(uint64_t i)
{
return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
}
Value* Builder::VIMMED1_16(uint64_t i)
{
return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
}
Value* Builder::VIMMED1(int i)
{
return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));

View File

@ -71,6 +71,9 @@ Constant* CInc(uint32_t base, uint32_t count)
Constant* PRED(bool pred);
Value* VIMMED1(uint64_t i);
Value* VIMMED1_16(uint64_t i);
Value* VIMMED1(int i);
Value* VIMMED1_16(int i);