swr/rasterizer: improvements in simdlib
1. fix build issues with MSVC 2019 compiler The MSVC 2019 compiler seems to have an issue with optimized code-gen when using the _mm256_and_si256() intrinsic. Only disable use of integer vpand on buggy versions MSVC 2019. Otherwise allow use of integer vpand intrinsic. 2. Remove unused vec/matrix functionality Reviewed-by: Alok Hota <alok.hota@intel.com>
This commit is contained in:
parent
b55a93fdd4
commit
ff75c35846
|
@ -191,57 +191,6 @@ SIMDINLINE SIMD128::Integer _simd_blend4_epi32(SIMD128::Integer const& a, SIMD12
|
|||
SIMD128::blend_ps<mask>(SIMD128::castsi_ps(a), SIMD128::castsi_ps(b)));
|
||||
}
|
||||
|
||||
SIMDINLINE
|
||||
void _simd_mov(simdscalar& r, unsigned int rlane, simdscalar& s, unsigned int slane)
|
||||
{
|
||||
OSALIGNSIMD(float) rArray[KNOB_SIMD_WIDTH], sArray[KNOB_SIMD_WIDTH];
|
||||
SIMD256::store_ps(rArray, r);
|
||||
SIMD256::store_ps(sArray, s);
|
||||
rArray[rlane] = sArray[slane];
|
||||
r = SIMD256::load_ps(rArray);
|
||||
}
|
||||
|
||||
// Populates a simdvector from a vector. So p = xyzw becomes xxxx yyyy zzzz wwww.
|
||||
#define _simdvec_load_ps SIMD::vec4_load1_ps
|
||||
|
||||
SIMDINLINE
|
||||
void _simdvec_mov(simdvector& r, const simdscalar& s)
|
||||
{
|
||||
SIMD::vec4_set1_vps(r, s);
|
||||
}
|
||||
|
||||
SIMDINLINE
|
||||
void _simdvec_mov(simdvector& r, const simdvector& v)
|
||||
{
|
||||
r = v;
|
||||
}
|
||||
|
||||
#if 0
|
||||
// just move a lane from the source simdvector to dest simdvector
|
||||
SIMDINLINE
|
||||
void _simdvec_mov(simdvector &r, unsigned int rlane, simdvector& s, unsigned int slane)
|
||||
{
|
||||
_simd_mov(r[0], rlane, s[0], slane);
|
||||
_simd_mov(r[1], rlane, s[1], slane);
|
||||
_simd_mov(r[2], rlane, s[2], slane);
|
||||
_simd_mov(r[3], rlane, s[3], slane);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#define _simdvec_dp3_ps SIMD::vec4_dp3_ps
|
||||
#define _simdvec_dp4_ps SIMD::vec4_dp4_ps
|
||||
#define _simdvec_rcp_length_ps SIMD::vec4_rcp_length_ps
|
||||
#define _simdvec_normalize_ps SIMD::vec4_normalize_ps
|
||||
#define _simdvec_mul_ps SIMD::vec4_mul_ps
|
||||
#define _simdvec_add_ps SIMD::vec4_add_ps
|
||||
#define _simdvec_min_ps SIMD::vec4_min_ps
|
||||
#define _simdvec_max_ps SIMD::vec4_max_ps
|
||||
#define _simd_mat4x4_vec4_multiply SIMD::mat4x4_vec4_multiply
|
||||
#define _simd_mat3x3_vec3_w0_multiply SIMD::mat3x3_vec3_w0_multiply
|
||||
#define _simd_mat4x4_vec3_w1_multiply SIMD::mat4x4_vec3_w1_multiply
|
||||
#define _simd_mat4x3_vec3_w1_multiply SIMD::mat4x3_vec3_w1_multiply
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Compute plane equation vA * vX + vB * vY + vC
|
||||
SIMDINLINE simdscalar vplaneps(simdscalar const& vA,
|
||||
|
|
|
@ -209,339 +209,6 @@ struct SIMDBase : Traits::IsaImpl
|
|||
using Integer = typename Traits::Integer;
|
||||
using Vec4 = typename Traits::Vec4;
|
||||
using Mask = typename Traits::Mask;
|
||||
|
||||
static const size_t VECTOR_BYTES = sizeof(Float);
|
||||
|
||||
// Populates a SIMD Vec4 from a non-simd vector. So p = xyzw becomes xxxx yyyy zzzz wwww.
|
||||
static SIMDINLINE void vec4_load1_ps(Vec4& r, const float* p)
|
||||
{
|
||||
r[0] = SIMD::set1_ps(p[0]);
|
||||
r[1] = SIMD::set1_ps(p[1]);
|
||||
r[2] = SIMD::set1_ps(p[2]);
|
||||
r[3] = SIMD::set1_ps(p[3]);
|
||||
}
|
||||
|
||||
static SIMDINLINE void vec4_set1_vps(Vec4& r, Float const& s)
|
||||
{
|
||||
r[0] = s;
|
||||
r[1] = s;
|
||||
r[2] = s;
|
||||
r[3] = s;
|
||||
}
|
||||
|
||||
static SIMDINLINE Float vec4_dp3_ps(const Vec4& v0, const Vec4& v1)
|
||||
{
|
||||
Float tmp, r;
|
||||
r = SIMD::mul_ps(v0[0], v1[0]); // (v0.x*v1.x)
|
||||
|
||||
tmp = SIMD::mul_ps(v0[1], v1[1]); // (v0.y*v1.y)
|
||||
r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y)
|
||||
|
||||
tmp = SIMD::mul_ps(v0[2], v1[2]); // (v0.z*v1.z)
|
||||
r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static SIMDINLINE Float vec4_dp4_ps(const Vec4& v0, const Vec4& v1)
|
||||
{
|
||||
Float tmp, r;
|
||||
r = SIMD::mul_ps(v0[0], v1[0]); // (v0.x*v1.x)
|
||||
|
||||
tmp = SIMD::mul_ps(v0[1], v1[1]); // (v0.y*v1.y)
|
||||
r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y)
|
||||
|
||||
tmp = SIMD::mul_ps(v0[2], v1[2]); // (v0.z*v1.z)
|
||||
r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
|
||||
|
||||
tmp = SIMD::mul_ps(v0[3], v1[3]); // (v0.w*v1.w)
|
||||
r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static SIMDINLINE Float vec4_rcp_length_ps(const Vec4& v)
|
||||
{
|
||||
Float length = vec4_dp4_ps(v, v);
|
||||
return SIMD::rsqrt_ps(length);
|
||||
}
|
||||
|
||||
static SIMDINLINE void vec4_normalize_ps(Vec4& r, const Vec4& v)
|
||||
{
|
||||
Float rcpLength = vec4_rcp_length_ps(v);
|
||||
|
||||
r[0] = SIMD::mul_ps(v[0], rcpLength);
|
||||
r[1] = SIMD::mul_ps(v[1], rcpLength);
|
||||
r[2] = SIMD::mul_ps(v[2], rcpLength);
|
||||
r[3] = SIMD::mul_ps(v[3], rcpLength);
|
||||
}
|
||||
|
||||
static SIMDINLINE void vec4_mul_ps(Vec4& r, const Vec4& v, Float const& s)
|
||||
{
|
||||
r[0] = SIMD::mul_ps(v[0], s);
|
||||
r[1] = SIMD::mul_ps(v[1], s);
|
||||
r[2] = SIMD::mul_ps(v[2], s);
|
||||
r[3] = SIMD::mul_ps(v[3], s);
|
||||
}
|
||||
|
||||
static SIMDINLINE void vec4_mul_ps(Vec4& r, const Vec4& v0, const Vec4& v1)
|
||||
{
|
||||
r[0] = SIMD::mul_ps(v0[0], v1[0]);
|
||||
r[1] = SIMD::mul_ps(v0[1], v1[1]);
|
||||
r[2] = SIMD::mul_ps(v0[2], v1[2]);
|
||||
r[3] = SIMD::mul_ps(v0[3], v1[3]);
|
||||
}
|
||||
|
||||
static SIMDINLINE void vec4_add_ps(Vec4& r, const Vec4& v0, Float const& s)
|
||||
{
|
||||
r[0] = SIMD::add_ps(v0[0], s);
|
||||
r[1] = SIMD::add_ps(v0[1], s);
|
||||
r[2] = SIMD::add_ps(v0[2], s);
|
||||
r[3] = SIMD::add_ps(v0[3], s);
|
||||
}
|
||||
|
||||
static SIMDINLINE void vec4_add_ps(Vec4& r, const Vec4& v0, const Vec4& v1)
|
||||
{
|
||||
r[0] = SIMD::add_ps(v0[0], v1[0]);
|
||||
r[1] = SIMD::add_ps(v0[1], v1[1]);
|
||||
r[2] = SIMD::add_ps(v0[2], v1[2]);
|
||||
r[3] = SIMD::add_ps(v0[3], v1[3]);
|
||||
}
|
||||
|
||||
static SIMDINLINE void vec4_min_ps(Vec4& r, const Vec4& v0, Float const& s)
|
||||
{
|
||||
r[0] = SIMD::min_ps(v0[0], s);
|
||||
r[1] = SIMD::min_ps(v0[1], s);
|
||||
r[2] = SIMD::min_ps(v0[2], s);
|
||||
r[3] = SIMD::min_ps(v0[3], s);
|
||||
}
|
||||
|
||||
static SIMDINLINE void vec4_max_ps(Vec4& r, const Vec4& v0, Float const& s)
|
||||
{
|
||||
r[0] = SIMD::max_ps(v0[0], s);
|
||||
r[1] = SIMD::max_ps(v0[1], s);
|
||||
r[2] = SIMD::max_ps(v0[2], s);
|
||||
r[3] = SIMD::max_ps(v0[3], s);
|
||||
}
|
||||
|
||||
// Matrix4x4 * Vector4
|
||||
// outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * v.w)
|
||||
// outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * v.w)
|
||||
// outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * v.w)
|
||||
// outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * v.w)
|
||||
static SIMDINLINE void SIMDCALL mat4x4_vec4_multiply(Vec4& result,
|
||||
const float* pMatrix,
|
||||
const Vec4& v)
|
||||
{
|
||||
Float m;
|
||||
Float r0;
|
||||
Float r1;
|
||||
|
||||
m = SIMD::load1_ps(pMatrix + 0 * 4 + 0); // m[row][0]
|
||||
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
|
||||
m = SIMD::load1_ps(pMatrix + 0 * 4 + 1); // m[row][1]
|
||||
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
|
||||
m = SIMD::load1_ps(pMatrix + 0 * 4 + 2); // m[row][2]
|
||||
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
|
||||
m = SIMD::load1_ps(pMatrix + 0 * 4 + 3); // m[row][3]
|
||||
r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
|
||||
result[0] = r0;
|
||||
|
||||
m = SIMD::load1_ps(pMatrix + 1 * 4 + 0); // m[row][0]
|
||||
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
|
||||
m = SIMD::load1_ps(pMatrix + 1 * 4 + 1); // m[row][1]
|
||||
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
|
||||
m = SIMD::load1_ps(pMatrix + 1 * 4 + 2); // m[row][2]
|
||||
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
|
||||
m = SIMD::load1_ps(pMatrix + 1 * 4 + 3); // m[row][3]
|
||||
r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
|
||||
result[1] = r0;
|
||||
|
||||
m = SIMD::load1_ps(pMatrix + 2 * 4 + 0); // m[row][0]
|
||||
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
|
||||
m = SIMD::load1_ps(pMatrix + 2 * 4 + 1); // m[row][1]
|
||||
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
|
||||
m = SIMD::load1_ps(pMatrix + 2 * 4 + 2); // m[row][2]
|
||||
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
|
||||
m = SIMD::load1_ps(pMatrix + 2 * 4 + 3); // m[row][3]
|
||||
r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
|
||||
result[2] = r0;
|
||||
|
||||
m = SIMD::load1_ps(pMatrix + 3 * 4 + 0); // m[row][0]
|
||||
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
|
||||
m = SIMD::load1_ps(pMatrix + 3 * 4 + 1); // m[row][1]
|
||||
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
|
||||
m = SIMD::load1_ps(pMatrix + 3 * 4 + 2); // m[row][2]
|
||||
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
|
||||
m = SIMD::load1_ps(pMatrix + 3 * 4 + 3); // m[row][3]
|
||||
r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
|
||||
result[3] = r0;
|
||||
}
|
||||
|
||||
// Matrix4x4 * Vector3 - Direction Vector where w = 0.
|
||||
// outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 0)
|
||||
// outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 0)
|
||||
// outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 0)
|
||||
// outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 0)
|
||||
static SIMDINLINE void SIMDCALL mat3x3_vec3_w0_multiply(Vec4& result,
|
||||
const float* pMatrix,
|
||||
const Vec4& v)
|
||||
{
|
||||
Float m;
|
||||
Float r0;
|
||||
Float r1;
|
||||
|
||||
m = SIMD::load1_ps(pMatrix + 0 * 4 + 0); // m[row][0]
|
||||
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
|
||||
m = SIMD::load1_ps(pMatrix + 0 * 4 + 1); // m[row][1]
|
||||
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
|
||||
m = SIMD::load1_ps(pMatrix + 0 * 4 + 2); // m[row][2]
|
||||
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
|
||||
result[0] = r0;
|
||||
|
||||
m = SIMD::load1_ps(pMatrix + 1 * 4 + 0); // m[row][0]
|
||||
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
|
||||
m = SIMD::load1_ps(pMatrix + 1 * 4 + 1); // m[row][1]
|
||||
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
|
||||
m = SIMD::load1_ps(pMatrix + 1 * 4 + 2); // m[row][2]
|
||||
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
|
||||
result[1] = r0;
|
||||
|
||||
m = SIMD::load1_ps(pMatrix + 2 * 4 + 0); // m[row][0]
|
||||
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
|
||||
m = SIMD::load1_ps(pMatrix + 2 * 4 + 1); // m[row][1]
|
||||
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
|
||||
m = SIMD::load1_ps(pMatrix + 2 * 4 + 2); // m[row][2]
|
||||
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
|
||||
result[2] = r0;
|
||||
|
||||
result[3] = SIMD::setzero_ps();
|
||||
}
|
||||
|
||||
// Matrix4x4 * Vector3 - Position vector where w = 1.
|
||||
// outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 1)
|
||||
// outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 1)
|
||||
// outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 1)
|
||||
// outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 1)
|
||||
static SIMDINLINE void SIMDCALL mat4x4_vec3_w1_multiply(Vec4& result,
|
||||
const float* pMatrix,
|
||||
const Vec4& v)
|
||||
{
|
||||
Float m;
|
||||
Float r0;
|
||||
Float r1;
|
||||
|
||||
m = SIMD::load1_ps(pMatrix + 0 * 4 + 0); // m[row][0]
|
||||
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
|
||||
m = SIMD::load1_ps(pMatrix + 0 * 4 + 1); // m[row][1]
|
||||
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
|
||||
m = SIMD::load1_ps(pMatrix + 0 * 4 + 2); // m[row][2]
|
||||
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
|
||||
m = SIMD::load1_ps(pMatrix + 0 * 4 + 3); // m[row][3]
|
||||
r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
|
||||
result[0] = r0;
|
||||
|
||||
m = SIMD::load1_ps(pMatrix + 1 * 4 + 0); // m[row][0]
|
||||
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
|
||||
m = SIMD::load1_ps(pMatrix + 1 * 4 + 1); // m[row][1]
|
||||
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
|
||||
m = SIMD::load1_ps(pMatrix + 1 * 4 + 2); // m[row][2]
|
||||
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
|
||||
m = SIMD::load1_ps(pMatrix + 1 * 4 + 3); // m[row][3]
|
||||
r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
|
||||
result[1] = r0;
|
||||
|
||||
m = SIMD::load1_ps(pMatrix + 2 * 4 + 0); // m[row][0]
|
||||
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
|
||||
m = SIMD::load1_ps(pMatrix + 2 * 4 + 1); // m[row][1]
|
||||
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
|
||||
m = SIMD::load1_ps(pMatrix + 2 * 4 + 2); // m[row][2]
|
||||
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
|
||||
m = SIMD::load1_ps(pMatrix + 2 * 4 + 3); // m[row][3]
|
||||
r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
|
||||
result[2] = r0;
|
||||
|
||||
m = SIMD::load1_ps(pMatrix + 3 * 4 + 0); // m[row][0]
|
||||
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
|
||||
m = SIMD::load1_ps(pMatrix + 3 * 4 + 1); // m[row][1]
|
||||
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
|
||||
m = SIMD::load1_ps(pMatrix + 3 * 4 + 2); // m[row][2]
|
||||
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
|
||||
m = SIMD::load1_ps(pMatrix + 3 * 4 + 3); // m[row][3]
|
||||
result[3] = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
|
||||
}
|
||||
|
||||
static SIMDINLINE void SIMDCALL mat4x3_vec3_w1_multiply(Vec4& result,
|
||||
const float* pMatrix,
|
||||
const Vec4& v)
|
||||
{
|
||||
Float m;
|
||||
Float r0;
|
||||
Float r1;
|
||||
|
||||
m = SIMD::load1_ps(pMatrix + 0 * 4 + 0); // m[row][0]
|
||||
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
|
||||
m = SIMD::load1_ps(pMatrix + 0 * 4 + 1); // m[row][1]
|
||||
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
|
||||
m = SIMD::load1_ps(pMatrix + 0 * 4 + 2); // m[row][2]
|
||||
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
|
||||
m = SIMD::load1_ps(pMatrix + 0 * 4 + 3); // m[row][3]
|
||||
r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
|
||||
result[0] = r0;
|
||||
|
||||
m = SIMD::load1_ps(pMatrix + 1 * 4 + 0); // m[row][0]
|
||||
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
|
||||
m = SIMD::load1_ps(pMatrix + 1 * 4 + 1); // m[row][1]
|
||||
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
|
||||
m = SIMD::load1_ps(pMatrix + 1 * 4 + 2); // m[row][2]
|
||||
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
|
||||
m = SIMD::load1_ps(pMatrix + 1 * 4 + 3); // m[row][3]
|
||||
r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
|
||||
result[1] = r0;
|
||||
|
||||
m = SIMD::load1_ps(pMatrix + 2 * 4 + 0); // m[row][0]
|
||||
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
|
||||
m = SIMD::load1_ps(pMatrix + 2 * 4 + 1); // m[row][1]
|
||||
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
|
||||
m = SIMD::load1_ps(pMatrix + 2 * 4 + 2); // m[row][2]
|
||||
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
|
||||
m = SIMD::load1_ps(pMatrix + 2 * 4 + 3); // m[row][3]
|
||||
r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
|
||||
result[2] = r0;
|
||||
result[3] = SIMD::set1_ps(1.0f);
|
||||
}
|
||||
}; // struct SIMDBase
|
||||
|
||||
using SIMD128 = SIMDBase<SIMDImpl::SIMD128Impl::Traits>;
|
||||
|
|
|
@ -222,14 +222,14 @@ SIMD_EMU_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
|
|||
//-----------------------------------------------------------------------
|
||||
// Logical operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int)
|
||||
SIMD_EMU_IWRAPPER_2(and_si); // return a & b (int)
|
||||
SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int)
|
||||
SIMD_EMU_IWRAPPER_2(andnot_si); // return (~a) & b (int)
|
||||
SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int)
|
||||
SIMD_EMU_IWRAPPER_2(or_si); // return a | b (int)
|
||||
SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int)
|
||||
SIMD_EMU_IWRAPPER_2(xor_si); // return a ^ b (int)
|
||||
SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int)
|
||||
SIMD_IFWRAPPER_2(and_si, _mm256_and_ps); // return a & b (int)
|
||||
SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int)
|
||||
SIMD_IFWRAPPER_2(andnot_si, _mm256_andnot_ps); // return (~a) & b (int)
|
||||
SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int)
|
||||
SIMD_IFWRAPPER_2(or_si, _mm256_or_ps); // return a | b (int)
|
||||
SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int)
|
||||
SIMD_IFWRAPPER_2(xor_si, _mm256_xor_ps); // return a ^ b (int)
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Shift operations
|
||||
|
|
|
@ -81,6 +81,7 @@
|
|||
return _mm256_##op(a, b, ImmT); \
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Floating point arithmetic operations
|
||||
//-----------------------------------------------------------------------
|
||||
|
@ -116,7 +117,14 @@ SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
|
|||
//-----------------------------------------------------------------------
|
||||
// Logical operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_IWRAPPER_2_(and_si, and_si256); // return a & b (int)
|
||||
#if _MSC_VER >= 1920 // && _MSC_FULL_VER < [some_fixed_version]
|
||||
// Some versions of MSVC 2019 don't handle constant folding of and_si() correctly.
|
||||
// Using and_ps instead inhibits the compiler's constant folding and actually issues
|
||||
// the and intrinsic even though both inputs are constant values.
|
||||
#else
|
||||
// Use native integer and intrinsic
|
||||
SIMD_IWRAPPER_2_(and_si, and_si256); // return a & b (int)
|
||||
#endif
|
||||
SIMD_IWRAPPER_2_(andnot_si, andnot_si256); // return (~a) & b (int)
|
||||
SIMD_IWRAPPER_2_(or_si, or_si256); // return a | b (int)
|
||||
SIMD_IWRAPPER_2_(xor_si, xor_si256); // return a ^ b (int)
|
||||
|
@ -213,6 +221,10 @@ static SIMDINLINE Float SIMDCALL
|
|||
return _mm256_i32gather_ps(p, idx, static_cast<int>(ScaleT));
|
||||
}
|
||||
|
||||
#if _MSC_VER == 1920 // && _MSC_FULL_VER < [some_fixed_version]
|
||||
// Don't use _mm256_mask_i32gather_ps(), the compiler doesn't preserve the mask register
|
||||
// correctly in early versions of MSVC 2019
|
||||
#else
|
||||
// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
|
||||
template <ScaleFactor ScaleT = ScaleFactor::SF_1>
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
|
@ -222,6 +234,7 @@ static SIMDINLINE Float SIMDCALL
|
|||
// Only for this intrinsic - not sure why. :(
|
||||
return _mm256_mask_i32gather_ps(old.v, p, idx.v, mask.v, static_cast<int>(ScaleT));
|
||||
}
|
||||
#endif
|
||||
|
||||
static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer const& a)
|
||||
{
|
||||
|
|
|
@ -328,101 +328,5 @@ struct SIMD256 // or SIMD4 or SIMD16
|
|||
//=======================================================================
|
||||
// Advanced masking interface (currently available only in SIMD16 width)
|
||||
//=======================================================================
|
||||
|
||||
|
||||
//=======================================================================
|
||||
// Extended Utility Functions (common to SIMD256 and SIMD16)
|
||||
//=======================================================================
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Extended Types
|
||||
//-----------------------------------------------------------------------
|
||||
|
||||
// Vec4, an SOA SIMD set of 4-dimensional vectors
|
||||
union Vec4
|
||||
{
|
||||
Vec4() = default;
|
||||
Vec4(Float in)
|
||||
{
|
||||
s.x = in;
|
||||
s.y = in;
|
||||
s.z = in;
|
||||
s.w = in;
|
||||
}
|
||||
Vec4(Float x, Float y, Float z, Float w)
|
||||
{
|
||||
s.x = x;
|
||||
s.y = y;
|
||||
s.z = z;
|
||||
s.w = w;
|
||||
}
|
||||
|
||||
Float v[4];
|
||||
Integer vi[4];
|
||||
struct
|
||||
{
|
||||
Float x;
|
||||
Float y;
|
||||
Float z;
|
||||
Float w;
|
||||
} s;
|
||||
Float& operator[] (const int i) { return v[i]; }
|
||||
Float const & operator[] (const int i) const { return v[i]; }
|
||||
};
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Extended Functions
|
||||
//-----------------------------------------------------------------------
|
||||
static void vec4_set1_ps(Vec4& r, const float *p); // r[0] = set1(p[0]), r[1] = set1(p[1]), ...
|
||||
static void vec4_set1_vps(Vec4& r, Float s); // r[0] = s, r[1] = s, ...
|
||||
static Float vec4_dp3_ps(const Vec4& v0, const Vec4& v1); // return dp3(v0, v1)
|
||||
static Float vec4_dp4_ps(const Vec4& v0, const Vec4& v1); // return dp4(v0, v1)
|
||||
static Float vec4_rcp_length_ps(const Vec4& v); // return 1.0f / sqrt(dp4(v, v))
|
||||
static void vec4_normalize_ps(Vec4& r, const Vec4& v); // r = v * rcp_length(v)
|
||||
static void vec4_mul_ps(Vec4& r, const Vec4& v, Float s); // r = v * set1_vps(s)
|
||||
static void vec4_mul_ps(Vec4& r, const Vec4& v0, const Vec4& v1); // r = v0 * v1
|
||||
static void vec4_add_ps(Vec4& r, const Vec4& v0, const Vec4& v1); // r = v0 + v1
|
||||
static void vec4_min_ps(Vec4& r, const Vec4& v0, Float s); // r = (v0 < s) ? v0 : s
|
||||
static void vec4_max_ps(Vec4& r, const Vec4& v0, Float s); // r = (v0 > s) ? v0 : s
|
||||
|
||||
// Matrix4x4 * Vector4
|
||||
// result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * v.s.w)
|
||||
// result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * v.s.w)
|
||||
// result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * v.s.w)
|
||||
// result.s.w = (m30 * v.s.x) + (m31 * v.s.y) + (m32 * v.s.z) + (m33 * v.s.w)
|
||||
static void mat4x4_vec4_multiply(
|
||||
Vec4& result,
|
||||
const float *pMatrix,
|
||||
const Vec4& v);
|
||||
|
||||
// Matrix4x4 * Vector3 - Direction Vector where w = 0.
|
||||
// result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * 0)
|
||||
// result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * 0)
|
||||
// result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * 0)
|
||||
// result.s.w = (m30 * v.s.x) + (m31 * v.s.y) + (m32 * v.s.z) + (m33 * 0)
|
||||
static void mat3x3_vec3_w0_multiply(
|
||||
Vec4& result,
|
||||
const float *pMatrix,
|
||||
const Vec4& v);
|
||||
|
||||
// Matrix4x4 * Vector3 - Position vector where w = 1.
|
||||
// result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * 1)
|
||||
// result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * 1)
|
||||
// result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * 1)
|
||||
// result.s.w = (m30 * v.s.x) + (m31 * v.s.y) + (m32 * v.s.z) + (m33 * 1)
|
||||
static void mat4x4_vec3_w1_multiply(
|
||||
Vec4& result,
|
||||
const float *pMatrix,
|
||||
const Vec4& v);
|
||||
|
||||
// Matrix4x3 * Vector3 - Position vector where w = 1.
|
||||
// result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * 1)
|
||||
// result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * 1)
|
||||
// result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * 1)
|
||||
// result.s.w = 1
|
||||
static void mat4x3_vec3_w1_multiply(
|
||||
Vec4& result,
|
||||
const float *pMatrix,
|
||||
const Vec4& v);
|
||||
};
|
||||
#endif // #if 0
|
||||
|
|
|
@ -315,7 +315,7 @@ namespace SIMDImpl
|
|||
|
||||
namespace SIMD512Impl
|
||||
{
|
||||
#if !(defined(__AVX512F__) || defined(_MM_K0_REG))
|
||||
#if !(defined(__AVX512F__) || defined(_ZMMINTRIN_H_INCLUDED))
|
||||
// Define AVX512 types if not included via immintrin.h.
|
||||
// All data members of these types are ONLY to viewed
|
||||
// in a debugger. Do NOT access them via code!
|
||||
|
|
|
@ -1551,6 +1551,7 @@ void SwrDispatch(HANDLE hContext,
|
|||
pTaskData->threadGroupCountX = threadGroupCountX;
|
||||
pTaskData->threadGroupCountY = threadGroupCountY;
|
||||
pTaskData->threadGroupCountZ = threadGroupCountZ;
|
||||
pTaskData->enableThreadDispatch = false;
|
||||
|
||||
uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ;
|
||||
uint32_t dcIndex = pDC->drawId % pContext->MAX_DRAWS_IN_FLIGHT;
|
||||
|
|
|
@ -588,6 +588,7 @@ SWR_FUNC(void,
|
|||
uint32_t threadGroupCountY,
|
||||
uint32_t threadGroupCountZ);
|
||||
|
||||
|
||||
/// @note this enum needs to be kept in sync with HOTTILE_STATE!
|
||||
enum SWR_TILE_STATE
|
||||
{
|
||||
|
|
|
@ -140,6 +140,7 @@ struct COMPUTE_DESC
|
|||
uint32_t threadGroupCountX;
|
||||
uint32_t threadGroupCountY;
|
||||
uint32_t threadGroupCountZ;
|
||||
bool enableThreadDispatch;
|
||||
};
|
||||
|
||||
typedef void (*PFN_WORK_FUNC)(DRAW_CONTEXT* pDC,
|
||||
|
|
|
@ -1447,7 +1447,7 @@ struct PA_TESS : PA_STATE
|
|||
SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]);
|
||||
if (!m_SOA)
|
||||
{
|
||||
indices = _simd16_mul_epi32(indices, _simd16_set1_epi32(vertexStride / 4));
|
||||
indices = _simd16_mullo_epi32(indices, _simd16_set1_epi32(vertexStride / 4));
|
||||
}
|
||||
#else
|
||||
SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]);
|
||||
|
|
|
@ -584,7 +584,7 @@ struct JitCacheFileHeader
|
|||
uint64_t GetObjectCRC() const { return m_objCRC; }
|
||||
|
||||
private:
|
||||
static const uint64_t JC_MAGIC_NUMBER = 0xfedcba9876543210ULL + 6;
|
||||
static const uint64_t JC_MAGIC_NUMBER = 0xfedcba9876543210ULL + 7;
|
||||
static const size_t JC_STR_MAX_LEN = 32;
|
||||
static const uint32_t JC_PLATFORM_KEY = (LLVM_VERSION_MAJOR << 24) |
|
||||
(LLVM_VERSION_MINOR << 16) | (LLVM_VERSION_PATCH << 8) |
|
||||
|
|
|
@ -61,6 +61,7 @@ namespace SwrJit
|
|||
mInt16PtrTy = PointerType::get(mInt16Ty, 0);
|
||||
mInt32PtrTy = PointerType::get(mInt32Ty, 0);
|
||||
mInt64PtrTy = PointerType::get(mInt64Ty, 0);
|
||||
mHandleTy = mInt8PtrTy;
|
||||
|
||||
mSimd4FP64Ty = VectorType::get(mDoubleTy, 4);
|
||||
|
||||
|
|
|
@ -78,6 +78,7 @@ namespace SwrJit
|
|||
// Built in types: scalar
|
||||
|
||||
Type* mVoidTy;
|
||||
Type* mHandleTy;
|
||||
Type* mInt1Ty;
|
||||
Type* mInt8Ty;
|
||||
Type* mInt16Ty;
|
||||
|
|
|
@ -237,6 +237,11 @@ namespace SwrJit
|
|||
return MASKED_GATHER(pVecSrcPtr, 4, pVecMask, pVecPassthru);
|
||||
}
|
||||
|
||||
void Builder::SCATTER_PTR(Value* pVecDstPtr, Value* pVecSrc, Value* pVecMask)
|
||||
{
|
||||
MASKED_SCATTER(pVecSrc, pVecDstPtr, 4, pVecMask);
|
||||
}
|
||||
|
||||
void Builder::Gather4(const SWR_FORMAT format,
|
||||
Value* pSrcBase,
|
||||
Value* byteOffsets,
|
||||
|
|
|
@ -148,6 +148,7 @@ void GATHER4DD(const SWR_FORMAT_INFO& info,
|
|||
Value* GATHERPD(Value* src, Value* pBase, Value* indices, Value* mask, uint8_t scale = 1);
|
||||
|
||||
Value* GATHER_PTR(Value* pVecSrcPtr, Value* pVecMask, Value* pVecPassthru);
|
||||
void SCATTER_PTR(Value* pVecDstPtr, Value* pVecSrc, Value* pVecMask);
|
||||
|
||||
virtual void SCATTERPS(Value* pDst,
|
||||
Value* vSrc,
|
||||
|
|
|
@ -170,6 +170,16 @@ namespace SwrJit
|
|||
return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0));
|
||||
}
|
||||
|
||||
Value* Builder::VIMMED1(uint64_t i)
|
||||
{
|
||||
return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
|
||||
}
|
||||
|
||||
Value* Builder::VIMMED1_16(uint64_t i)
|
||||
{
|
||||
return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
|
||||
}
|
||||
|
||||
Value* Builder::VIMMED1(int i)
|
||||
{
|
||||
return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
|
||||
|
|
|
@ -71,6 +71,9 @@ Constant* CInc(uint32_t base, uint32_t count)
|
|||
|
||||
Constant* PRED(bool pred);
|
||||
|
||||
Value* VIMMED1(uint64_t i);
|
||||
Value* VIMMED1_16(uint64_t i);
|
||||
|
||||
Value* VIMMED1(int i);
|
||||
Value* VIMMED1_16(int i);
|
||||
|
||||
|
|
Loading…
Reference in New Issue