gallium/swr: add OpenSWR rasterizer

Acked-by: Roland Scheidegger <sroland@vmware.com>
Acked-by: Jose Fonseca <jfonseca@vmware.com>
This commit is contained in:
Tim Rowley 2016-02-16 17:28:09 -06:00
parent 2b2d3680bf
commit c6e67f5a93
88 changed files with 48234 additions and 0 deletions

View File

@ -0,0 +1,208 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#ifndef SWRLIB_CONTAINERS_HPP__
#define SWRLIB_CONTAINERS_HPP__
#include <functional>
#include "common/os.h"
namespace SWRL
{
template <typename T, int NUM_ELEMENTS>
struct UncheckedFixedVector
{
UncheckedFixedVector() : mSize(0)
{
}
UncheckedFixedVector(std::size_t size, T const& exemplar)
{
this->mSize = 0;
for (std::size_t i = 0; i < size; ++i)
this->push_back(exemplar);
}
template <typename Iter>
UncheckedFixedVector(Iter fst, Iter lst)
{
this->mSize = 0;
for ( ; fst != lst; ++fst)
this->push_back(*fst);
}
UncheckedFixedVector(UncheckedFixedVector const& UFV)
{
this->mSize = 0;
for (std::size_t i = 0, N = UFV.size(); i < N; ++i)
(*this)[i] = UFV[i];
this->mSize = UFV.size();
}
UncheckedFixedVector& operator=(UncheckedFixedVector const& UFV)
{
for (std::size_t i = 0, N = UFV.size(); i < N; ++i)
(*this)[i] = UFV[i];
this->mSize = UFV.size();
return *this;
}
T* begin() { return &this->mElements[0]; }
T* end() { return &this->mElements[0] + this->mSize; }
T const* begin() const { return &this->mElements[0]; }
T const* end() const { return &this->mElements[0] + this->mSize; }
friend bool operator==(UncheckedFixedVector const& L, UncheckedFixedVector const& R)
{
if (L.size() != R.size()) return false;
for (std::size_t i = 0, N = L.size(); i < N; ++i)
{
if (L[i] != R[i]) return false;
}
return true;
}
friend bool operator!=(UncheckedFixedVector const& L, UncheckedFixedVector const& R)
{
if (L.size() != R.size()) return true;
for (std::size_t i = 0, N = L.size(); i < N; ++i)
{
if (L[i] != R[i]) return true;
}
return false;
}
T& operator[](std::size_t idx)
{
return this->mElements[idx];
}
T const& operator[](std::size_t idx) const
{
return this->mElements[idx];
}
void push_back(T const& t)
{
this->mElements[this->mSize] = t;
++this->mSize;
}
void pop_back()
{
SWR_ASSERT(this->mSize > 0);
--this->mSize;
}
T& back()
{
return this->mElements[this->mSize-1];
}
T const& back() const
{
return this->mElements[this->mSize-1];
}
bool empty() const
{
return this->mSize == 0;
}
std::size_t size() const
{
return this->mSize;
}
void resize(std::size_t sz)
{
this->mSize = sz;
}
void clear()
{
this->resize(0);
}
private:
std::size_t mSize;
T mElements[NUM_ELEMENTS];
};
template <typename T, int NUM_ELEMENTS>
struct FixedStack : UncheckedFixedVector<T, NUM_ELEMENTS>
{
FixedStack() {}
void push(T const& t)
{
this->push_back(t);
}
void pop()
{
this->pop_back();
}
T& top()
{
return this->back();
}
T const& top() const
{
return this->back();
}
};
template <typename T>
struct CRCHash
{
static_assert((sizeof(T) % sizeof(UINT)) == 0, "CRCHash expects templated type size is even multiple of 4B");
UINT operator()(const T& k) const
{
UINT *pData = (UINT*)&k;
UINT crc = 0;
for (UINT i = 0; i < sizeof(T) / sizeof(UINT); ++i)
{
crc = _mm_crc32_u32(crc, pData[i]);
}
return crc;
}
};
}// end SWRL
namespace std
{
template <typename T, int N>
struct hash<SWRL::UncheckedFixedVector<T, N>>
{
size_t operator() (SWRL::UncheckedFixedVector<T, N> const& v) const
{
if (v.size() == 0) return 0;
std::hash<T> H;
size_t x = H(v[0]);
if (v.size() == 1) return x;
for (size_t i = 1; i < v.size(); ++i)
x ^= H(v[i]) + 0x9e3779b9 + (x<<6) + (x>>2);
return x;
}
};
}// end std.
#endif//SWRLIB_CONTAINERS_HPP__

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,251 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file formats.h
*
* @brief auto-generated file
*
* DO NOT EDIT
*
******************************************************************************/
#pragma once
#include "common/os.h"
//////////////////////////////////////////////////////////////////////////
/// SWR_TYPE - Format component type
//////////////////////////////////////////////////////////////////////////
enum SWR_TYPE
{
SWR_TYPE_UNKNOWN,
SWR_TYPE_UNUSED,
SWR_TYPE_UNORM,
SWR_TYPE_SNORM,
SWR_TYPE_UINT,
SWR_TYPE_SINT,
SWR_TYPE_FLOAT,
SWR_TYPE_SSCALED,
SWR_TYPE_USCALED,
};
//////////////////////////////////////////////////////////////////////////
/// SWR_FORMAT
//////////////////////////////////////////////////////////////////////////
enum SWR_FORMAT
{
R32G32B32A32_FLOAT = 0x0,
R32G32B32A32_SINT = 0x1,
R32G32B32A32_UINT = 0x2,
R32G32B32X32_FLOAT = 0x6,
R32G32B32A32_SSCALED = 0x7,
R32G32B32A32_USCALED = 0x8,
R32G32B32_FLOAT = 0x40,
R32G32B32_SINT = 0x41,
R32G32B32_UINT = 0x42,
R32G32B32_SSCALED = 0x45,
R32G32B32_USCALED = 0x46,
R16G16B16A16_UNORM = 0x80,
R16G16B16A16_SNORM = 0x81,
R16G16B16A16_SINT = 0x82,
R16G16B16A16_UINT = 0x83,
R16G16B16A16_FLOAT = 0x84,
R32G32_FLOAT = 0x85,
R32G32_SINT = 0x86,
R32G32_UINT = 0x87,
R32_FLOAT_X8X24_TYPELESS = 0x88,
X32_TYPELESS_G8X24_UINT = 0x89,
L32A32_FLOAT = 0x8A,
R16G16B16X16_UNORM = 0x8E,
R16G16B16X16_FLOAT = 0x8F,
L32X32_FLOAT = 0x91,
I32X32_FLOAT = 0x92,
R16G16B16A16_SSCALED = 0x93,
R16G16B16A16_USCALED = 0x94,
R32G32_SSCALED = 0x95,
R32G32_USCALED = 0x96,
R32_FLOAT_X8X24_TYPELESS_LD = 0x98,
B8G8R8A8_UNORM = 0xC0,
B8G8R8A8_UNORM_SRGB = 0xC1,
R10G10B10A2_UNORM = 0xC2,
R10G10B10A2_UNORM_SRGB = 0xC3,
R10G10B10A2_UINT = 0xC4,
R8G8B8A8_UNORM = 0xC7,
R8G8B8A8_UNORM_SRGB = 0xC8,
R8G8B8A8_SNORM = 0xC9,
R8G8B8A8_SINT = 0xCA,
R8G8B8A8_UINT = 0xCB,
R16G16_UNORM = 0xCC,
R16G16_SNORM = 0xCD,
R16G16_SINT = 0xCE,
R16G16_UINT = 0xCF,
R16G16_FLOAT = 0xD0,
B10G10R10A2_UNORM = 0xD1,
B10G10R10A2_UNORM_SRGB = 0xD2,
R11G11B10_FLOAT = 0xD3,
R32_SINT = 0xD6,
R32_UINT = 0xD7,
R32_FLOAT = 0xD8,
R24_UNORM_X8_TYPELESS = 0xD9,
R24_UNORM_X8_TYPELESS_LD = 0xDC,
L32_UNORM = 0xDD,
L16A16_UNORM = 0xDF,
I24X8_UNORM = 0xE0,
L24X8_UNORM = 0xE1,
I32_FLOAT = 0xE3,
L32_FLOAT = 0xE4,
A32_FLOAT = 0xE5,
B8G8R8X8_UNORM = 0xE9,
B8G8R8X8_UNORM_SRGB = 0xEA,
R8G8B8X8_UNORM = 0xEB,
R8G8B8X8_UNORM_SRGB = 0xEC,
R9G9B9E5_SHAREDEXP = 0xED,
B10G10R10X2_UNORM = 0xEE,
L16A16_FLOAT = 0xF0,
R10G10B10X2_USCALED = 0xF3,
R8G8B8A8_SSCALED = 0xF4,
R8G8B8A8_USCALED = 0xF5,
R16G16_SSCALED = 0xF6,
R16G16_USCALED = 0xF7,
R32_SSCALED = 0xF8,
R32_USCALED = 0xF9,
B5G6R5_UNORM = 0x100,
B5G6R5_UNORM_SRGB = 0x101,
B5G5R5A1_UNORM = 0x102,
B5G5R5A1_UNORM_SRGB = 0x103,
B4G4R4A4_UNORM = 0x104,
B4G4R4A4_UNORM_SRGB = 0x105,
R8G8_UNORM = 0x106,
R8G8_SNORM = 0x107,
R8G8_SINT = 0x108,
R8G8_UINT = 0x109,
R16_UNORM = 0x10A,
R16_SNORM = 0x10B,
R16_SINT = 0x10C,
R16_UINT = 0x10D,
R16_FLOAT = 0x10E,
I16_UNORM = 0x111,
L16_UNORM = 0x112,
A16_UNORM = 0x113,
L8A8_UNORM = 0x114,
I16_FLOAT = 0x115,
L16_FLOAT = 0x116,
A16_FLOAT = 0x117,
L8A8_UNORM_SRGB = 0x118,
B5G5R5X1_UNORM = 0x11A,
B5G5R5X1_UNORM_SRGB = 0x11B,
R8G8_SSCALED = 0x11C,
R8G8_USCALED = 0x11D,
R16_SSCALED = 0x11E,
R16_USCALED = 0x11F,
L8A8_UINT = 0x126,
L8A8_SINT = 0x127,
R8_UNORM = 0x140,
R8_SNORM = 0x141,
R8_SINT = 0x142,
R8_UINT = 0x143,
A8_UNORM = 0x144,
I8_UNORM = 0x145,
L8_UNORM = 0x146,
R8_SSCALED = 0x149,
R8_USCALED = 0x14A,
L8_UNORM_SRGB = 0x14C,
L8_UINT = 0x152,
L8_SINT = 0x153,
I8_UINT = 0x154,
I8_SINT = 0x155,
YCRCB_SWAPUVY = 0x183,
BC1_UNORM = 0x186,
BC2_UNORM = 0x187,
BC3_UNORM = 0x188,
BC4_UNORM = 0x189,
BC5_UNORM = 0x18A,
BC1_UNORM_SRGB = 0x18B,
BC2_UNORM_SRGB = 0x18C,
BC3_UNORM_SRGB = 0x18D,
YCRCB_SWAPUV = 0x18F,
R8G8B8_UNORM = 0x193,
R8G8B8_SNORM = 0x194,
R8G8B8_SSCALED = 0x195,
R8G8B8_USCALED = 0x196,
BC4_SNORM = 0x199,
BC5_SNORM = 0x19A,
R16G16B16_FLOAT = 0x19B,
R16G16B16_UNORM = 0x19C,
R16G16B16_SNORM = 0x19D,
R16G16B16_SSCALED = 0x19E,
R16G16B16_USCALED = 0x19F,
BC6H_SF16 = 0x1A1,
BC7_UNORM = 0x1A2,
BC7_UNORM_SRGB = 0x1A3,
BC6H_UF16 = 0x1A4,
R8G8B8_UNORM_SRGB = 0x1A8,
R16G16B16_UINT = 0x1B0,
R16G16B16_SINT = 0x1B1,
R10G10B10A2_SNORM = 0x1B3,
R10G10B10A2_USCALED = 0x1B4,
R10G10B10A2_SSCALED = 0x1B5,
R10G10B10A2_SINT = 0x1B6,
B10G10R10A2_SNORM = 0x1B7,
B10G10R10A2_USCALED = 0x1B8,
B10G10R10A2_SSCALED = 0x1B9,
B10G10R10A2_UINT = 0x1BA,
B10G10R10A2_SINT = 0x1BB,
R8G8B8_UINT = 0x1C8,
R8G8B8_SINT = 0x1C9,
NUM_SWR_FORMATS = 0x1CA,
};
//////////////////////////////////////////////////////////////////////////
/// SWR_FORMAT_INFO - Format information
//////////////////////////////////////////////////////////////////////////
struct SWR_FORMAT_INFO
{
const char* name;
SWR_TYPE type[4];
uint32_t defaults[4];
uint32_t swizzle[4]; ///< swizzle per component
uint32_t bpc[4]; ///< bits per component
uint32_t bpp; ///< bits per pixel
uint32_t Bpp; ///< bytes per pixel
uint32_t numComps; ///< number of components
bool isSRGB;
bool isBC;
bool isSubsampled;
bool isNormalized[4];
float toFloat[4];
uint32_t bcWidth;
uint32_t bcHeight;
bool isLuminance;
};
extern const SWR_FORMAT_INFO gFormatInfo[];
//////////////////////////////////////////////////////////////////////////
/// @brief Retrieves format info struct for given format.
/// @param format - SWR format
INLINE const SWR_FORMAT_INFO& GetFormatInfo(SWR_FORMAT format)
{
return gFormatInfo[format];
}
// lookup table for unorm8 srgb -> float conversion
extern const uint32_t srgb8Table[256];

View File

@ -0,0 +1,235 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#pragma once
#include <iostream>
#include <vector>
#include <bitset>
#include <array>
#include <string>
#include <algorithm>
#if defined(_WIN32)
#include <intrin.h>
#else
#include <string.h>
#include <cpuid.h>
#endif
class InstructionSet
{
public:
InstructionSet() : CPU_Rep() {};
// getters
std::string Vendor(void) { return CPU_Rep.vendor_; }
std::string Brand(void) { return CPU_Rep.brand_; }
bool SSE3(void) { return CPU_Rep.f_1_ECX_[0]; }
bool PCLMULQDQ(void) { return CPU_Rep.f_1_ECX_[1]; }
bool MONITOR(void) { return CPU_Rep.f_1_ECX_[3]; }
bool SSSE3(void) { return CPU_Rep.f_1_ECX_[9]; }
bool FMA(void) { return CPU_Rep.f_1_ECX_[12]; }
bool CMPXCHG16B(void) { return CPU_Rep.f_1_ECX_[13]; }
bool SSE41(void) { return CPU_Rep.f_1_ECX_[19]; }
bool SSE42(void) { return CPU_Rep.f_1_ECX_[20]; }
bool MOVBE(void) { return CPU_Rep.f_1_ECX_[22]; }
bool POPCNT(void) { return CPU_Rep.f_1_ECX_[23]; }
bool AES(void) { return CPU_Rep.f_1_ECX_[25]; }
bool XSAVE(void) { return CPU_Rep.f_1_ECX_[26]; }
bool OSXSAVE(void) { return CPU_Rep.f_1_ECX_[27]; }
bool RDRAND(void) { return CPU_Rep.f_1_ECX_[30]; }
bool MSR(void) { return CPU_Rep.f_1_EDX_[5]; }
bool CX8(void) { return CPU_Rep.f_1_EDX_[8]; }
bool SEP(void) { return CPU_Rep.f_1_EDX_[11]; }
bool CMOV(void) { return CPU_Rep.f_1_EDX_[15]; }
bool CLFSH(void) { return CPU_Rep.f_1_EDX_[19]; }
bool MMX(void) { return CPU_Rep.f_1_EDX_[23]; }
bool FXSR(void) { return CPU_Rep.f_1_EDX_[24]; }
bool SSE(void) { return CPU_Rep.f_1_EDX_[25]; }
bool SSE2(void) { return CPU_Rep.f_1_EDX_[26]; }
bool FSGSBASE(void) { return CPU_Rep.f_7_EBX_[0]; }
bool BMI1(void) { return CPU_Rep.f_7_EBX_[3]; }
bool HLE(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_7_EBX_[4]; }
bool BMI2(void) { return CPU_Rep.f_7_EBX_[8]; }
bool ERMS(void) { return CPU_Rep.f_7_EBX_[9]; }
bool INVPCID(void) { return CPU_Rep.f_7_EBX_[10]; }
bool RTM(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_7_EBX_[11]; }
bool RDSEED(void) { return CPU_Rep.f_7_EBX_[18]; }
bool ADX(void) { return CPU_Rep.f_7_EBX_[19]; }
bool SHA(void) { return CPU_Rep.f_7_EBX_[29]; }
bool PREFETCHWT1(void) { return CPU_Rep.f_7_ECX_[0]; }
bool LAHF(void) { return CPU_Rep.f_81_ECX_[0]; }
bool LZCNT(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_81_ECX_[5]; }
bool ABM(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_ECX_[5]; }
bool SSE4a(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_ECX_[6]; }
bool XOP(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_ECX_[11]; }
bool TBM(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_ECX_[21]; }
bool SYSCALL(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_81_EDX_[11]; }
bool MMXEXT(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_EDX_[22]; }
bool RDTSCP(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_81_EDX_[27]; }
bool _3DNOWEXT(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_EDX_[30]; }
bool _3DNOW(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_EDX_[31]; }
bool AVX(void) { return CPU_Rep.f_1_ECX_[28]; }
bool F16C(void) { return CPU_Rep.f_1_ECX_[29]; }
bool AVX2(void) { return CPU_Rep.f_7_EBX_[5]; }
bool AVX512F(void) { return CPU_Rep.f_7_EBX_[16]; }
bool AVX512PF(void) { return CPU_Rep.f_7_EBX_[26]; }
bool AVX512ER(void) { return CPU_Rep.f_7_EBX_[27]; }
bool AVX512CD(void) { return CPU_Rep.f_7_EBX_[28]; }
private:
class InstructionSet_Internal
{
public:
InstructionSet_Internal()
: nIds_{ 0 },
nExIds_{ 0 },
isIntel_{ false },
isAMD_{ false },
f_1_ECX_{ 0 },
f_1_EDX_{ 0 },
f_7_EBX_{ 0 },
f_7_ECX_{ 0 },
f_81_ECX_{ 0 },
f_81_EDX_{ 0 },
data_{},
extdata_{}
{
//int cpuInfo[4] = {-1};
std::array<int, 4> cpui;
// Calling __cpuid with 0x0 as the function_id argument
// gets the number of the highest valid function ID.
#if defined(_WIN32)
__cpuid(cpui.data(), 0);
nIds_ = cpui[0];
#else
nIds_ = __get_cpuid_max(0, NULL);
#endif
for (int i = 0; i <= nIds_; ++i)
{
#if defined(_WIN32)
__cpuidex(cpui.data(), i, 0);
#else
int *data = cpui.data();
__cpuid_count(i, 0, data[0], data[1], data[2], data[3]);
#endif
data_.push_back(cpui);
}
// Capture vendor string
char vendor[0x20];
memset(vendor, 0, sizeof(vendor));
*reinterpret_cast<int*>(vendor) = data_[0][1];
*reinterpret_cast<int*>(vendor + 4) = data_[0][3];
*reinterpret_cast<int*>(vendor + 8) = data_[0][2];
vendor_ = vendor;
if (vendor_ == "GenuineIntel")
{
isIntel_ = true;
}
else if (vendor_ == "AuthenticAMD")
{
isAMD_ = true;
}
// load bitset with flags for function 0x00000001
if (nIds_ >= 1)
{
f_1_ECX_ = data_[1][2];
f_1_EDX_ = data_[1][3];
}
// load bitset with flags for function 0x00000007
if (nIds_ >= 7)
{
f_7_EBX_ = data_[7][1];
f_7_ECX_ = data_[7][2];
}
// Calling __cpuid with 0x80000000 as the function_id argument
// gets the number of the highest valid extended ID.
#if defined(_WIN32)
__cpuid(cpui.data(), 0x80000000);
nExIds_ = cpui[0];
#else
nExIds_ = __get_cpuid_max(0x80000000, NULL);
#endif
char brand[0x40];
memset(brand, 0, sizeof(brand));
for (unsigned i = 0x80000000; i <= nExIds_; ++i)
{
#if defined(_WIN32)
__cpuidex(cpui.data(), i, 0);
#else
int *data = cpui.data();
__cpuid_count(i, 0, data[0], data[1], data[2], data[3]);
#endif
extdata_.push_back(cpui);
}
// load bitset with flags for function 0x80000001
if (nExIds_ >= 0x80000001)
{
f_81_ECX_ = extdata_[1][2];
f_81_EDX_ = extdata_[1][3];
}
// Interpret CPU brand string if reported
if (nExIds_ >= 0x80000004)
{
memcpy(brand, extdata_[2].data(), sizeof(cpui));
memcpy(brand + 16, extdata_[3].data(), sizeof(cpui));
memcpy(brand + 32, extdata_[4].data(), sizeof(cpui));
brand_ = brand;
}
};
int nIds_;
unsigned nExIds_;
std::string vendor_;
std::string brand_;
bool isIntel_;
bool isAMD_;
std::bitset<32> f_1_ECX_;
std::bitset<32> f_1_EDX_;
std::bitset<32> f_7_EBX_;
std::bitset<32> f_7_ECX_;
std::bitset<32> f_81_ECX_;
std::bitset<32> f_81_EDX_;
std::vector<std::array<int, 4>> data_;
std::vector<std::array<int, 4>> extdata_;
};
const InstructionSet_Internal CPU_Rep;
};

View File

@ -0,0 +1,221 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#ifndef __SWR_OS_H__
#define __SWR_OS_H__
#include "core/knobs.h"
#if (defined(FORCE_WINDOWS) || defined(_WIN32)) && !defined(FORCE_LINUX)
#define SWR_API __cdecl
#ifndef _CRT_SECURE_NO_WARNINGS
#define _CRT_SECURE_NO_WARNINGS
#endif
#ifndef NOMINMAX
#define NOMINMAX
#endif
#include "Windows.h"
#include <intrin.h>
#include <cstdint>
#define OSALIGN(RWORD, WIDTH) __declspec(align(WIDTH)) RWORD
#define THREAD __declspec(thread)
#define INLINE __forceinline
#define DEBUGBREAK __debugbreak()
#define PRAGMA_WARNING_PUSH_DISABLE(...) \
__pragma(warning(push));\
__pragma(warning(disable:__VA_ARGS__));
#define PRAGMA_WARNING_POP() __pragma(warning(pop))
#if defined(_WIN32)
#if defined(_WIN64)
#define BitScanForwardSizeT BitScanForward64
#define _mm_popcount_sizeT _mm_popcnt_u64
#else
#define BitScanForwardSizeT BitScanForward
#define _mm_popcount_sizeT _mm_popcnt_u32
#endif
#endif
#elif defined(FORCE_LINUX) || defined(__linux__) || defined(__gnu_linux__)
#define SWR_API
#include <stdlib.h>
#include <string.h>
#include <X11/Xmd.h>
#include <x86intrin.h>
#include <stdint.h>
#include <sys/types.h>
#include <unistd.h>
#include <sys/stat.h>
typedef void VOID;
typedef void* LPVOID;
typedef CARD8 BOOL;
typedef wchar_t WCHAR;
typedef uint16_t UINT16;
typedef int INT;
typedef int INT32;
typedef unsigned int UINT;
typedef uint32_t UINT32;
typedef uint64_t UINT64;
typedef int64_t INT64;
typedef void* HANDLE;
typedef float FLOAT;
typedef int LONG;
typedef CARD8 BYTE;
typedef unsigned char UCHAR;
typedef unsigned int DWORD;
#undef FALSE
#define FALSE 0
#undef TRUE
#define TRUE 1
#define OSALIGN(RWORD, WIDTH) RWORD __attribute__((aligned(WIDTH)))
#define THREAD __thread
#ifndef INLINE
#define INLINE __inline
#endif
#define DEBUGBREAK asm ("int $3")
#define __cdecl
#define __declspec(X)
#define GCC_VERSION (__GNUC__ * 10000 \
+ __GNUC_MINOR__ * 100 \
+ __GNUC_PATCHLEVEL__)
#if !defined(__clang__) && (__GNUC__) && (GCC_VERSION < 40500)
inline
uint64_t __rdtsc()
{
long low, high;
asm volatile("rdtsc" : "=a"(low), "=d"(high));
return (low | ((uint64_t)high << 32));
}
#endif
#ifndef __clang__
// Intrinsic not defined in gcc
static INLINE
void _mm256_storeu2_m128i(__m128i *hi, __m128i *lo, __m256i a)
{
_mm_storeu_si128((__m128i*)lo, _mm256_castsi256_si128(a));
_mm_storeu_si128((__m128i*)hi, _mm256_extractf128_si256(a, 0x1));
}
#endif
inline
unsigned char _BitScanForward(unsigned long *Index, unsigned long Mask)
{
*Index = __builtin_ctz(Mask);
return (Mask != 0);
}
inline
unsigned char _BitScanForward(unsigned int *Index, unsigned int Mask)
{
*Index = __builtin_ctz(Mask);
return (Mask != 0);
}
inline
unsigned char _BitScanReverse(unsigned long *Index, unsigned long Mask)
{
*Index = __builtin_clz(Mask);
return (Mask != 0);
}
inline
unsigned char _BitScanReverse(unsigned int *Index, unsigned int Mask)
{
*Index = __builtin_clz(Mask);
return (Mask != 0);
}
inline
void *_aligned_malloc(unsigned int size, unsigned int alignment)
{
void *ret;
if (posix_memalign(&ret, alignment, size))
{
return NULL;
}
return ret;
}
inline
unsigned char _bittest(const LONG *a, LONG b)
{
return ((*(unsigned *)(a) & (1 << b)) != 0);
}
#define GetCurrentProcessId getpid
#define CreateDirectory(name, pSecurity) mkdir(name, 0777)
#if defined(_WIN32)
static inline
unsigned int _mm_popcnt_u32(unsigned int v)
{
return __builtin_popcount(v);
}
#endif
#define _aligned_free free
#define InterlockedCompareExchange(Dest, Exchange, Comparand) __sync_val_compare_and_swap(Dest, Comparand, Exchange)
#define InterlockedExchangeAdd(Addend, Value) __sync_fetch_and_add(Addend, Value)
#define InterlockedDecrement(Append) __sync_sub_and_fetch(Append, 1)
#define InterlockedIncrement(Append) __sync_add_and_fetch(Append, 1)
#define _ReadWriteBarrier() asm volatile("" ::: "memory")
#define __stdcall
#define PRAGMA_WARNING_PUSH_DISABLE(...)
#define PRAGMA_WARNING_POP()
#else
#error Unsupported OS/system.
#endif
// Universal types
typedef BYTE KILOBYTE[1024];
typedef KILOBYTE MEGABYTE[1024];
typedef MEGABYTE GIGABYTE[1024];
#define OSALIGNLINE(RWORD) OSALIGN(RWORD, 64)
#if KNOB_SIMD_WIDTH == 8
#define OSALIGNSIMD(RWORD) OSALIGN(RWORD, 32)
#endif
#include "common/swr_assert.h"
#endif//__SWR_OS_H__

View File

@ -0,0 +1,188 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file rdtsc_buckets.cpp
*
* @brief implementation of rdtsc buckets.
*
* Notes:
*
******************************************************************************/
#include "rdtsc_buckets.h"
#include <inttypes.h>
THREAD UINT tlsThreadId = 0;
void BucketManager::RegisterThread(const std::string& name)
{
BUCKET_THREAD newThread;
newThread.name = name;
newThread.root.children.reserve(mBuckets.size());
newThread.root.id = 0;
newThread.root.pParent = nullptr;
newThread.pCurrent = &newThread.root;
mThreadMutex.lock();
// assign unique thread id for this thread
size_t id = mThreads.size();
newThread.id = (UINT)id;
tlsThreadId = (UINT)id;
// open threadviz file if enabled
if (mThreadViz)
{
std::stringstream ss;
ss << mThreadVizDir << "\\threadviz_thread." << newThread.id << ".dat";
newThread.vizFile = fopen(ss.str().c_str(), "wb");
}
// store new thread
mThreads.push_back(newThread);
mThreadMutex.unlock();
}
UINT BucketManager::RegisterBucket(const BUCKET_DESC& desc)
{
size_t id = mBuckets.size();
mBuckets.push_back(desc);
return (UINT)id;
}
void BucketManager::PrintBucket(FILE* f, UINT level, UINT64 threadCycles, UINT64 parentCycles, const BUCKET& bucket)
{
const char *arrows[] = {
"",
"|-> ",
" |-> ",
" |-> ",
" |-> ",
" |-> ",
" |-> "
};
// compute percent of total cycles used by this bucket
float percentTotal = (float)((double)bucket.elapsed / (double)threadCycles * 100.0);
// compute percent of parent cycles used by this bucket
float percentParent = (float)((double)bucket.elapsed / (double)parentCycles * 100.0);
// compute average cycle count per invocation
UINT64 CPE = bucket.elapsed / bucket.count;
BUCKET_DESC &desc = mBuckets[bucket.id];
// construct hierarchy visualization
char hier[80];
strcpy(hier, arrows[level]);
strcat(hier, desc.name.c_str());
// print out
fprintf(f, "%6.2f %6.2f %-10" PRIu64 " %-10" PRIu64 " %-10u %-10lu %-10u %s\n",
percentTotal,
percentParent,
bucket.elapsed,
CPE,
bucket.count,
(unsigned long)0,
(uint32_t)0,
hier
);
// dump all children of this bucket
for (const BUCKET& child : bucket.children)
{
if (child.count)
{
PrintBucket(f, level + 1, threadCycles, bucket.elapsed, child);
}
}
}
void BucketManager::PrintThread(FILE* f, const BUCKET_THREAD& thread)
{
// print header
fprintf(f, "\nThread %u (%s)\n", thread.id, thread.name.c_str());
fprintf(f, " %%Tot %%Par Cycles CPE NumEvent CPE2 NumEvent2 Bucket\n");
// compute thread level total cycle counts across all buckets from root
const BUCKET& root = thread.root;
UINT64 totalCycles = 0;
for (const BUCKET& child : root.children)
{
totalCycles += child.elapsed;
}
for (const BUCKET& child : root.children)
{
if (child.count)
{
PrintBucket(f, 0, totalCycles, totalCycles, child);
}
}
}
void BucketManager::DumpThreadViz()
{
// ensure all thread data is flushed
mThreadMutex.lock();
for (auto& thread : mThreads)
{
fflush(thread.vizFile);
fclose(thread.vizFile);
}
mThreadMutex.unlock();
// dump bucket descriptions
std::stringstream ss;
ss << mThreadVizDir << "\\threadviz_buckets.dat";
FILE* f = fopen(ss.str().c_str(), "wb");
for (auto& bucket : mBuckets)
{
Serialize(f, bucket);
}
fclose(f);
}
void BucketManager::PrintReport(const std::string& filename)
{
if (mThreadViz)
{
DumpThreadViz();
}
else
{
FILE* f = fopen(filename.c_str(), "w");
mThreadMutex.lock();
for (const BUCKET_THREAD& thread : mThreads)
{
PrintThread(f, thread);
fprintf(f, "\n");
}
mThreadMutex.unlock();
fclose(f);
}
}

View File

@ -0,0 +1,229 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file rdtsc_buckets.h
*
* @brief declaration for rdtsc buckets.
*
* Notes:
*
******************************************************************************/
#pragma once
#include "os.h"
#include <vector>
#include <mutex>
#include <sstream>
#include "rdtsc_buckets_shared.h"
// unique thread id stored in thread local storage
extern THREAD UINT tlsThreadId;
//////////////////////////////////////////////////////////////////////////
/// @brief BucketManager encapsulates a single instance of the buckets
/// functionality. There can be one or many bucket managers active
/// at any time. The manager owns all the threads and
/// bucket information that have been registered to it.
class BucketManager
{
public:
BucketManager(bool enableThreadViz) : mThreadViz(enableThreadViz)
{
if (mThreadViz)
{
uint32_t pid = GetCurrentProcessId();
std::stringstream str;
str << "threadviz." << pid;
mThreadVizDir = str.str();
CreateDirectory(mThreadVizDir.c_str(), NULL);
}
}
// removes all registered thread data
void ClearThreads()
{
mThreadMutex.lock();
mThreads.clear();
mThreadMutex.unlock();
}
// removes all registered buckets
void ClearBuckets()
{
mBuckets.clear();
}
/// Registers a new thread with the manager.
/// @param name - name of thread, used for labels in reports and threadviz
void RegisterThread(const std::string& name);
/// Registers a new bucket type with the manager. Returns a unique
/// id which should be used in subsequent calls to start/stop the bucket
/// @param desc - description of the bucket
/// @return unique id
UINT RegisterBucket(const BUCKET_DESC& desc);
// dump threadviz data
void DumpThreadViz();
// print report
void PrintReport(const std::string& filename);
// start capturing
INLINE void StartCapture()
{
mCapturing = true;
}
// stop capturing
INLINE void StopCapture()
{
mCapturing = false;
// wait for all threads to pop back to root bucket
bool stillCapturing = true;
while (stillCapturing)
{
stillCapturing = false;
for (const BUCKET_THREAD& t : mThreads)
{
if (t.pCurrent != &t.root)
{
stillCapturing = true;
continue;
}
}
}
}
// start a bucket
// @param id generated by RegisterBucket
INLINE void StartBucket(UINT id)
{
if (!mCapturing) return;
SWR_ASSERT(tlsThreadId < mThreads.size());
BUCKET_THREAD& bt = mThreads[tlsThreadId];
// if threadviz is enabled, only need to dump start info to threads viz file
if (mThreadViz)
{
SWR_ASSERT(bt.vizFile != nullptr);
if (mBuckets[id].enableThreadViz)
{
VIZ_START_DATA data{ VIZ_START, id, __rdtsc() };
Serialize(bt.vizFile, data);
}
}
else
{
if (bt.pCurrent->children.size() < mBuckets.size())
{
bt.pCurrent->children.resize(mBuckets.size());
}
BUCKET &child = bt.pCurrent->children[id];
child.pParent = bt.pCurrent;
child.id = id;
child.start = __rdtsc();
// update thread's currently executing bucket
bt.pCurrent = &child;
}
bt.level++;
}
// stop the currently executing bucket
INLINE void StopBucket(UINT id)
{
SWR_ASSERT(tlsThreadId < mThreads.size());
BUCKET_THREAD &bt = mThreads[tlsThreadId];
if (bt.level == 0) return;
if (mThreadViz)
{
SWR_ASSERT(bt.vizFile != nullptr);
if (mBuckets[id].enableThreadViz)
{
VIZ_STOP_DATA data{ VIZ_STOP, __rdtsc() };
Serialize(bt.vizFile, data);
}
}
else
{
if (bt.pCurrent->start == 0) return;
SWR_ASSERT(bt.pCurrent->id == id, "Mismatched buckets detected");
bt.pCurrent->elapsed += (__rdtsc() - bt.pCurrent->start);
bt.pCurrent->count++;
// pop to parent
bt.pCurrent = bt.pCurrent->pParent;
}
bt.level--;
}
INLINE void AddEvent(uint32_t id, uint32_t count)
{
if (!mCapturing) return;
SWR_ASSERT(tlsThreadId < mThreads.size());
BUCKET_THREAD& bt = mThreads[tlsThreadId];
// don't record events for threadviz
if (!mThreadViz)
{
if (bt.pCurrent->children.size() < mBuckets.size())
{
bt.pCurrent->children.resize(mBuckets.size());
}
BUCKET &child = bt.pCurrent->children[id];
child.pParent = bt.pCurrent;
child.id = id;
child.count += count;
}
}
private:
void PrintBucket(FILE* f, UINT level, UINT64 threadCycles, UINT64 parentCycles, const BUCKET& bucket);
void PrintThread(FILE* f, const BUCKET_THREAD& thread);
// list of active threads that have registered with this manager
std::vector<BUCKET_THREAD> mThreads;
// list of buckets registered with this manager
std::vector<BUCKET_DESC> mBuckets;
// is capturing currently enabled
volatile bool mCapturing{ false };
std::mutex mThreadMutex;
// enable threadviz
bool mThreadViz{ false };
std::string mThreadVizDir;
};

View File

@ -0,0 +1,167 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file rdtsc_buckets.h
*
* @brief declaration for rdtsc buckets.
*
* Notes:
*
******************************************************************************/
#pragma once
#include <vector>
#include <cassert>
struct BUCKET
{
uint32_t id{ 0 };
uint64_t start{ 0 };
uint64_t elapsed{ 0 };
uint32_t count{ 0 };
BUCKET* pParent{ nullptr };
std::vector<BUCKET> children;
};
struct BUCKET_DESC
{
// name of bucket, used in reports
std::string name;
// description of bucket, used in threadviz
std::string description;
// enable for threadviz dumping
bool enableThreadViz;
// threadviz color of bucket, in RGBA8_UNORM format
uint32_t color;
};
struct BUCKET_THREAD
{
// name of thread, used in reports
std::string name;
// id for this thread, assigned by the thread manager
uint32_t id;
// root of the bucket hierarchy for this thread
BUCKET root;
// currently executing bucket somewhere in the hierarchy
BUCKET* pCurrent;
// currently executing hierarchy level
uint32_t level{ 0 };
// threadviz file object
FILE* vizFile{ nullptr };
BUCKET_THREAD() {}
BUCKET_THREAD(const BUCKET_THREAD& that)
{
name = that.name;
id = that.id;
root = that.root;
pCurrent = &root;
vizFile = that.vizFile;
}
};
enum VIZ_TYPE
{
VIZ_START = 0,
VIZ_STOP = 1,
VIZ_DATA = 2
};
struct VIZ_START_DATA
{
uint8_t type;
uint32_t bucketId;
uint64_t timestamp;
};
struct VIZ_STOP_DATA
{
uint8_t type;
uint64_t timestamp;
};
inline void Serialize(FILE* f, const VIZ_START_DATA& data)
{
fwrite(&data, sizeof(VIZ_START_DATA), 1, f);
}
inline void Deserialize(FILE* f, VIZ_START_DATA& data)
{
fread(&data, sizeof(VIZ_START_DATA), 1, f);
assert(data.type == VIZ_START);
}
inline void Serialize(FILE* f, const VIZ_STOP_DATA& data)
{
fwrite(&data, sizeof(VIZ_STOP_DATA), 1, f);
}
inline void Deserialize(FILE* f, VIZ_STOP_DATA& data)
{
fread(&data, sizeof(VIZ_STOP_DATA), 1, f);
assert(data.type == VIZ_STOP);
}
inline void Serialize(FILE* f, const std::string& string)
{
assert(string.size() <= 256);
uint8_t length = (uint8_t)string.size();
fwrite(&length, sizeof(length), 1, f);
fwrite(string.c_str(), string.size(), 1, f);
}
inline void Deserialize(FILE* f, std::string& string)
{
char cstr[256];
uint8_t length;
fread(&length, sizeof(length), 1, f);
fread(cstr, length, 1, f);
cstr[length] = 0;
string.assign(cstr);
}
inline void Serialize(FILE* f, const BUCKET_DESC& desc)
{
Serialize(f, desc.name);
Serialize(f, desc.description);
fwrite(&desc.enableThreadViz, sizeof(desc.enableThreadViz), 1, f);
fwrite(&desc.color, sizeof(desc.color), 1, f);
}
inline void Deserialize(FILE* f, BUCKET_DESC& desc)
{
Deserialize(f, desc.name);
Deserialize(f, desc.description);
fread(&desc.enableThreadViz, sizeof(desc.enableThreadViz), 1, f);
fread(&desc.color, sizeof(desc.color), 1, f);
}

View File

@ -0,0 +1,787 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#ifndef __SWR_SIMDINTRIN_H__
#define __SWR_SIMDINTRIN_H__
#include "os.h"
#include <cassert>
#include <emmintrin.h>
#include <immintrin.h>
#include <xmmintrin.h>
#if KNOB_SIMD_WIDTH == 8
typedef __m256 simdscalar;
typedef __m256i simdscalari;
typedef uint8_t simdmask;
#else
#error Unsupported vector width
#endif
// simd vector
OSALIGNSIMD(union) simdvector
{
simdscalar v[4];
struct
{
simdscalar x, y, z, w;
};
simdscalar& operator[] (const int i) { return v[i]; }
const simdscalar& operator[] (const int i) const { return v[i]; }
};
#if KNOB_SIMD_WIDTH == 8
#define _simd128_maskstore_ps _mm_maskstore_ps
#define _simd_load_ps _mm256_load_ps
#define _simd_load1_ps _mm256_broadcast_ss
#define _simd_loadu_ps _mm256_loadu_ps
#define _simd_setzero_ps _mm256_setzero_ps
#define _simd_set1_ps _mm256_set1_ps
#define _simd_blend_ps _mm256_blend_ps
#define _simd_blendv_ps _mm256_blendv_ps
#define _simd_store_ps _mm256_store_ps
#define _simd_mul_ps _mm256_mul_ps
#define _simd_add_ps _mm256_add_ps
#define _simd_sub_ps _mm256_sub_ps
#define _simd_rsqrt_ps _mm256_rsqrt_ps
#define _simd_min_ps _mm256_min_ps
#define _simd_max_ps _mm256_max_ps
#define _simd_movemask_ps _mm256_movemask_ps
#define _simd_cvtps_epi32 _mm256_cvtps_epi32
#define _simd_cvttps_epi32 _mm256_cvttps_epi32
#define _simd_cvtepi32_ps _mm256_cvtepi32_ps
#define _simd_cmplt_ps(a, b) _mm256_cmp_ps(a, b, _CMP_LT_OQ)
#define _simd_cmpgt_ps(a, b) _mm256_cmp_ps(a, b, _CMP_GT_OQ)
#define _simd_cmpneq_ps(a, b) _mm256_cmp_ps(a, b, _CMP_NEQ_OQ)
#define _simd_cmpeq_ps(a, b) _mm256_cmp_ps(a, b, _CMP_EQ_OQ)
#define _simd_cmpge_ps(a, b) _mm256_cmp_ps(a, b, _CMP_GE_OQ)
#define _simd_cmple_ps(a, b) _mm256_cmp_ps(a, b, _CMP_LE_OQ)
#define _simd_cmp_ps(a, b, imm) _mm256_cmp_ps(a, b, imm)
#define _simd_and_ps _mm256_and_ps
#define _simd_or_ps _mm256_or_ps
#define _simd_rcp_ps _mm256_rcp_ps
#define _simd_div_ps _mm256_div_ps
#define _simd_castsi_ps _mm256_castsi256_ps
#define _simd_andnot_ps _mm256_andnot_ps
#define _simd_round_ps _mm256_round_ps
#define _simd_castpd_ps _mm256_castpd_ps
#define _simd_broadcast_ps(a) _mm256_broadcast_ps((const __m128*)(a))
#define _simd_load_sd _mm256_load_sd
#define _simd_movemask_pd _mm256_movemask_pd
#define _simd_castsi_pd _mm256_castsi256_pd
// emulated integer simd
#define SIMD_EMU_EPI(func, intrin) \
INLINE \
__m256i func(__m256i a, __m256i b)\
{\
__m128i aHi = _mm256_extractf128_si256(a, 1);\
__m128i bHi = _mm256_extractf128_si256(b, 1);\
__m128i aLo = _mm256_castsi256_si128(a);\
__m128i bLo = _mm256_castsi256_si128(b);\
\
__m128i subLo = intrin(aLo, bLo);\
__m128i subHi = intrin(aHi, bHi);\
\
__m256i result = _mm256_castsi128_si256(subLo);\
result = _mm256_insertf128_si256(result, subHi, 1);\
\
return result;\
}
#if (KNOB_ARCH == KNOB_ARCH_AVX)
#define _simd_mul_epi32 _simdemu_mul_epi32
#define _simd_mullo_epi32 _simdemu_mullo_epi32
#define _simd_sub_epi32 _simdemu_sub_epi32
#define _simd_sub_epi64 _simdemu_sub_epi64
#define _simd_min_epi32 _simdemu_min_epi32
#define _simd_min_epu32 _simdemu_min_epu32
#define _simd_max_epi32 _simdemu_max_epi32
#define _simd_max_epu32 _simdemu_max_epu32
#define _simd_add_epi32 _simdemu_add_epi32
#define _simd_and_si _simdemu_and_si
#define _simd_andnot_si _simdemu_andnot_si
#define _simd_cmpeq_epi32 _simdemu_cmpeq_epi32
#define _simd_cmplt_epi32 _simdemu_cmplt_epi32
#define _simd_cmpgt_epi32 _simdemu_cmpgt_epi32
#define _simd_or_si _simdemu_or_si
#define _simd_castps_si _mm256_castps_si256
#define _simd_adds_epu8 _simdemu_adds_epu8
#define _simd_subs_epu8 _simdemu_subs_epu8
#define _simd_add_epi8 _simdemu_add_epi8
#define _simd_cmpeq_epi64 _simdemu_cmpeq_epi64
#define _simd_cmpgt_epi64 _simdemu_cmpgt_epi64
#define _simd_movemask_epi8 _simdemu_movemask_epi8
SIMD_EMU_EPI(_simdemu_mul_epi32, _mm_mul_epi32)
SIMD_EMU_EPI(_simdemu_mullo_epi32, _mm_mullo_epi32)
SIMD_EMU_EPI(_simdemu_sub_epi32, _mm_sub_epi32)
SIMD_EMU_EPI(_simdemu_sub_epi64, _mm_sub_epi64)
SIMD_EMU_EPI(_simdemu_min_epi32, _mm_min_epi32)
SIMD_EMU_EPI(_simdemu_min_epu32, _mm_min_epu32)
SIMD_EMU_EPI(_simdemu_max_epi32, _mm_max_epi32)
SIMD_EMU_EPI(_simdemu_max_epu32, _mm_max_epu32)
SIMD_EMU_EPI(_simdemu_add_epi32, _mm_add_epi32)
SIMD_EMU_EPI(_simdemu_and_si, _mm_and_si128)
SIMD_EMU_EPI(_simdemu_andnot_si, _mm_andnot_si128)
SIMD_EMU_EPI(_simdemu_cmpeq_epi32, _mm_cmpeq_epi32)
SIMD_EMU_EPI(_simdemu_cmplt_epi32, _mm_cmplt_epi32)
SIMD_EMU_EPI(_simdemu_cmpgt_epi32, _mm_cmpgt_epi32)
SIMD_EMU_EPI(_simdemu_or_si, _mm_or_si128)
SIMD_EMU_EPI(_simdemu_adds_epu8, _mm_adds_epu8)
SIMD_EMU_EPI(_simdemu_subs_epu8, _mm_subs_epu8)
SIMD_EMU_EPI(_simdemu_add_epi8, _mm_add_epi8)
SIMD_EMU_EPI(_simdemu_cmpeq_epi64, _mm_cmpeq_epi64)
SIMD_EMU_EPI(_simdemu_cmpgt_epi64, _mm_cmpgt_epi64)
#define _simd_unpacklo_epi32(a, b) _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)))
#define _simd_unpackhi_epi32(a, b) _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)))
#define _simd_slli_epi32(a,i) _simdemu_slli_epi32(a,i)
#define _simd_srai_epi32(a,i) _simdemu_srai_epi32(a,i)
#define _simd_srli_epi32(a,i) _simdemu_srli_epi32(a,i)
#define _simd_srlisi_ps(a,i) _mm256_castsi256_ps(_simdemu_srli_si128<i>(_mm256_castps_si256(a)))
#define _simd128_fmadd_ps _mm_fmaddemu_ps
#define _simd_fmadd_ps _mm_fmaddemu256_ps
#define _simd_fmsub_ps _mm_fmsubemu256_ps
#define _simd_shuffle_epi8 _simdemu_shuffle_epi8
SIMD_EMU_EPI(_simdemu_shuffle_epi8, _mm_shuffle_epi8)
INLINE
__m128 _mm_fmaddemu_ps(__m128 a, __m128 b, __m128 c)
{
__m128 res = _mm_mul_ps(a, b);
res = _mm_add_ps(res, c);
return res;
}
INLINE
__m256 _mm_fmaddemu256_ps(__m256 a, __m256 b, __m256 c)
{
__m256 res = _mm256_mul_ps(a, b);
res = _mm256_add_ps(res, c);
return res;
}
INLINE
__m256 _mm_fmsubemu256_ps(__m256 a, __m256 b, __m256 c)
{
__m256 res = _mm256_mul_ps(a, b);
res = _mm256_sub_ps(res, c);
return res;
}
INLINE
__m256 _simd_i32gather_ps(const float* pBase, __m256i vOffsets, const int scale)
{
uint32_t *pOffsets = (uint32_t*)&vOffsets;
simdscalar vResult;
float* pResult = (float*)&vResult;
for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
{
uint32_t offset = pOffsets[i];
offset = offset * scale;
pResult[i] = *(float*)(((const uint8_t*)pBase + offset));
}
return vResult;
}
INLINE
__m256 _simd_mask_i32gather_ps(__m256 vSrc, const float* pBase, __m256i vOffsets, __m256 vMask, const int scale)
{
uint32_t *pOffsets = (uint32_t*)&vOffsets;
simdscalar vResult = vSrc;
float* pResult = (float*)&vResult;
DWORD index;
uint32_t mask = _simd_movemask_ps(vMask);
while (_BitScanForward(&index, mask))
{
mask &= ~(1 << index);
uint32_t offset = pOffsets[index];
offset = offset * scale;
pResult[index] = *(float*)(((const uint8_t*)pBase + offset));
}
return vResult;
}
INLINE
__m256i _simd_abs_epi32(__m256i a)
{
__m128i aHi = _mm256_extractf128_si256(a, 1);
__m128i aLo = _mm256_castsi256_si128(a);
__m128i absLo = _mm_abs_epi32(aLo);
__m128i absHi = _mm_abs_epi32(aHi);
__m256i result = _mm256_castsi128_si256(absLo);
result = _mm256_insertf128_si256(result, absHi, 1);
return result;
}
INLINE
int _simdemu_movemask_epi8(__m256i a)
{
__m128i aHi = _mm256_extractf128_si256(a, 1);
__m128i aLo = _mm256_castsi256_si128(a);
int resHi = _mm_movemask_epi8(aHi);
int resLo = _mm_movemask_epi8(aLo);
return (resHi << 16) | resLo;
}
#else
#define _simd_mul_epi32 _mm256_mul_epi32
#define _simd_mullo_epi32 _mm256_mullo_epi32
#define _simd_sub_epi32 _mm256_sub_epi32
#define _simd_sub_epi64 _mm256_sub_epi64
#define _simd_min_epi32 _mm256_min_epi32
#define _simd_max_epi32 _mm256_max_epi32
#define _simd_min_epu32 _mm256_min_epu32
#define _simd_max_epu32 _mm256_max_epu32
#define _simd_add_epi32 _mm256_add_epi32
#define _simd_and_si _mm256_and_si256
#define _simd_andnot_si _mm256_andnot_si256
#define _simd_cmpeq_epi32 _mm256_cmpeq_epi32
#define _simd_cmplt_epi32(a,b) _mm256_cmpgt_epi32(b,a)
#define _simd_cmpgt_epi32(a,b) _mm256_cmpgt_epi32(a,b)
#define _simd_or_si _mm256_or_si256
#define _simd_castps_si _mm256_castps_si256
#define _simd_unpacklo_epi32 _mm256_unpacklo_epi32
#define _simd_unpackhi_epi32 _mm256_unpackhi_epi32
#define _simd_srli_si(a,i) _simdemu_srli_si128<i>(a)
#define _simd_slli_epi32 _mm256_slli_epi32
#define _simd_srai_epi32 _mm256_srai_epi32
#define _simd_srli_epi32 _mm256_srli_epi32
#define _simd_srlisi_ps(a,i) _mm256_castsi256_ps(_simdemu_srli_si128<i>(_mm256_castps_si256(a)))
#define _simd128_fmadd_ps _mm_fmadd_ps
#define _simd_fmadd_ps _mm256_fmadd_ps
#define _simd_fmsub_ps _mm256_fmsub_ps
#define _simd_shuffle_epi8 _mm256_shuffle_epi8
#define _simd_adds_epu8 _mm256_adds_epu8
#define _simd_subs_epu8 _mm256_subs_epu8
#define _simd_add_epi8 _mm256_add_epi8
#define _simd_i32gather_ps _mm256_i32gather_ps
#define _simd_mask_i32gather_ps _mm256_mask_i32gather_ps
#define _simd_abs_epi32 _mm256_abs_epi32
#define _simd_cmpeq_epi64 _mm256_cmpeq_epi64
#define _simd_cmpgt_epi64 _mm256_cmpgt_epi64
#define _simd_movemask_epi8 _mm256_movemask_epi8
#endif
#define _simd_shuffleps_epi32(vA, vB, imm) _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(vA), _mm256_castsi256_ps(vB), imm))
#define _simd_shuffle_ps _mm256_shuffle_ps
#define _simd_set1_epi32 _mm256_set1_epi32
#define _simd_set1_epi8 _mm256_set1_epi8
#define _simd_setzero_si _mm256_setzero_si256
#define _simd_cvttps_epi32 _mm256_cvttps_epi32
#define _simd_store_si _mm256_store_si256
#define _simd_broadcast_ss _mm256_broadcast_ss
#define _simd_maskstore_ps _mm256_maskstore_ps
#define _simd_load_si _mm256_load_si256
#define _simd_loadu_si _mm256_loadu_si256
#define _simd_sub_ps _mm256_sub_ps
#define _simd_testz_ps _mm256_testz_ps
#define _simd_xor_ps _mm256_xor_ps
INLINE
simdscalari _simd_blendv_epi32(simdscalari a, simdscalari b, simdscalar mask)
{
return _simd_castps_si(_simd_blendv_ps(_simd_castsi_ps(a), _simd_castsi_ps(b), mask));
}
// convert bitmask to vector mask
INLINE
simdscalar vMask(int32_t mask)
{
__m256i vec = _mm256_set1_epi32(mask);
const __m256i bit = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
vec = _simd_and_si(vec, bit);
vec = _simd_cmplt_epi32(_mm256_setzero_si256(), vec);
return _simd_castsi_ps(vec);
}
INLINE
void _simd_mov(simdscalar &r, unsigned int rlane, simdscalar& s, unsigned int slane)
{
OSALIGNSIMD(float) rArray[KNOB_SIMD_WIDTH], sArray[KNOB_SIMD_WIDTH];
_mm256_store_ps(rArray, r);
_mm256_store_ps(sArray, s);
rArray[rlane] = sArray[slane];
r = _mm256_load_ps(rArray);
}
INLINE __m256i _simdemu_slli_epi32(__m256i a, uint32_t i)
{
__m128i aHi = _mm256_extractf128_si256(a, 1);
__m128i aLo = _mm256_castsi256_si128(a);
__m128i resHi = _mm_slli_epi32(aHi, i);
__m128i resLo = _mm_slli_epi32(aLo, i);
__m256i result = _mm256_castsi128_si256(resLo);
result = _mm256_insertf128_si256(result, resHi, 1);
return result;
}
INLINE __m256i _simdemu_srai_epi32(__m256i a, uint32_t i)
{
__m128i aHi = _mm256_extractf128_si256(a, 1);
__m128i aLo = _mm256_castsi256_si128(a);
__m128i resHi = _mm_srai_epi32(aHi, i);
__m128i resLo = _mm_srai_epi32(aLo, i);
__m256i result = _mm256_castsi128_si256(resLo);
result = _mm256_insertf128_si256(result, resHi, 1);
return result;
}
INLINE __m256i _simdemu_srli_epi32(__m256i a, uint32_t i)
{
__m128i aHi = _mm256_extractf128_si256(a, 1);
__m128i aLo = _mm256_castsi256_si128(a);
__m128i resHi = _mm_srli_epi32(aHi, i);
__m128i resLo = _mm_srli_epi32(aLo, i);
__m256i result = _mm256_castsi128_si256(resLo);
result = _mm256_insertf128_si256(result, resHi, 1);
return result;
}
INLINE
void _simdvec_transpose(simdvector &v)
{
SWR_ASSERT(false, "Need to implement 8 wide version");
}
#else
#error Unsupported vector width
#endif
// Populates a simdvector from a vector. So p = xyzw becomes xxxx yyyy zzzz wwww.
INLINE
void _simdvec_load_ps(simdvector& r, const float *p)
{
r[0] = _simd_set1_ps(p[0]);
r[1] = _simd_set1_ps(p[1]);
r[2] = _simd_set1_ps(p[2]);
r[3] = _simd_set1_ps(p[3]);
}
INLINE
void _simdvec_mov(simdvector& r, const simdscalar& s)
{
r[0] = s;
r[1] = s;
r[2] = s;
r[3] = s;
}
INLINE
void _simdvec_mov(simdvector& r, const simdvector& v)
{
r[0] = v[0];
r[1] = v[1];
r[2] = v[2];
r[3] = v[3];
}
// just move a lane from the source simdvector to dest simdvector
INLINE
void _simdvec_mov(simdvector &r, unsigned int rlane, simdvector& s, unsigned int slane)
{
_simd_mov(r[0], rlane, s[0], slane);
_simd_mov(r[1], rlane, s[1], slane);
_simd_mov(r[2], rlane, s[2], slane);
_simd_mov(r[3], rlane, s[3], slane);
}
INLINE
void _simdvec_dp3_ps(simdscalar& r, const simdvector& v0, const simdvector& v1)
{
simdscalar tmp;
r = _simd_mul_ps(v0[0], v1[0]); // (v0.x*v1.x)
tmp = _simd_mul_ps(v0[1], v1[1]); // (v0.y*v1.y)
r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y)
tmp = _simd_mul_ps(v0[2], v1[2]); // (v0.z*v1.z)
r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
}
INLINE
void _simdvec_dp4_ps(simdscalar& r, const simdvector& v0, const simdvector& v1)
{
simdscalar tmp;
r = _simd_mul_ps(v0[0], v1[0]); // (v0.x*v1.x)
tmp = _simd_mul_ps(v0[1], v1[1]); // (v0.y*v1.y)
r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y)
tmp = _simd_mul_ps(v0[2], v1[2]); // (v0.z*v1.z)
r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
tmp = _simd_mul_ps(v0[3], v1[3]); // (v0.w*v1.w)
r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
}
INLINE
simdscalar _simdvec_rcp_length_ps(const simdvector& v)
{
simdscalar length;
_simdvec_dp4_ps(length, v, v);
return _simd_rsqrt_ps(length);
}
INLINE
void _simdvec_normalize_ps(simdvector& r, const simdvector& v)
{
simdscalar vecLength;
vecLength = _simdvec_rcp_length_ps(v);
r[0] = _simd_mul_ps(v[0], vecLength);
r[1] = _simd_mul_ps(v[1], vecLength);
r[2] = _simd_mul_ps(v[2], vecLength);
r[3] = _simd_mul_ps(v[3], vecLength);
}
INLINE
void _simdvec_mul_ps(simdvector& r, const simdvector& v, const simdscalar& s)
{
r[0] = _simd_mul_ps(v[0], s);
r[1] = _simd_mul_ps(v[1], s);
r[2] = _simd_mul_ps(v[2], s);
r[3] = _simd_mul_ps(v[3], s);
}
INLINE
void _simdvec_mul_ps(simdvector& r, const simdvector& v0, const simdvector& v1)
{
r[0] = _simd_mul_ps(v0[0], v1[0]);
r[1] = _simd_mul_ps(v0[1], v1[1]);
r[2] = _simd_mul_ps(v0[2], v1[2]);
r[3] = _simd_mul_ps(v0[3], v1[3]);
}
INLINE
void _simdvec_add_ps(simdvector& r, const simdvector& v0, const simdvector& v1)
{
r[0] = _simd_add_ps(v0[0], v1[0]);
r[1] = _simd_add_ps(v0[1], v1[1]);
r[2] = _simd_add_ps(v0[2], v1[2]);
r[3] = _simd_add_ps(v0[3], v1[3]);
}
INLINE
void _simdvec_min_ps(simdvector& r, const simdvector& v0, const simdscalar& s)
{
r[0] = _simd_min_ps(v0[0], s);
r[1] = _simd_min_ps(v0[1], s);
r[2] = _simd_min_ps(v0[2], s);
r[3] = _simd_min_ps(v0[3], s);
}
INLINE
void _simdvec_max_ps(simdvector& r, const simdvector& v0, const simdscalar& s)
{
r[0] = _simd_max_ps(v0[0], s);
r[1] = _simd_max_ps(v0[1], s);
r[2] = _simd_max_ps(v0[2], s);
r[3] = _simd_max_ps(v0[3], s);
}
// Matrix4x4 * Vector4
// outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * v.w)
// outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * v.w)
// outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * v.w)
// outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * v.w)
INLINE
void _simd_mat4x4_vec4_multiply(
simdvector& result,
const float *pMatrix,
const simdvector& v)
{
simdscalar m;
simdscalar r0;
simdscalar r1;
m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0]
r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1]
r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2]
r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3]
r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z)
r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
result[0] = r0;
m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0]
r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1]
r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2]
r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3]
r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z)
r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
result[1] = r0;
m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0]
r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1]
r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2]
r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3]
r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z)
r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
result[2] = r0;
m = _simd_load1_ps(pMatrix + 3*4 + 0); // m[row][0]
r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
m = _simd_load1_ps(pMatrix + 3*4 + 1); // m[row][1]
r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
m = _simd_load1_ps(pMatrix + 3*4 + 2); // m[row][2]
r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
m = _simd_load1_ps(pMatrix + 3*4 + 3); // m[row][3]
r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z)
r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
result[3] = r0;
}
// Matrix4x4 * Vector3 - Direction Vector where w = 0.
// outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 0)
// outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 0)
// outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 0)
// outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 0)
INLINE
void _simd_mat3x3_vec3_w0_multiply(
simdvector& result,
const float *pMatrix,
const simdvector& v)
{
simdscalar m;
simdscalar r0;
simdscalar r1;
m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0]
r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1]
r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2]
r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
result[0] = r0;
m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0]
r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1]
r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2]
r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
result[1] = r0;
m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0]
r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1]
r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2]
r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
result[2] = r0;
result[3] = _simd_setzero_ps();
}
// Matrix4x4 * Vector3 - Position vector where w = 1.
// outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 1)
// outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 1)
// outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 1)
// outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 1)
INLINE
void _simd_mat4x4_vec3_w1_multiply(
simdvector& result,
const float *pMatrix,
const simdvector& v)
{
simdscalar m;
simdscalar r0;
simdscalar r1;
m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0]
r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1]
r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2]
r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3]
r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
result[0] = r0;
m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0]
r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1]
r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2]
r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3]
r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
result[1] = r0;
m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0]
r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1]
r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2]
r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3]
r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
result[2] = r0;
m = _simd_load1_ps(pMatrix + 3*4 + 0); // m[row][0]
r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
m = _simd_load1_ps(pMatrix + 3*4 + 1); // m[row][1]
r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
m = _simd_load1_ps(pMatrix + 3*4 + 2); // m[row][2]
r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
m = _simd_load1_ps(pMatrix + 3*4 + 3); // m[row][3]
result[3] = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
}
INLINE
void _simd_mat4x3_vec3_w1_multiply(
simdvector& result,
const float *pMatrix,
const simdvector& v)
{
simdscalar m;
simdscalar r0;
simdscalar r1;
m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0]
r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1]
r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2]
r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3]
r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
result[0] = r0;
m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0]
r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1]
r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2]
r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3]
r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
result[1] = r0;
m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0]
r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1]
r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2]
r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3]
r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
result[2] = r0;
result[3] = _simd_set1_ps(1.0f);
}
//////////////////////////////////////////////////////////////////////////
/// @brief Compute plane equation vA * vX + vB * vY + vC
INLINE simdscalar vplaneps(simdscalar vA, simdscalar vB, simdscalar vC, simdscalar &vX, simdscalar &vY)
{
simdscalar vOut = _simd_fmadd_ps(vA, vX, vC);
vOut = _simd_fmadd_ps(vB, vY, vOut);
return vOut;
}
//////////////////////////////////////////////////////////////////////////
/// @brief Interpolates a single component.
/// @param vI - barycentric I
/// @param vJ - barycentric J
/// @param pInterpBuffer - pointer to attribute barycentric coeffs
template<UINT Attrib, UINT Comp>
static INLINE simdscalar InterpolateComponent(simdscalar vI, simdscalar vJ, const float *pInterpBuffer)
{
const float *pInterpA = &pInterpBuffer[Attrib * 12 + 0 + Comp];
const float *pInterpB = &pInterpBuffer[Attrib * 12 + 4 + Comp];
const float *pInterpC = &pInterpBuffer[Attrib * 12 + 8 + Comp];
simdscalar vA = _simd_broadcast_ss(pInterpA);
simdscalar vB = _simd_broadcast_ss(pInterpB);
simdscalar vC = _simd_broadcast_ss(pInterpC);
simdscalar vk = _simd_sub_ps(_simd_sub_ps(_simd_set1_ps(1.0f), vI), vJ);
vC = _simd_mul_ps(vk, vC);
return vplaneps(vA, vB, vC, vI, vJ);
}
#endif//__SWR_SIMDINTRIN_H__

View File

@ -0,0 +1,238 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#include "common/os.h"
#include <stdarg.h>
#include <stdio.h>
#include <assert.h>
#if SWR_ENABLE_ASSERTS || SWR_ENABLE_REL_ASSERTS
#if defined(_WIN32)
#pragma comment(lib, "user32.lib")
#endif // _WIN32
enum TextColor
{
TEXT_BLACK = 0,
TEXT_RED = 1,
TEXT_GREEN = 2,
TEXT_BLUE = 4,
TEXT_PURPLE = TEXT_RED | TEXT_BLUE,
TEXT_CYAN = TEXT_GREEN | TEXT_BLUE,
TEXT_YELLOW = TEXT_RED | TEXT_GREEN,
TEXT_WHITE = TEXT_RED | TEXT_GREEN | TEXT_BLUE,
};
enum TextStyle
{
TEXT_NORMAL = 0,
TEXT_INTENSITY = 1,
};
void SetTextColor(FILE* stream, TextColor color = TEXT_WHITE, TextStyle style = TEXT_NORMAL)
{
#if defined(_WIN32)
HANDLE hConsoleHandle = nullptr;
if (stream == stderr)
{
hConsoleHandle = GetStdHandle(STD_ERROR_HANDLE);
}
else if (stream == stdout)
{
hConsoleHandle = GetStdHandle(STD_OUTPUT_HANDLE);
}
else
{
// Not a console stream, do nothing
return;
}
WORD textAttributes = 0;
if (color & TEXT_RED)
{
textAttributes |= FOREGROUND_RED;
}
if (color & TEXT_GREEN)
{
textAttributes |= FOREGROUND_GREEN;
}
if (color & TEXT_BLUE)
{
textAttributes |= FOREGROUND_BLUE;
}
if (style & TEXT_INTENSITY)
{
textAttributes |= FOREGROUND_INTENSITY;
}
SetConsoleTextAttribute(hConsoleHandle, textAttributes);
#else // !_WIN32
// Print ANSI codes
uint32_t cc = 30 + (style ? 60 : 0) + color;
fprintf(stream, "\033[0m\033[%d;%dm", style, cc);
#endif
}
void ResetTextColor(FILE* stream)
{
#if defined(_WIN32)
SetTextColor(stream);
#else // !_WIN32
// Print ANSI codes
fprintf(stream, "\033[0m");
#endif
}
bool SwrAssert(
bool chkDebugger,
bool& enabled,
const char* pExpression,
const char* pFileName,
uint32_t lineNum,
const char* pFunction,
const char* pFmtString /* = nullptr */,
...)
{
if (!enabled) return false;
SetTextColor(stderr, TEXT_CYAN, TEXT_NORMAL);
fprintf(stderr, "%s(%d): ", pFileName, lineNum);
SetTextColor(stderr, TEXT_RED, TEXT_INTENSITY);
fprintf(stderr, "ASSERT: %s\n", pExpression);
SetTextColor(stderr, TEXT_CYAN, TEXT_INTENSITY);
fprintf(stderr, "\t%s\n", pFunction);
if (pFmtString)
{
SetTextColor(stderr, TEXT_YELLOW, TEXT_INTENSITY);
fprintf(stderr, "\t");
va_list args;
va_start(args, pFmtString);
vfprintf(stderr, pFmtString, args);
va_end(args);
fprintf(stderr, "\n");
}
ResetTextColor(stderr);
fflush(stderr);
#if defined(_WIN32)
static const int MAX_MESSAGE_LEN = 2048;
char msgBuf[MAX_MESSAGE_LEN];
sprintf_s(msgBuf, "%s(%d): ASSERT: %s\n", pFileName, lineNum, pExpression);
msgBuf[MAX_MESSAGE_LEN - 2] = '\n';
msgBuf[MAX_MESSAGE_LEN - 1] = 0;
OutputDebugStringA(msgBuf);
sprintf_s(msgBuf, "\t%s\n", pFunction);
msgBuf[MAX_MESSAGE_LEN - 2] = '\n';
msgBuf[MAX_MESSAGE_LEN - 1] = 0;
OutputDebugStringA(msgBuf);
int offset = 0;
if (pFmtString)
{
va_list args;
va_start(args, pFmtString);
offset = _vsnprintf_s(
msgBuf,
sizeof(msgBuf),
sizeof(msgBuf),
pFmtString,
args);
va_end(args);
if (offset < 0) { return true; }
OutputDebugStringA("\t");
OutputDebugStringA(msgBuf);
OutputDebugStringA("\n");
}
if (KNOB_ENABLE_ASSERT_DIALOGS)
{
int retval = sprintf_s(
&msgBuf[offset],
MAX_MESSAGE_LEN - offset,
"\n\n"
"File: %s\n"
"Line: %d\n"
"\n"
"Expression: %s\n\n"
"Cancel: Disable this assert for the remainder of the process\n"
"Try Again: Break into the debugger\n"
"Continue: Continue execution (but leave assert enabled)",
pFileName,
lineNum,
pExpression);
if (retval < 0) { return true; }
offset += retval;
if (!IsDebuggerPresent())
{
sprintf_s(
&msgBuf[offset],
MAX_MESSAGE_LEN - offset,
"\n\n*** NO DEBUGGER DETECTED ***\n\nPressing \"Try Again\" will cause a program crash!");
}
retval = MessageBoxA(nullptr, msgBuf, "Assert Failed", MB_CANCELTRYCONTINUE | MB_ICONEXCLAMATION);
switch (retval)
{
case IDCANCEL:
enabled = false;
return false;
case IDTRYAGAIN:
return true;
case IDCONTINUE:
return false;
}
}
else
{
return IsDebuggerPresent() || !chkDebugger;
}
#endif // _WIN32
return true;
}
#endif // SWR_ENABLE_ASSERTS

View File

@ -0,0 +1,109 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#ifndef __SWR_ASSERT_H__
#define __SWR_ASSERT_H__
#if !defined(__SWR_OS_H__)
#error swr_assert.h should not be included directly, please include "common/os.h" instead.
#endif
#if !defined(SWR_ENABLE_ASSERTS)
#if !defined(NDEBUG)
#define SWR_ENABLE_ASSERTS 1
#else
#define SWR_ENABLE_ASSERTS 0
#endif // _DEBUG
#endif // SWR_ENABLE_ASSERTS
#if !defined(SWR_ENABLE_REL_ASSERTS)
#define SWR_ENABLE_REL_ASSERTS 1
#endif
#if SWR_ENABLE_ASSERTS || SWR_ENABLE_REL_ASSERTS
#include "assert.h"
#if !defined(__cplusplus)
#pragma message("C++ is required for SWR Asserts, falling back to assert.h")
#if SWR_ENABLE_ASSERTS
#define SWR_ASSERT(e, ...) assert(e)
#endif
#if SWR_ENABLE_REL_ASSERTS
#define SWR_REL_ASSERT(e, ...) assert(e)
#endif
#else
#if SWR_ENABLE_ASSERTS
#if defined(assert)
#undef assert
#endif
#define assert(exp) SWR_ASSERT(exp)
#endif
bool SwrAssert(
bool chkDebugger,
bool& enabled,
const char* pExpression,
const char* pFileName,
uint32_t lineNum,
const char* function,
const char* pFmtString = nullptr,
...);
#define _SWR_ASSERT(chkDebugger, e, ...) {\
bool expFailed = !(e);\
if (expFailed) {\
static bool swrAssertEnabled = true;\
expFailed = SwrAssert(chkDebugger, swrAssertEnabled, #e, __FILE__, __LINE__, __FUNCTION__, ##__VA_ARGS__);\
if (expFailed) { DEBUGBREAK; }\
}\
}
#if SWR_ENABLE_ASSERTS
#define SWR_ASSERT(e, ...) _SWR_ASSERT(true, e, ##__VA_ARGS__)
#endif
#if SWR_ENABLE_REL_ASSERTS
#define SWR_REL_ASSERT(e, ...) _SWR_ASSERT(false, e, ##__VA_ARGS__)
#endif
#endif // C++
#endif // SWR_ENABLE_ASSERTS || SWR_ENABLE_REL_ASSERTS
#if !SWR_ENABLE_ASSERTS
#define SWR_ASSERT(e, ...)
#endif
#if !SWR_ENABLE_REL_ASSERTS
#define SWR_REL_ASSERT(e, ...)
#endif
#define SWR_NOT_IMPL SWR_ASSERT(0, "%s not implemented", __FUNCTION__)
#endif//__SWR_ASSERT_H__

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,500 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file api.h
*
* @brief API definitions
*
******************************************************************************/
#ifndef __SWR_API_H__
#define __SWR_API_H__
#include "common/os.h"
#include <assert.h>
#include <vector>
#include "common/simdintrin.h"
#include "common/formats.h"
#include "core/utils.h"
#include "core/state.h"
///@todo place all the API functions into the 'swr' namespace.
typedef void(SWR_API *PFN_CALLBACK_FUNC)(uint64_t data, uint64_t data2, uint64_t data3);
//////////////////////////////////////////////////////////////////////////
/// @brief Function signature for load hot tiles
/// @param hPrivateContext - handle to private data
/// @param dstFormat - format of the hot tile
/// @param renderTargetIndex - render target to store, can be color, depth or stencil
/// @param x - destination x coordinate
/// @param y - destination y coordinate
/// @param pDstHotTile - pointer to the hot tile surface
typedef void(SWR_API *PFN_LOAD_TILE)(HANDLE hPrivateContext, SWR_FORMAT dstFormat,
SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, BYTE *pDstHotTile);
//////////////////////////////////////////////////////////////////////////
/// @brief Function signature for store hot tiles
/// @param hPrivateContext - handle to private data
/// @param srcFormat - format of the hot tile
/// @param renderTargetIndex - render target to store, can be color, depth or stencil
/// @param x - destination x coordinate
/// @param y - destination y coordinate
/// @param pSrcHotTile - pointer to the hot tile surface
typedef void(SWR_API *PFN_STORE_TILE)(HANDLE hPrivateContext, SWR_FORMAT srcFormat,
SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, BYTE *pSrcHotTile);
/// @brief Function signature for clearing from the hot tiles clear value
/// @param hPrivateContext - handle to private data
/// @param renderTargetIndex - render target to store, can be color, depth or stencil
/// @param x - destination x coordinate
/// @param y - destination y coordinate
/// @param pClearColor - pointer to the hot tile's clear value
typedef void(SWR_API *PFN_CLEAR_TILE)(HANDLE hPrivateContext,
SWR_RENDERTARGET_ATTACHMENT rtIndex,
uint32_t x, uint32_t y, const float* pClearColor);
//////////////////////////////////////////////////////////////////////////
/// SWR_CREATECONTEXT_INFO
/////////////////////////////////////////////////////////////////////////
struct SWR_CREATECONTEXT_INFO
{
DRIVER_TYPE driver;
// External functions (e.g. sampler) need per draw context state.
// Use SwrGetPrivateContextState() to access private state.
uint32_t privateStateSize;
// Each SWR context can have multiple sets of active state
uint32_t maxSubContexts;
// tile manipulation functions
PFN_LOAD_TILE pfnLoadTile;
PFN_STORE_TILE pfnStoreTile;
PFN_CLEAR_TILE pfnClearTile;
};
//////////////////////////////////////////////////////////////////////////
/// SWR_RECT
/////////////////////////////////////////////////////////////////////////
struct SWR_RECT
{
uint32_t left;
uint32_t right;
uint32_t top;
uint32_t bottom;
};
//////////////////////////////////////////////////////////////////////////
/// @brief Create SWR Context.
/// @param pCreateInfo - pointer to creation info.
HANDLE SWR_API SwrCreateContext(
const SWR_CREATECONTEXT_INFO* pCreateInfo);
//////////////////////////////////////////////////////////////////////////
/// @brief Destroys SWR Context.
/// @param hContext - Handle passed back from SwrCreateContext
void SWR_API SwrDestroyContext(
HANDLE hContext);
//////////////////////////////////////////////////////////////////////////
/// @brief Set currently active state context
/// @param subContextIndex - value from 0 to
/// SWR_CREATECONTEXT_INFO.maxSubContexts. Defaults to 0.
void SWR_API SwrSetActiveSubContext(
HANDLE hContext,
uint32_t subContextIndex);
//////////////////////////////////////////////////////////////////////////
/// @brief Sync cmd. Executes the callback func when all rendering up to this sync
/// has been completed
/// @param hContext - Handle passed back from SwrCreateContext
/// @param pfnFunc - pointer to callback function,
/// @param userData - user data to pass back
void SWR_API SwrSync(
HANDLE hContext,
PFN_CALLBACK_FUNC pfnFunc,
uint64_t userData,
uint64_t userData2,
uint64_t userData3 = 0);
//////////////////////////////////////////////////////////////////////////
/// @brief Blocks until all rendering has been completed.
/// @param hContext - Handle passed back from SwrCreateContext
void SWR_API SwrWaitForIdle(
HANDLE hContext);
//////////////////////////////////////////////////////////////////////////
/// @brief Set vertex buffer state.
/// @param hContext - Handle passed back from SwrCreateContext
/// @param numBuffers - Number of vertex buffer state descriptors.
/// @param pVertexBuffers - Array of vertex buffer state descriptors.
void SWR_API SwrSetVertexBuffers(
HANDLE hContext,
uint32_t numBuffers,
const SWR_VERTEX_BUFFER_STATE* pVertexBuffers);
//////////////////////////////////////////////////////////////////////////
/// @brief Set index buffer
/// @param hContext - Handle passed back from SwrCreateContext
/// @param pIndexBuffer - Index buffer.
void SWR_API SwrSetIndexBuffer(
HANDLE hContext,
const SWR_INDEX_BUFFER_STATE* pIndexBuffer);
//////////////////////////////////////////////////////////////////////////
/// @brief Set fetch shader pointer.
/// @param hContext - Handle passed back from SwrCreateContext
/// @param pfnFetchFunc - Pointer to shader.
void SWR_API SwrSetFetchFunc(
HANDLE hContext,
PFN_FETCH_FUNC pfnFetchFunc);
//////////////////////////////////////////////////////////////////////////
/// @brief Set streamout shader pointer.
/// @param hContext - Handle passed back from SwrCreateContext
/// @param pfnSoFunc - Pointer to shader.
/// @param streamIndex - specifies stream
void SWR_API SwrSetSoFunc(
HANDLE hContext,
PFN_SO_FUNC pfnSoFunc,
uint32_t streamIndex);
//////////////////////////////////////////////////////////////////////////
/// @brief Set streamout state
/// @param hContext - Handle passed back from SwrCreateContext
/// @param pSoState - Pointer to streamout state.
void SWR_API SwrSetSoState(
HANDLE hContext,
SWR_STREAMOUT_STATE* pSoState);
//////////////////////////////////////////////////////////////////////////
/// @brief Set streamout buffer state
/// @param hContext - Handle passed back from SwrCreateContext
/// @param pSoBuffer - Pointer to streamout buffer.
/// @param slot - Slot to bind SO buffer to.
void SWR_API SwrSetSoBuffers(
HANDLE hContext,
SWR_STREAMOUT_BUFFER* pSoBuffer,
uint32_t slot);
//////////////////////////////////////////////////////////////////////////
/// @brief Set vertex shader pointer.
/// @param hContext - Handle passed back from SwrCreateContext
/// @param pfnVertexFunc - Pointer to shader.
void SWR_API SwrSetVertexFunc(
HANDLE hContext,
PFN_VERTEX_FUNC pfnVertexFunc);
//////////////////////////////////////////////////////////////////////////
/// @brief Set frontend state.
/// @param hContext - Handle passed back from SwrCreateContext
/// @param pState - Pointer to state
void SWR_API SwrSetFrontendState(
HANDLE hContext,
SWR_FRONTEND_STATE *pState);
//////////////////////////////////////////////////////////////////////////
/// @brief Set geometry shader state.
/// @param hContext - Handle passed back from SwrCreateContext
/// @param pState - Pointer to state
void SWR_API SwrSetGsState(
HANDLE hContext,
SWR_GS_STATE *pState);
//////////////////////////////////////////////////////////////////////////
/// @brief Set geometry shader
/// @param hContext - Handle passed back from SwrCreateContext
/// @param pState - Pointer to geometry shader function
void SWR_API SwrSetGsFunc(
HANDLE hContext,
PFN_GS_FUNC pfnGsFunc);
//////////////////////////////////////////////////////////////////////////
/// @brief Set compute shader
/// @param hContext - Handle passed back from SwrCreateContext
/// @param pState - Pointer to compute shader function
/// @param totalThreadsInGroup - product of thread group dimensions.
void SWR_API SwrSetCsFunc(
HANDLE hContext,
PFN_CS_FUNC pfnCsFunc,
uint32_t totalThreadsInGroup);
//////////////////////////////////////////////////////////////////////////
/// @brief Set tessellation state.
/// @param hContext - Handle passed back from SwrCreateContext
/// @param pState - Pointer to state
void SWR_API SwrSetTsState(
HANDLE hContext,
SWR_TS_STATE *pState);
//////////////////////////////////////////////////////////////////////////
/// @brief Set hull shader
/// @param hContext - Handle passed back from SwrCreateContext
/// @param pfnFunc - Pointer to shader function
void SWR_API SwrSetHsFunc(
HANDLE hContext,
PFN_HS_FUNC pfnFunc);
//////////////////////////////////////////////////////////////////////////
/// @brief Set domain shader
/// @param hContext - Handle passed back from SwrCreateContext
/// @param pfnFunc - Pointer to shader function
void SWR_API SwrSetDsFunc(
HANDLE hContext,
PFN_DS_FUNC pfnFunc);
//////////////////////////////////////////////////////////////////////////
/// @brief Set depth stencil state
/// @param hContext - Handle passed back from SwrCreateContext
/// @param pState - Pointer to state.
void SWR_API SwrSetDepthStencilState(
HANDLE hContext,
SWR_DEPTH_STENCIL_STATE *pState);
//////////////////////////////////////////////////////////////////////////
/// @brief Set backend state
/// @param hContext - Handle passed back from SwrCreateContext
/// @param pState - Pointer to state.
void SWR_API SwrSetBackendState(
HANDLE hContext,
SWR_BACKEND_STATE *pState);
//////////////////////////////////////////////////////////////////////////
/// @brief Set pixel shader state
/// @param hContext - Handle passed back from SwrCreateContext
/// @param pState - Pointer to state.
void SWR_API SwrSetPixelShaderState(
HANDLE hContext,
SWR_PS_STATE *pState);
//////////////////////////////////////////////////////////////////////////
/// @brief Set blend state
/// @param hContext - Handle passed back from SwrCreateContext
/// @param pState - Pointer to state.
void SWR_API SwrSetBlendState(
HANDLE hContext,
SWR_BLEND_STATE *pState);
//////////////////////////////////////////////////////////////////////////
/// @brief Set blend function
/// @param hContext - Handle passed back from SwrCreateContext
/// @param renderTarget - render target index
/// @param pfnBlendFunc - function pointer
void SWR_API SwrSetBlendFunc(
HANDLE hContext,
uint32_t renderTarget,
PFN_BLEND_JIT_FUNC pfnBlendFunc);
//////////////////////////////////////////////////////////////////////////
/// @brief Set linkage mask
/// @param hContext - Handle passed back from SwrCreateContext
/// @param mask - Specifies which vertex outputs are are needed by PS.
/// @param pMap - (Optional)Linkage map to specify where FE attributes are
/// gathered from to supply PS attribute values. The length
/// of the map buffer needs to match the number of set bits
/// in "mask".
void SWR_API SwrSetLinkage(
HANDLE hContext,
uint32_t mask,
const uint8_t* pMap);
//////////////////////////////////////////////////////////////////////////
/// @brief SwrDraw
/// @param hContext - Handle passed back from SwrCreateContext
/// @param topology - Specifies topology for draw.
/// @param startVertex - Specifies start vertex in vertex buffer for draw.
/// @param primCount - Number of vertices.
void SWR_API SwrDraw(
HANDLE hContext,
PRIMITIVE_TOPOLOGY topology,
uint32_t startVertex,
uint32_t primCount);
//////////////////////////////////////////////////////////////////////////
/// @brief SwrDrawInstanced
/// @param hContext - Handle passed back from SwrCreateContext
/// @param topology - Specifies topology for draw.
/// @param numVertsPerInstance - How many vertices to read sequentially from vertex data.
/// @param numInstances - How many instances to render.
/// @param startVertex - Specifies start vertex for draw. (vertex data)
/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
void SWR_API SwrDrawInstanced(
HANDLE hContext,
PRIMITIVE_TOPOLOGY topology,
uint32_t numVertsPerInstance,
uint32_t numInstances,
uint32_t startVertex,
uint32_t startInstance);
//////////////////////////////////////////////////////////////////////////
/// @brief DrawIndexed
/// @param hContext - Handle passed back from SwrCreateContext
/// @param topology - Specifies topology for draw.
/// @param numIndices - Number of indices to read sequentially from index buffer.
/// @param indexOffset - Starting index into index buffer.
/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
void SWR_API SwrDrawIndexed(
HANDLE hContext,
PRIMITIVE_TOPOLOGY topology,
uint32_t numIndices,
uint32_t indexOffset,
int32_t baseVertex);
//////////////////////////////////////////////////////////////////////////
/// @brief SwrDrawIndexedInstanced
/// @param hContext - Handle passed back from SwrCreateContext
/// @param topology - Specifies topology for draw.
/// @param numIndices - Number of indices to read sequentially from index buffer.
/// @param numInstances - Number of instances to render.
/// @param indexOffset - Starting index into index buffer.
/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
void SWR_API SwrDrawIndexedInstanced(
HANDLE hContext,
PRIMITIVE_TOPOLOGY topology,
uint32_t numIndices,
uint32_t numInstances,
uint32_t indexOffset,
int32_t baseVertex,
uint32_t startInstance);
//////////////////////////////////////////////////////////////////////////
/// @brief SwrInvalidateTiles
/// @param hContext - Handle passed back from SwrCreateContext
/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to invalidate.
void SWR_API SwrInvalidateTiles(
HANDLE hContext,
uint32_t attachmentMask);
//////////////////////////////////////////////////////////////////////////
/// @brief SwrDispatch
/// @param hContext - Handle passed back from SwrCreateContext
/// @param threadGroupCountX - Number of thread groups dispatched in X direction
/// @param threadGroupCountY - Number of thread groups dispatched in Y direction
/// @param threadGroupCountZ - Number of thread groups dispatched in Z direction
void SWR_API SwrDispatch(
HANDLE hContext,
uint32_t threadGroupCountX,
uint32_t threadGroupCountY,
uint32_t threadGroupCountZ);
enum SWR_TILE_STATE
{
SWR_TILE_INVALID = 0, // tile is in unitialized state and should be loaded with surface contents before rendering
SWR_TILE_DIRTY = 2, // tile contains newer data than surface it represents
SWR_TILE_RESOLVED = 3, // is in sync with surface it represents
};
/// @todo Add a good description for what attachments are and when and why you would use the different SWR_TILE_STATEs.
void SWR_API SwrStoreTiles(
HANDLE hContext,
SWR_RENDERTARGET_ATTACHMENT attachment,
SWR_TILE_STATE postStoreTileState);
void SWR_API SwrClearRenderTarget(
HANDLE hContext,
uint32_t clearMask,
const FLOAT clearColor[4],
float z,
BYTE stencil);
void SWR_API SwrSetRastState(
HANDLE hContext,
const SWR_RASTSTATE *pRastState);
//////////////////////////////////////////////////////////////////////////
/// @brief SwrSetViewports
/// @param hContext - Handle passed back from SwrCreateContext
/// @param numViewports - number of viewports passed in
/// @param pViewports - Specifies extents of viewport.
/// @param pMatrices - If not specified then SWR computes a default one.
void SWR_API SwrSetViewports(
HANDLE hContext,
uint32_t numViewports,
const SWR_VIEWPORT* pViewports,
const SWR_VIEWPORT_MATRIX* pMatrices);
//////////////////////////////////////////////////////////////////////////
/// @brief SwrSetScissorRects
/// @param hContext - Handle passed back from SwrCreateContext
/// @param numScissors - number of scissors passed in
/// @param pScissors - array of scissors
void SWR_API SwrSetScissorRects(
HANDLE hContext,
uint32_t numScissors,
const BBOX* pScissors);
//////////////////////////////////////////////////////////////////////////
/// @brief Returns a pointer to the private context state for the current
/// draw operation. This is used for external componets such as the
/// sampler.
///
/// @note Client needs to resend private state prior to each draw call.
/// Also, SWR is responsible for the private state memory.
/// @param hContext - Handle passed back from SwrCreateContext
VOID* SWR_API SwrGetPrivateContextState(
HANDLE hContext);
//////////////////////////////////////////////////////////////////////////
/// @brief Clients can use this to allocate memory for draw/dispatch
/// operations. The memory will automatically be freed once operation
/// has completed. Client can use this to allocate binding tables,
/// etc. needed for shader execution.
/// @param hContext - Handle passed back from SwrCreateContext
/// @param size - Size of allocation
/// @param align - Alignment needed for allocation.
VOID* SWR_API SwrAllocDrawContextMemory(
HANDLE hContext,
uint32_t size,
uint32_t align);
//////////////////////////////////////////////////////////////////////////
/// @brief Returns pointer to SWR stats.
/// @note The counters are incremented by multiple threads.
/// When calling this, you need to ensure all previous operations
/// have completed.
/// @param hContext - Handle passed back from SwrCreateContext
/// @param pStats - SWR will fill this out for caller.
void SWR_API SwrGetStats(
HANDLE hContext,
SWR_STATS* pStats);
//////////////////////////////////////////////////////////////////////////
/// @brief Enables stats counting
/// @param hContext - Handle passed back from SwrCreateContext
/// @param enable - If true then counts are incremented.
void SWR_API SwrEnableStats(
HANDLE hContext,
bool enable);
//////////////////////////////////////////////////////////////////////////
/// @brief Mark end of frame - used for performance profiling
/// @param hContext - Handle passed back from SwrCreateContext
void SWR_API SwrEndFrame(
HANDLE hContext);
#endif//__SWR_API_H__

View File

@ -0,0 +1,166 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file arena.cpp
*
* @brief Arena memory manager
* The arena is convenient and fast for managing allocations for any of
* our allocations that are associated with operations and can all be freed
* once when their operation has completed. Allocations are cheap since
* most of the time its simply an increment of an offset. Also, no need to
* free individual allocations. All of the arena memory can be freed at once.
*
******************************************************************************/
#include "context.h"
#include "arena.h"
#include <cmath>
Arena::Arena()
: m_pCurBlock(nullptr), m_size(0)
{
m_pMutex = new std::mutex();
}
Arena::~Arena()
{
Reset(); // Reset just in case to avoid leaking memory.
if (m_pCurBlock)
{
_aligned_free(m_pCurBlock->pMem);
delete m_pCurBlock;
}
delete m_pMutex;
}
///@todo Remove this when all users have stopped using this.
void Arena::Init()
{
m_size = 0;
m_pCurBlock = nullptr;
m_pMutex = new std::mutex();
}
void* Arena::AllocAligned(size_t size, size_t align)
{
if (m_pCurBlock)
{
ArenaBlock* pCurBlock = m_pCurBlock;
pCurBlock->offset = AlignUp(pCurBlock->offset, align);
if ((pCurBlock->offset + size) <= pCurBlock->blockSize)
{
void* pMem = PtrAdd(pCurBlock->pMem, pCurBlock->offset);
pCurBlock->offset += size;
m_size += size;
return pMem;
}
// Not enough memory in this block, fall through to allocate
// a new block
}
static const size_t ArenaBlockSize = 1024*1024;
size_t blockSize = std::max(m_size + ArenaBlockSize, std::max(size, ArenaBlockSize));
blockSize = AlignUp(blockSize, KNOB_SIMD_WIDTH*4);
void *pMem = _aligned_malloc(blockSize, KNOB_SIMD_WIDTH*4); // Arena blocks are always simd byte aligned.
SWR_ASSERT(pMem != nullptr);
ArenaBlock* pNewBlock = new (std::nothrow) ArenaBlock();
SWR_ASSERT(pNewBlock != nullptr);
if (pNewBlock != nullptr)
{
pNewBlock->pNext = m_pCurBlock;
m_pCurBlock = pNewBlock;
m_pCurBlock->pMem = pMem;
m_pCurBlock->blockSize = blockSize;
}
return AllocAligned(size, align);
}
void* Arena::Alloc(size_t size)
{
return AllocAligned(size, 1);
}
void* Arena::AllocAlignedSync(size_t size, size_t align)
{
void* pAlloc = nullptr;
SWR_ASSERT(m_pMutex != nullptr);
m_pMutex->lock();
pAlloc = AllocAligned(size, align);
m_pMutex->unlock();
return pAlloc;
}
void* Arena::AllocSync(size_t size)
{
void* pAlloc = nullptr;
SWR_ASSERT(m_pMutex != nullptr);
m_pMutex->lock();
pAlloc = Alloc(size);
m_pMutex->unlock();
return pAlloc;
}
void Arena::Reset(bool removeAll)
{
if (m_pCurBlock)
{
m_pCurBlock->offset = 0;
ArenaBlock *pUsedBlocks = m_pCurBlock->pNext;
m_pCurBlock->pNext = nullptr;
while(pUsedBlocks)
{
ArenaBlock* pBlock = pUsedBlocks;
pUsedBlocks = pBlock->pNext;
_aligned_free(pBlock->pMem);
delete pBlock;
}
if (removeAll)
{
_aligned_free(m_pCurBlock->pMem);
delete m_pCurBlock;
m_pCurBlock = nullptr;
}
}
m_size = 0;
}

View File

@ -0,0 +1,69 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file arena.h
*
* @brief Arena memory manager
* The arena is convenient and fast for managing allocations for any of
* our allocations that are associated with operations and can all be freed
* once when their operation has completed. Allocations are cheap since
* most of the time its simply an increment of an offset. Also, no need to
* free individual allocations. All of the arena memory can be freed at once.
*
******************************************************************************/
#pragma once
#include <mutex>
class Arena
{
public:
Arena();
~Arena();
void Init();
void* AllocAligned(size_t size, size_t align);
void* Alloc(size_t size);
void* AllocAlignedSync(size_t size, size_t align);
void* AllocSync(size_t size);
void Reset(bool removeAll = false);
size_t Size() { return m_size; }
private:
struct ArenaBlock
{
void* pMem = nullptr;
size_t blockSize = 0;
size_t offset = 0;
ArenaBlock* pNext = nullptr;
};
ArenaBlock* m_pCurBlock = nullptr;
size_t m_size = 0;
/// @note Mutex is only used by sync allocation functions.
std::mutex* m_pMutex;
};

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,59 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file backend.h
*
* @brief Backend handles rasterization, pixel shading and output merger
* operations.
*
******************************************************************************/
#pragma once
#include "common/os.h"
#include "core/context.h"
void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId);
void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
void ProcessQueryStatsBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
void ProcessInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers);
void InitClearTilesTable();
enum SWR_BACKEND_FUNCS
{
SWR_BACKEND_SINGLE_SAMPLE,
SWR_BACKEND_MSAA_PIXEL_RATE,
SWR_BACKEND_MSAA_SAMPLE_RATE,
SWR_BACKEND_FUNCS_MAX,
};
void InitBackendFuncTables();
extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_MAX];
extern PFN_BACKEND_FUNC gBackendSingleSample[2][2];
extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX][2][2];
extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2];
extern PFN_OUTPUT_MERGER gBackendOutputMergerTable[SWR_NUM_RENDERTARGETS+1][SWR_MULTISAMPLE_TYPE_MAX];
extern PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable[2];
extern PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable[2];
extern PFN_CALC_CENTROID_BARYCENTRICS gCentroidBarycentricTable[SWR_MULTISAMPLE_TYPE_MAX][2][2][2];

View File

@ -0,0 +1,318 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file blend.cpp
*
* @brief Implementation for blending operations.
*
******************************************************************************/
#include "state.h"
template<bool Color, bool Alpha>
INLINE
void GenerateBlendFactor(SWR_BLEND_FACTOR func, simdvector &constantColor, simdvector &src, simdvector &src1, simdvector &dst, simdvector &out)
{
simdvector result;
switch (func)
{
case BLENDFACTOR_ZERO:
result.x = _simd_setzero_ps();
result.y = _simd_setzero_ps();
result.z = _simd_setzero_ps();
result.w = _simd_setzero_ps();
break;
case BLENDFACTOR_ONE:
result.x = _simd_set1_ps(1.0);
result.y = _simd_set1_ps(1.0);
result.z = _simd_set1_ps(1.0);
result.w = _simd_set1_ps(1.0);
break;
case BLENDFACTOR_SRC_COLOR:
result = src;
break;
case BLENDFACTOR_DST_COLOR:
result = dst;
break;
case BLENDFACTOR_INV_SRC_COLOR:
result.x = _simd_sub_ps(_simd_set1_ps(1.0), src.x);
result.y = _simd_sub_ps(_simd_set1_ps(1.0), src.y);
result.z = _simd_sub_ps(_simd_set1_ps(1.0), src.z);
result.w = _simd_sub_ps(_simd_set1_ps(1.0), src.w);
break;
case BLENDFACTOR_INV_DST_COLOR:
result.x = _simd_sub_ps(_simd_set1_ps(1.0), dst.x);
result.y = _simd_sub_ps(_simd_set1_ps(1.0), dst.y);
result.z = _simd_sub_ps(_simd_set1_ps(1.0), dst.z);
result.w = _simd_sub_ps(_simd_set1_ps(1.0), dst.w);
break;
case BLENDFACTOR_SRC_ALPHA: result.x = src.w;
result.y = src.w;
result.z = src.w;
result.w = src.w;
break;
case BLENDFACTOR_INV_SRC_ALPHA:
{
simdscalar oneMinusSrcA = _simd_sub_ps(_simd_set1_ps(1.0), src.w);
result.x = oneMinusSrcA;
result.y = oneMinusSrcA;
result.z = oneMinusSrcA;
result.w = oneMinusSrcA;
break;
}
case BLENDFACTOR_DST_ALPHA: result.x = dst.w;
result.y = dst.w;
result.z = dst.w;
result.w = dst.w;
break;
case BLENDFACTOR_INV_DST_ALPHA:
{
simdscalar oneMinusDstA = _simd_sub_ps(_simd_set1_ps(1.0), dst.w);
result.x = oneMinusDstA;
result.y = oneMinusDstA;
result.z = oneMinusDstA;
result.w = oneMinusDstA;
break;
}
case BLENDFACTOR_SRC_ALPHA_SATURATE:
{
simdscalar sat = _simd_min_ps(src.w, _simd_sub_ps(_simd_set1_ps(1.0), dst.w));
result.x = sat;
result.y = sat;
result.z = sat;
result.w = _simd_set1_ps(1.0);
break;
}
case BLENDFACTOR_CONST_COLOR:
result.x = constantColor[0];
result.y = constantColor[1];
result.z = constantColor[2];
result.w = constantColor[3];
break;
case BLENDFACTOR_CONST_ALPHA:
result.x = result.y = result.z = result.w = constantColor[3];
break;
case BLENDFACTOR_INV_CONST_COLOR:
{
result.x = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[0]);
result.y = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[1]);
result.z = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[2]);
result.w = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[3]);
break;
}
case BLENDFACTOR_INV_CONST_ALPHA:
{
result.x = result.y = result.z = result.w = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[3]);
break;
}
case BLENDFACTOR_SRC1_COLOR:
result.x = src1.x;
result.y = src1.y;
result.z = src1.z;
result.w = src1.w;
break;
case BLENDFACTOR_SRC1_ALPHA:
result.x = result.y = result.z = result.w = src1.w;
break;
case BLENDFACTOR_INV_SRC1_COLOR:
result.x = _simd_sub_ps(_simd_set1_ps(1.0f), src1.x);
result.y = _simd_sub_ps(_simd_set1_ps(1.0f), src1.y);
result.z = _simd_sub_ps(_simd_set1_ps(1.0f), src1.z);
result.w = _simd_sub_ps(_simd_set1_ps(1.0f), src1.w);
break;
case BLENDFACTOR_INV_SRC1_ALPHA:
result.x = result.y = result.z = result.w = _simd_sub_ps(_simd_set1_ps(1.0f), src1.w);
break;
default: SWR_ASSERT(false, "Unimplemented blend factor: %d", func);
}
if (Color)
{
out.x = result.x;
out.y = result.y;
out.z = result.z;
}
if (Alpha)
{
out.w = result.w;
}
}
template<bool Color, bool Alpha>
INLINE void BlendFunc(SWR_BLEND_OP blendOp, simdvector &src, simdvector &srcFactor, simdvector &dst, simdvector &dstFactor, simdvector &out)
{
simdvector result;
switch (blendOp)
{
case BLENDOP_ADD:
result.x = _simd_fmadd_ps(srcFactor.x, src.x, _simd_mul_ps(dstFactor.x, dst.x));
result.y = _simd_fmadd_ps(srcFactor.y, src.y, _simd_mul_ps(dstFactor.y, dst.y));
result.z = _simd_fmadd_ps(srcFactor.z, src.z, _simd_mul_ps(dstFactor.z, dst.z));
result.w = _simd_fmadd_ps(srcFactor.w, src.w, _simd_mul_ps(dstFactor.w, dst.w));
break;
case BLENDOP_SUBTRACT:
result.x = _simd_fmsub_ps(srcFactor.x, src.x, _simd_mul_ps(dstFactor.x, dst.x));
result.y = _simd_fmsub_ps(srcFactor.y, src.y, _simd_mul_ps(dstFactor.y, dst.y));
result.z = _simd_fmsub_ps(srcFactor.z, src.z, _simd_mul_ps(dstFactor.z, dst.z));
result.w = _simd_fmsub_ps(srcFactor.w, src.w, _simd_mul_ps(dstFactor.w, dst.w));
break;
case BLENDOP_REVSUBTRACT:
result.x = _simd_fmsub_ps(dstFactor.x, dst.x, _simd_mul_ps(srcFactor.x, src.x));
result.y = _simd_fmsub_ps(dstFactor.y, dst.y, _simd_mul_ps(srcFactor.y, src.y));
result.z = _simd_fmsub_ps(dstFactor.z, dst.z, _simd_mul_ps(srcFactor.z, src.z));
result.w = _simd_fmsub_ps(dstFactor.w, dst.w, _simd_mul_ps(srcFactor.w, src.w));
break;
case BLENDOP_MIN:
result.x = _simd_min_ps(_simd_mul_ps(srcFactor.x, src.x), _simd_mul_ps(dstFactor.x, dst.x));
result.y = _simd_min_ps(_simd_mul_ps(srcFactor.y, src.y), _simd_mul_ps(dstFactor.y, dst.y));
result.z = _simd_min_ps(_simd_mul_ps(srcFactor.z, src.z), _simd_mul_ps(dstFactor.z, dst.z));
result.w = _simd_min_ps(_simd_mul_ps(srcFactor.w, src.w), _simd_mul_ps(dstFactor.w, dst.w));
break;
case BLENDOP_MAX:
result.x = _simd_max_ps(_simd_mul_ps(srcFactor.x, src.x), _simd_mul_ps(dstFactor.x, dst.x));
result.y = _simd_max_ps(_simd_mul_ps(srcFactor.y, src.y), _simd_mul_ps(dstFactor.y, dst.y));
result.z = _simd_max_ps(_simd_mul_ps(srcFactor.z, src.z), _simd_mul_ps(dstFactor.z, dst.z));
result.w = _simd_max_ps(_simd_mul_ps(srcFactor.w, src.w), _simd_mul_ps(dstFactor.w, dst.w));
break;
default:
SWR_ASSERT(false, "Unimplemented blend function: %d", blendOp);
}
if (Color)
{
out.x = result.x;
out.y = result.y;
out.z = result.z;
}
if (Alpha)
{
out.w = result.w;
}
}
template<SWR_TYPE type>
INLINE void Clamp(simdvector &src)
{
switch (type)
{
case SWR_TYPE_FLOAT:
break;
case SWR_TYPE_UNORM:
src.x = _simd_max_ps(src.x, _simd_setzero_ps());
src.x = _simd_min_ps(src.x, _simd_set1_ps(1.0f));
src.y = _simd_max_ps(src.y, _simd_setzero_ps());
src.y = _simd_min_ps(src.y, _simd_set1_ps(1.0f));
src.z = _simd_max_ps(src.z, _simd_setzero_ps());
src.z = _simd_min_ps(src.z, _simd_set1_ps(1.0f));
src.w = _simd_max_ps(src.w, _simd_setzero_ps());
src.w = _simd_min_ps(src.w, _simd_set1_ps(1.0f));
break;
case SWR_TYPE_SNORM:
src.x = _simd_max_ps(src.x, _simd_set1_ps(-1.0f));
src.x = _simd_min_ps(src.x, _simd_set1_ps(1.0f));
src.y = _simd_max_ps(src.y, _simd_set1_ps(-1.0f));
src.y = _simd_min_ps(src.y, _simd_set1_ps(1.0f));
src.z = _simd_max_ps(src.z, _simd_set1_ps(-1.0f));
src.z = _simd_min_ps(src.z, _simd_set1_ps(1.0f));
src.w = _simd_max_ps(src.w, _simd_set1_ps(-1.0f));
src.w = _simd_min_ps(src.w, _simd_set1_ps(1.0f));
break;
default:
SWR_ASSERT(false, "Unimplemented clamp: %d", type);
break;
}
}
template<SWR_TYPE type>
void Blend(const SWR_BLEND_STATE *pBlendState, const SWR_RENDER_TARGET_BLEND_STATE *pState, simdvector &src, simdvector& src1, BYTE *pDst, simdvector &result)
{
// load render target
simdvector dst;
LoadSOA<KNOB_COLOR_HOT_TILE_FORMAT>(pDst, dst);
simdvector constColor;
constColor.x = _simd_broadcast_ss(&pBlendState->constantColor[0]);
constColor.y = _simd_broadcast_ss(&pBlendState->constantColor[1]);
constColor.z = _simd_broadcast_ss(&pBlendState->constantColor[2]);
constColor.w = _simd_broadcast_ss(&pBlendState->constantColor[3]);
// clamp src/dst/constant
Clamp<type>(src);
Clamp<type>(src1);
Clamp<type>(dst);
Clamp<type>(constColor);
simdvector srcFactor, dstFactor;
if (pBlendState->independentAlphaBlendEnable)
{
GenerateBlendFactor<true, false>((SWR_BLEND_FACTOR)pState->sourceBlendFactor, constColor, src, src1, dst, srcFactor);
GenerateBlendFactor<false, true>((SWR_BLEND_FACTOR)pState->sourceAlphaBlendFactor, constColor, src, src1, dst, srcFactor);
GenerateBlendFactor<true, false>((SWR_BLEND_FACTOR)pState->destBlendFactor, constColor, src, src1, dst, dstFactor);
GenerateBlendFactor<false, true>((SWR_BLEND_FACTOR)pState->destAlphaBlendFactor, constColor, src, src1, dst, dstFactor);
BlendFunc<true, false>((SWR_BLEND_OP)pState->colorBlendFunc, src, srcFactor, dst, dstFactor, result);
BlendFunc<false, true>((SWR_BLEND_OP)pState->alphaBlendFunc, src, srcFactor, dst, dstFactor, result);
}
else
{
GenerateBlendFactor<true, true>((SWR_BLEND_FACTOR)pState->sourceBlendFactor, constColor, src, src1, dst, srcFactor);
GenerateBlendFactor<true, true>((SWR_BLEND_FACTOR)pState->destBlendFactor, constColor, src, src1, dst, dstFactor);
BlendFunc<true, true>((SWR_BLEND_OP)pState->colorBlendFunc, src, srcFactor, dst, dstFactor, result);
}
}

View File

@ -0,0 +1,201 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file clip.cpp
*
* @brief Implementation for clipping
*
******************************************************************************/
#include <assert.h>
#include "common/os.h"
#include "core/clip.h"
float ComputeInterpFactor(float boundaryCoord0, float boundaryCoord1)
{
return (boundaryCoord0 / (boundaryCoord0 - boundaryCoord1));
}
template<SWR_CLIPCODES ClippingPlane>
inline void intersect(
int s, // index to first edge vertex v0 in pInPts.
int p, // index to second edge vertex v1 in pInPts.
const float *pInPts, // array of all the input positions.
const float *pInAttribs, // array of all attributes for all vertex. All the attributes for each vertex is contiguous.
int numInAttribs, // number of attributes per vertex.
int i, // output index.
float *pOutPts, // array of output positions. We'll write our new intersection point at i*4.
float *pOutAttribs) // array of output attributes. We'll write our new attributes at i*numInAttribs.
{
float t;
// Find the parameter of the intersection.
// t = (v1.w - v1.x) / ((v2.x - v1.x) - (v2.w - v1.w)) for x = w (RIGHT) plane, etc.
const float *v1 = &pInPts[s*4];
const float *v2 = &pInPts[p*4];
switch (ClippingPlane)
{
case FRUSTUM_LEFT: t = ComputeInterpFactor(v1[3] + v1[0], v2[3] + v2[0]); break;
case FRUSTUM_RIGHT: t = ComputeInterpFactor(v1[3] - v1[0], v2[3] - v2[0]); break;
case FRUSTUM_TOP: t = ComputeInterpFactor(v1[3] + v1[1], v2[3] + v2[1]); break;
case FRUSTUM_BOTTOM: t = ComputeInterpFactor(v1[3] - v1[1], v2[3] - v2[1]); break;
case FRUSTUM_NEAR: t = ComputeInterpFactor(v1[2], v2[2]); break;
case FRUSTUM_FAR: t = ComputeInterpFactor(v1[3] - v1[2], v2[3] - v2[2]); break;
default: SWR_ASSERT(false, "invalid clipping plane: %d", ClippingPlane);
};
const float *a1 = &pInAttribs[s*numInAttribs];
const float *a2 = &pInAttribs[p*numInAttribs];
float *pOutP = &pOutPts[i*4];
float *pOutA = &pOutAttribs[i*numInAttribs];
// Interpolate new position.
for(int j = 0; j < 4; ++j)
{
pOutP[j] = v1[j] + (v2[j]-v1[j])*t;
}
// Interpolate Attributes
for(int attr = 0; attr < numInAttribs; ++attr)
{
pOutA[attr] = a1[attr] + (a2[attr]-a1[attr])*t;
}
}
// Checks whether vertex v lies inside clipping plane
// in homogenous coords check -w < {x,y,z} < w;
//
template<SWR_CLIPCODES ClippingPlane>
inline int inside(const float v[4])
{
switch (ClippingPlane)
{
case FRUSTUM_LEFT : return (v[0]>=-v[3]);
case FRUSTUM_RIGHT : return (v[0]<= v[3]);
case FRUSTUM_TOP : return (v[1]>=-v[3]);
case FRUSTUM_BOTTOM : return (v[1]<= v[3]);
case FRUSTUM_NEAR : return (v[2]>=0.0f);
case FRUSTUM_FAR : return (v[2]<= v[3]);
default:
SWR_ASSERT(false, "invalid clipping plane: %d", ClippingPlane);
return 0;
}
}
// Clips a polygon in homogenous coordinates to a particular clipping plane.
// Takes in vertices of the polygon (InPts) and the clipping plane
// Puts the vertices of the clipped polygon in OutPts
// Returns number of points in clipped polygon
//
template<SWR_CLIPCODES ClippingPlane>
int ClipTriToPlane( const float *pInPts, int numInPts,
const float *pInAttribs, int numInAttribs,
float *pOutPts, float *pOutAttribs)
{
int i=0; // index number of OutPts, # of vertices in OutPts = i div 4;
for (int j = 0; j < numInPts; ++j)
{
int s = j;
int p = (j + 1) % numInPts;
int s_in = inside<ClippingPlane>(&pInPts[s*4]);
int p_in = inside<ClippingPlane>(&pInPts[p*4]);
// test if vertex is to be added to output vertices
if (s_in != p_in) // edge crosses clipping plane
{
// find point of intersection
intersect<ClippingPlane>(s, p, pInPts, pInAttribs, numInAttribs, i, pOutPts, pOutAttribs);
i++;
}
if (p_in) // 2nd vertex is inside clipping volume, add it to output
{
// Copy 2nd vertex position of edge over to output.
for(int k = 0; k < 4; ++k)
{
pOutPts[i*4 + k] = pInPts[p*4 + k];
}
// Copy 2nd vertex attributes of edge over to output.
for(int attr = 0; attr < numInAttribs; ++attr)
{
pOutAttribs[i*numInAttribs+attr] = pInAttribs[p*numInAttribs+attr];
}
i++;
}
// edge does not cross clipping plane and vertex outside clipping volume
// => do not add vertex
}
return i;
}
void Clip(const float *pTriangle, const float *pAttribs, int numAttribs, float *pOutTriangles, int *numVerts, float *pOutAttribs)
{
// temp storage to hold at least 6 sets of vertices, the max number that can be created during clipping
OSALIGN(float, 16) tempPts[6 * 4];
OSALIGN(float, 16) tempAttribs[6 * KNOB_NUM_ATTRIBUTES * 4];
// we opt to clip to viewport frustum to produce smaller triangles for rasterization precision
int NumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pTriangle, 3, pAttribs, numAttribs, tempPts, tempAttribs);
NumOutPts = ClipTriToPlane<FRUSTUM_FAR>(tempPts, NumOutPts, tempAttribs, numAttribs, pOutTriangles, pOutAttribs);
NumOutPts = ClipTriToPlane<FRUSTUM_LEFT>(pOutTriangles, NumOutPts, pOutAttribs, numAttribs, tempPts, tempAttribs);
NumOutPts = ClipTriToPlane<FRUSTUM_RIGHT>(tempPts, NumOutPts, tempAttribs, numAttribs, pOutTriangles, pOutAttribs);
NumOutPts = ClipTriToPlane<FRUSTUM_BOTTOM>(pOutTriangles, NumOutPts, pOutAttribs, numAttribs, tempPts, tempAttribs);
NumOutPts = ClipTriToPlane<FRUSTUM_TOP>(tempPts, NumOutPts, tempAttribs, numAttribs, pOutTriangles, pOutAttribs);
SWR_ASSERT(NumOutPts <= 6);
*numVerts = NumOutPts;
return;
}
void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId)
{
RDTSC_START(FEClipTriangles);
Clipper<3> clipper(workerId, pDC);
clipper.ExecuteStage(pa, prims, primMask, primId);
RDTSC_STOP(FEClipTriangles, 1, 0);
}
void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId)
{
RDTSC_START(FEClipLines);
Clipper<2> clipper(workerId, pDC);
clipper.ExecuteStage(pa, prims, primMask, primId);
RDTSC_STOP(FEClipLines, 1, 0);
}
void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId)
{
RDTSC_START(FEClipPoints);
Clipper<1> clipper(workerId, pDC);
clipper.ExecuteStage(pa, prims, primMask, primId);
RDTSC_STOP(FEClipPoints, 1, 0);
}

View File

@ -0,0 +1,868 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file clip.h
*
* @brief Definitions for clipping
*
******************************************************************************/
#pragma once
#include "common/simdintrin.h"
#include "core/context.h"
#include "core/pa.h"
#include "rdtsc_core.h"
enum SWR_CLIPCODES
{
// Shift clip codes out of the mantissa to prevent denormalized values when used in float compare.
// Guardband is able to use a single high-bit with 4 separate LSBs, because it computes a union, rather than intersection, of clipcodes.
#define CLIPCODE_SHIFT 23
FRUSTUM_LEFT = (0x01 << CLIPCODE_SHIFT),
FRUSTUM_TOP = (0x02 << CLIPCODE_SHIFT),
FRUSTUM_RIGHT = (0x04 << CLIPCODE_SHIFT),
FRUSTUM_BOTTOM = (0x08 << CLIPCODE_SHIFT),
FRUSTUM_NEAR = (0x10 << CLIPCODE_SHIFT),
FRUSTUM_FAR = (0x20 << CLIPCODE_SHIFT),
NEGW = (0x40 << CLIPCODE_SHIFT),
GUARDBAND_LEFT = (0x80 << CLIPCODE_SHIFT | 0x1),
GUARDBAND_TOP = (0x80 << CLIPCODE_SHIFT | 0x2),
GUARDBAND_RIGHT = (0x80 << CLIPCODE_SHIFT | 0x4),
GUARDBAND_BOTTOM = (0x80 << CLIPCODE_SHIFT | 0x8)
};
#define FRUSTUM_CLIP_MASK (FRUSTUM_LEFT|FRUSTUM_TOP|FRUSTUM_RIGHT|FRUSTUM_BOTTOM|FRUSTUM_NEAR|FRUSTUM_FAR)
#define GUARDBAND_CLIP_MASK (FRUSTUM_NEAR|FRUSTUM_FAR|GUARDBAND_LEFT|GUARDBAND_TOP|GUARDBAND_RIGHT|GUARDBAND_BOTTOM|NEGW)
void Clip(const float *pTriangle, const float *pAttribs, int numAttribs, float *pOutTriangles,
int *numVerts, float *pOutAttribs);
INLINE
void ComputeClipCodes(DRIVER_TYPE type, const API_STATE& state, const simdvector& vertex, simdscalar& clipCodes)
{
clipCodes = _simd_setzero_ps();
// -w
simdscalar vNegW = _simd_mul_ps(vertex.w, _simd_set1_ps(-1.0f));
// FRUSTUM_LEFT
simdscalar vRes = _simd_cmplt_ps(vertex.x, vNegW);
clipCodes = _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_LEFT)));
// FRUSTUM_TOP
vRes = _simd_cmplt_ps(vertex.y, vNegW);
clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_TOP))));
// FRUSTUM_RIGHT
vRes = _simd_cmpgt_ps(vertex.x, vertex.w);
clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_RIGHT))));
// FRUSTUM_BOTTOM
vRes = _simd_cmpgt_ps(vertex.y, vertex.w);
clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_BOTTOM))));
if (state.rastState.depthClipEnable)
{
// FRUSTUM_NEAR
// DX clips depth [0..w], GL clips [-w..w]
if (type == DX)
{
vRes = _simd_cmplt_ps(vertex.z, _simd_setzero_ps());
}
else
{
vRes = _simd_cmplt_ps(vertex.z, vNegW);
}
clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_NEAR))));
// FRUSTUM_FAR
vRes = _simd_cmpgt_ps(vertex.z, vertex.w);
clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_FAR))));
}
// NEGW
vRes = _simd_cmple_ps(vertex.w, _simd_setzero_ps());
clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(NEGW))));
// GUARDBAND_LEFT
simdscalar gbMult = _simd_mul_ps(vNegW, _simd_set1_ps(state.gbState.left));
vRes = _simd_cmplt_ps(vertex.x, gbMult);
clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_LEFT))));
// GUARDBAND_TOP
gbMult = _simd_mul_ps(vNegW, _simd_set1_ps(state.gbState.top));
vRes = _simd_cmplt_ps(vertex.y, gbMult);
clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_TOP))));
// GUARDBAND_RIGHT
gbMult = _simd_mul_ps(vertex.w, _simd_set1_ps(state.gbState.right));
vRes = _simd_cmpgt_ps(vertex.x, gbMult);
clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_RIGHT))));
// GUARDBAND_BOTTOM
gbMult = _simd_mul_ps(vertex.w, _simd_set1_ps(state.gbState.bottom));
vRes = _simd_cmpgt_ps(vertex.y, gbMult);
clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_BOTTOM))));
}
template<uint32_t NumVertsPerPrim>
class Clipper
{
public:
Clipper(uint32_t in_workerId, DRAW_CONTEXT* in_pDC) :
workerId(in_workerId), driverType(in_pDC->pContext->driverType), pDC(in_pDC), state(GetApiState(in_pDC))
{
static_assert(NumVertsPerPrim >= 1 && NumVertsPerPrim <= 3, "Invalid NumVertsPerPrim");
}
void ComputeClipCodes(simdvector vertex[])
{
for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
{
::ComputeClipCodes(this->driverType, this->state, vertex[i], this->clipCodes[i]);
}
}
simdscalar ComputeClipCodeIntersection()
{
simdscalar result = this->clipCodes[0];
for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
{
result = _simd_and_ps(result, this->clipCodes[i]);
}
return result;
}
simdscalar ComputeClipCodeUnion()
{
simdscalar result = this->clipCodes[0];
for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
{
result = _simd_or_ps(result, this->clipCodes[i]);
}
return result;
}
int ComputeNegWMask()
{
simdscalar clipCodeUnion = ComputeClipCodeUnion();
clipCodeUnion = _simd_and_ps(clipCodeUnion, _simd_castsi_ps(_simd_set1_epi32(NEGW)));
return _simd_movemask_ps(_simd_cmpneq_ps(clipCodeUnion, _simd_setzero_ps()));
}
int ComputeClipMask()
{
simdscalar clipUnion = ComputeClipCodeUnion();
clipUnion = _simd_and_ps(clipUnion, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_CLIP_MASK)));
return _simd_movemask_ps(_simd_cmpneq_ps(clipUnion, _simd_setzero_ps()));
}
// clipper is responsible for culling any prims with NAN coordinates
int ComputeNaNMask(simdvector prim[])
{
simdscalar vNanMask = _simd_setzero_ps();
for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
{
simdscalar vNan01 = _simd_cmp_ps(prim[e].v[0], prim[e].v[1], _CMP_UNORD_Q);
vNanMask = _simd_or_ps(vNanMask, vNan01);
simdscalar vNan23 = _simd_cmp_ps(prim[e].v[2], prim[e].v[3], _CMP_UNORD_Q);
vNanMask = _simd_or_ps(vNanMask, vNan23);
}
return _simd_movemask_ps(vNanMask);
}
int ComputeUserClipCullMask(PA_STATE& pa, simdvector prim[])
{
uint8_t cullMask = this->state.rastState.cullDistanceMask;
simdscalar vClipCullMask = _simd_setzero_ps();
DWORD index;
simdvector vClipCullDistLo[3];
simdvector vClipCullDistHi[3];
pa.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT, vClipCullDistLo);
pa.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT, vClipCullDistHi);
while (_BitScanForward(&index, cullMask))
{
cullMask &= ~(1 << index);
uint32_t slot = index >> 2;
uint32_t component = index & 0x3;
simdscalar vCullMaskElem = _simd_set1_ps(-1.0f);
for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
{
simdscalar vCullComp;
if (slot == 0)
{
vCullComp = vClipCullDistLo[e][component];
}
else
{
vCullComp = vClipCullDistHi[e][component];
}
// cull if cull distance < 0 || NAN
simdscalar vCull = _simd_cmp_ps(_mm256_setzero_ps(), vCullComp, _CMP_NLE_UQ);
vCullMaskElem = _simd_and_ps(vCullMaskElem, vCull);
}
vClipCullMask = _simd_or_ps(vClipCullMask, vCullMaskElem);
}
// clipper should also discard any primitive with NAN clip distance
uint8_t clipMask = this->state.rastState.clipDistanceMask;
while (_BitScanForward(&index, clipMask))
{
clipMask &= ~(1 << index);
uint32_t slot = index >> 2;
uint32_t component = index & 0x3;
for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
{
simdscalar vClipComp;
if (slot == 0)
{
vClipComp = vClipCullDistLo[e][component];
}
else
{
vClipComp = vClipCullDistHi[e][component];
}
simdscalar vClip = _simd_cmp_ps(vClipComp, vClipComp, _CMP_UNORD_Q);
vClipCullMask = _simd_or_ps(vClipCullMask, vClip);
}
}
return _simd_movemask_ps(vClipCullMask);
}
// clip a single primitive
int ClipScalar(PA_STATE& pa, uint32_t primIndex, float* pOutPos, float* pOutAttribs)
{
OSALIGN(float, 16) inVerts[3 * 4];
OSALIGN(float, 16) inAttribs[3 * KNOB_NUM_ATTRIBUTES * 4];
// transpose primitive position
__m128 verts[3];
pa.AssembleSingle(VERTEX_POSITION_SLOT, primIndex, verts);
_mm_store_ps(&inVerts[0], verts[0]);
_mm_store_ps(&inVerts[4], verts[1]);
_mm_store_ps(&inVerts[8], verts[2]);
// transpose attribs
uint32_t numScalarAttribs = this->state.linkageCount * 4;
int idx = 0;
DWORD slot = 0;
uint32_t mapIdx = 0;
uint32_t tmpLinkage = uint32_t(this->state.linkageMask);
while (_BitScanForward(&slot, tmpLinkage))
{
tmpLinkage &= ~(1 << slot);
// Compute absolute attrib slot in vertex array
uint32_t inputSlot = VERTEX_ATTRIB_START_SLOT + this->state.linkageMap[mapIdx++];
__m128 attrib[3]; // triangle attribs (always 4 wide)
pa.AssembleSingle(inputSlot, primIndex, attrib);
_mm_store_ps(&inAttribs[idx], attrib[0]);
_mm_store_ps(&inAttribs[idx + numScalarAttribs], attrib[1]);
_mm_store_ps(&inAttribs[idx + numScalarAttribs * 2], attrib[2]);
idx += 4;
}
int numVerts;
Clip(inVerts, inAttribs, numScalarAttribs, pOutPos, &numVerts, pOutAttribs);
return numVerts;
}
// clip SIMD primitives
void ClipSimd(const simdscalar& vPrimMask, const simdscalar& vClipMask, PA_STATE& pa, const simdscalari& vPrimId)
{
// input/output vertex store for clipper
simdvertex vertices[7]; // maximum 7 verts generated per triangle
LONG constantInterpMask = this->state.backendState.constantInterpolationMask;
uint32_t provokingVertex = 0;
if(pa.binTopology == TOP_TRIANGLE_FAN)
{
provokingVertex = this->state.frontendState.provokingVertex.triFan;
}
///@todo: line topology for wireframe?
// assemble pos
simdvector tmpVector[NumVertsPerPrim];
pa.Assemble(VERTEX_POSITION_SLOT, tmpVector);
for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
{
vertices[i].attrib[VERTEX_POSITION_SLOT] = tmpVector[i];
}
// assemble attribs
DWORD slot = 0;
uint32_t mapIdx = 0;
uint32_t tmpLinkage = this->state.linkageMask;
int32_t maxSlot = -1;
while (_BitScanForward(&slot, tmpLinkage))
{
tmpLinkage &= ~(1 << slot);
// Compute absolute attrib slot in vertex array
uint32_t mapSlot = this->state.linkageMap[mapIdx++];
maxSlot = std::max<int32_t>(maxSlot, mapSlot);
uint32_t inputSlot = VERTEX_ATTRIB_START_SLOT + mapSlot;
pa.Assemble(inputSlot, tmpVector);
// if constant interpolation enabled for this attribute, assign the provoking
// vertex values to all edges
if (_bittest(&constantInterpMask, slot))
{
for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
{
vertices[i].attrib[inputSlot] = tmpVector[provokingVertex];
}
}
else
{
for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
{
vertices[i].attrib[inputSlot] = tmpVector[i];
}
}
}
uint32_t numAttribs = maxSlot + 1;
simdscalari vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs);
// set up new PA for binning clipped primitives
PFN_PROCESS_PRIMS pfnBinFunc = nullptr;
PRIMITIVE_TOPOLOGY clipTopology = TOP_UNKNOWN;
if (NumVertsPerPrim == 3)
{
pfnBinFunc = BinTriangles;
clipTopology = TOP_TRIANGLE_FAN;
// so that the binner knows to bloat wide points later
if (pa.binTopology == TOP_POINT_LIST)
clipTopology = TOP_POINT_LIST;
}
else if (NumVertsPerPrim == 2)
{
pfnBinFunc = BinLines;
clipTopology = TOP_LINE_LIST;
}
else
{
SWR_ASSERT(0 && "Unexpected points in clipper.");
}
uint32_t* pVertexCount = (uint32_t*)&vNumClippedVerts;
uint32_t* pPrimitiveId = (uint32_t*)&vPrimId;
const simdscalari vOffsets = _mm256_set_epi32(
0 * sizeof(simdvertex), // unused lane
6 * sizeof(simdvertex),
5 * sizeof(simdvertex),
4 * sizeof(simdvertex),
3 * sizeof(simdvertex),
2 * sizeof(simdvertex),
1 * sizeof(simdvertex),
0 * sizeof(simdvertex));
// only need to gather 7 verts
// @todo dynamic mask based on actual # of verts generated per lane
const simdscalar vMask = _mm256_set_ps(0, -1, -1, -1, -1, -1, -1, -1);
uint32_t numClippedPrims = 0;
for (uint32_t inputPrim = 0; inputPrim < pa.NumPrims(); ++inputPrim)
{
uint32_t numEmittedVerts = pVertexCount[inputPrim];
if (numEmittedVerts < NumVertsPerPrim)
{
continue;
}
SWR_ASSERT(numEmittedVerts <= 7, "Unexpected vertex count from clipper.");
uint32_t numEmittedPrims = GetNumPrims(clipTopology, numEmittedVerts);
numClippedPrims += numEmittedPrims;
// tranpose clipper output so that each lane's vertices are in SIMD order
// set aside space for 2 vertices, as the PA will try to read up to 16 verts
// for triangle fan
simdvertex transposedPrims[2];
// transpose pos
uint8_t* pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_POSITION_SLOT]) + sizeof(float) * inputPrim;
for (uint32_t c = 0; c < 4; ++c)
{
transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
pBase += sizeof(simdscalar);
}
// transpose attribs
pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_ATTRIB_START_SLOT]) + sizeof(float) * inputPrim;
for (uint32_t attrib = 0; attrib < numAttribs; ++attrib)
{
uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + attrib;
for (uint32_t c = 0; c < 4; ++c)
{
transposedPrims[0].attrib[attribSlot][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
pBase += sizeof(simdscalar);
}
}
PA_STATE_OPT clipPa(this->pDC, numEmittedPrims, (uint8_t*)&transposedPrims[0], numEmittedVerts, true, clipTopology);
while (clipPa.GetNextStreamOutput())
{
do
{
simdvector attrib[NumVertsPerPrim];
bool assemble = clipPa.Assemble(VERTEX_POSITION_SLOT, attrib);
if (assemble)
{
static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff };
pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd_set1_epi32(pPrimitiveId[inputPrim]));
}
} while (clipPa.NextPrim());
}
}
// update global pipeline stat
SWR_CONTEXT* pContext = this->pDC->pContext;
UPDATE_STAT(CPrimitives, numClippedPrims);
}
// execute the clipper stage
void ExecuteStage(PA_STATE& pa, simdvector prim[], uint32_t primMask, simdscalari primId)
{
// set up binner based on PA state
PFN_PROCESS_PRIMS pfnBinner;
switch (pa.binTopology)
{
case TOP_POINT_LIST:
pfnBinner = BinPoints;
break;
case TOP_LINE_LIST:
case TOP_LINE_STRIP:
case TOP_LINE_LOOP:
case TOP_LINE_LIST_ADJ:
case TOP_LISTSTRIP_ADJ:
pfnBinner = BinLines;
break;
default:
pfnBinner = BinTriangles;
break;
};
// update clipper invocations pipeline stat
SWR_CONTEXT* pContext = this->pDC->pContext;
uint32_t numInvoc = _mm_popcnt_u32(primMask);
UPDATE_STAT(CInvocations, numInvoc);
ComputeClipCodes(prim);
// cull prims with NAN coords
primMask &= ~ComputeNaNMask(prim);
// user cull distance cull
if (this->state.rastState.cullDistanceMask)
{
primMask &= ~ComputeUserClipCullMask(pa, prim);
}
// cull prims outside view frustum
simdscalar clipIntersection = ComputeClipCodeIntersection();
int validMask = primMask & _simd_movemask_ps(_simd_cmpeq_ps(clipIntersection, _simd_setzero_ps()));
// skip clipping for points
uint32_t clipMask = 0;
if (NumVertsPerPrim != 1)
{
clipMask = primMask & ComputeClipMask();
}
if (clipMask)
{
RDTSC_START(FEGuardbandClip);
// we have to clip tris, execute the clipper, which will also
// call the binner
ClipSimd(vMask(primMask), vMask(clipMask), pa, primId);
RDTSC_STOP(FEGuardbandClip, 1, 0);
}
else if (validMask)
{
// update CPrimitives pipeline state
SWR_CONTEXT* pContext = this->pDC->pContext;
UPDATE_STAT(CPrimitives, _mm_popcnt_u32(validMask));
// forward valid prims directly to binner
pfnBinner(this->pDC, pa, this->workerId, prim, validMask, primId);
}
}
private:
inline simdscalar ComputeInterpFactor(simdscalar boundaryCoord0, simdscalar boundaryCoord1)
{
return _simd_div_ps(boundaryCoord0, _simd_sub_ps(boundaryCoord0, boundaryCoord1));
}
inline simdscalari ComputeOffsets(uint32_t attrib, simdscalari vIndices, uint32_t component)
{
const uint32_t simdVertexStride = sizeof(simdvertex);
const uint32_t componentStride = sizeof(simdscalar);
const uint32_t attribStride = sizeof(simdvector);
const __m256i vElemOffset = _mm256_set_epi32(7 * sizeof(float), 6 * sizeof(float), 5 * sizeof(float), 4 * sizeof(float),
3 * sizeof(float), 2 * sizeof(float), 1 * sizeof(float), 0 * sizeof(float));
// step to the simdvertex
simdscalari vOffsets = _simd_mullo_epi32(vIndices, _simd_set1_epi32(simdVertexStride));
// step to the attribute and component
vOffsets = _simd_add_epi32(vOffsets, _simd_set1_epi32(attribStride * attrib + componentStride * component));
// step to the lane
vOffsets = _simd_add_epi32(vOffsets, vElemOffset);
return vOffsets;
}
// gathers a single component for a given attribute for each SIMD lane
inline simdscalar GatherComponent(const float* pBuffer, uint32_t attrib, simdscalar vMask, simdscalari vIndices, uint32_t component)
{
simdscalari vOffsets = ComputeOffsets(attrib, vIndices, component);
simdscalar vSrc = _mm256_undefined_ps();
return _simd_mask_i32gather_ps(vSrc, pBuffer, vOffsets, vMask, 1);
}
inline void ScatterComponent(const float* pBuffer, uint32_t attrib, simdscalar vMask, simdscalari vIndices, uint32_t component, simdscalar vSrc)
{
simdscalari vOffsets = ComputeOffsets(attrib, vIndices, component);
uint32_t* pOffsets = (uint32_t*)&vOffsets;
float* pSrc = (float*)&vSrc;
uint32_t mask = _simd_movemask_ps(vMask);
DWORD lane;
while (_BitScanForward(&lane, mask))
{
mask &= ~(1 << lane);
uint8_t* pBuf = (uint8_t*)pBuffer + pOffsets[lane];
*(float*)pBuf = pSrc[lane];
}
}
template<SWR_CLIPCODES ClippingPlane>
inline void intersect(
const simdscalar& vActiveMask, // active lanes to operate on
const simdscalari& s, // index to first edge vertex v0 in pInPts.
const simdscalari& p, // index to second edge vertex v1 in pInPts.
const simdvector& v1, // vertex 0 position
const simdvector& v2, // vertex 1 position
simdscalari& outIndex, // output index.
const float *pInVerts, // array of all the input positions.
uint32_t numInAttribs, // number of attributes per vertex.
float *pOutVerts) // array of output positions. We'll write our new intersection point at i*4.
{
// compute interpolation factor
simdscalar t;
switch (ClippingPlane)
{
case FRUSTUM_LEFT: t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[0]), _simd_add_ps(v2[3], v2[0])); break;
case FRUSTUM_RIGHT: t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[0]), _simd_sub_ps(v2[3], v2[0])); break;
case FRUSTUM_TOP: t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[1]), _simd_add_ps(v2[3], v2[1])); break;
case FRUSTUM_BOTTOM: t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[1]), _simd_sub_ps(v2[3], v2[1])); break;
case FRUSTUM_NEAR:
// DX Znear plane is 0, GL is -w
if (this->driverType == DX)
{
t = ComputeInterpFactor(v1[2], v2[2]);
}
else
{
t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[2]), _simd_add_ps(v2[3], v2[2]));
}
break;
case FRUSTUM_FAR: t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[2]), _simd_sub_ps(v2[3], v2[2])); break;
default: SWR_ASSERT(false, "invalid clipping plane: %d", ClippingPlane);
};
// interpolate position and store
for (uint32_t c = 0; c < 4; ++c)
{
simdscalar vOutPos = _simd_fmadd_ps(_simd_sub_ps(v2[c], v1[c]), t, v1[c]);
ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, vActiveMask, outIndex, c, vOutPos);
}
// interpolate attributes and store
for (uint32_t a = 0; a < numInAttribs; ++a)
{
uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
for (uint32_t c = 0; c < 4; ++c)
{
simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
}
}
}
template<SWR_CLIPCODES ClippingPlane>
inline simdscalar inside(const simdvector& v)
{
switch (ClippingPlane)
{
case FRUSTUM_LEFT: return _simd_cmpge_ps(v[0], _simd_mul_ps(v[3], _simd_set1_ps(-1.0f)));
case FRUSTUM_RIGHT: return _simd_cmple_ps(v[0], v[3]);
case FRUSTUM_TOP: return _simd_cmpge_ps(v[1], _simd_mul_ps(v[3], _simd_set1_ps(-1.0f)));
case FRUSTUM_BOTTOM: return _simd_cmple_ps(v[1], v[3]);
case FRUSTUM_NEAR: return _simd_cmpge_ps(v[2], this->driverType == DX ? _simd_setzero_ps() : _simd_mul_ps(v[3], _simd_set1_ps(-1.0f)));
case FRUSTUM_FAR: return _simd_cmple_ps(v[2], v[3]);
default:
SWR_ASSERT(false, "invalid clipping plane: %d", ClippingPlane);
return _simd_setzero_ps();
}
}
template<SWR_CLIPCODES ClippingPlane>
simdscalari ClipTriToPlane(const float* pInVerts, const simdscalari& vNumInPts, uint32_t numInAttribs, float* pOutVerts)
{
simdscalari vCurIndex = _simd_setzero_si();
simdscalari vOutIndex = _simd_setzero_si();
simdscalar vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts));
while (!_simd_testz_ps(vActiveMask, vActiveMask)) // loop until activeMask is empty
{
simdscalari s = vCurIndex;
simdscalari p = _simd_add_epi32(s, _simd_set1_epi32(1));
simdscalari underFlowMask = _simd_cmpgt_epi32(vNumInPts, p);
p = _simd_castps_si(_simd_blendv_ps(_simd_setzero_ps(), _simd_castsi_ps(p), _simd_castsi_ps(underFlowMask)));
// gather position
simdvector vInPos0, vInPos1;
for (uint32_t c = 0; c < 4; ++c)
{
vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
}
// compute inside mask
simdscalar s_in = inside<ClippingPlane>(vInPos0);
simdscalar p_in = inside<ClippingPlane>(vInPos1);
// compute intersection mask (s_in != p_in)
simdscalar intersectMask = _simd_xor_ps(s_in, p_in);
intersectMask = _simd_and_ps(intersectMask, vActiveMask);
// store s if inside
s_in = _simd_and_ps(s_in, vActiveMask);
if (!_simd_testz_ps(s_in, s_in))
{
// store position
for (uint32_t c = 0; c < 4; ++c)
{
ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
}
// store attribs
for (uint32_t a = 0; a < numInAttribs; ++a)
{
uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
for (uint32_t c = 0; c < 4; ++c)
{
simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
}
}
// increment outIndex
vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), s_in);
}
// compute and store intersection
if (!_simd_testz_ps(intersectMask, intersectMask))
{
intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
// increment outIndex for active lanes
vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), intersectMask);
}
// increment loop index and update active mask
vCurIndex = _simd_add_epi32(vCurIndex, _simd_set1_epi32(1));
vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts));
}
return vOutIndex;
}
template<SWR_CLIPCODES ClippingPlane>
simdscalari ClipLineToPlane(const float* pInVerts, const simdscalari& vNumInPts, uint32_t numInAttribs, float* pOutVerts)
{
simdscalari vCurIndex = _simd_setzero_si();
simdscalari vOutIndex = _simd_setzero_si();
simdscalar vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts));
if (!_simd_testz_ps(vActiveMask, vActiveMask))
{
simdscalari s = vCurIndex;
simdscalari p = _simd_add_epi32(s, _simd_set1_epi32(1));
// gather position
simdvector vInPos0, vInPos1;
for (uint32_t c = 0; c < 4; ++c)
{
vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
}
// compute inside mask
simdscalar s_in = inside<ClippingPlane>(vInPos0);
simdscalar p_in = inside<ClippingPlane>(vInPos1);
// compute intersection mask (s_in != p_in)
simdscalar intersectMask = _simd_xor_ps(s_in, p_in);
intersectMask = _simd_and_ps(intersectMask, vActiveMask);
// store s if inside
s_in = _simd_and_ps(s_in, vActiveMask);
if (!_simd_testz_ps(s_in, s_in))
{
for (uint32_t c = 0; c < 4; ++c)
{
ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
}
// interpolate attributes and store
for (uint32_t a = 0; a < numInAttribs; ++a)
{
uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
for (uint32_t c = 0; c < 4; ++c)
{
simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
}
}
// increment outIndex
vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), s_in);
}
// compute and store intersection
if (!_simd_testz_ps(intersectMask, intersectMask))
{
intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
// increment outIndex for active lanes
vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), intersectMask);
}
// store p if inside
p_in = _simd_and_ps(p_in, vActiveMask);
if (!_simd_testz_ps(p_in, p_in))
{
for (uint32_t c = 0; c < 4; ++c)
{
ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, p_in, vOutIndex, c, vInPos1[c]);
}
// interpolate attributes and store
for (uint32_t a = 0; a < numInAttribs; ++a)
{
uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
for (uint32_t c = 0; c < 4; ++c)
{
simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, p_in, p, c);
ScatterComponent(pOutVerts, attribSlot, p_in, vOutIndex, c, vAttrib);
}
}
// increment outIndex
vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), p_in);
}
}
return vOutIndex;
}
//////////////////////////////////////////////////////////////////////////
/// @brief Vertical clipper. Clips SIMD primitives at a time
/// @param pVertices - pointer to vertices in SOA form. Clipper will read input and write results to this buffer
/// @param vPrimMask - mask of valid input primitives, including non-clipped prims
/// @param numAttribs - number of valid input attribs, including position
simdscalari ClipPrims(float* pVertices, const simdscalar& vPrimMask, const simdscalar& vClipMask, int numAttribs)
{
// temp storage
simdvertex tempVertices[7];
float* pTempVerts = (float*)&tempVertices[0];
// zero out num input verts for non-active lanes
simdscalari vNumInPts = _simd_set1_epi32(NumVertsPerPrim);
vNumInPts = _simd_blendv_epi32(_simd_setzero_si(), vNumInPts, vClipMask);
// clip prims to frustum
simdscalari vNumOutPts;
if (NumVertsPerPrim == 3)
{
vNumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
vNumOutPts = ClipTriToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
vNumOutPts = ClipTriToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
vNumOutPts = ClipTriToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
vNumOutPts = ClipTriToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
vNumOutPts = ClipTriToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
}
else
{
SWR_ASSERT(NumVertsPerPrim == 2);
vNumOutPts = ClipLineToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
vNumOutPts = ClipLineToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
vNumOutPts = ClipLineToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
vNumOutPts = ClipLineToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
vNumOutPts = ClipLineToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
vNumOutPts = ClipLineToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
}
// restore num verts for non-clipped, active lanes
simdscalar vNonClippedMask = _simd_andnot_ps(vClipMask, vPrimMask);
vNumOutPts = _simd_blendv_epi32(vNumOutPts, _simd_set1_epi32(NumVertsPerPrim), vNonClippedMask);
return vNumOutPts;
}
const uint32_t workerId;
const DRIVER_TYPE driverType;
DRAW_CONTEXT* pDC;
const API_STATE& state;
simdscalar clipCodes[NumVertsPerPrim];
};
// pipeline stage functions
void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId);
void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId);
void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId);

View File

@ -0,0 +1,495 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file context.h
*
* @brief Definitions for SWR_CONTEXT and DRAW_CONTEXT
* The SWR_CONTEXT is our global context and contains the DC ring,
* thread state, etc.
*
* The DRAW_CONTEXT contains all state associated with a draw operation.
*
******************************************************************************/
#pragma once
#include <condition_variable>
#include <algorithm>
#include "core/api.h"
#include "core/utils.h"
#include "core/arena.h"
#include "core/fifo.hpp"
#include "core/knobs.h"
#include "common/simdintrin.h"
#include "core/threads.h"
// x.8 fixed point precision values
#define FIXED_POINT_SHIFT 8
#define FIXED_POINT_SCALE 256
// x.16 fixed point precision values
#define FIXED_POINT16_SHIFT 16
#define FIXED_POINT16_SCALE 65536
struct SWR_CONTEXT;
struct DRAW_CONTEXT;
struct TRI_FLAGS
{
uint32_t frontFacing : 1;
uint32_t yMajor : 1;
uint32_t coverageMask : (SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM);
uint32_t reserved : 32 - 1 - 1 - (SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM);
float pointSize;
uint32_t primID;
uint32_t renderTargetArrayIndex;
};
//////////////////////////////////////////////////////////////////////////
/// SWR_TRIANGLE_DESC
/////////////////////////////////////////////////////////////////////////
struct SWR_TRIANGLE_DESC
{
float I[3];
float J[3];
float Z[3];
float OneOverW[3];
float recipDet;
float *pRecipW;
float *pAttribs;
float *pPerspAttribs;
float *pSamplePos;
float *pUserClipBuffer;
uint64_t coverageMask[SWR_MAX_NUM_MULTISAMPLES];
TRI_FLAGS triFlags;
};
struct TRIANGLE_WORK_DESC
{
float *pTriBuffer;
float *pAttribs;
float *pUserClipBuffer;
uint32_t numAttribs;
TRI_FLAGS triFlags;
};
union CLEAR_FLAGS
{
struct
{
uint32_t mask : 3;
};
uint32_t bits;
};
struct CLEAR_DESC
{
CLEAR_FLAGS flags;
float clearRTColor[4]; // RGBA_32F
float clearDepth; // [0..1]
BYTE clearStencil;
};
struct INVALIDATE_TILES_DESC
{
uint32_t attachmentMask;
};
struct SYNC_DESC
{
PFN_CALLBACK_FUNC pfnCallbackFunc;
uint64_t userData;
uint64_t userData2;
uint64_t userData3;
};
struct QUERY_DESC
{
SWR_STATS* pStats;
};
struct STORE_TILES_DESC
{
SWR_RENDERTARGET_ATTACHMENT attachment;
SWR_TILE_STATE postStoreTileState;
};
struct COMPUTE_DESC
{
uint32_t threadGroupCountX;
uint32_t threadGroupCountY;
uint32_t threadGroupCountZ;
};
typedef void(*PFN_WORK_FUNC)(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc);
enum WORK_TYPE
{
SYNC,
DRAW,
CLEAR,
INVALIDATETILES,
STORETILES,
QUERYSTATS,
};
struct BE_WORK
{
WORK_TYPE type;
PFN_WORK_FUNC pfnWork;
union
{
SYNC_DESC sync;
TRIANGLE_WORK_DESC tri;
CLEAR_DESC clear;
INVALIDATE_TILES_DESC invalidateTiles;
STORE_TILES_DESC storeTiles;
QUERY_DESC queryStats;
} desc;
};
struct DRAW_WORK
{
DRAW_CONTEXT* pDC;
union
{
uint32_t numIndices; // DrawIndexed: Number of indices for draw.
uint32_t numVerts; // Draw: Number of verts (triangles, lines, etc)
};
union
{
const int32_t* pIB; // DrawIndexed: App supplied indices
uint32_t startVertex; // Draw: Starting vertex in VB to render from.
};
int32_t baseVertex;
uint32_t numInstances; // Number of instances
uint32_t startInstance; // Instance offset
uint32_t startPrimID; // starting primitiveID for this draw batch
uint32_t startVertexID; // starting VertexID for this draw batch (only needed for non-indexed draws)
SWR_FORMAT type; // index buffer type
};
typedef void(*PFN_FE_WORK_FUNC)(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pDesc);
struct FE_WORK
{
WORK_TYPE type;
PFN_FE_WORK_FUNC pfnWork;
union
{
SYNC_DESC sync;
DRAW_WORK draw;
CLEAR_DESC clear;
INVALIDATE_TILES_DESC invalidateTiles;
STORE_TILES_DESC storeTiles;
QUERY_DESC queryStats;
} desc;
};
struct GUARDBAND
{
float left, right, top, bottom;
};
struct PA_STATE;
// function signature for pipeline stages that execute after primitive assembly
typedef void(*PFN_PROCESS_PRIMS)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[],
uint32_t primMask, simdscalari primID);
OSALIGNLINE(struct) API_STATE
{
// Vertex Buffers
SWR_VERTEX_BUFFER_STATE vertexBuffers[KNOB_NUM_STREAMS];
// Index Buffer
SWR_INDEX_BUFFER_STATE indexBuffer;
// FS - Fetch Shader State
PFN_FETCH_FUNC pfnFetchFunc;
// VS - Vertex Shader State
PFN_VERTEX_FUNC pfnVertexFunc;
// GS - Geometry Shader State
PFN_GS_FUNC pfnGsFunc;
SWR_GS_STATE gsState;
// CS - Compute Shader
PFN_CS_FUNC pfnCsFunc;
uint32_t totalThreadsInGroup;
// FE - Frontend State
SWR_FRONTEND_STATE frontendState;
// SOS - Streamout Shader State
PFN_SO_FUNC pfnSoFunc[MAX_SO_STREAMS];
// Streamout state
SWR_STREAMOUT_STATE soState;
mutable SWR_STREAMOUT_BUFFER soBuffer[MAX_SO_STREAMS];
// Tessellation State
PFN_HS_FUNC pfnHsFunc;
PFN_DS_FUNC pfnDsFunc;
SWR_TS_STATE tsState;
// Specifies which VS outputs are sent to PS.
// Does not include position
uint32_t linkageMask;
uint32_t linkageCount;
uint8_t linkageMap[MAX_ATTRIBUTES];
// attrib mask, specifies the total set of attributes used
// by the frontend (vs, so, gs)
uint32_t feAttribMask;
PRIMITIVE_TOPOLOGY topology;
bool forceFront;
// RS - Rasterizer State
SWR_RASTSTATE rastState;
// floating point multisample offsets
float samplePos[SWR_MAX_NUM_MULTISAMPLES * 2];
GUARDBAND gbState;
SWR_VIEWPORT vp[KNOB_NUM_VIEWPORTS_SCISSORS];
SWR_VIEWPORT_MATRIX vpMatrix[KNOB_NUM_VIEWPORTS_SCISSORS];
BBOX scissorRects[KNOB_NUM_VIEWPORTS_SCISSORS];
BBOX scissorInFixedPoint;
// Backend state
SWR_BACKEND_STATE backendState;
// PS - Pixel shader state
SWR_PS_STATE psState;
SWR_DEPTH_STENCIL_STATE depthStencilState;
// OM - Output Merger State
SWR_BLEND_STATE blendState;
PFN_BLEND_JIT_FUNC pfnBlendFunc[SWR_NUM_RENDERTARGETS];
// Stats are incremented when this is true.
bool enableStats;
struct
{
uint32_t colorHottileEnable : 8;
uint32_t depthHottileEnable: 1;
uint32_t stencilHottileEnable : 1;
};
};
class MacroTileMgr;
class DispatchQueue;
struct RenderOutputBuffers
{
uint8_t* pColor[SWR_NUM_RENDERTARGETS];
uint8_t* pDepth;
uint8_t* pStencil;
};
// Plane equation A/B/C coeffs used to evaluate I/J barycentric coords
struct BarycentricCoeffs
{
simdscalar vIa;
simdscalar vIb;
simdscalar vIc;
simdscalar vJa;
simdscalar vJb;
simdscalar vJc;
simdscalar vZa;
simdscalar vZb;
simdscalar vZc;
simdscalar vRecipDet;
simdscalar vAOneOverW;
simdscalar vBOneOverW;
simdscalar vCOneOverW;
};
// pipeline function pointer types
typedef void(*PFN_BACKEND_FUNC)(DRAW_CONTEXT*, uint32_t, uint32_t, uint32_t, SWR_TRIANGLE_DESC&, RenderOutputBuffers&);
typedef void(*PFN_OUTPUT_MERGER)(SWR_PS_CONTEXT &, uint8_t* (&)[SWR_NUM_RENDERTARGETS], uint32_t, const SWR_BLEND_STATE*,
const PFN_BLEND_JIT_FUNC (&)[SWR_NUM_RENDERTARGETS], simdscalar&, simdscalar);
typedef void(*PFN_CALC_PIXEL_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT &);
typedef void(*PFN_CALC_SAMPLE_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT&);
typedef void(*PFN_CALC_CENTROID_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT &, const uint64_t *const, const uint32_t,
const simdscalar, const simdscalar);
struct BACKEND_FUNCS
{
PFN_BACKEND_FUNC pfnBackend;
PFN_CALC_PIXEL_BARYCENTRICS pfnCalcPixelBarycentrics;
PFN_CALC_SAMPLE_BARYCENTRICS pfnCalcSampleBarycentrics;
PFN_CALC_CENTROID_BARYCENTRICS pfnCalcCentroidBarycentrics;
PFN_OUTPUT_MERGER pfnOutputMerger;
};
// Draw State
struct DRAW_STATE
{
API_STATE state;
void* pPrivateState; // Its required the driver sets this up for each draw.
// pipeline function pointers, filled in by API thread when setting up the draw
BACKEND_FUNCS backendFuncs;
PFN_PROCESS_PRIMS pfnProcessPrims;
Arena* pArena; // This should only be used by API thread.
};
// Draw Context
// The api thread sets up a draw context that exists for the life of the draw.
// This draw context maintains all of the state needed for the draw operation.
struct DRAW_CONTEXT
{
SWR_CONTEXT *pContext;
uint64_t drawId;
bool isCompute; // Is this DC a compute context?
FE_WORK FeWork;
volatile OSALIGNLINE(uint32_t) FeLock;
volatile OSALIGNLINE(bool) inUse;
volatile OSALIGNLINE(bool) doneFE; // Is FE work done for this draw?
// Have all worker threads moved past draw in DC ring?
volatile OSALIGNLINE(uint32_t) threadsDoneFE;
volatile OSALIGNLINE(uint32_t) threadsDoneBE;
uint64_t dependency;
MacroTileMgr* pTileMgr;
// The following fields are valid if isCompute is true.
volatile OSALIGNLINE(bool) doneCompute; // Is this dispatch done? (isCompute)
DispatchQueue* pDispatch; // Queue for thread groups. (isCompute)
DRAW_STATE* pState;
Arena* pArena;
uint8_t* pSpillFill[KNOB_MAX_NUM_THREADS]; // Scratch space used for spill fills.
};
INLINE const API_STATE& GetApiState(const DRAW_CONTEXT* pDC)
{
SWR_ASSERT(pDC != nullptr);
SWR_ASSERT(pDC->pState != nullptr);
return pDC->pState->state;
}
INLINE void* GetPrivateState(const DRAW_CONTEXT* pDC)
{
SWR_ASSERT(pDC != nullptr);
SWR_ASSERT(pDC->pState != nullptr);
return pDC->pState->pPrivateState;
}
class HotTileMgr;
struct SWR_CONTEXT
{
// Draw Context Ring
// Each draw needs its own state in order to support mulitple draws in flight across multiple threads.
// We maintain N draw contexts configured as a ring. The size of the ring limits the maximum number
// of draws that can be in flight at any given time.
//
// Description:
// 1. State - When an application first sets state we'll request a new draw context to use.
// a. If there are no available draw contexts then we'll have to wait until one becomes free.
// b. If one is available then set pCurDrawContext to point to it and mark it in use.
// c. All state calls set state on pCurDrawContext.
// 2. Draw - Creates submits a work item that is associated with current draw context.
// a. Set pPrevDrawContext = pCurDrawContext
// b. Set pCurDrawContext to NULL.
// 3. State - When an applications sets state after draw
// a. Same as step 1.
// b. State is copied from prev draw context to current.
DRAW_CONTEXT* dcRing;
DRAW_CONTEXT *pCurDrawContext; // This points to DC entry in ring for an unsubmitted draw.
DRAW_CONTEXT *pPrevDrawContext; // This points to DC entry for the previous context submitted that we can copy state from.
// Draw State Ring
// When draw are very large (lots of primitives) then the API thread will break these up.
// These split draws all have identical state. So instead of storing the state directly
// in the Draw Context (DC) we instead store it in a Draw State (DS). This allows multiple DCs
// to reference a single entry in the DS ring.
DRAW_STATE* dsRing;
uint32_t curStateId; // Current index to the next available entry in the DS ring.
DRAW_STATE* subCtxSave; // Save area for inactive contexts.
uint32_t curSubCtxId; // Current index for active state subcontext.
uint32_t numSubContexts; // Number of available subcontexts
uint32_t NumWorkerThreads;
THREAD_POOL threadPool; // Thread pool associated with this context
std::condition_variable FifosNotEmpty;
std::mutex WaitLock;
// Draw Contexts will get a unique drawId generated from this
uint64_t nextDrawId;
// most recent draw id enqueued by the API thread
// written by api thread, read by multiple workers
OSALIGNLINE(volatile uint64_t) DrawEnqueued;
DRIVER_TYPE driverType;
uint32_t privateStateSize;
HotTileMgr *pHotTileMgr;
// tile load/store functions, passed in at create context time
PFN_LOAD_TILE pfnLoadTile;
PFN_STORE_TILE pfnStoreTile;
PFN_CLEAR_TILE pfnClearTile;
// Global Stats
SWR_STATS stats[KNOB_MAX_NUM_THREADS];
// Scratch space for workers.
uint8_t* pScratch[KNOB_MAX_NUM_THREADS];
};
void WaitForDependencies(SWR_CONTEXT *pContext, uint64_t drawId);
void WakeAllThreads(SWR_CONTEXT *pContext);
#define UPDATE_STAT(name, count) if (GetApiState(pDC).enableStats) { pContext->stats[workerId].name += count; }
#define SET_STAT(name, count) if (GetApiState(pDC).enableStats) { pContext->stats[workerId].name = count; }

View File

@ -0,0 +1,245 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file depthstencil.h
*
* @brief Implements depth/stencil functionality
*
******************************************************************************/
#pragma once
#include "common/os.h"
#include "format_conversion.h"
INLINE
void StencilOp(SWR_STENCILOP op, simdscalar mask, simdscalar stencilRefps, simdscalar &stencilps)
{
simdscalari stencil = _simd_castps_si(stencilps);
switch (op)
{
case STENCILOP_KEEP:
break;
case STENCILOP_ZERO:
stencilps = _simd_blendv_ps(stencilps, _simd_setzero_ps(), mask);
break;
case STENCILOP_REPLACE:
stencilps = _simd_blendv_ps(stencilps, stencilRefps, mask);
break;
case STENCILOP_INCRSAT:
{
simdscalari stencilincr = _simd_adds_epu8(stencil, _simd_set1_epi32(1));
stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencilincr), mask);
break;
}
case STENCILOP_DECRSAT:
{
simdscalari stencildecr = _simd_subs_epu8(stencil, _simd_set1_epi32(1));
stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencildecr), mask);
break;
}
case STENCILOP_INCR:
{
simdscalari stencilincr = _simd_add_epi8(stencil, _simd_set1_epi32(1));
stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencilincr), mask);
break;
}
case STENCILOP_DECR:
{
simdscalari stencildecr = _simd_add_epi8(stencil, _simd_set1_epi32((-1) & 0xff));
stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencildecr), mask);
break;
}
case STENCILOP_INVERT:
{
simdscalar stencilinvert = _simd_andnot_ps(stencilps, _simd_cmpeq_ps(_simd_setzero_ps(), _simd_setzero_ps()));
stencilps = _simd_blendv_ps(stencilps, stencilinvert, mask);
break;
}
default:
break;
}
}
INLINE
simdscalar DepthStencilTest(const SWR_VIEWPORT* pViewport, const SWR_DEPTH_STENCIL_STATE* pDSState,
bool frontFacing, simdscalar interpZ, BYTE* pDepthBase, simdscalar coverageMask, BYTE *pStencilBase,
simdscalar* pStencilMask)
{
static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
static_assert(KNOB_STENCIL_HOT_TILE_FORMAT == R8_UINT, "Unsupported stencil hot tile format");
simdscalar depthResult = _simd_set1_ps(-1.0f);
simdscalar zbuf;
// clamp Z to viewport [minZ..maxZ]
simdscalar vMinZ = _simd_broadcast_ss(&pViewport->minZ);
simdscalar vMaxZ = _simd_broadcast_ss(&pViewport->maxZ);
interpZ = _simd_min_ps(vMaxZ, _simd_max_ps(vMinZ, interpZ));
if (pDSState->depthTestEnable)
{
switch (pDSState->depthTestFunc)
{
case ZFUNC_NEVER: depthResult = _simd_setzero_ps(); break;
case ZFUNC_ALWAYS: break;
default:
zbuf = _simd_load_ps((const float*)pDepthBase);
}
switch (pDSState->depthTestFunc)
{
case ZFUNC_LE: depthResult = _simd_cmple_ps(interpZ, zbuf); break;
case ZFUNC_LT: depthResult = _simd_cmplt_ps(interpZ, zbuf); break;
case ZFUNC_GT: depthResult = _simd_cmpgt_ps(interpZ, zbuf); break;
case ZFUNC_GE: depthResult = _simd_cmpge_ps(interpZ, zbuf); break;
case ZFUNC_EQ: depthResult = _simd_cmpeq_ps(interpZ, zbuf); break;
case ZFUNC_NE: depthResult = _simd_cmpneq_ps(interpZ, zbuf); break;
}
}
simdscalar stencilMask = _simd_set1_ps(-1.0f);
if (pDSState->stencilTestEnable)
{
uint8_t stencilRefValue;
uint32_t stencilTestFunc;
uint8_t stencilTestMask;
if (frontFacing || !pDSState->doubleSidedStencilTestEnable)
{
stencilRefValue = pDSState->stencilRefValue;
stencilTestFunc = pDSState->stencilTestFunc;
stencilTestMask = pDSState->stencilTestMask;
}
else
{
stencilRefValue = pDSState->backfaceStencilRefValue;
stencilTestFunc = pDSState->backfaceStencilTestFunc;
stencilTestMask = pDSState->backfaceStencilTestMask;
}
simdvector sbuf;
simdscalar stencilWithMask;
simdscalar stencilRef;
switch(stencilTestFunc)
{
case ZFUNC_NEVER: stencilMask = _simd_setzero_ps(); break;
case ZFUNC_ALWAYS: break;
default:
LoadSOA<R8_UINT>(pStencilBase, sbuf);
// apply stencil read mask
stencilWithMask = _simd_castsi_ps(_simd_and_si(_simd_castps_si(sbuf.v[0]), _simd_set1_epi32(stencilTestMask)));
// do stencil compare in float to avoid simd integer emulation in AVX1
stencilWithMask = _simd_cvtepi32_ps(_simd_castps_si(stencilWithMask));
stencilRef = _simd_set1_ps((float)(stencilRefValue & stencilTestMask));
break;
}
switch(stencilTestFunc)
{
case ZFUNC_LE: stencilMask = _simd_cmple_ps(stencilRef, stencilWithMask); break;
case ZFUNC_LT: stencilMask = _simd_cmplt_ps(stencilRef, stencilWithMask); break;
case ZFUNC_GT: stencilMask = _simd_cmpgt_ps(stencilRef, stencilWithMask); break;
case ZFUNC_GE: stencilMask = _simd_cmpge_ps(stencilRef, stencilWithMask); break;
case ZFUNC_EQ: stencilMask = _simd_cmpeq_ps(stencilRef, stencilWithMask); break;
case ZFUNC_NE: stencilMask = _simd_cmpneq_ps(stencilRef, stencilWithMask); break;
}
}
simdscalar depthWriteMask = _simd_and_ps(depthResult, stencilMask);
depthWriteMask = _simd_and_ps(depthWriteMask, coverageMask);
*pStencilMask = stencilMask;
return depthWriteMask;
}
INLINE
void DepthStencilWrite(const SWR_VIEWPORT* pViewport, const SWR_DEPTH_STENCIL_STATE* pDSState,
bool frontFacing, simdscalar interpZ, BYTE* pDepthBase, const simdscalar& depthMask, const simdscalar& coverageMask,
BYTE *pStencilBase, const simdscalar& stencilMask)
{
if (pDSState->depthWriteEnable)
{
// clamp Z to viewport [minZ..maxZ]
simdscalar vMinZ = _simd_broadcast_ss(&pViewport->minZ);
simdscalar vMaxZ = _simd_broadcast_ss(&pViewport->maxZ);
interpZ = _simd_min_ps(vMaxZ, _simd_max_ps(vMinZ, interpZ));
simdscalar vMask = _simd_and_ps(depthMask, coverageMask);
_simd_maskstore_ps((float*)pDepthBase, _simd_castps_si(vMask), interpZ);
}
if (pDSState->stencilWriteEnable)
{
simdvector sbuf;
LoadSOA<R8_UINT>(pStencilBase, sbuf);
simdscalar stencilbuf = sbuf.v[0];
uint8_t stencilRefValue;
uint32_t stencilFailOp;
uint32_t stencilPassDepthPassOp;
uint32_t stencilPassDepthFailOp;
uint8_t stencilWriteMask;
if (frontFacing || !pDSState->doubleSidedStencilTestEnable)
{
stencilRefValue = pDSState->stencilRefValue;
stencilFailOp = pDSState->stencilFailOp;
stencilPassDepthPassOp = pDSState->stencilPassDepthPassOp;
stencilPassDepthFailOp = pDSState->stencilPassDepthFailOp;
stencilWriteMask = pDSState->stencilWriteMask;
}
else
{
stencilRefValue = pDSState->backfaceStencilRefValue;
stencilFailOp = pDSState->backfaceStencilFailOp;
stencilPassDepthPassOp = pDSState->backfaceStencilPassDepthPassOp;
stencilPassDepthFailOp = pDSState->backfaceStencilPassDepthFailOp;
stencilWriteMask = pDSState->backfaceStencilWriteMask;
}
simdscalar stencilps = stencilbuf;
simdscalar stencilRefps = _simd_castsi_ps(_simd_set1_epi32(stencilRefValue));
simdscalar stencilFailMask = _simd_andnot_ps(stencilMask, coverageMask);
simdscalar stencilPassDepthPassMask = _simd_and_ps(stencilMask, depthMask);
simdscalar stencilPassDepthFailMask = _simd_and_ps(stencilMask, _simd_andnot_ps(depthMask, _simd_set1_ps(-1)));
simdscalar origStencil = stencilps;
StencilOp((SWR_STENCILOP)stencilFailOp, stencilFailMask, stencilRefps, stencilps);
StencilOp((SWR_STENCILOP)stencilPassDepthFailOp, stencilPassDepthFailMask, stencilRefps, stencilps);
StencilOp((SWR_STENCILOP)stencilPassDepthPassOp, stencilPassDepthPassMask, stencilRefps, stencilps);
// apply stencil write mask
simdscalari vWriteMask = _simd_set1_epi32(stencilWriteMask);
stencilps = _simd_and_ps(stencilps, _simd_castsi_ps(vWriteMask));
stencilps = _simd_or_ps(_simd_andnot_ps(_simd_castsi_ps(vWriteMask), origStencil), stencilps);
simdvector stencilResult;
stencilResult.v[0] = _simd_blendv_ps(origStencil, stencilps, coverageMask);
StoreSOA<R8_UINT>(stencilResult, pStencilBase);
}
}

View File

@ -0,0 +1,136 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file fifo.hpp
*
* @brief Definitions for our fifos used for thread communication.
*
******************************************************************************/
#pragma once
#include "common/os.h"
#include "arena.h"
#include <vector>
#include <cassert>
template<class T>
struct QUEUE
{
OSALIGNLINE(volatile uint32_t) mLock{ 0 };
OSALIGNLINE(volatile uint32_t) mNumEntries{ 0 };
std::vector<T*> mBlocks;
T* mCurBlock{ nullptr };
uint32_t mHead{ 0 };
uint32_t mTail{ 0 };
uint32_t mCurBlockIdx{ 0 };
// power of 2
static const uint32_t mBlockSizeShift = 6;
static const uint32_t mBlockSize = 1 << mBlockSizeShift;
void clear(Arena& arena)
{
mHead = 0;
mTail = 0;
mBlocks.clear();
T* pNewBlock = (T*)arena.Alloc(sizeof(T)*mBlockSize);
mBlocks.push_back(pNewBlock);
mCurBlock = pNewBlock;
mCurBlockIdx = 0;
mNumEntries = 0;
_ReadWriteBarrier();
mLock = 0;
}
uint32_t getNumQueued()
{
return mNumEntries;
}
bool tryLock()
{
if (mLock)
{
return false;
}
// try to lock the FIFO
LONG initial = InterlockedCompareExchange(&mLock, 1, 0);
return (initial == 0);
}
void unlock()
{
mLock = 0;
}
T* peek()
{
if (mNumEntries == 0)
{
return nullptr;
}
uint32_t block = mHead >> mBlockSizeShift;
return &mBlocks[block][mHead & (mBlockSize-1)];
}
void dequeue_noinc()
{
mHead ++;
mNumEntries --;
}
bool enqueue_try_nosync(Arena& arena, const T* entry)
{
memcpy(&mCurBlock[mTail], entry, sizeof(T));
mTail ++;
if (mTail == mBlockSize)
{
if (++mCurBlockIdx < mBlocks.size())
{
mCurBlock = mBlocks[mCurBlockIdx];
}
else
{
T* newBlock = (T*)arena.Alloc(sizeof(T)*mBlockSize);
SWR_ASSERT(newBlock);
mBlocks.push_back(newBlock);
mCurBlock = newBlock;
}
mTail = 0;
}
mNumEntries ++;
return true;
}
void destroy()
{
}
};

View File

@ -0,0 +1,196 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file format_conversion.h
*
* @brief API implementation
*
******************************************************************************/
#include "format_types.h"
#include "format_traits.h"
//////////////////////////////////////////////////////////////////////////
/// @brief Load SIMD packed pixels in SOA format and converts to
/// SOA RGBA32_FLOAT format.
/// @param pSrc - source data in SOA form
/// @param dst - output data in SOA form
template<SWR_FORMAT SrcFormat>
INLINE void LoadSOA(const BYTE *pSrc, simdvector &dst)
{
// fast path for float32
if ((FormatTraits<SrcFormat>::GetType(0) == SWR_TYPE_FLOAT) && (FormatTraits<SrcFormat>::GetBPC(0) == 32))
{
auto lambda = [&](int comp)
{
simdscalar vComp = _simd_load_ps((const float*)(pSrc + comp*sizeof(simdscalar)));
dst.v[FormatTraits<SrcFormat>::swizzle(comp)] = vComp;
};
UnrollerL<0, FormatTraits<SrcFormat>::numComps, 1>::step(lambda);
return;
}
auto lambda = [&](int comp)
{
// load SIMD components
simdscalar vComp = FormatTraits<SrcFormat>::loadSOA(comp, pSrc);
// unpack
vComp = FormatTraits<SrcFormat>::unpack(comp, vComp);
// convert
if (FormatTraits<SrcFormat>::isNormalized(comp))
{
vComp = _simd_cvtepi32_ps(_simd_castps_si(vComp));
vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<SrcFormat>::toFloat(comp)));
}
dst.v[FormatTraits<SrcFormat>::swizzle(comp)] = vComp;
pSrc += (FormatTraits<SrcFormat>::GetBPC(comp) * KNOB_SIMD_WIDTH) / 8;
};
UnrollerL<0, FormatTraits<SrcFormat>::numComps, 1>::step(lambda);
}
//////////////////////////////////////////////////////////////////////////
/// @brief Clamps the given component based on the requirements on the
/// Format template arg
/// @param vComp - SIMD vector of floats
/// @param Component - component
template<SWR_FORMAT Format>
INLINE simdscalar Clamp(simdscalar vComp, uint32_t Component)
{
if (FormatTraits<Format>::isNormalized(Component))
{
if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_UNORM)
{
vComp = _simd_max_ps(vComp, _simd_setzero_ps());
}
if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_SNORM)
{
vComp = _simd_max_ps(vComp, _simd_set1_ps(-1.0f));
}
vComp = _simd_min_ps(vComp, _simd_set1_ps(1.0f));
}
else if (FormatTraits<Format>::GetBPC(Component) < 32)
{
if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_UINT)
{
int iMax = (1 << FormatTraits<Format>::GetBPC(Component)) - 1;
int iMin = 0;
simdscalari vCompi = _simd_castps_si(vComp);
vCompi = _simd_max_epu32(vCompi, _simd_set1_epi32(iMin));
vCompi = _simd_min_epu32(vCompi, _simd_set1_epi32(iMax));
vComp = _simd_castsi_ps(vCompi);
}
else if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_SINT)
{
int iMax = (1 << (FormatTraits<Format>::GetBPC(Component) - 1)) - 1;
int iMin = -1 - iMax;
simdscalari vCompi = _simd_castps_si(vComp);
vCompi = _simd_max_epi32(vCompi, _simd_set1_epi32(iMin));
vCompi = _simd_min_epi32(vCompi, _simd_set1_epi32(iMax));
vComp = _simd_castsi_ps(vCompi);
}
}
return vComp;
}
//////////////////////////////////////////////////////////////////////////
/// @brief Normalize the given component based on the requirements on the
/// Format template arg
/// @param vComp - SIMD vector of floats
/// @param Component - component
template<SWR_FORMAT Format>
INLINE simdscalar Normalize(simdscalar vComp, uint32_t Component)
{
if (FormatTraits<Format>::isNormalized(Component))
{
vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<Format>::fromFloat(Component)));
vComp = _simd_castsi_ps(_simd_cvtps_epi32(vComp));
}
return vComp;
}
//////////////////////////////////////////////////////////////////////////
/// @brief Convert and store simdvector of pixels in SOA
/// RGBA32_FLOAT to SOA format
/// @param src - source data in SOA form
/// @param dst - output data in SOA form
template<SWR_FORMAT DstFormat>
INLINE void StoreSOA(const simdvector &src, BYTE *pDst)
{
// fast path for float32
if ((FormatTraits<DstFormat>::GetType(0) == SWR_TYPE_FLOAT) && (FormatTraits<DstFormat>::GetBPC(0) == 32))
{
for (uint32_t comp = 0; comp < FormatTraits<DstFormat>::numComps; ++comp)
{
simdscalar vComp = src.v[FormatTraits<DstFormat>::swizzle(comp)];
// Gamma-correct
if (FormatTraits<DstFormat>::isSRGB)
{
if (comp < 3) // Input format is always RGBA32_FLOAT.
{
vComp = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(comp, vComp);
}
}
_simd_store_ps((float*)(pDst + comp*sizeof(simdscalar)), vComp);
}
return;
}
auto lambda = [&](int comp)
{
simdscalar vComp = src.v[FormatTraits<DstFormat>::swizzle(comp)];
// Gamma-correct
if (FormatTraits<DstFormat>::isSRGB)
{
if (comp < 3) // Input format is always RGBA32_FLOAT.
{
vComp = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(comp, vComp);
}
}
// clamp
vComp = Clamp<DstFormat>(vComp, comp);
// normalize
vComp = Normalize<DstFormat>(vComp, comp);
// pack
vComp = FormatTraits<DstFormat>::pack(comp, vComp);
// store
FormatTraits<DstFormat>::storeSOA(comp, pDst, vComp);
pDst += (FormatTraits<DstFormat>::GetBPC(comp) * KNOB_SIMD_WIDTH) / 8;
};
UnrollerL<0, FormatTraits<DstFormat>::numComps, 1>::step(lambda);
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,327 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file frontend.h
*
* @brief Definitions for Frontend which handles vertex processing,
* primitive assembly, clipping, binning, etc.
*
******************************************************************************/
#pragma once
#include "context.h"
INLINE
__m128i fpToFixedPoint(const __m128 vIn)
{
__m128 vFixed = _mm_mul_ps(vIn, _mm_set1_ps(FIXED_POINT_SCALE));
return _mm_cvtps_epi32(vFixed);
}
INLINE
simdscalari fpToFixedPointVertical(const simdscalar vIn)
{
simdscalar vFixed = _simd_mul_ps(vIn, _simd_set1_ps(FIXED_POINT_SCALE));
return _simd_cvtps_epi32(vFixed);
}
// Calculates the A and B coefficients for the 3 edges of the triangle
//
// maths for edge equations:
// standard form of a line in 2d
// Ax + By + C = 0
// A = y0 - y1
// B = x1 - x0
// C = x0y1 - x1y0
INLINE
void triangleSetupAB(const __m128 vX, const __m128 vY, __m128 & vA, __m128 & vB)
{
// vYsub = y1 y2 y0 dc
__m128 vYsub = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(3, 0, 2, 1));
// vY = y0 y1 y2 dc
vA = _mm_sub_ps(vY, vYsub);
// Result:
// A[0] = y0 - y1
// A[1] = y1 - y2
// A[2] = y2 - y0
// vXsub = x1 x2 x0 dc
__m128 vXsub = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(3, 0, 2, 1));
// vX = x0 x1 x2 dc
vB = _mm_sub_ps(vXsub, vX);
// Result:
// B[0] = x1 - x0
// B[1] = x2 - x1
// B[2] = x0 - x2
}
INLINE
void triangleSetupABVertical(const simdscalar vX[3], const simdscalar vY[3], simdscalar (&vA)[3], simdscalar (&vB)[3])
{
// generate edge equations
// A = y0 - y1
// B = x1 - x0
vA[0] = _simd_sub_ps(vY[0], vY[1]);
vA[1] = _simd_sub_ps(vY[1], vY[2]);
vA[2] = _simd_sub_ps(vY[2], vY[0]);
vB[0] = _simd_sub_ps(vX[1], vX[0]);
vB[1] = _simd_sub_ps(vX[2], vX[1]);
vB[2] = _simd_sub_ps(vX[0], vX[2]);
}
INLINE
void triangleSetupABInt(const __m128i vX, const __m128i vY, __m128i & vA, __m128i & vB)
{
// generate edge equations
// A = y0 - y1
// B = x1 - x0
// C = x0y1 - x1y0
__m128i vYsub = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 0, 2, 1));
vA = _mm_sub_epi32(vY, vYsub);
__m128i vXsub = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 0, 2, 1));
vB = _mm_sub_epi32(vXsub, vX);
}
INLINE
void triangleSetupABIntVertical(const simdscalari vX[3], const simdscalari vY[3], simdscalari (&vA)[3], simdscalari (&vB)[3])
{
// A = y0 - y1
// B = x1 - x0
vA[0] = _simd_sub_epi32(vY[0], vY[1]);
vA[1] = _simd_sub_epi32(vY[1], vY[2]);
vA[2] = _simd_sub_epi32(vY[2], vY[0]);
vB[0] = _simd_sub_epi32(vX[1], vX[0]);
vB[1] = _simd_sub_epi32(vX[2], vX[1]);
vB[2] = _simd_sub_epi32(vX[0], vX[2]);
}
// Calculate the determinant of the triangle
// 2 vectors between the 3 points: P, Q
// Px = x0-x2, Py = y0-y2
// Qx = x1-x2, Qy = y1-y2
// |Px Qx|
// det = | | = PxQy - PyQx
// |Py Qy|
// simplifies to : (x0-x2)*(y1-y2) - (y0-y2)*(x1-x2)
// try to reuse our A & B coef's already calculated. factor out a -1 from Py and Qx
// : B[2]*A[1] - (-(y2-y0))*(-(x2-x1))
// : B[2]*A[1] - (-1)(-1)(y2-y0)*(x2-x1)
// : B[2]*A[1] - A[2]*B[1]
INLINE
float calcDeterminantInt(const __m128i vA, const __m128i vB)
{
// vAShuf = [A1, A0, A2, A0]
__m128i vAShuf = _mm_shuffle_epi32(vA, _MM_SHUFFLE(0, 2, 0, 1));
// vBShuf = [B2, B0, B1, B0]
__m128i vBShuf = _mm_shuffle_epi32(vB, _MM_SHUFFLE(0, 1, 0, 2));
// vMul = [A1*B2, B1*A2]
__m128i vMul = _mm_mul_epi32(vAShuf, vBShuf);
// shuffle upper to lower
// vMul2 = [B1*A2, B1*A2]
__m128i vMul2 = _mm_shuffle_epi32(vMul, _MM_SHUFFLE(3, 2, 3, 2));
//vMul = [A1*B2 - B1*A2]
vMul = _mm_sub_epi64(vMul, vMul2);
// According to emmintrin.h __mm_store1_pd(), address must be 16-byte aligned
OSALIGN(int64_t, 16) result;
_mm_store1_pd((double*)&result, _mm_castsi128_pd(vMul));
double fResult = (double)result;
fResult = fResult * (1.0 / FIXED_POINT16_SCALE);
return (float)fResult;
}
INLINE
void calcDeterminantIntVertical(const simdscalari vA[3], const simdscalari vB[3], simdscalari *pvDet)
{
// refer to calcDeterminantInt comment for calculation explanation
// A1*B2
simdscalari vA1Lo = _simd_unpacklo_epi32(vA[1], vA[1]); // 0 0 1 1 4 4 5 5
simdscalari vA1Hi = _simd_unpackhi_epi32(vA[1], vA[1]); // 2 2 3 3 6 6 7 7
simdscalari vB2Lo = _simd_unpacklo_epi32(vB[2], vB[2]);
simdscalari vB2Hi = _simd_unpackhi_epi32(vB[2], vB[2]);
simdscalari vA1B2Lo = _simd_mul_epi32(vA1Lo, vB2Lo); // 0 1 4 5
simdscalari vA1B2Hi = _simd_mul_epi32(vA1Hi, vB2Hi); // 2 3 6 7
// B1*A2
simdscalari vA2Lo = _simd_unpacklo_epi32(vA[2], vA[2]);
simdscalari vA2Hi = _simd_unpackhi_epi32(vA[2], vA[2]);
simdscalari vB1Lo = _simd_unpacklo_epi32(vB[1], vB[1]);
simdscalari vB1Hi = _simd_unpackhi_epi32(vB[1], vB[1]);
simdscalari vA2B1Lo = _simd_mul_epi32(vA2Lo, vB1Lo);
simdscalari vA2B1Hi = _simd_mul_epi32(vA2Hi, vB1Hi);
// A1*B2 - A2*B1
simdscalari detLo = _simd_sub_epi64(vA1B2Lo, vA2B1Lo);
simdscalari detHi = _simd_sub_epi64(vA1B2Hi, vA2B1Hi);
// shuffle 0 1 4 5 -> 0 1 2 3
simdscalari vResultLo = _mm256_permute2f128_si256(detLo, detHi, 0x20);
simdscalari vResultHi = _mm256_permute2f128_si256(detLo, detHi, 0x31);
pvDet[0] = vResultLo;
pvDet[1] = vResultHi;
}
INLINE
void triangleSetupC(const __m128 vX, const __m128 vY, const __m128 vA, const __m128 &vB, __m128 &vC)
{
// C = -Ax - By
vC = _mm_mul_ps(vA, vX);
__m128 vCy = _mm_mul_ps(vB, vY);
vC = _mm_mul_ps(vC, _mm_set1_ps(-1.0f));
vC = _mm_sub_ps(vC, vCy);
}
INLINE
void viewportTransform(__m128 &vX, __m128 &vY, __m128 &vZ, const SWR_VIEWPORT_MATRIX &vpMatrix)
{
vX = _mm_mul_ps(vX, _mm_set1_ps(vpMatrix.m00));
vX = _mm_add_ps(vX, _mm_set1_ps(vpMatrix.m30));
vY = _mm_mul_ps(vY, _mm_set1_ps(vpMatrix.m11));
vY = _mm_add_ps(vY, _mm_set1_ps(vpMatrix.m31));
vZ = _mm_mul_ps(vZ, _mm_set1_ps(vpMatrix.m22));
vZ = _mm_add_ps(vZ, _mm_set1_ps(vpMatrix.m32));
}
template<uint32_t NumVerts>
INLINE
void viewportTransform(simdvector *v, const SWR_VIEWPORT_MATRIX & vpMatrix)
{
simdscalar m00 = _simd_load1_ps(&vpMatrix.m00);
simdscalar m30 = _simd_load1_ps(&vpMatrix.m30);
simdscalar m11 = _simd_load1_ps(&vpMatrix.m11);
simdscalar m31 = _simd_load1_ps(&vpMatrix.m31);
simdscalar m22 = _simd_load1_ps(&vpMatrix.m22);
simdscalar m32 = _simd_load1_ps(&vpMatrix.m32);
for (uint32_t i = 0; i < NumVerts; ++i)
{
v[i].x = _simd_fmadd_ps(v[i].x, m00, m30);
v[i].y = _simd_fmadd_ps(v[i].y, m11, m31);
v[i].z = _simd_fmadd_ps(v[i].z, m22, m32);
}
}
INLINE
void calcBoundingBoxInt(const __m128i &vX, const __m128i &vY, BBOX &bbox)
{
// Need horizontal fp min here
__m128i vX1 = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 2, 0, 1));
__m128i vX2 = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 0, 1, 2));
__m128i vY1 = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 2, 0, 1));
__m128i vY2 = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 0, 1, 2));
__m128i vMinX = _mm_min_epi32(vX, vX1);
vMinX = _mm_min_epi32(vMinX, vX2);
__m128i vMaxX = _mm_max_epi32(vX, vX1);
vMaxX = _mm_max_epi32(vMaxX, vX2);
__m128i vMinY = _mm_min_epi32(vY, vY1);
vMinY = _mm_min_epi32(vMinY, vY2);
__m128i vMaxY = _mm_max_epi32(vY, vY1);
vMaxY = _mm_max_epi32(vMaxY, vY2);
bbox.left = _mm_extract_epi32(vMinX, 0);
bbox.right = _mm_extract_epi32(vMaxX, 0);
bbox.top = _mm_extract_epi32(vMinY, 0);
bbox.bottom = _mm_extract_epi32(vMaxY, 0);
#if 0
Jacob: A = _mm_shuffle_ps(X, Y, 0 0 0 0)
B = _mm_shuffle_ps(Z, W, 0 0 0 0)
A = _mm_shuffle_epi32(A, 3 0 3 0)
A = _mm_shuffle_ps(A, B, 1 0 1 0)
#endif
}
INLINE
void calcBoundingBoxIntVertical(const simdscalari (&vX)[3], const simdscalari (&vY)[3], simdBBox &bbox)
{
simdscalari vMinX = vX[0];
vMinX = _simd_min_epi32(vMinX, vX[1]);
vMinX = _simd_min_epi32(vMinX, vX[2]);
simdscalari vMaxX = vX[0];
vMaxX = _simd_max_epi32(vMaxX, vX[1]);
vMaxX = _simd_max_epi32(vMaxX, vX[2]);
simdscalari vMinY = vY[0];
vMinY = _simd_min_epi32(vMinY, vY[1]);
vMinY = _simd_min_epi32(vMinY, vY[2]);
simdscalari vMaxY = vY[0];
vMaxY = _simd_max_epi32(vMaxY, vY[1]);
vMaxY = _simd_max_epi32(vMaxY, vY[2]);
bbox.left = vMinX;
bbox.right = vMaxX;
bbox.top = vMinY;
bbox.bottom = vMaxY;
}
INLINE
bool CanUseSimplePoints(DRAW_CONTEXT *pDC)
{
const API_STATE& state = GetApiState(pDC);
return (state.rastState.sampleCount == SWR_MULTISAMPLE_1X &&
state.rastState.pointSize == 1.0f &&
!state.rastState.pointParam &&
!state.rastState.pointSpriteEnable);
}
uint32_t GetNumPrims(PRIMITIVE_TOPOLOGY mode, uint32_t numElements);
uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts);
// Templated Draw front-end function. All combinations of template parameter values are available
template <bool IsIndexedT, bool HasTessellationT, bool HasGeometryShaderT, bool HasStreamOutT, bool HasRastT>
void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
void ProcessClear(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
void ProcessStoreTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
void ProcessInvalidateTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
void ProcessSync(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
void ProcessQueryStats(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
struct PA_STATE_BASE; // forward decl
void BinTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector tri[3], uint32_t primMask, simdscalari primID);
void BinPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID);
void BinLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID);

View File

@ -0,0 +1,142 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file knobs.h
*
* @brief Static (Compile-Time) Knobs for Core.
*
******************************************************************************/
#pragma once
#include <stdint.h>
#include <gen_knobs.h>
#define KNOB_ARCH_AVX 0
#define KNOB_ARCH_AVX2 1
#define KNOB_ARCH_AVX512 2
///////////////////////////////////////////////////////////////////////////////
// Architecture validation
///////////////////////////////////////////////////////////////////////////////
#if !defined(KNOB_ARCH)
#define KNOB_ARCH KNOB_ARCH_AVX
#endif
#if (KNOB_ARCH == KNOB_ARCH_AVX)
#define KNOB_ARCH_ISA AVX
#define KNOB_ARCH_STR "AVX"
#define KNOB_SIMD_WIDTH 8
#elif (KNOB_ARCH == KNOB_ARCH_AVX2)
#define KNOB_ARCH_ISA AVX2
#define KNOB_ARCH_STR "AVX2"
#define KNOB_SIMD_WIDTH 8
#elif (KNOB_ARCH == KNOB_ARCH_AVX512)
#define KNOB_ARCH_ISA AVX512F
#define KNOB_ARCH_STR "AVX512"
#define KNOB_SIMD_WIDTH 16
#error "AVX512 not yet supported"
#else
#error "Unknown architecture"
#endif
#define MAX_KNOB_ARCH_STR_LEN sizeof("AVX512_PLUS_PADDING")
///////////////////////////////////////////////////////////////////////////////
// Configuration knobs
///////////////////////////////////////////////////////////////////////////////
#define KNOB_MAX_NUM_THREADS 256 // Supports up to dual-HSW-Xeon.
// Maximum supported number of active vertex buffer streams
#define KNOB_NUM_STREAMS 32
// Maximum supported number of attributes per vertex
#define KNOB_NUM_ATTRIBUTES 38
// Maximum supported active viewports and scissors
#define KNOB_NUM_VIEWPORTS_SCISSORS 16
// Guardband range used by the clipper
#define KNOB_GUARDBAND_WIDTH 32768.0f
#define KNOB_GUARDBAND_HEIGHT 32768.0f
///////////////////////////////
// Macro tile configuration
///////////////////////////////
// raster tile dimensions
#define KNOB_TILE_X_DIM 8
#define KNOB_TILE_X_DIM_SHIFT 3
#define KNOB_TILE_Y_DIM 8
#define KNOB_TILE_Y_DIM_SHIFT 3
// fixed macrotile pixel dimension for now, eventually will be
// dynamically set based on tile format and pixel size
#define KNOB_MACROTILE_X_DIM 64
#define KNOB_MACROTILE_Y_DIM 64
#define KNOB_MACROTILE_X_DIM_FIXED (KNOB_MACROTILE_X_DIM << 8)
#define KNOB_MACROTILE_Y_DIM_FIXED (KNOB_MACROTILE_Y_DIM << 8)
#define KNOB_MACROTILE_X_DIM_FIXED_SHIFT 14
#define KNOB_MACROTILE_Y_DIM_FIXED_SHIFT 14
#define KNOB_MACROTILE_X_DIM_IN_TILES (KNOB_MACROTILE_X_DIM >> KNOB_TILE_X_DIM_SHIFT)
#define KNOB_MACROTILE_Y_DIM_IN_TILES (KNOB_MACROTILE_Y_DIM >> KNOB_TILE_Y_DIM_SHIFT)
// total # of hot tiles available. This should be enough to
// fully render a 16kx16k 128bpp render target
#define KNOB_NUM_HOT_TILES_X 256
#define KNOB_NUM_HOT_TILES_Y 256
#define KNOB_COLOR_HOT_TILE_FORMAT R32G32B32A32_FLOAT
#define KNOB_DEPTH_HOT_TILE_FORMAT R32_FLOAT
#define KNOB_STENCIL_HOT_TILE_FORMAT R8_UINT
// Max scissor rectangle
#define KNOB_MAX_SCISSOR_X KNOB_NUM_HOT_TILES_X * KNOB_MACROTILE_X_DIM
#define KNOB_MAX_SCISSOR_Y KNOB_NUM_HOT_TILES_Y * KNOB_MACROTILE_Y_DIM
#if KNOB_SIMD_WIDTH==8 && KNOB_TILE_X_DIM < 4
#error "incompatible width/tile dimensions"
#endif
#if KNOB_SIMD_WIDTH == 8
#define SIMD_TILE_X_DIM 4
#define SIMD_TILE_Y_DIM 2
#else
#error "Invalid simd width"
#endif
///////////////////////////////////////////////////////////////////////////////
// Optimization knobs
///////////////////////////////////////////////////////////////////////////////
#define KNOB_USE_FAST_SRGB TRUE
// enables cut-aware primitive assembler
#define KNOB_ENABLE_CUT_AWARE_PA TRUE
///////////////////////////////////////////////////////////////////////////////
// Debug knobs
///////////////////////////////////////////////////////////////////////////////
//#define KNOB_ENABLE_RDTSC
// Set to 1 to use the dynamic KNOB_TOSS_XXXX knobs.
#if !defined(KNOB_ENABLE_TOSS_POINTS)
#define KNOB_ENABLE_TOSS_POINTS 0
#endif

View File

@ -0,0 +1,98 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file knobs_init.h
*
* @brief Dynamic Knobs Initialization for Core.
*
******************************************************************************/
#pragma once
#include <core/knobs.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <stdio.h>
// Assume the type is compatible with a 32-bit integer
template <typename T>
static inline void ConvertEnvToKnob(const char* pOverride, T& knobValue)
{
uint32_t value = 0;
if (sscanf(pOverride, "%u", &value))
{
knobValue = static_cast<T>(value);
}
}
static inline void ConvertEnvToKnob(const char* pOverride, bool& knobValue)
{
size_t len = strlen(pOverride);
if (len == 1)
{
auto c = tolower(pOverride[0]);
if (c == 'y' || c == 't' || c == '1')
{
knobValue = true;
return;
}
if (c == 'n' || c == 'f' || c == '0')
{
knobValue = false;
return;
}
}
// Try converting to a number and casting to bool
uint32_t value = 0;
if (sscanf(pOverride, "%u", &value))
{
knobValue = value != 0;
return;
}
}
static inline void ConvertEnvToKnob(const char* pOverride, float& knobValue)
{
float value = knobValue;
if (sscanf(pOverride, "%f", &value))
{
knobValue = value;
}
}
template <typename T>
static inline void InitKnob(T& knob)
{
// TODO, read registry first
// Second, read environment variables
const char* pOverride = getenv(knob.Name());
if (pOverride)
{
auto knobValue = knob.Value();
ConvertEnvToKnob(pOverride, knobValue);
knob.Value(knobValue);
}
}

View File

@ -0,0 +1,51 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file multisample.cpp
*
******************************************************************************/
#include "multisample.h"
const uint32_t MultisampleTraits<SWR_MULTISAMPLE_2X>::samplePosXi[2] {0xC0, 0x40};
const uint32_t MultisampleTraits<SWR_MULTISAMPLE_2X>::samplePosYi[2] {0xC0, 0x40};
const uint32_t MultisampleTraits<SWR_MULTISAMPLE_4X>::samplePosXi[4] {0x60, 0xE0, 0x20, 0xA0};
const uint32_t MultisampleTraits<SWR_MULTISAMPLE_4X>::samplePosYi[4] {0x20, 0x60, 0xA0, 0xE0};
const uint32_t MultisampleTraits<SWR_MULTISAMPLE_8X>::samplePosXi[8] {0x90, 0x70, 0xD0, 0x50, 0x30, 0x10, 0xB0, 0xF0};
const uint32_t MultisampleTraits<SWR_MULTISAMPLE_8X>::samplePosYi[8] {0x50, 0xB0, 0x90, 0x30, 0xD0, 0x70, 0xF0, 0x10};
const uint32_t MultisampleTraits<SWR_MULTISAMPLE_16X>::samplePosXi[16]
{0x90, 0x70, 0x50, 0xC0, 0x30, 0xA0, 0xD0, 0xB0, 0x60, 0x80, 0x40, 0x20, 0x00, 0xF0, 0xE0, 0x10};
const uint32_t MultisampleTraits<SWR_MULTISAMPLE_16X>::samplePosYi[16]
{0x90, 0x50, 0xA0, 0x70, 0x60, 0xD0, 0xB0, 0x30, 0xE0, 0x10, 0x20, 0xC0, 0x80, 0x40, 0xF0, 0x00};
const float MultisampleTraits<SWR_MULTISAMPLE_1X>::samplePosX{0.5f};
const float MultisampleTraits<SWR_MULTISAMPLE_1X>::samplePosY{0.5f};
const float MultisampleTraits<SWR_MULTISAMPLE_2X>::samplePosX[2]{0.75f, 0.25f};
const float MultisampleTraits<SWR_MULTISAMPLE_2X>::samplePosY[2]{0.75f, 0.25f};
const float MultisampleTraits<SWR_MULTISAMPLE_4X>::samplePosX[4]{0.375f, 0.875, 0.125, 0.625};
const float MultisampleTraits<SWR_MULTISAMPLE_4X>::samplePosY[4]{0.125, 0.375, 0.625, 0.875};
const float MultisampleTraits<SWR_MULTISAMPLE_8X>::samplePosX[8]{0.5625, 0.4375, 0.8125, 0.3125, 0.1875, 0.0625, 0.6875, 0.9375};
const float MultisampleTraits<SWR_MULTISAMPLE_8X>::samplePosY[8]{0.3125, 0.6875, 0.5625, 0.1875, 0.8125, 0.4375, 0.9375, 0.0625};
const float MultisampleTraits<SWR_MULTISAMPLE_16X>::samplePosX[16]
{0.5625, 0.4375, 0.3125, 0.7500, 0.1875, 0.6250, 0.8125, 0.6875, 0.3750, 0.5000, 0.2500, 0.1250, 0.0000, 0.9375, 0.8750, 0.0625};
const float MultisampleTraits<SWR_MULTISAMPLE_16X>::samplePosY[16]
{0.5625, 0.3125, 0.6250, 0.4375, 0.3750, 0.8125, 0.6875, 0.1875, 0.8750, 0.0625, 0.1250, 0.7500, 0.5000, 0.2500, 0.9375, 0.0000};

View File

@ -0,0 +1,620 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file multisample.h
*
******************************************************************************/
#pragma once
#include "context.h"
#include "format_traits.h"
INLINE
uint32_t GetNumSamples(SWR_MULTISAMPLE_COUNT sampleCount)
{
static const uint32_t sampleCountLUT[SWR_MULTISAMPLE_TYPE_MAX] {1, 2, 4, 8, 16};
assert(sampleCount < SWR_MULTISAMPLE_TYPE_MAX);
return sampleCountLUT[sampleCount];
}
INLINE
SWR_MULTISAMPLE_COUNT GetSampleCount(uint32_t numSamples)
{
switch(numSamples)
{
case 1: return SWR_MULTISAMPLE_1X;
case 2: return SWR_MULTISAMPLE_2X;
case 4: return SWR_MULTISAMPLE_4X;
case 8: return SWR_MULTISAMPLE_8X;
case 16: return SWR_MULTISAMPLE_16X;
default: assert(0); return SWR_MULTISAMPLE_1X;
}
}
// hardcoded offsets based on Direct3d standard multisample positions
// 8 x 8 pixel grid ranging from (0, 0) to (15, 15), with (0, 0) = UL pixel corner
// coords are 0.8 fixed point offsets from (0, 0)
template<SWR_MULTISAMPLE_COUNT sampleCount>
struct MultisampleTraits
{
INLINE static __m128i vXi(uint32_t sampleNum) = delete;
INLINE static __m128i vYi(uint32_t sampleNum) = delete;
INLINE static simdscalar vX(uint32_t sampleNum) = delete;
INLINE static simdscalar vY(uint32_t sampleNum) = delete;
INLINE static float X(uint32_t sampleNum) = delete;
INLINE static float Y(uint32_t sampleNum) = delete;
INLINE static __m128i TileSampleOffsetsX() = delete;
INLINE static __m128i TileSampleOffsetsY() = delete;
INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum) = delete;
INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum) = delete;
INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum) = delete;
INLINE static simdscalari FullSampleMask() = delete;
static const uint32_t numSamples = 0;
};
template<>
struct MultisampleTraits<SWR_MULTISAMPLE_1X>
{
INLINE static __m128i vXi(uint32_t sampleNum)
{
static const __m128i X = _mm_set1_epi32(samplePosXi);
return X;
}
INLINE static __m128i vYi(uint32_t sampleNum)
{
static const __m128i Y = _mm_set1_epi32(samplePosYi);
return Y;
}
INLINE static simdscalar vX(uint32_t sampleNum)
{
static const simdscalar X = _simd_set1_ps(0.5f);
return X;
}
INLINE static simdscalar vY(uint32_t sampleNum)
{
static const simdscalar Y = _simd_set1_ps(0.5f);
return Y;
}
INLINE static float X(uint32_t sampleNum) {return samplePosX;};
INLINE static float Y(uint32_t sampleNum) {return samplePosY;};
INLINE static __m128i TileSampleOffsetsX()
{
static const uint32_t bboxLeftEdge = 0x80;
static const uint32_t bboxRightEdge = 0x80;
// BR, BL, UR, UL
static const __m128i tileSampleOffsetX = _mm_set_epi32(bboxRightEdge, bboxLeftEdge, bboxRightEdge, bboxLeftEdge);
return tileSampleOffsetX;
}
INLINE static __m128i TileSampleOffsetsY()
{
static const uint32_t bboxTopEdge = 0x80;
static const uint32_t bboxBottomEdge = 0x80;
// BR, BL, UR, UL
static const __m128i tileSampleOffsetY = _mm_set_epi32(bboxBottomEdge, bboxBottomEdge, bboxTopEdge, bboxTopEdge);
return tileSampleOffsetY;
}
INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
{
return 0;
}
INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
{
return 0;
}
INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
{
return 0;
}
INLINE static simdscalari FullSampleMask(){return _simd_set1_epi32(0x1);};
static const uint32_t samplePosXi {0x80};
static const uint32_t samplePosYi {0x80};
static const float samplePosX;
static const float samplePosY;
static const uint32_t numSamples = 1;
};
template<>
struct MultisampleTraits<SWR_MULTISAMPLE_2X>
{
INLINE static __m128i vXi(uint32_t sampleNum)
{
SWR_ASSERT(sampleNum < numSamples);
static const __m128i X[numSamples] {_mm_set1_epi32(samplePosXi[0]), _mm_set1_epi32(samplePosXi[1])};
return X[sampleNum];
}
INLINE static __m128i vYi(uint32_t sampleNum)
{
SWR_ASSERT(sampleNum < numSamples);
static const __m128i Y[numSamples] {_mm_set1_epi32(samplePosYi[0]), _mm_set1_epi32(samplePosYi[1])};
return Y[sampleNum];
}
INLINE static simdscalar vX(uint32_t sampleNum)
{
static const simdscalar X[numSamples] {_simd_set1_ps(0.75f), _simd_set1_ps(0.25f)};
assert(sampleNum < numSamples);
return X[sampleNum];
}
INLINE static simdscalar vY(uint32_t sampleNum)
{
static const simdscalar Y[numSamples] {_simd_set1_ps(0.75f), _simd_set1_ps(0.25f)};
assert(sampleNum < numSamples);
return Y[sampleNum];
}
INLINE static float X(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosX[sampleNum]; };
INLINE static float Y(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosY[sampleNum]; };
INLINE static __m128i TileSampleOffsetsX()
{
static const uint32_t bboxLeftEdge = 0x40;
static const uint32_t bboxRightEdge = 0xC0;
// BR, BL, UR, UL
static const __m128i tileSampleOffsetX = _mm_set_epi32(bboxRightEdge, bboxLeftEdge, bboxRightEdge, bboxLeftEdge);
return tileSampleOffsetX;
}
INLINE static __m128i TileSampleOffsetsY()
{
static const uint32_t bboxTopEdge = 0x40;
static const uint32_t bboxBottomEdge = 0xC0;
// BR, BL, UR, UL
static const __m128i tileSampleOffsetY = _mm_set_epi32(bboxBottomEdge, bboxBottomEdge, bboxTopEdge, bboxTopEdge);
return tileSampleOffsetY;
}
INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
{
static const uint32_t RasterTileColorOffsets[numSamples]
{ 0,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8)
};
assert(sampleNum < numSamples);
return RasterTileColorOffsets[sampleNum];
}
INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
{
static const uint32_t RasterTileDepthOffsets[numSamples]
{ 0,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8)
};
assert(sampleNum < numSamples);
return RasterTileDepthOffsets[sampleNum];
}
INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
{
static const uint32_t RasterTileStencilOffsets[numSamples]
{ 0,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8)
};
assert(sampleNum < numSamples);
return RasterTileStencilOffsets[sampleNum];
}
INLINE static simdscalari FullSampleMask()
{
static const simdscalari mask =_simd_set1_epi32(0x3);
return mask;
}
static const uint32_t samplePosXi[2];
static const uint32_t samplePosYi[2];
static const float samplePosX[2];
static const float samplePosY[2];
static const uint32_t numSamples = 2;
};
template<>
struct MultisampleTraits<SWR_MULTISAMPLE_4X>
{
INLINE static __m128i vXi(uint32_t sampleNum)
{
static const __m128i X[numSamples]
{_mm_set1_epi32(samplePosXi[0]), _mm_set1_epi32(samplePosXi[1]), _mm_set1_epi32(samplePosXi[2]), _mm_set1_epi32(samplePosXi[3])};
SWR_ASSERT(sampleNum < numSamples);
return X[sampleNum];
}
INLINE static __m128i vYi(uint32_t sampleNum)
{
static const __m128i Y[numSamples]
{_mm_set1_epi32(samplePosYi[0]), _mm_set1_epi32(samplePosYi[1]), _mm_set1_epi32(samplePosYi[2]), _mm_set1_epi32(samplePosYi[3])};
SWR_ASSERT(sampleNum < numSamples);
return Y[sampleNum];
}
INLINE static simdscalar vX(uint32_t sampleNum)
{
static const simdscalar X[numSamples]
{_simd_set1_ps(0.375f), _simd_set1_ps(0.875), _simd_set1_ps(0.125), _simd_set1_ps(0.625)};
assert(sampleNum < numSamples);
return X[sampleNum];
}
INLINE static simdscalar vY(uint32_t sampleNum)
{
static const simdscalar Y[numSamples]
{_simd_set1_ps(0.125), _simd_set1_ps(0.375f), _simd_set1_ps(0.625), _simd_set1_ps(0.875)};
assert(sampleNum < numSamples);
return Y[sampleNum];
}
INLINE static float X(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosX[sampleNum]; };
INLINE static float Y(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosY[sampleNum]; };
INLINE static __m128i TileSampleOffsetsX()
{
static const uint32_t bboxLeftEdge = 0x20;
static const uint32_t bboxRightEdge = 0xE0;
// BR, BL, UR, UL
static const __m128i tileSampleOffsetX = _mm_set_epi32(bboxRightEdge, bboxLeftEdge, bboxRightEdge, bboxLeftEdge);
return tileSampleOffsetX;
}
INLINE static __m128i TileSampleOffsetsY()
{
static const uint32_t bboxTopEdge = 0x20;
static const uint32_t bboxBottomEdge = 0xE0;
// BR, BL, UR, UL
static const __m128i tileSampleOffsetY = _mm_set_epi32(bboxBottomEdge, bboxBottomEdge, bboxTopEdge, bboxTopEdge);
return tileSampleOffsetY;
}
INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
{
static const uint32_t RasterTileColorOffsets[numSamples]
{ 0,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8),
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3,
};
assert(sampleNum < numSamples);
return RasterTileColorOffsets[sampleNum];
}
INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
{
static const uint32_t RasterTileDepthOffsets[numSamples]
{ 0,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8),
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3,
};
assert(sampleNum < numSamples);
return RasterTileDepthOffsets[sampleNum];
}
INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
{
static const uint32_t RasterTileStencilOffsets[numSamples]
{ 0,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8),
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 2,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 3,
};
assert(sampleNum < numSamples);
return RasterTileStencilOffsets[sampleNum];
}
INLINE static simdscalari FullSampleMask()
{
static const simdscalari mask = _simd_set1_epi32(0xF);
return mask;
}
static const uint32_t samplePosXi[4];
static const uint32_t samplePosYi[4];
static const float samplePosX[4];
static const float samplePosY[4];
static const uint32_t numSamples = 4;
};
template<>
struct MultisampleTraits<SWR_MULTISAMPLE_8X>
{
INLINE static __m128i vXi(uint32_t sampleNum)
{
static const __m128i X[numSamples]
{_mm_set1_epi32(samplePosXi[0]), _mm_set1_epi32(samplePosXi[1]), _mm_set1_epi32(samplePosXi[2]), _mm_set1_epi32(samplePosXi[3]),
_mm_set1_epi32(samplePosXi[4]), _mm_set1_epi32(samplePosXi[5]), _mm_set1_epi32(samplePosXi[6]), _mm_set1_epi32(samplePosXi[7])};
SWR_ASSERT(sampleNum < numSamples);
return X[sampleNum];
}
INLINE static __m128i vYi(uint32_t sampleNum)
{
static const __m128i Y[numSamples]
{_mm_set1_epi32(samplePosYi[0]), _mm_set1_epi32(samplePosYi[1]), _mm_set1_epi32(samplePosYi[2]), _mm_set1_epi32(samplePosYi[3]),
_mm_set1_epi32(samplePosYi[4]), _mm_set1_epi32(samplePosYi[5]), _mm_set1_epi32(samplePosYi[6]), _mm_set1_epi32(samplePosYi[7])};
SWR_ASSERT(sampleNum < numSamples);
return Y[sampleNum];
}
INLINE static simdscalar vX(uint32_t sampleNum)
{
static const simdscalar X[numSamples]
{_simd_set1_ps(0.5625), _simd_set1_ps(0.4375), _simd_set1_ps(0.8125), _simd_set1_ps(0.3125),
_simd_set1_ps(0.1875), _simd_set1_ps(0.0625), _simd_set1_ps(0.6875), _simd_set1_ps(0.9375)};
assert(sampleNum < numSamples);
return X[sampleNum];
}
INLINE static simdscalar vY(uint32_t sampleNum)
{
static const simdscalar Y[numSamples]
{_simd_set1_ps(0.3125), _simd_set1_ps(0.6875), _simd_set1_ps(0.5625), _simd_set1_ps(0.1875),
_simd_set1_ps(0.8125), _simd_set1_ps(0.4375), _simd_set1_ps(0.9375), _simd_set1_ps(0.0625)};
assert(sampleNum < numSamples);
return Y[sampleNum];
}
INLINE static float X(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosX[sampleNum]; };
INLINE static float Y(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosY[sampleNum]; };
INLINE static __m128i TileSampleOffsetsX()
{
static const uint32_t bboxLeftEdge = 0x10;
static const uint32_t bboxRightEdge = 0xF0;
// BR, BL, UR, UL
static const __m128i tileSampleOffsetX = _mm_set_epi32(bboxRightEdge, bboxLeftEdge, bboxRightEdge, bboxLeftEdge);
return tileSampleOffsetX;
}
INLINE static __m128i TileSampleOffsetsY()
{
static const uint32_t bboxTopEdge = 0x10;
static const uint32_t bboxBottomEdge = 0xF0;
// BR, BL, UR, UL
static const __m128i tileSampleOffsetY = _mm_set_epi32(bboxBottomEdge, bboxBottomEdge, bboxTopEdge, bboxTopEdge);
return tileSampleOffsetY;
}
INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
{
static const uint32_t RasterTileColorOffsets[numSamples]
{ 0,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8),
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 4,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 5,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 6,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 7,
};
assert(sampleNum < numSamples);
return RasterTileColorOffsets[sampleNum];
}
INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
{
static const uint32_t RasterTileDepthOffsets[numSamples]
{ 0,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8),
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 4,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 5,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 6,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 7,
};
assert(sampleNum < numSamples);
return RasterTileDepthOffsets[sampleNum];
}
INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
{
static const uint32_t RasterTileStencilOffsets[numSamples]
{ 0,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8),
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 2,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 3,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 4,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 5,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 6,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 7,
};
assert(sampleNum < numSamples);
return RasterTileStencilOffsets[sampleNum];
}
INLINE static simdscalari FullSampleMask()
{
static const simdscalari mask = _simd_set1_epi32(0xFF);
return mask;
}
static const uint32_t samplePosXi[8];
static const uint32_t samplePosYi[8];
static const float samplePosX[8];
static const float samplePosY[8];
static const uint32_t numSamples = 8;
};
template<>
struct MultisampleTraits<SWR_MULTISAMPLE_16X>
{
INLINE static __m128i vXi(uint32_t sampleNum)
{
static const __m128i X[numSamples]
{_mm_set1_epi32(samplePosXi[0]), _mm_set1_epi32(samplePosXi[1]), _mm_set1_epi32(samplePosXi[2]), _mm_set1_epi32(samplePosXi[3]),
_mm_set1_epi32(samplePosXi[4]), _mm_set1_epi32(samplePosXi[5]), _mm_set1_epi32(samplePosXi[6]), _mm_set1_epi32(samplePosXi[7]),
_mm_set1_epi32(samplePosXi[8]), _mm_set1_epi32(samplePosXi[9]), _mm_set1_epi32(samplePosXi[10]), _mm_set1_epi32(samplePosXi[11]),
_mm_set1_epi32(samplePosXi[12]), _mm_set1_epi32(samplePosXi[13]), _mm_set1_epi32(samplePosXi[14]), _mm_set1_epi32(samplePosXi[15])};
SWR_ASSERT(sampleNum < numSamples);
return X[sampleNum];
}
INLINE static __m128i vYi(uint32_t sampleNum)
{
static const __m128i Y[numSamples]
{_mm_set1_epi32(samplePosYi[0]), _mm_set1_epi32(samplePosYi[1]), _mm_set1_epi32(samplePosYi[2]), _mm_set1_epi32(samplePosYi[3]),
_mm_set1_epi32(samplePosYi[4]), _mm_set1_epi32(samplePosYi[5]), _mm_set1_epi32(samplePosYi[6]), _mm_set1_epi32(samplePosYi[7]),
_mm_set1_epi32(samplePosYi[8]), _mm_set1_epi32(samplePosYi[9]), _mm_set1_epi32(samplePosYi[10]), _mm_set1_epi32(samplePosYi[11]),
_mm_set1_epi32(samplePosYi[12]), _mm_set1_epi32(samplePosYi[13]), _mm_set1_epi32(samplePosYi[14]), _mm_set1_epi32(samplePosYi[15])};
SWR_ASSERT(sampleNum < numSamples);
return Y[sampleNum];
}
INLINE static simdscalar vX(uint32_t sampleNum)
{
static const simdscalar X[numSamples]
{_simd_set1_ps(0.5625), _simd_set1_ps(0.4375), _simd_set1_ps(0.3125), _simd_set1_ps(0.7500),
_simd_set1_ps(0.1875), _simd_set1_ps(0.6250), _simd_set1_ps(0.8125), _simd_set1_ps(0.6875),
_simd_set1_ps(0.3750), _simd_set1_ps(0.5000), _simd_set1_ps(0.2500), _simd_set1_ps(0.1250),
_simd_set1_ps(0.0000), _simd_set1_ps(0.9375), _simd_set1_ps(0.8750), _simd_set1_ps(0.0625)};
assert(sampleNum < numSamples);
return X[sampleNum];
}
INLINE static simdscalar vY(uint32_t sampleNum)
{
static const simdscalar Y[numSamples]
{_simd_set1_ps(0.5625), _simd_set1_ps(0.3125), _simd_set1_ps(0.6250), _simd_set1_ps(0.4375),
_simd_set1_ps(0.3750), _simd_set1_ps(0.8125), _simd_set1_ps(0.6875), _simd_set1_ps(0.1875),
_simd_set1_ps(0.8750), _simd_set1_ps(0.0625), _simd_set1_ps(0.1250), _simd_set1_ps(0.7500),
_simd_set1_ps(0.5000), _simd_set1_ps(0.2500), _simd_set1_ps(0.9375), _simd_set1_ps(0.0000)};
assert(sampleNum < numSamples);
return Y[sampleNum];
}
INLINE static float X(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosX[sampleNum]; };
INLINE static float Y(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosY[sampleNum]; };
INLINE static __m128i TileSampleOffsetsX()
{
static const uint32_t bboxLeftEdge = 0x00;
static const uint32_t bboxRightEdge = 0xF0;
// BR, BL, UR, UL
static const __m128i tileSampleOffsetX = _mm_set_epi32(bboxRightEdge, bboxLeftEdge, bboxRightEdge, bboxLeftEdge);
return tileSampleOffsetX;
}
INLINE static __m128i TileSampleOffsetsY()
{
static const uint32_t bboxTopEdge = 0x00;
static const uint32_t bboxBottomEdge = 0xF0;
// BR, BL, UR, UL
static const __m128i tileSampleOffsetY = _mm_set_epi32(bboxBottomEdge, bboxBottomEdge, bboxTopEdge, bboxTopEdge);
return tileSampleOffsetY;
}
INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
{
static const uint32_t RasterTileColorOffsets[numSamples]
{ 0,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8),
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 4,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 5,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 6,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 7,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 8,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 9,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 10,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 11,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 12,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 13,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 14,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 15,
};
assert(sampleNum < numSamples);
return RasterTileColorOffsets[sampleNum];
}
INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
{
static const uint32_t RasterTileDepthOffsets[numSamples]
{ 0,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8),
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 4,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 5,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 6,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 7,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 8,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 9,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 10,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 11,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 12,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 13,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 14,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 15,
};
assert(sampleNum < numSamples);
return RasterTileDepthOffsets[sampleNum];
}
INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
{
static const uint32_t RasterTileStencilOffsets[numSamples]
{ 0,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8),
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 2,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 3,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 4,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 5,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 6,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 7,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 8,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 9,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 10,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 11,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 12,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 13,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 14,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 15,
};
assert(sampleNum < numSamples);
return RasterTileStencilOffsets[sampleNum];
}
INLINE static simdscalari FullSampleMask()
{
static const simdscalari mask = _simd_set1_epi32(0xFFFF);
return mask;
}
static const uint32_t samplePosXi[16];
static const uint32_t samplePosYi[16];
static const float samplePosX[16];
static const float samplePosY[16];
static const uint32_t numSamples = 16;
};

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,35 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file rasterizer.h
*
* @brief Definitions for the rasterizer.
*
******************************************************************************/
#pragma once
#include "context.h"
extern PFN_WORK_FUNC gRasterizerTable[2][SWR_MULTISAMPLE_TYPE_MAX];
void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
void RasterizeSimplePoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
void RasterizeTriPoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);

View File

@ -0,0 +1,91 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#include "rdtsc_core.h"
#include "common/rdtsc_buckets.h"
// must match CORE_BUCKETS enum order
BUCKET_DESC gCoreBuckets[] = {
{ "APIClearRenderTarget", "", true, 0xff0b8bea },
{ "APIDraw", "", true, 0xff000066 },
{ "APIDrawWakeAllThreads", "", false, 0xffffffff },
{ "APIDrawIndexed", "", true, 0xff000066 },
{ "APIDispatch", "", true, 0xff660000 },
{ "APIStoreTiles", "", true, 0xff00ffff },
{ "APIGetDrawContext", "", false, 0xffffffff },
{ "APISync", "", true, 0xff6666ff },
{ "APIWaitForIdle", "", true, 0xff0000ff },
{ "FEProcessDraw", "", true, 0xff009900 },
{ "FEProcessDrawIndexed", "", true, 0xff009900 },
{ "FEFetchShader", "", false, 0xffffffff },
{ "FEVertexShader", "", false, 0xffffffff },
{ "FEHullShader", "", false, 0xffffffff },
{ "FETessellation", "", false, 0xffffffff },
{ "FEDomainShader", "", false, 0xffffffff },
{ "FEGeometryShader", "", false, 0xffffffff },
{ "FEStreamout", "", false, 0xffffffff },
{ "FEPAAssemble", "", false, 0xffffffff },
{ "FEBinPoints", "", false, 0xff29b854 },
{ "FEBinLines", "", false, 0xff29b854 },
{ "FEBinTriangles", "", false, 0xff29b854 },
{ "FETriangleSetup", "", false, 0xffffffff },
{ "FEViewportCull", "", false, 0xffffffff },
{ "FEGuardbandClip", "", false, 0xffffffff },
{ "FEClipPoints", "", false, 0xffffffff },
{ "FEClipLines", "", false, 0xffffffff },
{ "FEClipTriangles", "", false, 0xffffffff },
{ "FECullZeroAreaAndBackface", "", false, 0xffffffff },
{ "FECullBetweenCenters", "", false, 0xffffffff },
{ "FEProcessStoreTiles", "", true, 0xff39c864 },
{ "FEProcessInvalidateTiles", "", true, 0xffffffff },
{ "WorkerWorkOnFifoBE", "", false, 0xff40261c },
{ "WorkerFoundWork", "", false, 0xff573326 },
{ "BELoadTiles", "", true, 0xffb0e2ff },
{ "BEDispatch", "", true, 0xff00a2ff },
{ "BEClear", "", true, 0xff00ccbb },
{ "BERasterizeLine", "", true, 0xffb26a4e },
{ "BERasterizeTriangle", "", true, 0xffb26a4e },
{ "BETriangleSetup", "", false, 0xffffffff },
{ "BEStepSetup", "", false, 0xffffffff },
{ "BECullZeroArea", "", false, 0xffffffff },
{ "BEEmptyTriangle", "", false, 0xffffffff },
{ "BETrivialAccept", "", false, 0xffffffff },
{ "BETrivialReject", "", false, 0xffffffff },
{ "BERasterizePartial", "", false, 0xffffffff },
{ "BEPixelBackend", "", false, 0xffffffff },
{ "BESetup", "", false, 0xffffffff },
{ "BEBarycentric", "", false, 0xffffffff },
{ "BEEarlyDepthTest", "", false, 0xffffffff },
{ "BEPixelShader", "", false, 0xffffffff },
{ "BELateDepthTest", "", false, 0xffffffff },
{ "BEOutputMerger", "", false, 0xffffffff },
{ "BEStoreTiles", "", true, 0xff00cccc },
{ "BEEndTile", "", false, 0xffffffff },
{ "WorkerWaitForThreadEvent", "", false, 0xffffffff },
};
/// @todo bucketmanager and mapping should probably be a part of the SWR context
std::vector<uint32_t> gBucketMap;
BucketManager gBucketMgr(KNOB_BUCKETS_ENABLE_THREADVIZ);
uint32_t gCurrentFrame = 0;

View File

@ -0,0 +1,177 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#pragma once
#include "knobs.h"
#include "common/os.h"
#include "common/rdtsc_buckets.h"
#include <vector>
enum CORE_BUCKETS
{
APIClearRenderTarget,
APIDraw,
APIDrawWakeAllThreads,
APIDrawIndexed,
APIDispatch,
APIStoreTiles,
APIGetDrawContext,
APISync,
APIWaitForIdle,
FEProcessDraw,
FEProcessDrawIndexed,
FEFetchShader,
FEVertexShader,
FEHullShader,
FETessellation,
FEDomainShader,
FEGeometryShader,
FEStreamout,
FEPAAssemble,
FEBinPoints,
FEBinLines,
FEBinTriangles,
FETriangleSetup,
FEViewportCull,
FEGuardbandClip,
FEClipPoints,
FEClipLines,
FEClipTriangles,
FECullZeroAreaAndBackface,
FECullBetweenCenters,
FEProcessStoreTiles,
FEProcessInvalidateTiles,
WorkerWorkOnFifoBE,
WorkerFoundWork,
BELoadTiles,
BEDispatch,
BEClear,
BERasterizeLine,
BERasterizeTriangle,
BETriangleSetup,
BEStepSetup,
BECullZeroArea,
BEEmptyTriangle,
BETrivialAccept,
BETrivialReject,
BERasterizePartial,
BEPixelBackend,
BESetup,
BEBarycentric,
BEEarlyDepthTest,
BEPixelShader,
BELateDepthTest,
BEOutputMerger,
BEStoreTiles,
BEEndTile,
WorkerWaitForThreadEvent,
NumBuckets
};
void rdtscReset();
void rdtscInit(int threadId);
void rdtscStart(uint32_t bucketId);
void rdtscStop(uint32_t bucketId, uint32_t count, uint64_t drawId);
void rdtscEvent(uint32_t bucketId, uint32_t count1, uint32_t count2);
void rdtscEndFrame();
#ifdef KNOB_ENABLE_RDTSC
#define RDTSC_RESET() rdtscReset()
#define RDTSC_INIT(threadId) rdtscInit(threadId)
#define RDTSC_START(bucket) rdtscStart(bucket)
#define RDTSC_STOP(bucket, count, draw) rdtscStop(bucket, count, draw)
#define RDTSC_EVENT(bucket, count1, count2) rdtscEvent(bucket, count1, count2)
#define RDTSC_ENDFRAME() rdtscEndFrame()
#else
#define RDTSC_RESET()
#define RDTSC_INIT(threadId)
#define RDTSC_START(bucket)
#define RDTSC_STOP(bucket, count, draw)
#define RDTSC_EVENT(bucket, count1, count2)
#define RDTSC_ENDFRAME()
#endif
extern std::vector<uint32_t> gBucketMap;
extern BucketManager gBucketMgr;
extern BUCKET_DESC gCoreBuckets[];
extern uint32_t gCurrentFrame;
INLINE void rdtscReset()
{
gCurrentFrame = 0;
gBucketMgr.ClearThreads();
gBucketMgr.ClearBuckets();
}
INLINE void rdtscInit(int threadId)
{
// register all the buckets once
if (threadId == 0)
{
gBucketMap.resize(NumBuckets);
for (uint32_t i = 0; i < NumBuckets; ++i)
{
gBucketMap[i] = gBucketMgr.RegisterBucket(gCoreBuckets[i]);
}
}
std::string name = threadId == 0 ? "API" : "WORKER";
gBucketMgr.RegisterThread(name);
}
INLINE void rdtscStart(uint32_t bucketId)
{
uint32_t id = gBucketMap[bucketId];
gBucketMgr.StartBucket(id);
}
INLINE void rdtscStop(uint32_t bucketId, uint32_t count, uint64_t drawId)
{
uint32_t id = gBucketMap[bucketId];
gBucketMgr.StopBucket(id);
}
INLINE void rdtscEvent(uint32_t bucketId, uint32_t count1, uint32_t count2)
{
uint32_t id = gBucketMap[bucketId];
gBucketMgr.AddEvent(id, count1);
}
INLINE void rdtscEndFrame()
{
gCurrentFrame++;
if (gCurrentFrame == KNOB_BUCKETS_START_FRAME)
{
gBucketMgr.StartCapture();
}
if (gCurrentFrame == KNOB_BUCKETS_END_FRAME)
{
gBucketMgr.StopCapture();
gBucketMgr.PrintReport("rdtsc.txt");
}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,88 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file tessellator.h
*
* @brief Tessellator fixed function unit interface definition
*
******************************************************************************/
#pragma once
/// Allocate and initialize a new tessellation context
HANDLE SWR_API TSInitCtx(
SWR_TS_DOMAIN tsDomain, ///< [IN] Tessellation domain (isoline, quad, triangle)
SWR_TS_PARTITIONING tsPartitioning, ///< [IN] Tessellation partitioning algorithm
SWR_TS_OUTPUT_TOPOLOGY tsOutputTopology, ///< [IN] Tessellation output topology
void* pContextMem, ///< [IN] Memory to use for the context
size_t& memSize); ///< [INOUT] In: Amount of memory in pContextMem. Out: Mem required
/// Destroy & de-allocate tessellation context
void SWR_API TSDestroyCtx(
HANDLE tsCtx); ///< [IN] Tessellation context to be destroyed
struct SWR_TS_TESSELLATED_DATA
{
uint32_t NumPrimitives;
uint32_t NumDomainPoints;
uint32_t* ppIndices[3];
float* pDomainPointsU;
float* pDomainPointsV;
// For Tri: pDomainPointsW[i] = 1.0f - pDomainPointsU[i] - pDomainPointsV[i]
};
/// Perform Tessellation
void SWR_API TSTessellate(
HANDLE tsCtx, ///< [IN] Tessellation Context
const SWR_TESSELLATION_FACTORS& tsTessFactors, ///< [IN] Tessellation Factors
SWR_TS_TESSELLATED_DATA& tsTessellatedData); ///< [OUT] Tessellated Data
/// @TODO - Implement OSS tessellator
INLINE HANDLE SWR_API TSInitCtx(
SWR_TS_DOMAIN tsDomain,
SWR_TS_PARTITIONING tsPartitioning,
SWR_TS_OUTPUT_TOPOLOGY tsOutputTopology,
void* pContextMem,
size_t& memSize)
{
SWR_ASSERT(0, "%s: Not Implemented", __FUNCTION__);
return NULL;
}
INLINE void SWR_API TSDestroyCtx(HANDLE tsCtx)
{
SWR_ASSERT(0, "%s: Not Implemented", __FUNCTION__);
}
INLINE void SWR_API TSTessellate(
HANDLE tsCtx,
const SWR_TESSELLATION_FACTORS& tsTessFactors,
SWR_TS_TESSELLATED_DATA& tsTessellatedData)
{
SWR_ASSERT(0, "%s: Not Implemented", __FUNCTION__);
}

View File

@ -0,0 +1,962 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#include <stdio.h>
#include <thread>
#include <algorithm>
#include <unordered_set>
#include <float.h>
#include <vector>
#include <utility>
#include <fstream>
#include <string>
#if defined(__linux__) || defined(__gnu_linux__)
#include <pthread.h>
#include <sched.h>
#include <unistd.h>
#endif
#include "common/os.h"
#include "context.h"
#include "frontend.h"
#include "backend.h"
#include "rasterizer.h"
#include "rdtsc_core.h"
#include "tilemgr.h"
#include "core/multisample.h"
// ThreadId
struct Core
{
uint32_t procGroup = 0;
std::vector<uint32_t> threadIds;
};
struct NumaNode
{
std::vector<Core> cores;
};
typedef std::vector<NumaNode> CPUNumaNodes;
void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThreadsPerProcGroup)
{
out_nodes.clear();
out_numThreadsPerProcGroup = 0;
#if defined(_WIN32)
SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX buffer[KNOB_MAX_NUM_THREADS];
DWORD bufSize = sizeof(buffer);
BOOL ret = GetLogicalProcessorInformationEx(RelationProcessorCore, buffer, &bufSize);
SWR_ASSERT(ret != FALSE, "Failed to get Processor Topology Information");
uint32_t count = bufSize / buffer->Size;
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBuffer = buffer;
for (uint32_t i = 0; i < count; ++i)
{
SWR_ASSERT(pBuffer->Relationship == RelationProcessorCore);
for (uint32_t g = 0; g < pBuffer->Processor.GroupCount; ++g)
{
auto& gmask = pBuffer->Processor.GroupMask[g];
uint32_t threadId = 0;
uint32_t procGroup = gmask.Group;
Core* pCore = nullptr;
uint32_t numThreads = (uint32_t)_mm_popcount_sizeT(gmask.Mask);
while (BitScanForwardSizeT((unsigned long*)&threadId, gmask.Mask))
{
// clear mask
gmask.Mask &= ~(KAFFINITY(1) << threadId);
// Find Numa Node
PROCESSOR_NUMBER procNum = {};
procNum.Group = WORD(procGroup);
procNum.Number = UCHAR(threadId);
uint32_t numaId = 0;
ret = GetNumaProcessorNodeEx(&procNum, (PUSHORT)&numaId);
SWR_ASSERT(ret);
// Store data
if (out_nodes.size() <= numaId) out_nodes.resize(numaId + 1);
auto& numaNode = out_nodes[numaId];
uint32_t coreId = 0;
if (nullptr == pCore)
{
numaNode.cores.push_back(Core());
pCore = &numaNode.cores.back();
pCore->procGroup = procGroup;
#if !defined(_WIN64)
coreId = (uint32_t)numaNode.cores.size();
if ((coreId * numThreads) >= 32)
{
// Windows doesn't return threadIds >= 32 for a processor group correctly
// when running a 32-bit application.
// Just save -1 as the threadId
threadId = uint32_t(-1);
}
#endif
}
pCore->threadIds.push_back(threadId);
if (procGroup == 0)
{
out_numThreadsPerProcGroup++;
}
}
}
pBuffer = PtrAdd(pBuffer, pBuffer->Size);
}
#elif defined(__linux__) || defined (__gnu_linux__)
// Parse /proc/cpuinfo to get full topology
std::ifstream input("/proc/cpuinfo");
std::string line;
char* c;
uint32_t threadId = uint32_t(-1);
uint32_t coreId = uint32_t(-1);
uint32_t numaId = uint32_t(-1);
while (std::getline(input, line))
{
if (line.find("processor") != std::string::npos)
{
if (threadId != uint32_t(-1))
{
// Save information.
if (out_nodes.size() <= numaId) out_nodes.resize(numaId + 1);
auto& numaNode = out_nodes[numaId];
if (numaNode.cores.size() <= coreId) numaNode.cores.resize(coreId + 1);
auto& core = numaNode.cores[coreId];
core.procGroup = coreId;
core.threadIds.push_back(threadId);
out_numThreadsPerProcGroup++;
}
auto data_start = line.find(": ") + 2;
threadId = std::strtoul(&line.c_str()[data_start], &c, 10);
continue;
}
if (line.find("core id") != std::string::npos)
{
auto data_start = line.find(": ") + 2;
coreId = std::strtoul(&line.c_str()[data_start], &c, 10);
continue;
}
if (line.find("physical id") != std::string::npos)
{
auto data_start = line.find(": ") + 2;
numaId = std::strtoul(&line.c_str()[data_start], &c, 10);
continue;
}
}
if (threadId != uint32_t(-1))
{
// Save information.
if (out_nodes.size() <= numaId) out_nodes.resize(numaId + 1);
auto& numaNode = out_nodes[numaId];
if (numaNode.cores.size() <= coreId) numaNode.cores.resize(coreId + 1);
auto& core = numaNode.cores[coreId];
core.procGroup = coreId;
core.threadIds.push_back(threadId);
out_numThreadsPerProcGroup++;
}
for (uint32_t node = 0; node < out_nodes.size(); node++) {
auto& numaNode = out_nodes[node];
auto it = numaNode.cores.begin();
for ( ; it != numaNode.cores.end(); ) {
if (it->threadIds.size() == 0)
numaNode.cores.erase(it);
else
++it;
}
}
#else
#error Unsupported platform
#endif
}
void bindThread(uint32_t threadId, uint32_t procGroupId = 0, bool bindProcGroup=false)
{
// Only bind threads when MAX_WORKER_THREADS isn't set.
if (KNOB_MAX_WORKER_THREADS && bindProcGroup == false)
{
return;
}
#if defined(_WIN32)
{
GROUP_AFFINITY affinity = {};
affinity.Group = procGroupId;
#if !defined(_WIN64)
if (threadId >= 32)
{
// In a 32-bit process on Windows it is impossible to bind
// to logical processors 32-63 within a processor group.
// In this case set the mask to 0 and let the system assign
// the processor. Hopefully it will make smart choices.
affinity.Mask = 0;
}
else
#endif
{
// If KNOB_MAX_WORKER_THREADS is set, only bind to the proc group,
// Not the individual HW thread.
if (!KNOB_MAX_WORKER_THREADS)
{
affinity.Mask = KAFFINITY(1) << threadId;
}
}
SetThreadGroupAffinity(GetCurrentThread(), &affinity, nullptr);
}
#else
cpu_set_t cpuset;
pthread_t thread = pthread_self();
CPU_ZERO(&cpuset);
CPU_SET(threadId, &cpuset);
pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
#endif
}
INLINE
uint64_t GetEnqueuedDraw(SWR_CONTEXT *pContext)
{
//uint64_t result = _InterlockedCompareExchange64((volatile __int64*)&pContext->DrawEnqueued, 0, 0);
//return result;
return pContext->DrawEnqueued;
}
INLINE
DRAW_CONTEXT *GetDC(SWR_CONTEXT *pContext, uint64_t drawId)
{
return &pContext->dcRing[(drawId-1) % KNOB_MAX_DRAWS_IN_FLIGHT];
}
// returns true if dependency not met
INLINE
bool CheckDependency(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint64_t lastRetiredDraw)
{
return (pDC->dependency > lastRetiredDraw);
}
void ClearColorHotTile(const HOTTILE* pHotTile) // clear a macro tile from float4 clear data.
{
// Load clear color into SIMD register...
float *pClearData = (float*)(pHotTile->clearData);
simdscalar valR = _simd_broadcast_ss(&pClearData[0]);
simdscalar valG = _simd_broadcast_ss(&pClearData[1]);
simdscalar valB = _simd_broadcast_ss(&pClearData[2]);
simdscalar valA = _simd_broadcast_ss(&pClearData[3]);
float *pfBuf = (float*)pHotTile->pBuffer;
uint32_t numSamples = pHotTile->numSamples;
for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
{
for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
{
for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM) //SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); si++)
{
_simd_store_ps(pfBuf, valR);
pfBuf += KNOB_SIMD_WIDTH;
_simd_store_ps(pfBuf, valG);
pfBuf += KNOB_SIMD_WIDTH;
_simd_store_ps(pfBuf, valB);
pfBuf += KNOB_SIMD_WIDTH;
_simd_store_ps(pfBuf, valA);
pfBuf += KNOB_SIMD_WIDTH;
}
}
}
}
void ClearDepthHotTile(const HOTTILE* pHotTile) // clear a macro tile from float4 clear data.
{
// Load clear color into SIMD register...
float *pClearData = (float*)(pHotTile->clearData);
simdscalar valZ = _simd_broadcast_ss(&pClearData[0]);
float *pfBuf = (float*)pHotTile->pBuffer;
uint32_t numSamples = pHotTile->numSamples;
for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
{
for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
{
for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM)
{
_simd_store_ps(pfBuf, valZ);
pfBuf += KNOB_SIMD_WIDTH;
}
}
}
}
void ClearStencilHotTile(const HOTTILE* pHotTile)
{
// convert from F32 to U8.
uint8_t clearVal = (uint8_t)(pHotTile->clearData[0]);
//broadcast 32x into __m256i...
simdscalari valS = _simd_set1_epi8(clearVal);
simdscalari* pBuf = (simdscalari*)pHotTile->pBuffer;
uint32_t numSamples = pHotTile->numSamples;
for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
{
for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
{
// We're putting 4 pixels in each of the 32-bit slots, so increment 4 times as quickly.
for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM * 4)
{
_simd_store_si(pBuf, valS);
pBuf += 1;
}
}
}
}
// for draw calls, we initialize the active hot tiles and perform deferred
// load on them if tile is in invalid state. we do this in the outer thread loop instead of inside
// the draw routine itself mainly for performance, to avoid unnecessary setup
// every triangle
// @todo support deferred clear
INLINE
void InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, const TRIANGLE_WORK_DESC* pWork)
{
const API_STATE& state = GetApiState(pDC);
HotTileMgr *pHotTileMgr = pContext->pHotTileMgr;
uint32_t x, y;
MacroTileMgr::getTileIndices(macroID, x, y);
x *= KNOB_MACROTILE_X_DIM;
y *= KNOB_MACROTILE_Y_DIM;
uint32_t numSamples = GetNumSamples(state.rastState.sampleCount);
// check RT if enabled
unsigned long rtSlot = 0;
uint32_t colorHottileEnableMask = state.colorHottileEnable;
while(_BitScanForward(&rtSlot, colorHottileEnableMask))
{
HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true, numSamples);
if (pHotTile->state == HOTTILE_INVALID)
{
RDTSC_START(BELoadTiles);
// invalid hottile before draw requires a load from surface before we can draw to it
pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_COLOR_HOT_TILE_FORMAT, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
pHotTile->state = HOTTILE_DIRTY;
RDTSC_STOP(BELoadTiles, 0, 0);
}
else if (pHotTile->state == HOTTILE_CLEAR)
{
RDTSC_START(BELoadTiles);
// Clear the tile.
ClearColorHotTile(pHotTile);
pHotTile->state = HOTTILE_DIRTY;
RDTSC_STOP(BELoadTiles, 0, 0);
}
colorHottileEnableMask &= ~(1 << rtSlot);
}
// check depth if enabled
if (state.depthHottileEnable)
{
HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true, numSamples);
if (pHotTile->state == HOTTILE_INVALID)
{
RDTSC_START(BELoadTiles);
// invalid hottile before draw requires a load from surface before we can draw to it
pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_DEPTH_HOT_TILE_FORMAT, SWR_ATTACHMENT_DEPTH, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
pHotTile->state = HOTTILE_DIRTY;
RDTSC_STOP(BELoadTiles, 0, 0);
}
else if (pHotTile->state == HOTTILE_CLEAR)
{
RDTSC_START(BELoadTiles);
// Clear the tile.
ClearDepthHotTile(pHotTile);
pHotTile->state = HOTTILE_DIRTY;
RDTSC_STOP(BELoadTiles, 0, 0);
}
}
// check stencil if enabled
if (state.stencilHottileEnable)
{
HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true, numSamples);
if (pHotTile->state == HOTTILE_INVALID)
{
RDTSC_START(BELoadTiles);
// invalid hottile before draw requires a load from surface before we can draw to it
pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_STENCIL_HOT_TILE_FORMAT, SWR_ATTACHMENT_STENCIL, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
pHotTile->state = HOTTILE_DIRTY;
RDTSC_STOP(BELoadTiles, 0, 0);
}
else if (pHotTile->state == HOTTILE_CLEAR)
{
RDTSC_START(BELoadTiles);
// Clear the tile.
ClearStencilHotTile(pHotTile);
pHotTile->state = HOTTILE_DIRTY;
RDTSC_STOP(BELoadTiles, 0, 0);
}
}
}
INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE)
{
// increment our current draw id to the first incomplete draw
uint64_t drawEnqueued = GetEnqueuedDraw(pContext);
while (curDrawBE < drawEnqueued)
{
DRAW_CONTEXT *pDC = &pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT];
// If its not compute and FE is not done then break out of loop.
if (!pDC->doneFE && !pDC->isCompute) break;
bool isWorkComplete = (pDC->isCompute) ?
pDC->pDispatch->isWorkComplete() : pDC->pTileMgr->isWorkComplete();
if (isWorkComplete)
{
curDrawBE++;
InterlockedIncrement(&pDC->threadsDoneBE);
}
else
{
break;
}
}
// If there are no more incomplete draws then return false.
return (curDrawBE >= drawEnqueued) ? false : true;
}
//////////////////////////////////////////////////////////////////////////
/// @brief If there is any BE work then go work on it.
/// @param pContext - pointer to SWR context.
/// @param workerId - The unique worker ID that is assigned to this thread.
/// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker thread
/// has its own curDrawBE counter and this ensures that each worker processes all the
/// draws in order.
/// @param lockedTiles - This is the set of tiles locked by other threads. Each thread maintains its
/// own set and each time it fails to lock a macrotile, because its already locked,
/// then it will add that tile to the lockedTiles set. As a worker begins to work
/// on future draws the lockedTiles ensure that it doesn't work on tiles that may
/// still have work pending in a previous draw. Additionally, the lockedTiles is
/// hueristic that can steer a worker back to the same macrotile that it had been
/// working on in a previous draw.
void WorkOnFifoBE(
SWR_CONTEXT *pContext,
uint32_t workerId,
uint64_t &curDrawBE,
std::unordered_set<uint32_t>& lockedTiles)
{
// Find the first incomplete draw that has pending work. If no such draw is found then
// return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE.
if (FindFirstIncompleteDraw(pContext, curDrawBE) == false)
{
return;
}
uint64_t lastRetiredDraw = pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT].drawId - 1;
// Reset our history for locked tiles. We'll have to re-learn which tiles are locked.
lockedTiles.clear();
// Try to work on each draw in order of the available draws in flight.
// 1. If we're on curDrawBE, we can work on any macrotile that is available.
// 2. If we're trying to work on draws after curDrawBE, we are restricted to
// working on those macrotiles that are known to be complete in the prior draw to
// maintain order. The locked tiles provides the history to ensures this.
for (uint64_t i = curDrawBE; i < GetEnqueuedDraw(pContext); ++i)
{
DRAW_CONTEXT *pDC = &pContext->dcRing[i % KNOB_MAX_DRAWS_IN_FLIGHT];
if (pDC->isCompute) return; // We don't look at compute work.
// First wait for FE to be finished with this draw. This keeps threading model simple
// but if there are lots of bubbles between draws then serializing FE and BE may
// need to be revisited.
if (!pDC->doneFE) return;
// If this draw is dependent on a previous draw then we need to bail.
if (CheckDependency(pContext, pDC, lastRetiredDraw))
{
return;
}
// Grab the list of all dirty macrotiles. A tile is dirty if it has work queued to it.
std::vector<uint32_t> &macroTiles = pDC->pTileMgr->getDirtyTiles();
for (uint32_t tileID : macroTiles)
{
MacroTileQueue &tile = pDC->pTileMgr->getMacroTileQueue(tileID);
// can only work on this draw if it's not in use by other threads
if (lockedTiles.find(tileID) == lockedTiles.end())
{
if (tile.getNumQueued())
{
if (tile.tryLock())
{
BE_WORK *pWork;
RDTSC_START(WorkerFoundWork);
uint32_t numWorkItems = tile.getNumQueued();
if (numWorkItems != 0)
{
pWork = tile.peek();
SWR_ASSERT(pWork);
if (pWork->type == DRAW)
{
InitializeHotTiles(pContext, pDC, tileID, (const TRIANGLE_WORK_DESC*)&pWork->desc);
}
}
while ((pWork = tile.peek()) != nullptr)
{
pWork->pfnWork(pDC, workerId, tileID, &pWork->desc);
tile.dequeue();
}
RDTSC_STOP(WorkerFoundWork, numWorkItems, pDC->drawId);
_ReadWriteBarrier();
pDC->pTileMgr->markTileComplete(tileID);
// Optimization: If the draw is complete and we're the last one to have worked on it then
// we can reset the locked list as we know that all previous draws before the next are guaranteed to be complete.
if ((curDrawBE == i) && pDC->pTileMgr->isWorkComplete())
{
// We can increment the current BE and safely move to next draw since we know this draw is complete.
curDrawBE++;
InterlockedIncrement(&pDC->threadsDoneBE);
lastRetiredDraw++;
lockedTiles.clear();
break;
}
}
else
{
// This tile is already locked. So let's add it to our locked tiles set. This way we don't try locking this one again.
lockedTiles.insert(tileID);
}
}
}
}
}
}
void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, UCHAR numaNode)
{
// Try to grab the next DC from the ring
uint64_t drawEnqueued = GetEnqueuedDraw(pContext);
while (curDrawFE < drawEnqueued)
{
uint32_t dcSlot = curDrawFE % KNOB_MAX_DRAWS_IN_FLIGHT;
DRAW_CONTEXT *pDC = &pContext->dcRing[dcSlot];
if (pDC->isCompute || pDC->doneFE || pDC->FeLock)
{
curDrawFE++;
InterlockedIncrement(&pDC->threadsDoneFE);
}
else
{
break;
}
}
uint64_t curDraw = curDrawFE;
while (curDraw < drawEnqueued)
{
uint32_t dcSlot = curDraw % KNOB_MAX_DRAWS_IN_FLIGHT;
DRAW_CONTEXT *pDC = &pContext->dcRing[dcSlot];
if (!pDC->isCompute && !pDC->FeLock)
{
uint32_t initial = InterlockedCompareExchange((volatile uint32_t*)&pDC->FeLock, 1, 0);
if (initial == 0)
{
// successfully grabbed the DC, now run the FE
pDC->FeWork.pfnWork(pContext, pDC, workerId, &pDC->FeWork.desc);
_ReadWriteBarrier();
pDC->doneFE = true;
}
}
curDraw++;
}
}
//////////////////////////////////////////////////////////////////////////
/// @brief If there is any compute work then go work on it.
/// @param pContext - pointer to SWR context.
/// @param workerId - The unique worker ID that is assigned to this thread.
/// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker thread
/// has its own curDrawBE counter and this ensures that each worker processes all the
/// draws in order.
void WorkOnCompute(
SWR_CONTEXT *pContext,
uint32_t workerId,
uint64_t& curDrawBE)
{
if (FindFirstIncompleteDraw(pContext, curDrawBE) == false)
{
return;
}
uint64_t lastRetiredDraw = pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT].drawId - 1;
DRAW_CONTEXT *pDC = &pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT];
if (pDC->isCompute == false) return;
// check dependencies
if (CheckDependency(pContext, pDC, lastRetiredDraw))
{
return;
}
SWR_ASSERT(pDC->pDispatch != nullptr);
DispatchQueue& queue = *pDC->pDispatch;
// Is there any work remaining?
if (queue.getNumQueued() > 0)
{
bool lastToComplete = false;
uint32_t threadGroupId = 0;
while (queue.getWork(threadGroupId))
{
ProcessComputeBE(pDC, workerId, threadGroupId);
lastToComplete = queue.finishedWork();
}
_ReadWriteBarrier();
if (lastToComplete)
{
SWR_ASSERT(queue.isWorkComplete() == true);
pDC->doneCompute = true;
}
}
}
DWORD workerThreadMain(LPVOID pData)
{
THREAD_DATA *pThreadData = (THREAD_DATA*)pData;
SWR_CONTEXT *pContext = pThreadData->pContext;
uint32_t threadId = pThreadData->threadId;
uint32_t workerId = pThreadData->workerId;
bindThread(threadId, pThreadData->procGroupId, pThreadData->forceBindProcGroup);
RDTSC_INIT(threadId);
int numaNode = (int)pThreadData->numaId;
// flush denormals to 0
_mm_setcsr(_mm_getcsr() | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
// Track tiles locked by other threads. If we try to lock a macrotile and find its already
// locked then we'll add it to this list so that we don't try and lock it again.
std::unordered_set<uint32_t> lockedTiles;
// each worker has the ability to work on any of the queued draws as long as certain
// conditions are met. the data associated
// with a draw is guaranteed to be active as long as a worker hasn't signaled that he
// has moved on to the next draw when he determines there is no more work to do. The api
// thread will not increment the head of the dc ring until all workers have moved past the
// current head.
// the logic to determine what to work on is:
// 1- try to work on the FE any draw that is queued. For now there are no dependencies
// on the FE work, so any worker can grab any FE and process in parallel. Eventually
// we'll need dependency tracking to force serialization on FEs. The worker will try
// to pick an FE by atomically incrementing a counter in the swr context. he'll keep
// trying until he reaches the tail.
// 2- BE work must be done in strict order. we accomplish this today by pulling work off
// the oldest draw (ie the head) of the dcRing. the worker can determine if there is
// any work left by comparing the total # of binned work items and the total # of completed
// work items. If they are equal, then there is no more work to do for this draw, and
// the worker can safely increment its oldestDraw counter and move on to the next draw.
std::unique_lock<std::mutex> lock(pContext->WaitLock, std::defer_lock);
auto threadHasWork = [&](uint64_t curDraw) { return curDraw != pContext->DrawEnqueued; };
uint64_t curDrawBE = 1;
uint64_t curDrawFE = 1;
while (pContext->threadPool.inThreadShutdown == false)
{
uint32_t loop = 0;
while (loop++ < KNOB_WORKER_SPIN_LOOP_COUNT && !threadHasWork(curDrawBE))
{
_mm_pause();
}
if (!threadHasWork(curDrawBE))
{
lock.lock();
// check for thread idle condition again under lock
if (threadHasWork(curDrawBE))
{
lock.unlock();
continue;
}
if (pContext->threadPool.inThreadShutdown)
{
lock.unlock();
break;
}
RDTSC_START(WorkerWaitForThreadEvent);
pContext->FifosNotEmpty.wait(lock);
lock.unlock();
RDTSC_STOP(WorkerWaitForThreadEvent, 0, 0);
if (pContext->threadPool.inThreadShutdown)
{
break;
}
}
RDTSC_START(WorkerWorkOnFifoBE);
WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles);
RDTSC_STOP(WorkerWorkOnFifoBE, 0, 0);
WorkOnCompute(pContext, workerId, curDrawBE);
WorkOnFifoFE(pContext, workerId, curDrawFE, numaNode);
}
return 0;
}
DWORD workerThreadInit(LPVOID pData)
{
#if defined(_WIN32)
__try
#endif // _WIN32
{
return workerThreadMain(pData);
}
#if defined(_WIN32)
__except(EXCEPTION_CONTINUE_SEARCH)
{
}
#endif // _WIN32
return 1;
}
void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
{
bindThread(0);
CPUNumaNodes nodes;
uint32_t numThreadsPerProcGroup = 0;
CalculateProcessorTopology(nodes, numThreadsPerProcGroup);
uint32_t numHWNodes = (uint32_t)nodes.size();
uint32_t numHWCoresPerNode = (uint32_t)nodes[0].cores.size();
uint32_t numHWHyperThreads = (uint32_t)nodes[0].cores[0].threadIds.size();
uint32_t numNodes = numHWNodes;
uint32_t numCoresPerNode = numHWCoresPerNode;
uint32_t numHyperThreads = numHWHyperThreads;
if (KNOB_MAX_NUMA_NODES)
{
numNodes = std::min(numNodes, KNOB_MAX_NUMA_NODES);
}
if (KNOB_MAX_CORES_PER_NUMA_NODE)
{
numCoresPerNode = std::min(numCoresPerNode, KNOB_MAX_CORES_PER_NUMA_NODE);
}
if (KNOB_MAX_THREADS_PER_CORE)
{
numHyperThreads = std::min(numHyperThreads, KNOB_MAX_THREADS_PER_CORE);
}
// Calculate numThreads
uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads;
if (KNOB_MAX_WORKER_THREADS)
{
uint32_t maxHWThreads = numHWNodes * numHWCoresPerNode * numHWHyperThreads;
numThreads = std::min(KNOB_MAX_WORKER_THREADS, maxHWThreads);
}
if (numThreads > KNOB_MAX_NUM_THREADS)
{
printf("WARNING: system thread count %u exceeds max %u, "
"performance will be degraded\n",
numThreads, KNOB_MAX_NUM_THREADS);
}
if (numThreads == 1)
{
// If only 1 worker thread, try to move it to an available
// HW thread. If that fails, use the API thread.
if (numCoresPerNode < numHWCoresPerNode)
{
numCoresPerNode++;
}
else if (numHyperThreads < numHWHyperThreads)
{
numHyperThreads++;
}
else if (numNodes < numHWNodes)
{
numNodes++;
}
else
{
pPool->numThreads = 0;
SET_KNOB(SINGLE_THREADED, true);
return;
}
}
else
{
// Save a HW thread for the API thread.
numThreads--;
}
pPool->numThreads = numThreads;
pContext->NumWorkerThreads = pPool->numThreads;
pPool->inThreadShutdown = false;
pPool->pThreadData = (THREAD_DATA *)malloc(pPool->numThreads * sizeof(THREAD_DATA));
if (KNOB_MAX_WORKER_THREADS)
{
bool bForceBindProcGroup = (numThreads > numThreadsPerProcGroup);
uint32_t numProcGroups = (numThreads + numThreadsPerProcGroup - 1) / numThreadsPerProcGroup;
// When MAX_WORKER_THREADS is set we don't bother to bind to specific HW threads
// But Windows will still require binding to specific process groups
for (uint32_t workerId = 0; workerId < numThreads; ++workerId)
{
pPool->pThreadData[workerId].workerId = workerId;
pPool->pThreadData[workerId].procGroupId = workerId % numProcGroups;
pPool->pThreadData[workerId].threadId = 0;
pPool->pThreadData[workerId].numaId = 0;
pPool->pThreadData[workerId].pContext = pContext;
pPool->pThreadData[workerId].forceBindProcGroup = bForceBindProcGroup;
pPool->threads[workerId] = new std::thread(workerThreadInit, &pPool->pThreadData[workerId]);
}
}
else
{
uint32_t workerId = 0;
for (uint32_t n = 0; n < numNodes; ++n)
{
auto& node = nodes[n];
uint32_t numCores = numCoresPerNode;
for (uint32_t c = 0; c < numCores; ++c)
{
auto& core = node.cores[c];
for (uint32_t t = 0; t < numHyperThreads; ++t)
{
if (c == 0 && n == 0 && t == 0)
{
// Skip core 0, thread0 on node 0 to reserve for API thread
continue;
}
pPool->pThreadData[workerId].workerId = workerId;
pPool->pThreadData[workerId].procGroupId = core.procGroup;
pPool->pThreadData[workerId].threadId = core.threadIds[t];
pPool->pThreadData[workerId].numaId = n;
pPool->pThreadData[workerId].pContext = pContext;
pPool->threads[workerId] = new std::thread(workerThreadInit, &pPool->pThreadData[workerId]);
++workerId;
}
}
}
}
}
void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
{
if (!KNOB_SINGLE_THREADED)
{
// Inform threads to finish up
std::unique_lock<std::mutex> lock(pContext->WaitLock);
pPool->inThreadShutdown = true;
_mm_mfence();
pContext->FifosNotEmpty.notify_all();
lock.unlock();
// Wait for threads to finish and destroy them
for (uint32_t t = 0; t < pPool->numThreads; ++t)
{
pPool->threads[t]->join();
delete(pPool->threads[t]);
}
// Clean up data used by threads
free(pPool->pThreadData);
}
}

View File

@ -0,0 +1,63 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file threads.h
*
* @brief Definitions for SWR threading model.
*
******************************************************************************/
#pragma once
#include "knobs.h"
#include <unordered_set>
#include <thread>
typedef std::thread* THREAD_PTR;
struct SWR_CONTEXT;
struct THREAD_DATA
{
uint32_t procGroupId; // Will always be 0 for non-Windows OS
uint32_t threadId; // within the procGroup for Windows
uint32_t numaId; // NUMA node id
uint32_t workerId;
SWR_CONTEXT *pContext;
bool forceBindProcGroup; // Only useful when KNOB_MAX_WORKER_THREADS is set.
};
struct THREAD_POOL
{
THREAD_PTR threads[KNOB_MAX_NUM_THREADS];
uint32_t numThreads;
volatile bool inThreadShutdown;
THREAD_DATA *pThreadData;
};
void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
// Expose FE and BE worker functions to the API thread if single threaded
void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, UCHAR numaNode);
void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE, std::unordered_set<uint32_t> &usedTiles);
void WorkOnCompute(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE);

View File

@ -0,0 +1,105 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file tilemgr.cpp
*
* @brief Implementation for Macro Tile Manager which provides the facilities
* for threads to work on an macro tile.
*
******************************************************************************/
#include <unordered_map>
#include "fifo.hpp"
#include "tilemgr.h"
#define TILE_ID(x,y) ((x << 16 | y))
// override new/delete for alignment
void *MacroTileMgr::operator new(size_t size)
{
return _aligned_malloc(size, 64);
}
void MacroTileMgr::operator delete(void *p)
{
_aligned_free(p);
}
void* DispatchQueue::operator new(size_t size)
{
return _aligned_malloc(size, 64);
}
void DispatchQueue::operator delete(void *p)
{
_aligned_free(p);
}
MacroTileMgr::MacroTileMgr(Arena& arena) : mArena(arena)
{
}
void MacroTileMgr::initialize()
{
mWorkItemsProduced = 0;
mWorkItemsConsumed = 0;
mDirtyTiles.clear();
}
void MacroTileMgr::enqueue(uint32_t x, uint32_t y, BE_WORK *pWork)
{
// Should not enqueue more then what we have backing for in the hot tile manager.
SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X);
SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y);
uint32_t id = TILE_ID(x, y);
MacroTileQueue &tile = mTiles[id];
tile.mWorkItemsFE++;
if (tile.mWorkItemsFE == 1)
{
tile.clear(mArena);
mDirtyTiles.push_back(id);
}
mWorkItemsProduced++;
tile.enqueue_try_nosync(mArena, pWork);
}
void MacroTileMgr::markTileComplete(uint32_t id)
{
SWR_ASSERT(mTiles.find(id) != mTiles.end());
MacroTileQueue &tile = mTiles[id];
uint32_t numTiles = tile.mWorkItemsFE;
InterlockedExchangeAdd(&mWorkItemsConsumed, numTiles);
_ReadWriteBarrier();
tile.mWorkItemsBE += numTiles;
SWR_ASSERT(tile.mWorkItemsFE == tile.mWorkItemsBE);
// clear out tile, but defer fifo clear until the next DC first queues to it.
// this prevents worker threads from constantly locking a completed macro tile
tile.mWorkItemsFE = 0;
tile.mWorkItemsBE = 0;
}

View File

@ -0,0 +1,390 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file tilemgr.h
*
* @brief Definitions for Macro Tile Manager which provides the facilities
* for threads to work on an macro tile.
*
******************************************************************************/
#pragma once
#include <set>
#include <unordered_map>
#include "common/formats.h"
#include "fifo.hpp"
#include "context.h"
#include "format_traits.h"
//////////////////////////////////////////////////////////////////////////
/// MacroTile - work queue for a tile.
//////////////////////////////////////////////////////////////////////////
struct MacroTileQueue
{
MacroTileQueue() { }
~MacroTileQueue() { }
//////////////////////////////////////////////////////////////////////////
/// @brief Returns number of work items queued for this tile.
uint32_t getNumQueued()
{
return mFifo.getNumQueued();
}
//////////////////////////////////////////////////////////////////////////
/// @brief Attempt to lock the work fifo. If already locked then return false.
bool tryLock()
{
return mFifo.tryLock();
}
//////////////////////////////////////////////////////////////////////////
/// @brief Clear fifo and unlock it.
void clear(Arena& arena)
{
mFifo.clear(arena);
}
//////////////////////////////////////////////////////////////////////////
/// @brief Peek at work sitting at the front of the fifo.
BE_WORK* peek()
{
return mFifo.peek();
}
bool enqueue_try_nosync(Arena& arena, const BE_WORK* entry)
{
return mFifo.enqueue_try_nosync(arena, entry);
}
//////////////////////////////////////////////////////////////////////////
/// @brief Move to next work item
void dequeue()
{
mFifo.dequeue_noinc();
}
//////////////////////////////////////////////////////////////////////////
/// @brief Destroy fifo
void destroy()
{
mFifo.destroy();
}
///@todo This will all be private.
uint32_t mWorkItemsFE = 0;
uint32_t mWorkItemsBE = 0;
private:
QUEUE<BE_WORK> mFifo;
};
//////////////////////////////////////////////////////////////////////////
/// MacroTileMgr - Manages macrotiles for a draw.
//////////////////////////////////////////////////////////////////////////
class MacroTileMgr
{
public:
MacroTileMgr(Arena& arena);
~MacroTileMgr()
{
for (auto &tile : mTiles)
{
tile.second.destroy();
}
}
void initialize();
INLINE std::vector<uint32_t>& getDirtyTiles() { return mDirtyTiles; }
INLINE MacroTileQueue& getMacroTileQueue(uint32_t id) { return mTiles[id]; }
void markTileComplete(uint32_t id);
INLINE bool isWorkComplete()
{
return mWorkItemsProduced == mWorkItemsConsumed;
}
void enqueue(uint32_t x, uint32_t y, BE_WORK *pWork);
static INLINE void getTileIndices(uint32_t tileID, uint32_t &x, uint32_t &y)
{
y = tileID & 0xffff;
x = (tileID >> 16) & 0xffff;
}
void *operator new(size_t size);
void operator delete (void *p);
private:
Arena& mArena;
SWR_FORMAT mFormat;
std::unordered_map<uint32_t, MacroTileQueue> mTiles;
// Any tile that has work queued to it is a dirty tile.
std::vector<uint32_t> mDirtyTiles;
OSALIGNLINE(LONG) mWorkItemsProduced;
OSALIGNLINE(volatile LONG) mWorkItemsConsumed;
};
//////////////////////////////////////////////////////////////////////////
/// DispatchQueue - work queue for dispatch
//////////////////////////////////////////////////////////////////////////
class DispatchQueue
{
public:
DispatchQueue() {}
//////////////////////////////////////////////////////////////////////////
/// @brief Setup the producer consumer counts.
void initialize(uint32_t totalTasks, void* pTaskData)
{
// The available and outstanding counts start with total tasks.
// At the start there are N tasks available and outstanding.
// When both the available and outstanding counts have reached 0 then all work has completed.
// When a worker starts on a threadgroup then it decrements the available count.
// When a worker completes a threadgroup then it decrements the outstanding count.
mTasksAvailable = totalTasks;
mTasksOutstanding = totalTasks;
mpTaskData = pTaskData;
}
//////////////////////////////////////////////////////////////////////////
/// @brief Returns number of tasks available for this dispatch.
uint32_t getNumQueued()
{
return (mTasksAvailable > 0) ? mTasksAvailable : 0;
}
//////////////////////////////////////////////////////////////////////////
/// @brief Atomically decrement the work available count. If the result
// is greater than 0 then we can on the associated thread group.
// Otherwise, there is no more work to do.
bool getWork(uint32_t& groupId)
{
LONG result = InterlockedDecrement(&mTasksAvailable);
if (result >= 0)
{
groupId = result;
return true;
}
return false;
}
//////////////////////////////////////////////////////////////////////////
/// @brief Atomically decrement the outstanding count. A worker is notifying
/// us that he just finished some work. Also, return true if we're
/// the last worker to complete this dispatch.
bool finishedWork()
{
LONG result = InterlockedDecrement(&mTasksOutstanding);
SWR_ASSERT(result >= 0, "Should never oversubscribe work");
return (result == 0) ? true : false;
}
//////////////////////////////////////////////////////////////////////////
/// @brief Work is complete once both the available/outstanding counts have reached 0.
bool isWorkComplete()
{
return ((mTasksAvailable <= 0) &&
(mTasksOutstanding <= 0));
}
//////////////////////////////////////////////////////////////////////////
/// @brief Return pointer to task data.
const void* GetTasksData()
{
return mpTaskData;
}
void *operator new(size_t size);
void operator delete (void *p);
void* mpTaskData; // The API thread will set this up and the callback task function will interpet this.
OSALIGNLINE(volatile LONG) mTasksAvailable{ 0 };
OSALIGNLINE(volatile LONG) mTasksOutstanding{ 0 };
};
enum HOTTILE_STATE
{
HOTTILE_INVALID, // tile is in unitialized state and should be loaded with surface contents before rendering
HOTTILE_CLEAR, // tile should be cleared
HOTTILE_DIRTY, // tile has been rendered to
HOTTILE_RESOLVED, // tile has been stored to memory
};
struct HOTTILE
{
BYTE *pBuffer;
HOTTILE_STATE state;
DWORD clearData[4]; // May need to change based on pfnClearTile implementation. Reorder for alignment?
uint32_t numSamples;
uint32_t renderTargetArrayIndex; // current render target array index loaded
};
union HotTileSet
{
struct
{
HOTTILE Color[SWR_NUM_RENDERTARGETS];
HOTTILE Depth;
HOTTILE Stencil;
};
HOTTILE Attachment[SWR_NUM_ATTACHMENTS];
};
class HotTileMgr
{
public:
HotTileMgr()
{
memset(&mHotTiles[0][0], 0, sizeof(mHotTiles));
// cache hottile size
for (uint32_t i = SWR_ATTACHMENT_COLOR0; i <= SWR_ATTACHMENT_COLOR7; ++i)
{
mHotTileSize[i] = KNOB_MACROTILE_X_DIM * KNOB_MACROTILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8;
}
mHotTileSize[SWR_ATTACHMENT_DEPTH] = KNOB_MACROTILE_X_DIM * KNOB_MACROTILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8;
mHotTileSize[SWR_ATTACHMENT_STENCIL] = KNOB_MACROTILE_X_DIM * KNOB_MACROTILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8;
}
~HotTileMgr()
{
for (int x = 0; x < KNOB_NUM_HOT_TILES_X; ++x)
{
for (int y = 0; y < KNOB_NUM_HOT_TILES_Y; ++y)
{
for (int a = 0; a < SWR_NUM_ATTACHMENTS; ++a)
{
if (mHotTiles[x][y].Attachment[a].pBuffer != NULL)
{
_aligned_free(mHotTiles[x][y].Attachment[a].pBuffer);
mHotTiles[x][y].Attachment[a].pBuffer = NULL;
}
}
}
}
}
HOTTILE *GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples = 1,
uint32_t renderTargetArrayIndex = 0)
{
uint32_t x, y;
MacroTileMgr::getTileIndices(macroID, x, y);
assert(x < KNOB_NUM_HOT_TILES_X);
assert(y < KNOB_NUM_HOT_TILES_Y);
HotTileSet &tile = mHotTiles[x][y];
HOTTILE& hotTile = tile.Attachment[attachment];
if (hotTile.pBuffer == NULL)
{
if (create)
{
uint32_t size = numSamples * mHotTileSize[attachment];
hotTile.pBuffer = (BYTE*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4);
hotTile.state = HOTTILE_INVALID;
hotTile.numSamples = numSamples;
hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
}
else
{
return NULL;
}
}
else
{
// free the old tile and create a new one with enough space to hold all samples
if (numSamples > hotTile.numSamples)
{
// tile should be either uninitialized or resolved if we're deleting and switching to a
// new sample count
assert((hotTile.state == HOTTILE_INVALID) ||
(hotTile.state == HOTTILE_RESOLVED) ||
(hotTile.state == HOTTILE_CLEAR));
_aligned_free(hotTile.pBuffer);
uint32_t size = numSamples * mHotTileSize[attachment];
hotTile.pBuffer = (BYTE*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4);
hotTile.state = HOTTILE_INVALID;
hotTile.numSamples = numSamples;
}
// if requested render target array index isn't currently loaded, need to store out the current hottile
// and load the requested array slice
if (renderTargetArrayIndex != hotTile.renderTargetArrayIndex)
{
SWR_FORMAT format;
switch (attachment)
{
case SWR_ATTACHMENT_COLOR0:
case SWR_ATTACHMENT_COLOR1:
case SWR_ATTACHMENT_COLOR2:
case SWR_ATTACHMENT_COLOR3:
case SWR_ATTACHMENT_COLOR4:
case SWR_ATTACHMENT_COLOR5:
case SWR_ATTACHMENT_COLOR6:
case SWR_ATTACHMENT_COLOR7: format = KNOB_COLOR_HOT_TILE_FORMAT; break;
case SWR_ATTACHMENT_DEPTH: format = KNOB_DEPTH_HOT_TILE_FORMAT; break;
case SWR_ATTACHMENT_STENCIL: format = KNOB_STENCIL_HOT_TILE_FORMAT; break;
default: SWR_ASSERT(false, "Unknown attachment: %d", attachment); format = KNOB_COLOR_HOT_TILE_FORMAT; break;
}
if (hotTile.state == HOTTILE_DIRTY)
{
pContext->pfnStoreTile(GetPrivateState(pDC), format, attachment,
x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, hotTile.renderTargetArrayIndex, hotTile.pBuffer);
}
pContext->pfnLoadTile(GetPrivateState(pDC), format, attachment,
x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, renderTargetArrayIndex, hotTile.pBuffer);
hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
hotTile.state = HOTTILE_DIRTY;
}
}
return &tile.Attachment[attachment];
}
HotTileSet &GetHotTile(uint32_t macroID)
{
uint32_t x, y;
MacroTileMgr::getTileIndices(macroID, x, y);
assert(x < KNOB_NUM_HOT_TILES_X);
assert(y < KNOB_NUM_HOT_TILES_Y);
return mHotTiles[x][y];
}
private:
HotTileSet mHotTiles[KNOB_NUM_HOT_TILES_X][KNOB_NUM_HOT_TILES_Y];
uint32_t mHotTileSize[SWR_NUM_ATTACHMENTS];
};

View File

@ -0,0 +1,148 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file utils.cpp
*
* @brief Utilities used by SWR core.
*
******************************************************************************/
#if defined(_WIN32)
#include<Windows.h>
#include <Gdiplus.h>
#include <Gdiplusheaders.h>
#include <cstdint>
using namespace Gdiplus;
int GetEncoderClsid(const WCHAR* format, CLSID* pClsid)
{
uint32_t num = 0; // number of image encoders
uint32_t size = 0; // size of the image encoder array in bytes
ImageCodecInfo* pImageCodecInfo = nullptr;
GetImageEncodersSize(&num, &size);
if(size == 0)
return -1; // Failure
pImageCodecInfo = (ImageCodecInfo*)(malloc(size));
if(pImageCodecInfo == nullptr)
return -1; // Failure
GetImageEncoders(num, size, pImageCodecInfo);
for(uint32_t j = 0; j < num; ++j)
{
if( wcscmp(pImageCodecInfo[j].MimeType, format) == 0 )
{
*pClsid = pImageCodecInfo[j].Clsid;
free(pImageCodecInfo);
return j; // Success
}
}
free(pImageCodecInfo);
return -1; // Failure
}
void SaveImageToPNGFile(
const WCHAR *pFilename,
void *pBuffer,
uint32_t width,
uint32_t height)
{
// dump pixels to a png
// Initialize GDI+.
GdiplusStartupInput gdiplusStartupInput;
ULONG_PTR gdiplusToken;
GdiplusStartup(&gdiplusToken, &gdiplusStartupInput, nullptr);
Bitmap *bitmap = new Bitmap(width, height);
BYTE *pBytes = (BYTE*)pBuffer;
static const uint32_t bytesPerPixel = 4;
for (uint32_t y = 0; y < height; ++y)
for (uint32_t x = 0; x < width; ++x)
{
uint32_t pixel = *(uint32_t*)pBytes;
if (pixel == 0xcdcdcdcd)
{
pixel = 0xFFFF00FF;
}
else if (pixel == 0xdddddddd)
{
pixel = 0x80FF0000;
}
else
{
pixel |= 0xFF000000;
}
Color color(pixel);
bitmap->SetPixel(x, y, color);
pBytes += bytesPerPixel;
}
// Save image.
CLSID pngClsid;
GetEncoderClsid(L"image/png", &pngClsid);
bitmap->Save(pFilename, &pngClsid, nullptr);
delete bitmap;
GdiplusShutdown(gdiplusToken);
}
void OpenBitmapFromFile(
const WCHAR *pFilename,
void **pBuffer,
uint32_t *width,
uint32_t *height)
{
GdiplusStartupInput gdiplusStartupInput;
ULONG_PTR gdiplusToken;
GdiplusStartup(&gdiplusToken, &gdiplusStartupInput, nullptr);
Bitmap *bitmap = new Bitmap(pFilename);
*width = bitmap->GetWidth();
*height = bitmap->GetHeight();
*pBuffer = new BYTE[*width * *height * 4]; // width * height * |RGBA|
// The folder 'stb_image' contains a PNG open/close module which
// is far less painful than this is, yo.
Gdiplus::Color clr;
for (uint32_t y = 0, idx = 0; y < *height; ++y)
{
for (uint32_t x = 0; x < *width; ++x, idx += 4)
{
bitmap->GetPixel(x, *height - y - 1, &clr);
((BYTE*)*pBuffer)[idx + 0] = clr.GetBlue();
((BYTE*)*pBuffer)[idx + 1] = clr.GetGreen();
((BYTE*)*pBuffer)[idx + 2] = clr.GetRed();
((BYTE*)*pBuffer)[idx + 3] = clr.GetAlpha();
}
}
delete bitmap;
bitmap = 0;
}
#endif

View File

@ -0,0 +1,831 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file utils.h
*
* @brief Utilities used by SWR core.
*
******************************************************************************/
#pragma once
#include <string.h>
#include "common/os.h"
#include "common/simdintrin.h"
#include "common/swr_assert.h"
#if defined(_WIN32)
void SaveImageToPNGFile(
const WCHAR *pFilename,
void *pBuffer,
uint32_t width,
uint32_t height);
void OpenBitmapFromFile(
const WCHAR *pFilename,
void **pBuffer,
uint32_t *width,
uint32_t *height);
#endif
/// @todo assume linux is always 64 bit
#if defined(_WIN64) || defined(__linux__) || defined(__gnu_linux__)
#define _MM_INSERT_EPI64 _mm_insert_epi64
#define _MM_EXTRACT_EPI64 _mm_extract_epi64
#else
INLINE INT64 _MM_EXTRACT_EPI64(__m128i a, const int32_t ndx)
{
OSALIGNLINE(uint32_t) elems[4];
_mm_store_si128((__m128i*)elems, a);
if (ndx == 0)
{
uint64_t foo = elems[0];
foo |= (uint64_t)elems[1] << 32;
return foo;
}
else
{
uint64_t foo = elems[2];
foo |= (uint64_t)elems[3] << 32;
return foo;
}
}
INLINE __m128i _MM_INSERT_EPI64(__m128i a, INT64 b, const int32_t ndx)
{
OSALIGNLINE(int64_t) elems[2];
_mm_store_si128((__m128i*)elems, a);
if (ndx == 0)
{
elems[0] = b;
}
else
{
elems[1] = b;
}
__m128i out;
out = _mm_load_si128((const __m128i*)elems);
return out;
}
#endif
OSALIGNLINE(struct) BBOX
{
int top, bottom, left, right;
BBOX() {}
BBOX(int t, int b, int l, int r) : top(t), bottom(b), left(l), right(r) {}
bool operator==(const BBOX& rhs)
{
return (this->top == rhs.top &&
this->bottom == rhs.bottom &&
this->left == rhs.left &&
this->right == rhs.right);
}
bool operator!=(const BBOX& rhs)
{
return !(*this == rhs);
}
};
struct simdBBox
{
simdscalari top, bottom, left, right;
};
INLINE
void vTranspose(__m128 &row0, __m128 &row1, __m128 &row2, __m128 &row3)
{
__m128i row0i = _mm_castps_si128(row0);
__m128i row1i = _mm_castps_si128(row1);
__m128i row2i = _mm_castps_si128(row2);
__m128i row3i = _mm_castps_si128(row3);
__m128i vTemp = row2i;
row2i = _mm_unpacklo_epi32(row2i, row3i);
vTemp = _mm_unpackhi_epi32(vTemp, row3i);
row3i = row0i;
row0i = _mm_unpacklo_epi32(row0i, row1i);
row3i = _mm_unpackhi_epi32(row3i, row1i);
row1i = row0i;
row0i = _mm_unpacklo_epi64(row0i, row2i);
row1i = _mm_unpackhi_epi64(row1i, row2i);
row2i = row3i;
row2i = _mm_unpacklo_epi64(row2i, vTemp);
row3i = _mm_unpackhi_epi64(row3i, vTemp);
row0 = _mm_castsi128_ps(row0i);
row1 = _mm_castsi128_ps(row1i);
row2 = _mm_castsi128_ps(row2i);
row3 = _mm_castsi128_ps(row3i);
}
INLINE
void vTranspose(__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3)
{
__m128i vTemp = row2;
row2 = _mm_unpacklo_epi32(row2, row3);
vTemp = _mm_unpackhi_epi32(vTemp, row3);
row3 = row0;
row0 = _mm_unpacklo_epi32(row0, row1);
row3 = _mm_unpackhi_epi32(row3, row1);
row1 = row0;
row0 = _mm_unpacklo_epi64(row0, row2);
row1 = _mm_unpackhi_epi64(row1, row2);
row2 = row3;
row2 = _mm_unpacklo_epi64(row2, vTemp);
row3 = _mm_unpackhi_epi64(row3, vTemp);
}
#define GCC_VERSION (__GNUC__ * 10000 \
+ __GNUC_MINOR__ * 100 \
+ __GNUC_PATCHLEVEL__)
#if defined(__GNUC__) && (GCC_VERSION < 40900)
#define _mm_undefined_ps _mm_setzero_ps
#define _mm_undefined_si128 _mm_setzero_si128
#if KNOB_SIMD_WIDTH == 8
#define _mm256_undefined_ps _mm256_setzero_ps
#endif
#endif
#if KNOB_SIMD_WIDTH == 8
INLINE
void vTranspose3x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2)
{
__m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5
__m256 r1rx = _mm256_unpacklo_ps(vSrc1, _mm256_undefined_ps()); //y0w0y1w1 y4w4y5w5
__m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4
__m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5
r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7
r1rx = _mm256_unpackhi_ps(vSrc1, _mm256_undefined_ps()); //y2w2y3w3 y6w6yw77
__m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6
__m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7
vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1);
vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1);
vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1);
vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1);
}
INLINE
void vTranspose4x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2, __m256 &vSrc3)
{
__m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5
__m256 r1rx = _mm256_unpacklo_ps(vSrc1, vSrc3); //y0w0y1w1 y4w4y5w5
__m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4
__m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5
r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7
r1rx = _mm256_unpackhi_ps(vSrc1, vSrc3) ; //y2w2y3w3 y6w6yw77
__m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6
__m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7
vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1);
vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1);
vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1);
vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1);
}
INLINE
void vTranspose8x8(__m256 (&vDst)[8], const __m256 &vMask0, const __m256 &vMask1, const __m256 &vMask2, const __m256 &vMask3, const __m256 &vMask4, const __m256 &vMask5, const __m256 &vMask6, const __m256 &vMask7)
{
__m256 __t0 = _mm256_unpacklo_ps(vMask0, vMask1);
__m256 __t1 = _mm256_unpackhi_ps(vMask0, vMask1);
__m256 __t2 = _mm256_unpacklo_ps(vMask2, vMask3);
__m256 __t3 = _mm256_unpackhi_ps(vMask2, vMask3);
__m256 __t4 = _mm256_unpacklo_ps(vMask4, vMask5);
__m256 __t5 = _mm256_unpackhi_ps(vMask4, vMask5);
__m256 __t6 = _mm256_unpacklo_ps(vMask6, vMask7);
__m256 __t7 = _mm256_unpackhi_ps(vMask6, vMask7);
__m256 __tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0));
__m256 __tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2));
__m256 __tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0));
__m256 __tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2));
__m256 __tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0));
__m256 __tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2));
__m256 __tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0));
__m256 __tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2));
vDst[0] = _mm256_permute2f128_ps(__tt0, __tt4, 0x20);
vDst[1] = _mm256_permute2f128_ps(__tt1, __tt5, 0x20);
vDst[2] = _mm256_permute2f128_ps(__tt2, __tt6, 0x20);
vDst[3] = _mm256_permute2f128_ps(__tt3, __tt7, 0x20);
vDst[4] = _mm256_permute2f128_ps(__tt0, __tt4, 0x31);
vDst[5] = _mm256_permute2f128_ps(__tt1, __tt5, 0x31);
vDst[6] = _mm256_permute2f128_ps(__tt2, __tt6, 0x31);
vDst[7] = _mm256_permute2f128_ps(__tt3, __tt7, 0x31);
}
INLINE
void vTranspose8x8(__m256 (&vDst)[8], const __m256i &vMask0, const __m256i &vMask1, const __m256i &vMask2, const __m256i &vMask3, const __m256i &vMask4, const __m256i &vMask5, const __m256i &vMask6, const __m256i &vMask7)
{
vTranspose8x8(vDst, _mm256_castsi256_ps(vMask0), _mm256_castsi256_ps(vMask1), _mm256_castsi256_ps(vMask2), _mm256_castsi256_ps(vMask3),
_mm256_castsi256_ps(vMask4), _mm256_castsi256_ps(vMask5), _mm256_castsi256_ps(vMask6), _mm256_castsi256_ps(vMask7));
}
#endif
//////////////////////////////////////////////////////////////////////////
/// TranposeSingleComponent
//////////////////////////////////////////////////////////////////////////
template<uint32_t bpp>
struct TransposeSingleComponent
{
//////////////////////////////////////////////////////////////////////////
/// @brief Pass-thru for single component.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
{
memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8);
}
};
//////////////////////////////////////////////////////////////////////////
/// Transpose8_8_8_8
//////////////////////////////////////////////////////////////////////////
struct Transpose8_8_8_8
{
//////////////////////////////////////////////////////////////////////////
/// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
{
simdscalari src = _simd_load_si((const simdscalari*)pSrc);
#if KNOB_SIMD_WIDTH == 8
#if KNOB_ARCH == KNOB_ARCH_AVX
__m128i c0c1 = _mm256_castsi256_si128(src); // rrrrrrrrgggggggg
__m128i c2c3 = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src), 1)); // bbbbbbbbaaaaaaaa
__m128i c0c2 = _mm_unpacklo_epi64(c0c1, c2c3); // rrrrrrrrbbbbbbbb
__m128i c1c3 = _mm_unpackhi_epi64(c0c1, c2c3); // ggggggggaaaaaaaa
__m128i c01 = _mm_unpacklo_epi8(c0c2, c1c3); // rgrgrgrgrgrgrgrg
__m128i c23 = _mm_unpackhi_epi8(c0c2, c1c3); // babababababababa
__m128i c0123lo = _mm_unpacklo_epi16(c01, c23); // rgbargbargbargba
__m128i c0123hi = _mm_unpackhi_epi16(c01, c23); // rgbargbargbargba
_mm_store_si128((__m128i*)pDst, c0123lo);
_mm_store_si128((__m128i*)(pDst + 16), c0123hi);
#elif KNOB_ARCH == KNOB_ARCH_AVX2
simdscalari dst01 = _mm256_shuffle_epi8(src,
_mm256_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800));
simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01);
dst23 = _mm256_shuffle_epi8(dst23,
_mm256_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080));
simdscalari dst = _mm256_or_si256(dst01, dst23);
_simd_store_si((simdscalari*)pDst, dst);
#endif
#else
#error Unsupported vector width
#endif
}
};
//////////////////////////////////////////////////////////////////////////
/// Transpose8_8_8
//////////////////////////////////////////////////////////////////////////
struct Transpose8_8_8
{
//////////////////////////////////////////////////////////////////////////
/// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
};
//////////////////////////////////////////////////////////////////////////
/// Transpose8_8
//////////////////////////////////////////////////////////////////////////
struct Transpose8_8
{
//////////////////////////////////////////////////////////////////////////
/// @brief Performs an SOA to AOS conversion for packed 8_8 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
{
simdscalari src = _simd_load_si((const simdscalari*)pSrc);
#if KNOB_SIMD_WIDTH == 8
__m128i rg = _mm256_castsi256_si128(src); // rrrrrrrr gggggggg
__m128i g = _mm_unpackhi_epi64(rg, rg); // gggggggg gggggggg
rg = _mm_unpacklo_epi8(rg, g);
_mm_store_si128((__m128i*)pDst, rg);
#else
#error Unsupported vector width
#endif
}
};
//////////////////////////////////////////////////////////////////////////
/// Transpose32_32_32_32
//////////////////////////////////////////////////////////////////////////
struct Transpose32_32_32_32
{
//////////////////////////////////////////////////////////////////////////
/// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
{
#if KNOB_SIMD_WIDTH == 8
simdscalar src0 = _simd_load_ps((const float*)pSrc);
simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
simdscalar src3 = _simd_load_ps((const float*)pSrc + 24);
__m128 vDst[8];
vTranspose4x8(vDst, src0, src1, src2, src3);
_mm_store_ps((float*)pDst, vDst[0]);
_mm_store_ps((float*)pDst+4, vDst[1]);
_mm_store_ps((float*)pDst+8, vDst[2]);
_mm_store_ps((float*)pDst+12, vDst[3]);
_mm_store_ps((float*)pDst+16, vDst[4]);
_mm_store_ps((float*)pDst+20, vDst[5]);
_mm_store_ps((float*)pDst+24, vDst[6]);
_mm_store_ps((float*)pDst+28, vDst[7]);
#else
#error Unsupported vector width
#endif
}
};
//////////////////////////////////////////////////////////////////////////
/// Transpose32_32_32
//////////////////////////////////////////////////////////////////////////
struct Transpose32_32_32
{
//////////////////////////////////////////////////////////////////////////
/// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
{
#if KNOB_SIMD_WIDTH == 8
simdscalar src0 = _simd_load_ps((const float*)pSrc);
simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
__m128 vDst[8];
vTranspose3x8(vDst, src0, src1, src2);
_mm_store_ps((float*)pDst, vDst[0]);
_mm_store_ps((float*)pDst + 4, vDst[1]);
_mm_store_ps((float*)pDst + 8, vDst[2]);
_mm_store_ps((float*)pDst + 12, vDst[3]);
_mm_store_ps((float*)pDst + 16, vDst[4]);
_mm_store_ps((float*)pDst + 20, vDst[5]);
_mm_store_ps((float*)pDst + 24, vDst[6]);
_mm_store_ps((float*)pDst + 28, vDst[7]);
#else
#error Unsupported vector width
#endif
}
};
//////////////////////////////////////////////////////////////////////////
/// Transpose32_32
//////////////////////////////////////////////////////////////////////////
struct Transpose32_32
{
//////////////////////////////////////////////////////////////////////////
/// @brief Performs an SOA to AOS conversion for packed 32_32 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
{
const float* pfSrc = (const float*)pSrc;
__m128 src_r0 = _mm_load_ps(pfSrc + 0);
__m128 src_r1 = _mm_load_ps(pfSrc + 4);
__m128 src_g0 = _mm_load_ps(pfSrc + 8);
__m128 src_g1 = _mm_load_ps(pfSrc + 12);
__m128 dst0 = _mm_unpacklo_ps(src_r0, src_g0);
__m128 dst1 = _mm_unpackhi_ps(src_r0, src_g0);
__m128 dst2 = _mm_unpacklo_ps(src_r1, src_g1);
__m128 dst3 = _mm_unpackhi_ps(src_r1, src_g1);
float* pfDst = (float*)pDst;
_mm_store_ps(pfDst + 0, dst0);
_mm_store_ps(pfDst + 4, dst1);
_mm_store_ps(pfDst + 8, dst2);
_mm_store_ps(pfDst + 12, dst3);
}
};
//////////////////////////////////////////////////////////////////////////
/// Transpose16_16_16_16
//////////////////////////////////////////////////////////////////////////
struct Transpose16_16_16_16
{
//////////////////////////////////////////////////////////////////////////
/// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
{
#if KNOB_SIMD_WIDTH == 8
simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari)));
__m128i src_r = _mm256_extractf128_si256(src_rg, 0);
__m128i src_g = _mm256_extractf128_si256(src_rg, 1);
__m128i src_b = _mm256_extractf128_si256(src_ba, 0);
__m128i src_a = _mm256_extractf128_si256(src_ba, 1);
__m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
__m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
__m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
__m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
__m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
__m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
__m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
__m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
_mm_store_si128(((__m128i*)pDst) + 0, dst0);
_mm_store_si128(((__m128i*)pDst) + 1, dst1);
_mm_store_si128(((__m128i*)pDst) + 2, dst2);
_mm_store_si128(((__m128i*)pDst) + 3, dst3);
#else
#error Unsupported vector width
#endif
}
};
//////////////////////////////////////////////////////////////////////////
/// Transpose16_16_16
//////////////////////////////////////////////////////////////////////////
struct Transpose16_16_16
{
//////////////////////////////////////////////////////////////////////////
/// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
{
#if KNOB_SIMD_WIDTH == 8
simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
__m128i src_r = _mm256_extractf128_si256(src_rg, 0);
__m128i src_g = _mm256_extractf128_si256(src_rg, 1);
__m128i src_b = _mm_load_si128((const __m128i*)(pSrc + sizeof(simdscalari)));
__m128i src_a = _mm_undefined_si128();
__m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
__m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
__m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
__m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
__m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
__m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
__m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
__m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
_mm_store_si128(((__m128i*)pDst) + 0, dst0);
_mm_store_si128(((__m128i*)pDst) + 1, dst1);
_mm_store_si128(((__m128i*)pDst) + 2, dst2);
_mm_store_si128(((__m128i*)pDst) + 3, dst3);
#else
#error Unsupported vector width
#endif
}
};
//////////////////////////////////////////////////////////////////////////
/// Transpose16_16
//////////////////////////////////////////////////////////////////////////
struct Transpose16_16
{
//////////////////////////////////////////////////////////////////////////
/// @brief Performs an SOA to AOS conversion for packed 16_16 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
{
simdscalar src = _simd_load_ps((const float*)pSrc);
#if KNOB_SIMD_WIDTH == 8
__m128 comp0 = _mm256_castps256_ps128(src);
__m128 comp1 = _mm256_extractf128_ps(src, 1);
__m128i comp0i = _mm_castps_si128(comp0);
__m128i comp1i = _mm_castps_si128(comp1);
__m128i resLo = _mm_unpacklo_epi16(comp0i, comp1i);
__m128i resHi = _mm_unpackhi_epi16(comp0i, comp1i);
_mm_store_si128((__m128i*)pDst, resLo);
_mm_store_si128((__m128i*)pDst + 1, resHi);
#else
#error Unsupported vector width
#endif
}
};
//////////////////////////////////////////////////////////////////////////
/// Transpose24_8
//////////////////////////////////////////////////////////////////////////
struct Transpose24_8
{
//////////////////////////////////////////////////////////////////////////
/// @brief Performs an SOA to AOS conversion for packed 24_8 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
};
//////////////////////////////////////////////////////////////////////////
/// Transpose32_8_24
//////////////////////////////////////////////////////////////////////////
struct Transpose32_8_24
{
//////////////////////////////////////////////////////////////////////////
/// @brief Performs an SOA to AOS conversion for packed 32_8_24 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
};
//////////////////////////////////////////////////////////////////////////
/// Transpose4_4_4_4
//////////////////////////////////////////////////////////////////////////
struct Transpose4_4_4_4
{
//////////////////////////////////////////////////////////////////////////
/// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
};
//////////////////////////////////////////////////////////////////////////
/// Transpose5_6_5
//////////////////////////////////////////////////////////////////////////
struct Transpose5_6_5
{
//////////////////////////////////////////////////////////////////////////
/// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
};
//////////////////////////////////////////////////////////////////////////
/// Transpose9_9_9_5
//////////////////////////////////////////////////////////////////////////
struct Transpose9_9_9_5
{
//////////////////////////////////////////////////////////////////////////
/// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
};
//////////////////////////////////////////////////////////////////////////
/// Transpose5_5_5_1
//////////////////////////////////////////////////////////////////////////
struct Transpose5_5_5_1
{
//////////////////////////////////////////////////////////////////////////
/// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
};
//////////////////////////////////////////////////////////////////////////
/// Transpose10_10_10_2
//////////////////////////////////////////////////////////////////////////
struct Transpose10_10_10_2
{
//////////////////////////////////////////////////////////////////////////
/// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
};
//////////////////////////////////////////////////////////////////////////
/// Transpose11_11_10
//////////////////////////////////////////////////////////////////////////
struct Transpose11_11_10
{
//////////////////////////////////////////////////////////////////////////
/// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
};
// helper function to unroll loops
template<int Begin, int End, int Step = 1>
struct UnrollerL {
template<typename Lambda>
INLINE static void step(Lambda& func) {
func(Begin);
UnrollerL<Begin + Step, End, Step>::step(func);
}
};
template<int End, int Step>
struct UnrollerL<End, End, Step> {
template<typename Lambda>
static void step(Lambda& func) {
}
};
// general CRC compute
INLINE
uint32_t ComputeCRC(uint32_t crc, const void *pData, uint32_t size)
{
#if defined(_WIN64) || defined(__linux__) || defined(__gnu_linux__)
uint32_t sizeInQwords = size / sizeof(uint64_t);
uint32_t sizeRemainderBytes = size % sizeof(uint64_t);
uint64_t* pDataWords = (uint64_t*)pData;
for (uint32_t i = 0; i < sizeInQwords; ++i)
{
crc = (uint32_t)_mm_crc32_u64(crc, *pDataWords++);
}
#else
uint32_t sizeInDwords = size / sizeof(uint32_t);
uint32_t sizeRemainderBytes = size % sizeof(uint32_t);
uint32_t* pDataWords = (uint32_t*)pData;
for (uint32_t i = 0; i < sizeInDwords; ++i)
{
crc = _mm_crc32_u32(crc, *pDataWords++);
}
#endif
BYTE* pRemainderBytes = (BYTE*)pDataWords;
for (uint32_t i = 0; i < sizeRemainderBytes; ++i)
{
crc = _mm_crc32_u8(crc, *pRemainderBytes++);
}
return crc;
}
//////////////////////////////////////////////////////////////////////////
/// Add byte offset to any-type pointer
//////////////////////////////////////////////////////////////////////////
template <typename T>
INLINE
static T* PtrAdd(T* p, intptr_t offset)
{
intptr_t intp = reinterpret_cast<intptr_t>(p);
return reinterpret_cast<T*>(intp + offset);
}
//////////////////////////////////////////////////////////////////////////
/// Is a power-of-2?
//////////////////////////////////////////////////////////////////////////
template <typename T>
INLINE
static bool IsPow2(T value)
{
return value == (value & (0 - value));
}
//////////////////////////////////////////////////////////////////////////
/// Align down to specified alignment
/// Note: IsPow2(alignment) MUST be true
//////////////////////////////////////////////////////////////////////////
template <typename T1, typename T2>
INLINE
static T1 AlignDownPow2(T1 value, T2 alignment)
{
SWR_ASSERT(IsPow2(alignment));
return value & ~T1(alignment - 1);
}
//////////////////////////////////////////////////////////////////////////
/// Align up to specified alignment
/// Note: IsPow2(alignment) MUST be true
//////////////////////////////////////////////////////////////////////////
template <typename T1, typename T2>
INLINE
static T1 AlignUpPow2(T1 value, T2 alignment)
{
return AlignDownPow2(value + T1(alignment - 1), alignment);
}
//////////////////////////////////////////////////////////////////////////
/// Align up ptr to specified alignment
/// Note: IsPow2(alignment) MUST be true
//////////////////////////////////////////////////////////////////////////
template <typename T1, typename T2>
INLINE
static T1* AlignUpPow2(T1* value, T2 alignment)
{
return reinterpret_cast<T1*>(
AlignDownPow2(reinterpret_cast<uintptr_t>(value) + uintptr_t(alignment - 1), alignment));
}
//////////////////////////////////////////////////////////////////////////
/// Align down to specified alignment
//////////////////////////////////////////////////////////////////////////
template <typename T1, typename T2>
INLINE
static T1 AlignDown(T1 value, T2 alignment)
{
if (IsPow2(alignment)) { return AlignDownPow2(value, alignment); }
return value - T1(value % alignment);
}
//////////////////////////////////////////////////////////////////////////
/// Align down to specified alignment
//////////////////////////////////////////////////////////////////////////
template <typename T1, typename T2>
INLINE
static T1* AlignDown(T1* value, T2 alignment)
{
return (T1*)AlignDown(uintptr_t(value), alignment);
}
//////////////////////////////////////////////////////////////////////////
/// Align up to specified alignment
/// Note: IsPow2(alignment) MUST be true
//////////////////////////////////////////////////////////////////////////
template <typename T1, typename T2>
INLINE
static T1 AlignUp(T1 value, T2 alignment)
{
return AlignDown(value + T1(alignment - 1), alignment);
}
//////////////////////////////////////////////////////////////////////////
/// Align up to specified alignment
/// Note: IsPow2(alignment) MUST be true
//////////////////////////////////////////////////////////////////////////
template <typename T1, typename T2>
INLINE
static T1* AlignUp(T1* value, T2 alignment)
{
return AlignDown(PtrAdd(value, alignment - 1), alignment);
}
//////////////////////////////////////////////////////////////////////////
/// Helper structure used to access an array of elements that don't
/// correspond to a typical word size.
//////////////////////////////////////////////////////////////////////////
template<typename T, size_t BitsPerElementT, size_t ArrayLenT>
class BitsArray
{
private:
static const size_t BITS_PER_WORD = sizeof(size_t) * 8;
static const size_t ELEMENTS_PER_WORD = BITS_PER_WORD / BitsPerElementT;
static const size_t NUM_WORDS = (ArrayLenT + ELEMENTS_PER_WORD - 1) / ELEMENTS_PER_WORD;
static const size_t ELEMENT_MASK = (size_t(1) << BitsPerElementT) - 1;
static_assert(ELEMENTS_PER_WORD * BitsPerElementT == BITS_PER_WORD,
"Element size must an integral fraction of pointer size");
size_t m_words[NUM_WORDS] = {};
public:
T operator[] (size_t elementIndex) const
{
size_t word = m_words[elementIndex / ELEMENTS_PER_WORD];
word >>= ((elementIndex % ELEMENTS_PER_WORD) * BitsPerElementT);
return T(word & ELEMENT_MASK);
}
};

View File

@ -0,0 +1,313 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file JitManager.cpp
*
* @brief Implementation if the Jit Manager.
*
* Notes:
*
******************************************************************************/
#if defined(_WIN32)
#pragma warning(disable: 4800 4146 4244 4267 4355 4996)
#endif
#include "jit_api.h"
#include "JitManager.h"
#include "fetch_jit.h"
#if defined(_WIN32)
#include "llvm/ADT/Triple.h"
#endif
#include "llvm/IR/Function.h"
#include "llvm/Support/DynamicLibrary.h"
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/SourceMgr.h"
#include "llvm/Analysis/CFGPrinter.h"
#include "llvm/IRReader/IRReader.h"
#include "core/state.h"
#include "common/containers.hpp"
#include "state_llvm.h"
#include <sstream>
#if defined(_WIN32)
#include <psapi.h>
#include <cstring>
#define INTEL_OUTPUT_DIR "c:\\Intel"
#define SWR_OUTPUT_DIR INTEL_OUTPUT_DIR "\\SWR"
#define JITTER_OUTPUT_DIR SWR_OUTPUT_DIR "\\Jitter"
#endif
using namespace llvm;
//////////////////////////////////////////////////////////////////////////
/// @brief Contructor for JitManager.
/// @param simdWidth - SIMD width to be used in generated program.
JitManager::JitManager(uint32_t simdWidth, const char *arch)
: mContext(), mBuilder(mContext), mIsModuleFinalized(true), mJitNumber(0), mVWidth(simdWidth), mArch(arch)
{
InitializeNativeTarget();
InitializeNativeTargetAsmPrinter();
InitializeNativeTargetDisassembler();
TargetOptions tOpts;
tOpts.AllowFPOpFusion = FPOpFusion::Fast;
tOpts.NoInfsFPMath = false;
tOpts.NoNaNsFPMath = false;
tOpts.UnsafeFPMath = true;
#if defined(_DEBUG)
#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR < 7
tOpts.NoFramePointerElim = true;
#endif
#endif
//tOpts.PrintMachineCode = true;
std::stringstream fnName("JitModule", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
fnName << mJitNumber++;
std::unique_ptr<Module> newModule(new Module(fnName.str(), mContext));
mpCurrentModule = newModule.get();
auto &&EB = EngineBuilder(std::move(newModule));
EB.setTargetOptions(tOpts);
EB.setOptLevel(CodeGenOpt::Aggressive);
StringRef hostCPUName;
// force JIT to use the same CPU arch as the rest of swr
if(mArch.AVX512F())
{
assert(0 && "Implement AVX512 jitter");
hostCPUName = sys::getHostCPUName();
if (mVWidth == 0)
{
mVWidth = 16;
}
}
else if(mArch.AVX2())
{
hostCPUName = StringRef("core-avx2");
if (mVWidth == 0)
{
mVWidth = 8;
}
}
else if(mArch.AVX())
{
if (mArch.F16C())
{
hostCPUName = StringRef("core-avx-i");
}
else
{
hostCPUName = StringRef("corei7-avx");
}
if (mVWidth == 0)
{
mVWidth = 8;
}
}
else
{
hostCPUName = sys::getHostCPUName();
if (mVWidth == 0)
{
mVWidth = 8; // 4?
}
}
EB.setMCPU(hostCPUName);
#if defined(_WIN32)
// Needed for MCJIT on windows
Triple hostTriple(sys::getProcessTriple());
hostTriple.setObjectFormat(Triple::ELF);
mpCurrentModule->setTargetTriple(hostTriple.getTriple());
#endif // _WIN32
mpExec = EB.create();
#if LLVM_USE_INTEL_JITEVENTS
JITEventListener *vTune = JITEventListener::createIntelJITEventListener();
mpExec->RegisterJITEventListener(vTune);
#endif
mFP32Ty = Type::getFloatTy(mContext); // float type
mInt8Ty = Type::getInt8Ty(mContext);
mInt32Ty = Type::getInt32Ty(mContext); // int type
mInt64Ty = Type::getInt64Ty(mContext); // int type
mV4FP32Ty = StructType::get(mContext, std::vector<Type*>(4, mFP32Ty), false); // vector4 float type (represented as structure)
mV4Int32Ty = StructType::get(mContext, std::vector<Type*>(4, mInt32Ty), false); // vector4 int type
// fetch function signature
// typedef void(__cdecl *PFN_FETCH_FUNC)(SWR_FETCH_CONTEXT& fetchInfo, simdvertex& out);
std::vector<Type*> fsArgs;
fsArgs.push_back(PointerType::get(Gen_SWR_FETCH_CONTEXT(this), 0));
fsArgs.push_back(PointerType::get(Gen_simdvertex(this), 0));
mFetchShaderTy = FunctionType::get(Type::getVoidTy(mContext), fsArgs, false);
mSimtFP32Ty = VectorType::get(mFP32Ty, mVWidth);
mSimtInt32Ty = VectorType::get(mInt32Ty, mVWidth);
mSimdVectorTy = StructType::get(mContext, std::vector<Type*>(4, mSimtFP32Ty), false);
mSimdVectorInt32Ty = StructType::get(mContext, std::vector<Type*>(4, mSimtInt32Ty), false);
#if defined(_WIN32)
// explicitly instantiate used symbols from potentially staticly linked libs
sys::DynamicLibrary::AddSymbol("exp2f", &exp2f);
sys::DynamicLibrary::AddSymbol("log2f", &log2f);
sys::DynamicLibrary::AddSymbol("sinf", &sinf);
sys::DynamicLibrary::AddSymbol("cosf", &cosf);
sys::DynamicLibrary::AddSymbol("powf", &powf);
#endif
#if defined(_WIN32)
if (KNOB_DUMP_SHADER_IR)
{
CreateDirectory(INTEL_OUTPUT_DIR, NULL);
CreateDirectory(SWR_OUTPUT_DIR, NULL);
CreateDirectory(JITTER_OUTPUT_DIR, NULL);
}
///@todo Figure out a better solution for this.
// Redirect stdin, stdout, and stderr to attached console.
freopen("CONIN$", "r", stdin);
freopen("CONOUT$", "w", stdout);
freopen("CONOUT$", "w", stderr);
#endif
}
//////////////////////////////////////////////////////////////////////////
/// @brief Create new LLVM module.
void JitManager::SetupNewModule()
{
SWR_ASSERT(mIsModuleFinalized == true && "Current module is not finalized!");
std::stringstream fnName("JitModule", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
fnName << mJitNumber++;
std::unique_ptr<Module> newModule(new Module(fnName.str(), mContext));
mpCurrentModule = newModule.get();
#if defined(_WIN32)
// Needed for MCJIT on windows
Triple hostTriple(sys::getProcessTriple());
hostTriple.setObjectFormat(Triple::ELF);
newModule->setTargetTriple(hostTriple.getTriple());
#endif // _WIN32
mpExec->addModule(std::move(newModule));
mIsModuleFinalized = false;
}
//////////////////////////////////////////////////////////////////////////
/// @brief Create new LLVM module from IR.
bool JitManager::SetupModuleFromIR(const uint8_t *pIR)
{
std::unique_ptr<MemoryBuffer> pMem = MemoryBuffer::getMemBuffer(StringRef((const char*)pIR), "");
SMDiagnostic Err;
std::unique_ptr<Module> newModule = parseIR(pMem.get()->getMemBufferRef(), Err, mContext);
if (newModule == nullptr)
{
SWR_ASSERT(0, "Parse failed! Check Err for details.");
return false;
}
mpCurrentModule = newModule.get();
#if defined(_WIN32)
// Needed for MCJIT on windows
Triple hostTriple(sys::getProcessTriple());
hostTriple.setObjectFormat(Triple::ELF);
newModule->setTargetTriple(hostTriple.getTriple());
#endif // _WIN32
mpExec->addModule(std::move(newModule));
mIsModuleFinalized = false;
return true;
}
//////////////////////////////////////////////////////////////////////////
/// @brief Dump function to file.
void JitManager::DumpToFile(Function *f, const char *fileName)
{
if (KNOB_DUMP_SHADER_IR)
{
#if defined(_WIN32)
DWORD pid = GetCurrentProcessId();
TCHAR procname[MAX_PATH];
GetModuleFileName(NULL, procname, MAX_PATH);
const char* pBaseName = strrchr(procname, '\\');
std::stringstream outDir;
outDir << JITTER_OUTPUT_DIR << pBaseName << "_" << pid << std::ends;
CreateDirectory(outDir.str().c_str(), NULL);
#endif
std::error_code EC;
const char *funcName = f->getName().data();
char fName[256];
#if defined(_WIN32)
sprintf(fName, "%s\\%s.%s.ll", outDir.str().c_str(), funcName, fileName);
#else
sprintf(fName, "%s.%s.ll", funcName, fileName);
#endif
raw_fd_ostream fd(fName, EC, llvm::sys::fs::F_None);
Module* pModule = f->getParent();
pModule->print(fd, nullptr);
#if defined(_WIN32)
sprintf(fName, "%s\\cfg.%s.%s.dot", outDir.str().c_str(), funcName, fileName);
#else
sprintf(fName, "cfg.%s.%s.dot", funcName, fileName);
#endif
fd.flush();
raw_fd_ostream fd_cfg(fName, EC, llvm::sys::fs::F_Text);
WriteGraph(fd_cfg, (const Function*)f);
fd_cfg.flush();
}
}
extern "C"
{
//////////////////////////////////////////////////////////////////////////
/// @brief Create JIT context.
/// @param simdWidth - SIMD width to be used in generated program.
HANDLE JITCALL JitCreateContext(uint32_t targetSimdWidth, const char* arch)
{
return new JitManager(targetSimdWidth, arch);
}
//////////////////////////////////////////////////////////////////////////
/// @brief Destroy JIT context.
void JITCALL JitDestroyContext(HANDLE hJitContext)
{
delete reinterpret_cast<JitManager*>(hJitContext);
}
}

View File

@ -0,0 +1,186 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file JitManager.h
*
* @brief JitManager contains the LLVM data structures used for JIT generation
*
* Notes:
*
******************************************************************************/
#pragma once
#include "common/os.h"
#include "common/isa.hpp"
#if defined(_WIN32)
#pragma warning(disable : 4146 4244 4267 4800 4996)
#endif
// llvm 3.7+ reuses "DEBUG" as an enum value
#pragma push_macro("DEBUG")
#undef DEBUG
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/Config/llvm-config.h"
#ifndef LLVM_VERSION_MAJOR
#include "llvm/Config/config.h"
#endif
#include "llvm/IR/Verifier.h"
#include "llvm/ExecutionEngine/MCJIT.h"
#include "llvm/Support/FileSystem.h"
#define LLVM_F_NONE sys::fs::F_None
#include "llvm/Analysis/Passes.h"
#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
#include "llvm/PassManager.h"
#else
#include "llvm/IR/LegacyPassManager.h"
using namespace llvm::legacy;
#endif
#include "llvm/CodeGen/Passes.h"
#include "llvm/ExecutionEngine/ExecutionEngine.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Support/TargetSelect.h"
#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Support/Host.h"
#pragma pop_macro("DEBUG")
using namespace llvm;
//////////////////////////////////////////////////////////////////////////
/// JitInstructionSet
/// @brief Subclass of InstructionSet that allows users to override
/// the reporting of support for certain ISA features. This allows capping
/// the jitted code to a certain feature level, e.g. jit AVX level code on
/// a platform that supports AVX2.
//////////////////////////////////////////////////////////////////////////
class JitInstructionSet : public InstructionSet
{
public:
JitInstructionSet(const char* requestedIsa) : isaRequest(requestedIsa)
{
std::transform(isaRequest.begin(), isaRequest.end(), isaRequest.begin(), ::tolower);
if(isaRequest == "avx")
{
bForceAVX = true;
bForceAVX2 = false;
bForceAVX512 = false;
}
else if(isaRequest == "avx2")
{
bForceAVX = false;
bForceAVX2 = true;
bForceAVX512 = false;
}
#if 0
else if(isaRequest == "avx512")
{
bForceAVX = false;
bForceAVX2 = false;
bForceAVX512 = true;
}
#endif
};
bool AVX2(void) { return bForceAVX ? 0 : InstructionSet::AVX2(); }
bool AVX512F(void) { return (bForceAVX | bForceAVX2) ? 0 : InstructionSet::AVX512F(); }
bool BMI2(void) { return bForceAVX ? 0 : InstructionSet::BMI2(); }
private:
bool bForceAVX = false;
bool bForceAVX2 = false;
bool bForceAVX512 = false;
std::string isaRequest;
};
struct JitLLVMContext : LLVMContext
{
};
//////////////////////////////////////////////////////////////////////////
/// JitManager
//////////////////////////////////////////////////////////////////////////
struct JitManager
{
JitManager(uint32_t w, const char *arch);
~JitManager(){};
JitLLVMContext mContext; ///< LLVM compiler
IRBuilder<> mBuilder; ///< LLVM IR Builder
ExecutionEngine* mpExec;
// Need to be rebuilt after a JIT and before building new IR
Module* mpCurrentModule;
bool mIsModuleFinalized;
uint32_t mJitNumber;
uint32_t mVWidth;
// Built in types.
Type* mInt8Ty;
Type* mInt32Ty;
Type* mInt64Ty;
Type* mFP32Ty;
StructType* mV4FP32Ty;
StructType* mV4Int32Ty;
// helper scalar function types
FunctionType* mUnaryFPTy;
FunctionType* mBinaryFPTy;
FunctionType* mTrinaryFPTy;
FunctionType* mUnaryIntTy;
FunctionType* mBinaryIntTy;
FunctionType* mTrinaryIntTy;
Type* mSimtFP32Ty;
Type* mSimtInt32Ty;
Type* mSimdVectorInt32Ty;
Type* mSimdVectorTy;
// fetch shader types
FunctionType* mFetchShaderTy;
JitInstructionSet mArch;
void SetupNewModule();
bool SetupModuleFromIR(const uint8_t *pIR);
static void DumpToFile(Function *f, const char *fileName);
};

View File

@ -0,0 +1,772 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file blend_jit.cpp
*
* @brief Implementation of the blend jitter
*
* Notes:
*
******************************************************************************/
#include "jit_api.h"
#include "blend_jit.h"
#include "builder.h"
#include "state_llvm.h"
#include "common/containers.hpp"
#include "llvm/IR/DataLayout.h"
#include <sstream>
// components with bit-widths <= the QUANTIZE_THRESHOLD will be quantized
#define QUANTIZE_THRESHOLD 2
//////////////////////////////////////////////////////////////////////////
/// Interface to Jitting a blend shader
//////////////////////////////////////////////////////////////////////////
struct BlendJit : public Builder
{
BlendJit(JitManager* pJitMgr) : Builder(pJitMgr){};
template<bool Color, bool Alpha>
void GenerateBlendFactor(SWR_BLEND_FACTOR factor, Value* constColor[4], Value* src[4], Value* src1[4], Value* dst[4], Value* result[4])
{
Value* out[4];
switch (factor)
{
case BLENDFACTOR_ONE:
out[0] = out[1] = out[2] = out[3] = VIMMED1(1.0f);
break;
case BLENDFACTOR_SRC_COLOR:
out[0] = src[0];
out[1] = src[1];
out[2] = src[2];
out[3] = src[3];
break;
case BLENDFACTOR_SRC_ALPHA:
out[0] = out[1] = out[2] = out[3] = src[3];
break;
case BLENDFACTOR_DST_ALPHA:
out[0] = out[1] = out[2] = out[3] = dst[3];
break;
case BLENDFACTOR_DST_COLOR:
out[0] = dst[0];
out[1] = dst[1];
out[2] = dst[2];
out[3] = dst[3];
break;
case BLENDFACTOR_SRC_ALPHA_SATURATE:
out[0] = out[1] = out[2] = VMINPS(src[3], FSUB(VIMMED1(1.0f), dst[3]));
out[3] = VIMMED1(1.0f);
break;
case BLENDFACTOR_CONST_COLOR:
out[0] = constColor[0];
out[1] = constColor[1];
out[2] = constColor[2];
out[3] = constColor[3];
break;
case BLENDFACTOR_CONST_ALPHA:
out[0] = out[1] = out[2] = out[3] = constColor[3];
break;
case BLENDFACTOR_SRC1_COLOR:
out[0] = src1[0];
out[1] = src1[1];
out[2] = src1[2];
out[3] = src1[3];
break;
case BLENDFACTOR_SRC1_ALPHA:
out[0] = out[1] = out[2] = out[3] = src1[3];
break;
case BLENDFACTOR_ZERO:
out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
break;
case BLENDFACTOR_INV_SRC_COLOR:
out[0] = FSUB(VIMMED1(1.0f), src[0]);
out[1] = FSUB(VIMMED1(1.0f), src[1]);
out[2] = FSUB(VIMMED1(1.0f), src[2]);
out[3] = FSUB(VIMMED1(1.0f), src[3]);
break;
case BLENDFACTOR_INV_SRC_ALPHA:
out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src[3]);
break;
case BLENDFACTOR_INV_DST_ALPHA:
out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), dst[3]);
break;
case BLENDFACTOR_INV_DST_COLOR:
out[0] = FSUB(VIMMED1(1.0f), dst[0]);
out[1] = FSUB(VIMMED1(1.0f), dst[1]);
out[2] = FSUB(VIMMED1(1.0f), dst[2]);
out[3] = FSUB(VIMMED1(1.0f), dst[3]);
break;
case BLENDFACTOR_INV_CONST_COLOR:
out[0] = FSUB(VIMMED1(1.0f), constColor[0]);
out[1] = FSUB(VIMMED1(1.0f), constColor[1]);
out[2] = FSUB(VIMMED1(1.0f), constColor[2]);
out[3] = FSUB(VIMMED1(1.0f), constColor[3]);
break;
case BLENDFACTOR_INV_CONST_ALPHA:
out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), constColor[3]);
break;
case BLENDFACTOR_INV_SRC1_COLOR:
out[0] = FSUB(VIMMED1(1.0f), src1[0]);
out[1] = FSUB(VIMMED1(1.0f), src1[1]);
out[2] = FSUB(VIMMED1(1.0f), src1[2]);
out[3] = FSUB(VIMMED1(1.0f), src1[3]);
break;
case BLENDFACTOR_INV_SRC1_ALPHA:
out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src1[3]);
break;
default:
SWR_ASSERT(false, "Unsupported blend factor: %d", factor);
out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
break;
}
if (Color)
{
result[0] = out[0];
result[1] = out[1];
result[2] = out[2];
}
if (Alpha)
{
result[3] = out[3];
}
}
void Clamp(SWR_FORMAT format, Value* src[4])
{
const SWR_FORMAT_INFO& info = GetFormatInfo(format);
SWR_TYPE type = info.type[0];
switch (type)
{
case SWR_TYPE_FLOAT:
break;
case SWR_TYPE_UNORM:
src[0] = VMINPS(VMAXPS(src[0], VIMMED1(0.0f)), VIMMED1(1.0f));
src[1] = VMINPS(VMAXPS(src[1], VIMMED1(0.0f)), VIMMED1(1.0f));
src[2] = VMINPS(VMAXPS(src[2], VIMMED1(0.0f)), VIMMED1(1.0f));
src[3] = VMINPS(VMAXPS(src[3], VIMMED1(0.0f)), VIMMED1(1.0f));
break;
case SWR_TYPE_SNORM:
src[0] = VMINPS(VMAXPS(src[0], VIMMED1(-1.0f)), VIMMED1(1.0f));
src[1] = VMINPS(VMAXPS(src[1], VIMMED1(-1.0f)), VIMMED1(1.0f));
src[2] = VMINPS(VMAXPS(src[2], VIMMED1(-1.0f)), VIMMED1(1.0f));
src[3] = VMINPS(VMAXPS(src[3], VIMMED1(-1.0f)), VIMMED1(1.0f));
break;
default: SWR_ASSERT(false, "Unsupport format type: %d", type);
}
}
void ApplyDefaults(SWR_FORMAT format, Value* src[4])
{
const SWR_FORMAT_INFO& info = GetFormatInfo(format);
bool valid[] = { false, false, false, false };
for (uint32_t c = 0; c < info.numComps; ++c)
{
valid[info.swizzle[c]] = true;
}
for (uint32_t c = 0; c < 4; ++c)
{
if (!valid[c])
{
src[c] = BITCAST(VIMMED1((int)info.defaults[c]), mSimdFP32Ty);
}
}
}
void ApplyUnusedDefaults(SWR_FORMAT format, Value* src[4])
{
const SWR_FORMAT_INFO& info = GetFormatInfo(format);
for (uint32_t c = 0; c < info.numComps; ++c)
{
if (info.type[c] == SWR_TYPE_UNUSED)
{
src[info.swizzle[c]] = BITCAST(VIMMED1((int)info.defaults[info.swizzle[c]]), mSimdFP32Ty);
}
}
}
void Quantize(SWR_FORMAT format, Value* src[4])
{
const SWR_FORMAT_INFO& info = GetFormatInfo(format);
for (uint32_t c = 0; c < info.numComps; ++c)
{
if (info.bpc[c] <= QUANTIZE_THRESHOLD)
{
uint32_t swizComp = info.swizzle[c];
float factor = (float)((1 << info.bpc[c]) - 1);
switch (info.type[c])
{
case SWR_TYPE_UNORM:
src[swizComp] = FADD(FMUL(src[swizComp], VIMMED1(factor)), VIMMED1(0.5f));
src[swizComp] = VROUND(src[swizComp], C(_MM_FROUND_TO_ZERO));
src[swizComp] = FMUL(src[swizComp], VIMMED1(1.0f /factor));
break;
default: SWR_ASSERT(false, "Unsupported format type: %d", info.type[c]);
}
}
}
}
template<bool Color, bool Alpha>
void BlendFunc(SWR_BLEND_OP blendOp, Value* src[4], Value* srcFactor[4], Value* dst[4], Value* dstFactor[4], Value* result[4])
{
Value* out[4];
Value* srcBlend[4];
Value* dstBlend[4];
for (uint32_t i = 0; i < 4; ++i)
{
srcBlend[i] = FMUL(src[i], srcFactor[i]);
dstBlend[i] = FMUL(dst[i], dstFactor[i]);
}
switch (blendOp)
{
case BLENDOP_ADD:
out[0] = FADD(srcBlend[0], dstBlend[0]);
out[1] = FADD(srcBlend[1], dstBlend[1]);
out[2] = FADD(srcBlend[2], dstBlend[2]);
out[3] = FADD(srcBlend[3], dstBlend[3]);
break;
case BLENDOP_SUBTRACT:
out[0] = FSUB(srcBlend[0], dstBlend[0]);
out[1] = FSUB(srcBlend[1], dstBlend[1]);
out[2] = FSUB(srcBlend[2], dstBlend[2]);
out[3] = FSUB(srcBlend[3], dstBlend[3]);
break;
case BLENDOP_REVSUBTRACT:
out[0] = FSUB(dstBlend[0], srcBlend[0]);
out[1] = FSUB(dstBlend[1], srcBlend[1]);
out[2] = FSUB(dstBlend[2], srcBlend[2]);
out[3] = FSUB(dstBlend[3], srcBlend[3]);
break;
case BLENDOP_MIN:
out[0] = VMINPS(src[0], dst[0]);
out[1] = VMINPS(src[1], dst[1]);
out[2] = VMINPS(src[2], dst[2]);
out[3] = VMINPS(src[3], dst[3]);
break;
case BLENDOP_MAX:
out[0] = VMAXPS(src[0], dst[0]);
out[1] = VMAXPS(src[1], dst[1]);
out[2] = VMAXPS(src[2], dst[2]);
out[3] = VMAXPS(src[3], dst[3]);
break;
default:
SWR_ASSERT(false, "Unsupported blend operation: %d", blendOp);
out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
break;
}
if (Color)
{
result[0] = out[0];
result[1] = out[1];
result[2] = out[2];
}
if (Alpha)
{
result[3] = out[3];
}
}
void LogicOpFunc(SWR_LOGIC_OP logicOp, Value* src[4], Value* dst[4], Value* result[4])
{
// Op: (s == PS output, d = RT contents)
switch(logicOp)
{
case LOGICOP_CLEAR:
result[0] = VIMMED1(0);
result[1] = VIMMED1(0);
result[2] = VIMMED1(0);
result[3] = VIMMED1(0);
break;
case LOGICOP_NOR:
// ~(s | d)
result[0] = XOR(OR(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
result[1] = XOR(OR(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
result[2] = XOR(OR(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
result[3] = XOR(OR(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
break;
case LOGICOP_AND_INVERTED:
// ~s & d
// todo: use avx andnot instr when I can find the intrinsic to call
result[0] = AND(XOR(src[0], VIMMED1(0xFFFFFFFF)), dst[0]);
result[1] = AND(XOR(src[1], VIMMED1(0xFFFFFFFF)), dst[1]);
result[2] = AND(XOR(src[2], VIMMED1(0xFFFFFFFF)), dst[2]);
result[3] = AND(XOR(src[3], VIMMED1(0xFFFFFFFF)), dst[3]);
break;
case LOGICOP_COPY_INVERTED:
// ~s
result[0] = XOR(src[0], VIMMED1(0xFFFFFFFF));
result[1] = XOR(src[1], VIMMED1(0xFFFFFFFF));
result[2] = XOR(src[2], VIMMED1(0xFFFFFFFF));
result[3] = XOR(src[3], VIMMED1(0xFFFFFFFF));
break;
case LOGICOP_AND_REVERSE:
// s & ~d
// todo: use avx andnot instr when I can find the intrinsic to call
result[0] = AND(XOR(dst[0], VIMMED1(0xFFFFFFFF)), src[0]);
result[1] = AND(XOR(dst[1], VIMMED1(0xFFFFFFFF)), src[1]);
result[2] = AND(XOR(dst[2], VIMMED1(0xFFFFFFFF)), src[2]);
result[3] = AND(XOR(dst[3], VIMMED1(0xFFFFFFFF)), src[3]);
break;
case LOGICOP_INVERT:
// ~d
result[0] = XOR(dst[0], VIMMED1(0xFFFFFFFF));
result[1] = XOR(dst[1], VIMMED1(0xFFFFFFFF));
result[2] = XOR(dst[2], VIMMED1(0xFFFFFFFF));
result[3] = XOR(dst[3], VIMMED1(0xFFFFFFFF));
break;
case LOGICOP_XOR:
// s ^ d
result[0] = XOR(src[0], dst[0]);
result[1] = XOR(src[1], dst[1]);
result[2] = XOR(src[2], dst[2]);
result[3] = XOR(src[3], dst[3]);
break;
case LOGICOP_NAND:
// ~(s & d)
result[0] = XOR(AND(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
result[1] = XOR(AND(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
result[2] = XOR(AND(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
result[3] = XOR(AND(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
break;
case LOGICOP_AND:
// s & d
result[0] = AND(src[0], dst[0]);
result[1] = AND(src[1], dst[1]);
result[2] = AND(src[2], dst[2]);
result[3] = AND(src[3], dst[3]);
break;
case LOGICOP_EQUIV:
// ~(s ^ d)
result[0] = XOR(XOR(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
result[1] = XOR(XOR(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
result[2] = XOR(XOR(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
result[3] = XOR(XOR(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
break;
case LOGICOP_NOOP:
result[0] = dst[0];
result[1] = dst[1];
result[2] = dst[2];
result[3] = dst[3];
break;
case LOGICOP_OR_INVERTED:
// ~s | d
result[0] = OR(XOR(src[0], VIMMED1(0xFFFFFFFF)), dst[0]);
result[1] = OR(XOR(src[1], VIMMED1(0xFFFFFFFF)), dst[1]);
result[2] = OR(XOR(src[2], VIMMED1(0xFFFFFFFF)), dst[2]);
result[3] = OR(XOR(src[3], VIMMED1(0xFFFFFFFF)), dst[3]);
break;
case LOGICOP_COPY:
result[0] = src[0];
result[1] = src[1];
result[2] = src[2];
result[3] = src[3];
break;
case LOGICOP_OR_REVERSE:
// s | ~d
result[0] = OR(XOR(dst[0], VIMMED1(0xFFFFFFFF)), src[0]);
result[1] = OR(XOR(dst[1], VIMMED1(0xFFFFFFFF)), src[1]);
result[2] = OR(XOR(dst[2], VIMMED1(0xFFFFFFFF)), src[2]);
result[3] = OR(XOR(dst[3], VIMMED1(0xFFFFFFFF)), src[3]);
break;
case LOGICOP_OR:
// s | d
result[0] = OR(src[0], dst[0]);
result[1] = OR(src[1], dst[1]);
result[2] = OR(src[2], dst[2]);
result[3] = OR(src[3], dst[3]);
break;
case LOGICOP_SET:
result[0] = VIMMED1(0xFFFFFFFF);
result[1] = VIMMED1(0xFFFFFFFF);
result[2] = VIMMED1(0xFFFFFFFF);
result[3] = VIMMED1(0xFFFFFFFF);
break;
default:
SWR_ASSERT(false, "Unsupported logic operation: %d", logicOp);
result[0] = result[1] = result[2] = result[3] = VIMMED1(0.0f);
break;
}
}
void AlphaTest(const BLEND_COMPILE_STATE& state, Value* pBlendState, Value* pAlpha, Value* ppMask)
{
// load uint32_t reference
Value* pRef = VBROADCAST(LOAD(pBlendState, { 0, SWR_BLEND_STATE_alphaTestReference }));
Value* pTest = nullptr;
if (state.alphaTestFormat == ALPHA_TEST_UNORM8)
{
// convert float alpha to unorm8
Value* pAlphaU8 = FMUL(pAlpha, VIMMED1(256.0f));
pAlphaU8 = FP_TO_UI(pAlphaU8, mSimdInt32Ty);
// compare
switch (state.alphaTestFunction)
{
case ZFUNC_ALWAYS: pTest = VIMMED1(true); break;
case ZFUNC_NEVER: pTest = VIMMED1(false); break;
case ZFUNC_LT: pTest = ICMP_ULT(pAlphaU8, pRef); break;
case ZFUNC_EQ: pTest = ICMP_EQ(pAlphaU8, pRef); break;
case ZFUNC_LE: pTest = ICMP_ULE(pAlphaU8, pRef); break;
case ZFUNC_GT: pTest = ICMP_UGT(pAlphaU8, pRef); break;
case ZFUNC_NE: pTest = ICMP_NE(pAlphaU8, pRef); break;
case ZFUNC_GE: pTest = ICMP_UGE(pAlphaU8, pRef); break;
default:
SWR_ASSERT(false, "Invalid alpha test function");
break;
}
}
else
{
// cast ref to float
pRef = BITCAST(pRef, mSimdFP32Ty);
// compare
switch (state.alphaTestFunction)
{
case ZFUNC_ALWAYS: pTest = VIMMED1(true); break;
case ZFUNC_NEVER: pTest = VIMMED1(false); break;
case ZFUNC_LT: pTest = FCMP_OLT(pAlpha, pRef); break;
case ZFUNC_EQ: pTest = FCMP_OEQ(pAlpha, pRef); break;
case ZFUNC_LE: pTest = FCMP_OLE(pAlpha, pRef); break;
case ZFUNC_GT: pTest = FCMP_OGT(pAlpha, pRef); break;
case ZFUNC_NE: pTest = FCMP_ONE(pAlpha, pRef); break;
case ZFUNC_GE: pTest = FCMP_OGE(pAlpha, pRef); break;
default:
SWR_ASSERT(false, "Invalid alpha test function");
break;
}
}
// load current mask
Value* pMask = LOAD(ppMask);
// convert to int1 mask
pMask = MASK(pMask);
// and with alpha test result
pMask = AND(pMask, pTest);
// convert back to vector mask
pMask = VMASK(pMask);
// store new mask
STORE(pMask, ppMask);
}
Function* Create(const BLEND_COMPILE_STATE& state)
{
static std::size_t jitNum = 0;
std::stringstream fnName("BlendShader", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
fnName << jitNum++;
// blend function signature
//typedef void(*PFN_BLEND_JIT_FUNC)(const SWR_BLEND_STATE*, simdvector&, simdvector&, uint32_t, BYTE*, simdvector&, simdscalari*, simdscalari*);
std::vector<Type*> args{
PointerType::get(Gen_SWR_BLEND_STATE(JM()), 0), // SWR_BLEND_STATE*
PointerType::get(mSimdFP32Ty, 0), // simdvector& src
PointerType::get(mSimdFP32Ty, 0), // simdvector& src1
Type::getInt32Ty(JM()->mContext), // sampleNum
PointerType::get(mSimdFP32Ty, 0), // uint8_t* pDst
PointerType::get(mSimdFP32Ty, 0), // simdvector& result
PointerType::get(mSimdInt32Ty, 0), // simdscalari* oMask
PointerType::get(mSimdInt32Ty, 0), // simdscalari* pMask
};
FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false);
Function* blendFunc = Function::Create(fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", blendFunc);
IRB()->SetInsertPoint(entry);
// arguments
auto argitr = blendFunc->getArgumentList().begin();
Value* pBlendState = &*argitr++;
pBlendState->setName("pBlendState");
Value* pSrc = &*argitr++;
pSrc->setName("src");
Value* pSrc1 = &*argitr++;
pSrc1->setName("src1");
Value* sampleNum = &*argitr++;
sampleNum->setName("sampleNum");
Value* pDst = &*argitr++;
pDst->setName("pDst");
Value* pResult = &*argitr++;
pResult->setName("result");
Value* ppoMask = &*argitr++;
ppoMask->setName("ppoMask");
Value* ppMask = &*argitr++;
ppMask->setName("pMask");
static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
Value* dst[4];
Value* constantColor[4];
Value* src[4];
Value* src1[4];
Value* result[4];
for (uint32_t i = 0; i < 4; ++i)
{
// load hot tile
dst[i] = LOAD(pDst, { i });
// load constant color
constantColor[i] = VBROADCAST(LOAD(pBlendState, { 0, SWR_BLEND_STATE_constantColor, i }));
// load src
src[i] = LOAD(pSrc, { i });
// load src1
src1[i] = LOAD(pSrc1, { i });
}
Value* currentMask = VIMMED1(-1);
if(state.desc.alphaToCoverageEnable)
{
currentMask = FP_TO_SI(FMUL(src[3], VBROADCAST(C((float)state.desc.numSamples))), mSimdInt32Ty);
}
// alpha test
if (state.desc.alphaTestEnable)
{
AlphaTest(state, pBlendState, src[3], ppMask);
}
// color blend
if (state.blendState.blendEnable)
{
// clamp sources
Clamp(state.format, src);
Clamp(state.format, src1);
Clamp(state.format, dst);
Clamp(state.format, constantColor);
// apply defaults to hottile contents to take into account missing components
ApplyDefaults(state.format, dst);
// Force defaults for unused 'X' components
ApplyUnusedDefaults(state.format, dst);
// Quantize low precision components
Quantize(state.format, dst);
// special case clamping for R11G11B10_float which has no sign bit
if (state.format == R11G11B10_FLOAT)
{
dst[0] = VMAXPS(dst[0], VIMMED1(0.0f));
dst[1] = VMAXPS(dst[1], VIMMED1(0.0f));
dst[2] = VMAXPS(dst[2], VIMMED1(0.0f));
dst[3] = VMAXPS(dst[3], VIMMED1(0.0f));
}
Value* srcFactor[4];
Value* dstFactor[4];
if (state.desc.independentAlphaBlendEnable)
{
GenerateBlendFactor<true, false>(state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor);
GenerateBlendFactor<false, true>(state.blendState.sourceAlphaBlendFactor, constantColor, src, src1, dst, srcFactor);
GenerateBlendFactor<true, false>(state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor);
GenerateBlendFactor<false, true>(state.blendState.destAlphaBlendFactor, constantColor, src, src1, dst, dstFactor);
BlendFunc<true, false>(state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result);
BlendFunc<false, true>(state.blendState.alphaBlendFunc, src, srcFactor, dst, dstFactor, result);
}
else
{
GenerateBlendFactor<true, true>(state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor);
GenerateBlendFactor<true, true>(state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor);
BlendFunc<true, true>(state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result);
}
// store results out
for (uint32_t i = 0; i < 4; ++i)
{
STORE(result[i], pResult, { i });
}
}
if(state.blendState.logicOpEnable)
{
const SWR_FORMAT_INFO& info = GetFormatInfo(state.format);
SWR_ASSERT(info.type[0] == SWR_TYPE_UINT);
Value* vMask[4];
for(uint32_t i = 0; i < 4; i++)
{
switch(info.bpc[i])
{
case 0: vMask[i] = VIMMED1(0x00000000); break;
case 2: vMask[i] = VIMMED1(0x00000003); break;
case 5: vMask[i] = VIMMED1(0x0000001F); break;
case 6: vMask[i] = VIMMED1(0x0000003F); break;
case 8: vMask[i] = VIMMED1(0x000000FF); break;
case 10: vMask[i] = VIMMED1(0x000003FF); break;
case 11: vMask[i] = VIMMED1(0x000007FF); break;
case 16: vMask[i] = VIMMED1(0x0000FFFF); break;
case 24: vMask[i] = VIMMED1(0x00FFFFFF); break;
case 32: vMask[i] = VIMMED1(0xFFFFFFFF); break;
default:
vMask[i] = VIMMED1(0x0);
SWR_ASSERT(0, "Unsupported bpc for logic op\n");
break;
}
src[i] = BITCAST(src[i], mSimdInt32Ty);//, vMask[i]);
dst[i] = BITCAST(dst[i], mSimdInt32Ty);
}
LogicOpFunc(state.blendState.logicOpFunc, src, dst, result);
// store results out
for(uint32_t i = 0; i < 4; ++i)
{
// clear upper bits from PS output not in RT format after doing logic op
result[i] = AND(result[i], vMask[i]);
STORE(BITCAST(result[i], mSimdFP32Ty), pResult, {i});
}
}
if(state.desc.oMaskEnable)
{
assert(!(state.desc.alphaToCoverageEnable));
// load current mask
Value* oMask = LOAD(ppoMask);
Value* sampleMasked = VBROADCAST(SHL(C(1), sampleNum));
oMask = AND(oMask, sampleMasked);
currentMask = AND(oMask, currentMask);
}
if(state.desc.sampleMaskEnable)
{
Value* sampleMask = LOAD(pBlendState, { 0, SWR_BLEND_STATE_sampleMask});
Value* sampleMasked = SHL(C(1), sampleNum);
sampleMask = AND(sampleMask, sampleMasked);
sampleMask = VBROADCAST(ICMP_SGT(sampleMask, C(0)));
sampleMask = S_EXT(sampleMask, mSimdInt32Ty);
currentMask = AND(sampleMask, currentMask);
}
if(state.desc.sampleMaskEnable || state.desc.alphaToCoverageEnable ||
state.desc.oMaskEnable)
{
// load current mask
Value* pMask = LOAD(ppMask);
currentMask = S_EXT(ICMP_SGT(currentMask, VBROADCAST(C(0))), mSimdInt32Ty);
Value* outputMask = AND(pMask, currentMask);
// store new mask
STORE(outputMask, GEP(ppMask, C(0)));
}
RET_VOID();
JitManager::DumpToFile(blendFunc, "");
FunctionPassManager passes(JM()->mpCurrentModule);
passes.add(createBreakCriticalEdgesPass());
passes.add(createCFGSimplificationPass());
passes.add(createEarlyCSEPass());
passes.add(createPromoteMemoryToRegisterPass());
passes.add(createCFGSimplificationPass());
passes.add(createEarlyCSEPass());
passes.add(createInstructionCombiningPass());
passes.add(createInstructionSimplifierPass());
passes.add(createConstantPropagationPass());
passes.add(createSCCPPass());
passes.add(createAggressiveDCEPass());
passes.run(*blendFunc);
JitManager::DumpToFile(blendFunc, "optimized");
return blendFunc;
}
};
//////////////////////////////////////////////////////////////////////////
/// @brief JITs from fetch shader IR
/// @param hJitMgr - JitManager handle
/// @param func - LLVM function IR
/// @return PFN_FETCH_FUNC - pointer to fetch code
PFN_BLEND_JIT_FUNC JitBlendFunc(HANDLE hJitMgr, const HANDLE hFunc)
{
const llvm::Function *func = (const llvm::Function*)hFunc;
JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
PFN_BLEND_JIT_FUNC pfnBlend;
pfnBlend = (PFN_BLEND_JIT_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
// MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
pJitMgr->mIsModuleFinalized = true;
return pfnBlend;
}
//////////////////////////////////////////////////////////////////////////
/// @brief JIT compiles blend shader
/// @param hJitMgr - JitManager handle
/// @param state - blend state to build function from
extern "C" PFN_BLEND_JIT_FUNC JITCALL JitCompileBlend(HANDLE hJitMgr, const BLEND_COMPILE_STATE& state)
{
JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
pJitMgr->SetupNewModule();
BlendJit theJit(pJitMgr);
HANDLE hFunc = theJit.Create(state);
return JitBlendFunc(hJitMgr, hFunc);
}

View File

@ -0,0 +1,93 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file blend_jit.h
*
* @brief Definition of the blend jitter
*
* Notes:
*
******************************************************************************/
#pragma once
#include "common/formats.h"
#include "core/context.h"
#include "core/state.h"
struct RENDER_TARGET_BLEND_COMPILE_STATE
{
bool blendEnable;
bool logicOpEnable;
SWR_BLEND_FACTOR sourceAlphaBlendFactor;
SWR_BLEND_FACTOR destAlphaBlendFactor;
SWR_BLEND_FACTOR sourceBlendFactor;
SWR_BLEND_FACTOR destBlendFactor;
SWR_BLEND_OP colorBlendFunc;
SWR_BLEND_OP alphaBlendFunc;
SWR_LOGIC_OP logicOpFunc;
};
enum ALPHA_TEST_FORMAT
{
ALPHA_TEST_UNORM8,
ALPHA_TEST_FLOAT32
};
//////////////////////////////////////////////////////////////////////////
/// BLEND_DESC
//////////////////////////////////////////////////////////////////////////
struct BLEND_DESC
{
union
{
struct
{
uint32_t alphaTestEnable: 1;
uint32_t independentAlphaBlendEnable: 1;
uint32_t alphaToCoverageEnable: 1;
uint32_t oMaskEnable:1;
uint32_t inputCoverageEnable:1;
uint32_t sampleMaskEnable:1;
uint32_t numSamples:5;
uint32_t _reserved : 21;
};
uint32_t bits;
};
};
#define BLEND_ENABLE_MASK 0x3D // a2c | oMaskEnable | inputCoverageEnable | sampleMaskEnable
//////////////////////////////////////////////////////////////////////////
/// State required for blend jit
//////////////////////////////////////////////////////////////////////////
struct BLEND_COMPILE_STATE
{
SWR_FORMAT format; // format of render target being blended
RENDER_TARGET_BLEND_COMPILE_STATE blendState;
BLEND_DESC desc;
SWR_ZFUNCTION alphaTestFunction;
ALPHA_TEST_FORMAT alphaTestFormat;
bool operator==(const BLEND_COMPILE_STATE& other) const
{
return memcmp(this, &other, sizeof(BLEND_COMPILE_STATE)) == 0;
}
};

View File

@ -0,0 +1,71 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file builder.h
*
* @brief Includes all the builder related functionality
*
* Notes:
*
******************************************************************************/
#include "builder.h"
using namespace llvm;
//////////////////////////////////////////////////////////////////////////
/// @brief Contructor for Builder.
/// @param pJitMgr - JitManager which contains modules, function passes, etc.
Builder::Builder(JitManager *pJitMgr)
: mpJitMgr(pJitMgr)
{
mpIRBuilder = &pJitMgr->mBuilder;
mVoidTy = Type::getVoidTy(pJitMgr->mContext);
mFP16Ty = Type::getHalfTy(pJitMgr->mContext);
mFP32Ty = Type::getFloatTy(pJitMgr->mContext);
mDoubleTy = Type::getDoubleTy(pJitMgr->mContext);
mInt1Ty = Type::getInt1Ty(pJitMgr->mContext);
mInt8Ty = Type::getInt8Ty(pJitMgr->mContext);
mInt16Ty = Type::getInt16Ty(pJitMgr->mContext);
mInt32Ty = Type::getInt32Ty(pJitMgr->mContext);
mInt64Ty = Type::getInt64Ty(pJitMgr->mContext);
mV4FP32Ty = StructType::get(pJitMgr->mContext, std::vector<Type*>(4, mFP32Ty), false); // vector4 float type (represented as structure)
mV4Int32Ty = StructType::get(pJitMgr->mContext, std::vector<Type*>(4, mInt32Ty), false); // vector4 int type
mSimdInt16Ty = VectorType::get(mInt16Ty, mpJitMgr->mVWidth);
mSimdInt32Ty = VectorType::get(mInt32Ty, mpJitMgr->mVWidth);
mSimdInt64Ty = VectorType::get(mInt64Ty, mpJitMgr->mVWidth);
mSimdFP16Ty = VectorType::get(mFP16Ty, mpJitMgr->mVWidth);
mSimdFP32Ty = VectorType::get(mFP32Ty, mpJitMgr->mVWidth);
if (sizeof(uint32_t*) == 4)
{
mIntPtrTy = mInt32Ty;
mSimdIntPtrTy = mSimdInt32Ty;
}
else
{
SWR_ASSERT(sizeof(uint32_t*) == 8);
mIntPtrTy = mInt64Ty;
mSimdIntPtrTy = mSimdInt64Ty;
}
}

View File

@ -0,0 +1,71 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file builder.h
*
* @brief Includes all the builder related functionality
*
* Notes:
*
******************************************************************************/
#pragma once
#include "JitManager.h"
#include "common/formats.h"
using namespace llvm;
struct Builder
{
Builder(JitManager *pJitMgr);
IRBuilder<>* IRB() { return mpIRBuilder; };
JitManager* JM() { return mpJitMgr; }
JitManager* mpJitMgr;
IRBuilder<>* mpIRBuilder;
// Built in types.
Type* mVoidTy;
Type* mInt1Ty;
Type* mInt8Ty;
Type* mInt16Ty;
Type* mInt32Ty;
Type* mInt64Ty;
Type* mIntPtrTy;
Type* mFP16Ty;
Type* mFP32Ty;
Type* mDoubleTy;
Type* mSimdFP16Ty;
Type* mSimdFP32Ty;
Type* mSimdInt16Ty;
Type* mSimdInt32Ty;
Type* mSimdInt64Ty;
Type* mSimdIntPtrTy;
StructType* mV4FP32Ty;
StructType* mV4Int32Ty;
#include "builder_gen.h"
#include "builder_x86.h"
#include "builder_misc.h"
#include "builder_math.h"
};

View File

@ -0,0 +1,34 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file builder_math.h
*
* @brief math/alu builder functions
*
* Notes:
*
******************************************************************************/
#pragma once
Value* VLOG2PS(Value* src);
Value* VPOW24PS(Value* src);
Value* VEXP2PS(Value* src);

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,149 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file builder_misc.h
*
* @brief miscellaneous builder functions
*
* Notes:
*
******************************************************************************/
#pragma once
Constant *C(bool i);
Constant *C(char i);
Constant *C(uint8_t i);
Constant *C(int i);
Constant *C(int64_t i);
Constant *C(uint16_t i);
Constant *C(uint32_t i);
Constant *C(float i);
template<typename Ty>
Constant *C(const std::initializer_list<Ty> &constList)
{
std::vector<Constant*> vConsts;
for(auto i : constList) {
vConsts.push_back(C((Ty)i));
}
return ConstantVector::get(vConsts);
}
Constant *PRED(bool pred);
Value *VIMMED1(int i);
Value *VIMMED1(uint32_t i);
Value *VIMMED1(float i);
Value *VIMMED1(bool i);
Value *VUNDEF(Type* t);
Value *VUNDEF_F();
Value *VUNDEF_I();
Value *VUNDEF(Type* ty, uint32_t size);
Value *VUNDEF_IPTR();
#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
Value *VINSERT(Value *vec, Value *val, uint64_t index);
#endif
Value *VBROADCAST(Value *src);
Value *VRCP(Value *va);
Value *VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY);
uint32_t IMMED(Value* i);
Value *GEP(Value* ptr, const std::initializer_list<Value*> &indexList);
Value *GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList);
CallInst *CALL(Value *Callee, const std::initializer_list<Value*> &args);
LoadInst *LOAD(Value *BasePtr, const std::initializer_list<uint32_t> &offset, const llvm::Twine& name = "");
LoadInst *LOADV(Value *BasePtr, const std::initializer_list<Value*> &offset, const llvm::Twine& name = "");
StoreInst *STORE(Value *Val, Value *BasePtr, const std::initializer_list<uint32_t> &offset);
StoreInst *STOREV(Value *Val, Value *BasePtr, const std::initializer_list<Value*> &offset);
Value *VCMPPS_EQ(Value* a, Value* b) { return VCMPPS(a, b, C((uint8_t)_CMP_EQ_OQ)); }
Value *VCMPPS_LT(Value* a, Value* b) { return VCMPPS(a, b, C((uint8_t)_CMP_LT_OQ)); }
Value *VCMPPS_LE(Value* a, Value* b) { return VCMPPS(a, b, C((uint8_t)_CMP_LE_OQ)); }
Value *VCMPPS_ISNAN(Value* a, Value* b) { return VCMPPS(a, b, C((uint8_t)_CMP_UNORD_Q)); }
Value *VCMPPS_NEQ(Value* a, Value* b) { return VCMPPS(a, b, C((uint8_t)_CMP_NEQ_OQ)); }
Value *VCMPPS_GE(Value* a, Value* b) { return VCMPPS(a, b, C((uint8_t)_CMP_GE_OQ)); }
Value *VCMPPS_GT(Value* a, Value* b) { return VCMPPS(a, b, C((uint8_t)_CMP_GT_OQ)); }
Value *VCMPPS_NOTNAN(Value* a, Value* b){ return VCMPPS(a, b, C((uint8_t)_CMP_ORD_Q)); }
Value *MASK(Value* vmask);
Value *VMASK(Value* mask);
//////////////////////////////////////////////////////////////////////////
/// @brief functions that build IR to call x86 intrinsics directly, or
/// emulate them with other instructions if not available on the host
//////////////////////////////////////////////////////////////////////////
Value *MASKLOADD(Value* src, Value* mask);
void Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
Value* mask, Value* vGatherComponents[], bool bPackedOutput);
Value *GATHERPS(Value* src, Value* pBase, Value* indices, Value* mask, Value* scale);
void GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
Value* mask, Value* vGatherComponents[], bool bPackedOutput);
Value *GATHERDD(Value* src, Value* pBase, Value* indices, Value* mask, Value* scale);
void GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
Value* mask, Value* vGatherComponents[], bool bPackedOutput);
void SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask);
void Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput);
void Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[], Value* vGatherOutput[], bool bPackedOutput);
Value *PSHUFB(Value* a, Value* b);
Value *PMOVSXBD(Value* a);
Value *PMOVSXWD(Value* a);
Value *PERMD(Value* a, Value* idx);
Value *CVTPH2PS(Value* a);
Value *CVTPS2PH(Value* a, Value* rounding);
Value *PMAXSD(Value* a, Value* b);
Value *PMINSD(Value* a, Value* b);
Value *VABSPS(Value* a);
Value *FMADDPS(Value* a, Value* b, Value* c);
// LLVM removed VPCMPGTD x86 intrinsic. This emulates that behavior
Value *VPCMPGTD(Value* a, Value* b)
{
Value* vIndexMask = ICMP_UGT(a,b);
// need to set the high bit for x86 intrinsic masks
return S_EXT(vIndexMask,VectorType::get(mInt32Ty,JM()->mVWidth));
}
Value *ICLAMP(Value* src, Value* low, Value* high);
Value *FCLAMP(Value* src, Value* low, Value* high);
Value *FCLAMP(Value* src, float low, float high);
CallInst *PRINT(const std::string &printStr);
CallInst *PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs);
Value* STACKSAVE();
void STACKRESTORE(Value* pSaved);
Value* POPCNT(Value* a);
Value* INT3() { return INTERRUPT(C((uint8_t)3)); }
Value *VEXTRACTI128(Value* a, Constant* imm8);
Value *VINSERTI128(Value* a, Value* b, Constant* imm8);

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,128 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file fetch_jit.h
*
* @brief Definition of the fetch jitter
*
* Notes:
*
******************************************************************************/
#pragma once
#include "common/formats.h"
#include "core/state.h"
//////////////////////////////////////////////////////////////////////////
/// INPUT_ELEMENT_DESC
//////////////////////////////////////////////////////////////////////////
struct INPUT_ELEMENT_DESC
{
union
{
struct
{
uint32_t AlignedByteOffset : 12;
uint32_t Format : 10;
uint32_t StreamIndex : 6;
uint32_t InstanceEnable : 1;
uint32_t ComponentControl0 : 3;
uint32_t ComponentControl1 : 3;
uint32_t ComponentControl2 : 3;
uint32_t ComponentControl3 : 3;
uint32_t ComponentPacking : 4;
uint32_t _reserved : 19;
};
uint64_t bits;
};
uint32_t InstanceDataStepRate;
};
// used to set ComponentPacking
enum ComponentEnable
{
NONE = 0x0,
X = 0x1,
Y = 0x2,
XY = 0x3,
Z = 0x4,
XZ = 0x5,
YZ = 0x6,
XYZ = 0x7,
W = 0x8,
XW = 0x9,
YW = 0xA,
XYW = 0xB,
ZW = 0xC,
XZW = 0xD,
YZW = 0xE,
XYZW = 0xF,
};
enum ComponentControl
{
NoStore = 0,
StoreSrc = 1,
Store0 = 2,
Store1Fp = 3,
Store1Int = 4,
};
//////////////////////////////////////////////////////////////////////////
/// State required for fetch shader jit compile.
//////////////////////////////////////////////////////////////////////////
struct FETCH_COMPILE_STATE
{
uint32_t numAttribs;
INPUT_ELEMENT_DESC layout[KNOB_NUM_ATTRIBUTES];
SWR_FORMAT indexType;
uint32_t cutIndex{ 0xffffffff };
// Options that effect the JIT'd code
bool bDisableVGATHER; // if enabled, FetchJit will generate loads/shuffles instead of VGATHERs
bool bDisableIndexOOBCheck; // if enabled, FetchJit will exclude index OOB check
bool bEnableCutIndex{ false }; // compares indices with the cut index and returns a cut mask
FETCH_COMPILE_STATE(bool useVGATHER = false, bool indexOOBCheck = false) :
bDisableVGATHER(useVGATHER), bDisableIndexOOBCheck(indexOOBCheck){};
bool operator==(const FETCH_COMPILE_STATE &other) const
{
if (numAttribs != other.numAttribs) return false;
if (indexType != other.indexType) return false;
if (bDisableVGATHER != other.bDisableVGATHER) return false;
if (bDisableIndexOOBCheck != other.bDisableIndexOOBCheck) return false;
if (bEnableCutIndex != other.bEnableCutIndex) return false;
if (cutIndex != other.cutIndex) return false;
for(uint32_t i = 0; i < numAttribs; ++i)
{
if((layout[i].bits != other.layout[i].bits) ||
((layout[i].InstanceEnable == 1) &&
(layout[i].InstanceDataStepRate != other.layout[i].InstanceDataStepRate))){
return false;
}
}
return true;
}
};

View File

@ -0,0 +1,108 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file jit_api.h
*
* @brief Platform independent JIT interface
*
* Notes:
*
******************************************************************************/
#pragma once
#include "common/os.h"
#include "fetch_jit.h"
#include "streamout_jit.h"
#include "blend_jit.h"
#if defined(_WIN32)
#define EXCEPTION_PRINT_STACK(ret) ret
#endif // _WIN32
#if defined(_WIN32)
#define JITCALL __stdcall
#else
#define JITCALL
#endif
extern "C"
{
struct ShaderInfo;
//////////////////////////////////////////////////////////////////////////
/// Jit Compile Info Input
//////////////////////////////////////////////////////////////////////////
struct JIT_COMPILE_INPUT
{
SWR_SHADER_TYPE type;
const void* pIR; ///< Pointer to LLVM IR text.
bool enableJitSampler;
};
//////////////////////////////////////////////////////////////////////////
/// @brief Create JIT context.
HANDLE JITCALL JitCreateContext(uint32_t targetSimdWidth, const char* arch);
//////////////////////////////////////////////////////////////////////////
/// @brief Destroy JIT context.
void JITCALL JitDestroyContext(HANDLE hJitContext);
//////////////////////////////////////////////////////////////////////////
/// @brief JIT compile shader.
/// @param hJitContext - Jit Context
/// @param input - Input containing LLVM IR and other information
/// @param output - Output containing information about JIT shader
ShaderInfo* JITCALL JitCompileShader(
HANDLE hJitContext,
const JIT_COMPILE_INPUT& input);
//////////////////////////////////////////////////////////////////////////
/// @brief JIT destroy shader.
/// @param hJitContext - Jit Context
/// @param pShaderInfo - pointer to shader object.
void JITCALL JitDestroyShader(
HANDLE hJitContext,
ShaderInfo*& pShaderInfo);
//////////////////////////////////////////////////////////////////////////
/// @brief JIT compiles fetch shader
/// @param hJitContext - Jit Context
/// @param state - Fetch state to build function from
PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitContext, const FETCH_COMPILE_STATE& state);
//////////////////////////////////////////////////////////////////////////
/// @brief JIT compiles streamout shader
/// @param hJitContext - Jit Context
/// @param state - SO state to build function from
PFN_SO_FUNC JITCALL JitCompileStreamout(HANDLE hJitContext, const STREAMOUT_COMPILE_STATE& state);
//////////////////////////////////////////////////////////////////////////
/// @brief JIT compiles blend shader
/// @param hJitContext - Jit Context
/// @param state - blend state to build function from
PFN_BLEND_JIT_FUNC JITCALL JitCompileBlend(HANDLE hJitContext, const BLEND_COMPILE_STATE& state);
}; // extern "C"

View File

@ -0,0 +1,401 @@
# Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice (including the next
# paragraph) shall be included in all copies or substantial portions of the
# Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
#!deps/python32/python.exe
import os, sys, re
import argparse
import json as JSON
import operator
header = r"""/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file %s
*
* @brief auto-generated file
*
* DO NOT EDIT
*
******************************************************************************/
"""
"""
"""
def gen_file_header(filename):
global header
headerStr = header % filename
return headerStr.splitlines()
inst_aliases = {
'SHUFFLE_VECTOR': 'VSHUFFLE',
'INSERT_ELEMENT': 'VINSERT',
'EXTRACT_ELEMENT': 'VEXTRACT',
'MEM_SET': 'MEMSET',
'MEM_CPY': 'MEMCPY',
'MEM_MOVE': 'MEMMOVE',
'L_SHR': 'LSHR',
'A_SHR': 'ASHR',
'BIT_CAST': 'BITCAST',
'U_DIV': 'UDIV',
'S_DIV': 'SDIV',
'U_REM': 'UREM',
'S_REM': 'SREM',
'BIN_OP': 'BINOP',
}
intrinsics = [
["VGATHERPS", "x86_avx2_gather_d_ps_256", ["src", "pBase", "indices", "mask", "scale"]],
["VGATHERDD", "x86_avx2_gather_d_d_256", ["src", "pBase", "indices", "mask", "scale"]],
["VSQRTPS", "x86_avx_sqrt_ps_256", ["a"]],
["VRSQRTPS", "x86_avx_rsqrt_ps_256", ["a"]],
["VRCPPS", "x86_avx_rcp_ps_256", ["a"]],
["VMINPS", "x86_avx_min_ps_256", ["a", "b"]],
["VMAXPS", "x86_avx_max_ps_256", ["a", "b"]],
["VPMINSD", "x86_avx2_pmins_d", ["a", "b"]],
["VPMAXSD", "x86_avx2_pmaxs_d", ["a", "b"]],
["VROUND", "x86_avx_round_ps_256", ["a", "rounding"]],
["VCMPPS", "x86_avx_cmp_ps_256", ["a", "b", "cmpop"]],
["VBLENDVPS", "x86_avx_blendv_ps_256", ["a", "b", "mask"]],
["BEXTR_32", "x86_bmi_bextr_32", ["src", "control"]],
["VMASKLOADD", "x86_avx2_maskload_d_256", ["src", "mask"]],
["VMASKMOVPS", "x86_avx_maskload_ps_256", ["src", "mask"]],
["VPSHUFB", "x86_avx2_pshuf_b", ["a", "b"]],
["VPMOVSXBD", "x86_avx2_pmovsxbd", ["a"]], # sign extend packed 8bit components
["VPMOVSXWD", "x86_avx2_pmovsxwd", ["a"]], # sign extend packed 16bit components
["VPERMD", "x86_avx2_permd", ["idx", "a"]],
["VCVTPH2PS", "x86_vcvtph2ps_256", ["a"]],
["VCVTPS2PH", "x86_vcvtps2ph_256", ["a", "round"]],
["VHSUBPS", "x86_avx_hsub_ps_256", ["a", "b"]],
["VPTESTC", "x86_avx_ptestc_256", ["a", "b"]],
["VPTESTZ", "x86_avx_ptestz_256", ["a", "b"]],
["VFMADDPS", "x86_fma_vfmadd_ps_256", ["a", "b", "c"]],
["VCVTTPS2DQ", "x86_avx_cvtt_ps2dq_256", ["a"]],
["VMOVMSKPS", "x86_avx_movmsk_ps_256", ["a"]],
["INTERRUPT", "x86_int", ["a"]],
]
def convert_uppercamel(name):
s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).upper()
"""
Given an input file (e.g. IRBuilder.h) generates function dictionary.
"""
def parse_ir_builder(input_file):
functions = []
lines = input_file.readlines()
idx = 0
while idx < len(lines) - 1:
line = lines[idx].rstrip()
idx += 1
#match = re.search(r"\*Create", line)
match = re.search(r"[\*\s]Create(\w*)\(", line)
if match is not None:
#print("Line: %s" % match.group(1))
if re.search(r"^\s*Create", line) is not None:
func_sig = lines[idx-2].rstrip() + line
else:
func_sig = line
end_of_args = False
while not end_of_args:
end_paren = re.search(r"\)", line)
if end_paren is not None:
end_of_args = True
else:
line = lines[idx].rstrip()
func_sig += line
idx += 1
delfunc = re.search(r"LLVM_DELETED_FUNCTION|= delete;", func_sig)
if not delfunc:
func = re.search(r"(.*?)\*[\n\s]*(Create\w*)\((.*?)\)", func_sig)
if func is not None:
return_type = func.group(1).lstrip() + '*'
func_name = func.group(2)
arguments = func.group(3)
func_args = ''
func_args_nodefs = ''
num_args = arguments.count(',')
arg_names = []
num_args = 0
args = arguments.split(',')
for arg in args:
arg = arg.lstrip()
if arg:
if num_args > 0:
func_args += ', '
func_args_nodefs += ', '
func_args += arg
func_args_nodefs += arg.split(' =')[0]
split_args = arg.split('=')
arg_name = split_args[0].rsplit(None, 1)[-1]
#print("Before ArgName = %s" % arg_name)
reg_arg = re.search(r"[\&\*]*(\w*)", arg_name)
if reg_arg:
#print("Arg Name = %s" % reg_arg.group(1))
arg_names += [reg_arg.group(1)]
num_args += 1
ignore = False
# The following functions need to be ignored.
if func_name == 'CreateInsertNUWNSWBinOp':
ignore = True
if func_name == 'CreateMaskedIntrinsic':
ignore = True
# Convert CamelCase to CAMEL_CASE
func_mod = re.search(r"Create(\w*)", func_name)
if func_mod:
func_mod = func_mod.group(1)
func_mod = convert_uppercamel(func_mod)
if func_mod[0:2] == 'F_' or func_mod[0:2] == 'I_':
func_mod = func_mod[0] + func_mod[2:]
# Substitute alias based on CAMEL_CASE name.
func_alias = inst_aliases.get(func_mod)
if not func_alias:
func_alias = func_mod
if func_name == 'CreateCall' or func_name == 'CreateGEP':
arglist = re.search(r'ArrayRef', func_args)
if arglist:
func_alias = func_alias + 'A'
if not ignore:
functions.append({
"name": func_name,
"alias": func_alias,
"return": return_type,
"args": func_args,
"args_nodefs": func_args_nodefs,
"arg_names": arg_names
})
return functions
"""
Auto-generates macros for LLVM IR
"""
def generate_gen_h(functions, output_file):
output_lines = gen_file_header(os.path.basename(output_file.name))
output_lines += [
'#pragma once',
'',
'//////////////////////////////////////////////////////////////////////////',
'/// Auto-generated Builder IR declarations',
'//////////////////////////////////////////////////////////////////////////',
]
for func in functions:
name = func['name']
if func['alias']:
name = func['alias']
output_lines += [
'%s%s(%s);' % (func['return'], name, func['args'])
]
output_file.write('\n'.join(output_lines) + '\n')
"""
Auto-generates macros for LLVM IR
"""
def generate_gen_cpp(functions, output_file):
output_lines = gen_file_header(os.path.basename(output_file.name))
output_lines += [
'#include \"builder.h\"',
''
]
for func in functions:
name = func['name']
if func['alias']:
name = func['alias']
args = func['arg_names']
func_args = ''
first_arg = True
for arg in args:
if not first_arg:
func_args += ', '
func_args += arg
first_arg = False
output_lines += [
'//////////////////////////////////////////////////////////////////////////',
'%sBuilder::%s(%s)' % (func['return'], name, func['args_nodefs']),
'{',
' return IRB()->%s(%s);' % (func['name'], func_args),
'}',
'',
]
output_file.write('\n'.join(output_lines) + '\n')
"""
Auto-generates macros for LLVM IR
"""
def generate_x86_h(output_file):
output_lines = gen_file_header(os.path.basename(output_file.name))
output_lines += [
'#pragma once',
'',
'//////////////////////////////////////////////////////////////////////////',
'/// Auto-generated x86 intrinsics',
'//////////////////////////////////////////////////////////////////////////',
]
for inst in intrinsics:
#print("Inst: %s, x86: %s numArgs: %d" % (inst[0], inst[1], len(inst[2])))
args = ''
first = True
for arg in inst[2]:
if not first:
args += ', '
args += ("Value* %s" % arg)
first = False
output_lines += [
'Value *%s(%s);' % (inst[0], args)
]
output_file.write('\n'.join(output_lines) + '\n')
"""
Auto-generates macros for LLVM IR
"""
def generate_x86_cpp(output_file):
output_lines = gen_file_header(os.path.basename(output_file.name))
output_lines += [
'#include \"builder.h\"',
''
]
for inst in intrinsics:
#print("Inst: %s, x86: %s numArgs: %d" % (inst[0], inst[1], len(inst[2])))
args = ''
pass_args = ''
first = True
for arg in inst[2]:
if not first:
args += ', '
pass_args += ', '
args += ("Value* %s" % arg)
pass_args += arg
first = False
output_lines += [
'//////////////////////////////////////////////////////////////////////////',
'Value *Builder::%s(%s)' % (inst[0], args),
'{',
' Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::%s);' % inst[1],
' return CALL(func, std::initializer_list<Value*>{%s});' % pass_args,
'}',
'',
]
output_file.write('\n'.join(output_lines) + '\n')
"""
Function which is invoked when this script is started from a command line.
Will present and consume a set of arguments which will tell this script how
to behave
"""
def main():
# Parse args...
parser = argparse.ArgumentParser()
parser.add_argument("--input", "-i", type=argparse.FileType('r'), help="Path to IRBuilder.h", required=False)
parser.add_argument("--output", "-o", type=argparse.FileType('w'), help="Path to output file", required=True)
parser.add_argument("--gen_h", "-gen_h", help="Generate builder_gen.h", action="store_true", default=False)
parser.add_argument("--gen_cpp", "-gen_cpp", help="Generate builder_gen.cpp", action="store_true", default=False)
parser.add_argument("--gen_x86_h", "-gen_x86_h", help="Generate x86 intrinsics. No input is needed.", action="store_true", default=False)
parser.add_argument("--gen_x86_cpp", "-gen_x86_cpp", help="Generate x86 intrinsics. No input is needed.", action="store_true", default=False)
args = parser.parse_args()
if args.input:
functions = parse_ir_builder(args.input)
if args.gen_h:
generate_gen_h(functions, args.output)
if args.gen_cpp:
generate_gen_cpp(functions, args.output)
else:
if args.gen_x86_h:
generate_x86_h(args.output)
if args.gen_x86_cpp:
generate_x86_cpp(args.output)
if args.gen_h:
print("Need to specify --input for --gen_h!")
if args.gen_cpp:
print("Need to specify --input for --gen_cpp!")
if __name__ == '__main__':
main()
# END OF FILE

View File

@ -0,0 +1,341 @@
# Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice (including the next
# paragraph) shall be included in all copies or substantial portions of the
# Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
#!deps/python32/python.exe
import os, sys, re
import argparse
import json as JSON
import operator
header = r"""
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file %s
*
* @brief auto-generated file
*
* DO NOT EDIT
*
******************************************************************************/
#pragma once
"""
"""
"""
def gen_file_header(filename):
global header
headerStr = header % filename
return headerStr.splitlines()
"""
"""
def gen_llvm_type(type, name, postfix_name, is_pointer, is_pointer_pointer, is_array, is_array_array, array_count, array_count1, is_llvm_struct, is_llvm_enum, is_llvm_pfn, output_file):
llvm_type = ''
if is_llvm_struct:
if is_pointer or is_pointer_pointer:
llvm_type = 'Type::getInt32Ty(ctx)'
else:
llvm_type = 'ArrayType::get(Type::getInt8Ty(ctx), sizeof(%s))' % type
elif is_llvm_enum:
llvm_type = 'Type::getInt32Ty(ctx)'
elif is_llvm_pfn:
llvm_type = 'PointerType::get(Type::getInt8Ty(ctx), 0)'
else:
if type == "BYTE" or type == "char" or type == "uint8_t" or type == "int8_t" or type == 'bool':
llvm_type = 'Type::getInt8Ty(ctx)'
elif type == 'UINT64' or type == 'INT64' or type == 'uint64_t' or type == 'int64_t':
llvm_type = 'Type::getInt64Ty(ctx)'
elif type == 'UINT16' or type == 'int16_t' or type == 'uint16_t':
llvm_type = 'Type::getInt16Ty(ctx)'
elif type == 'UINT' or type == 'INT' or type == 'int' or type == 'BOOL' or type == 'uint32_t' or type == 'int32_t':
llvm_type = 'Type::getInt32Ty(ctx)'
elif type == 'float' or type == 'FLOAT':
llvm_type = 'Type::getFloatTy(ctx)'
elif type == 'double' or type == 'DOUBLE':
llvm_type = 'Type::getDoubleTy(ctx)'
elif type == 'void' or type == 'VOID':
llvm_type = 'Type::getInt32Ty(ctx)'
elif type == 'HANDLE':
llvm_type = 'PointerType::get(Type::getInt32Ty(ctx), 0)'
elif type == 'simdscalar':
llvm_type = 'VectorType::get(Type::getFloatTy(ctx), pJitMgr->mVWidth)'
elif type == 'simdscalari':
llvm_type = 'VectorType::get(Type::getInt32Ty(ctx), pJitMgr->mVWidth)'
elif type == 'simdvector':
llvm_type = 'ArrayType::get(VectorType::get(Type::getFloatTy(ctx), pJitMgr->mVWidth), 4)'
else:
llvm_type = 'Gen_%s%s(pJitMgr)' % (type, postfix_name)
if is_pointer:
llvm_type = 'PointerType::get(%s, 0)' % llvm_type
if is_pointer_pointer:
llvm_type = 'PointerType::get(%s, 0)' % llvm_type
if is_array_array:
llvm_type = 'ArrayType::get(ArrayType::get(%s, %s), %s)' % (llvm_type, array_count1, array_count)
elif is_array:
llvm_type = 'ArrayType::get(%s, %s)' % (llvm_type, array_count)
return [' members.push_back( %s ); // %s' % (llvm_type, name)]
"""
"""
def gen_llvm_types(input_file, output_file):
output_lines = gen_file_header(os.path.basename(output_file.name))
lines = input_file.readlines()
postfix_name = ""
for idx in range(len(lines)):
line = lines[idx].rstrip()
match = re.match(r"(\s*)struct(\s*)(\w+)", line)
if match:
llvm_args = []
# Detect start of structure
is_fwd_decl = re.search(r";", line)
if not is_fwd_decl:
# Extract the command name
struct_name = match.group(3).strip()
output_lines += [
'//////////////////////////////////////////////////////////////////////////',
'/// Generate LLVM type information for %s' % struct_name,
'INLINE static StructType *Gen_%s%s(JitManager* pJitMgr)' % (struct_name, postfix_name),
'{',
' LLVMContext& ctx = pJitMgr->mContext;',
' std::vector<Type*> members;',
'',
]
end_of_struct = False
while not end_of_struct and idx < len(lines)-1:
idx += 1
line = lines[idx].rstrip()
is_llvm_typedef = re.search(r"@llvm_typedef", line)
if is_llvm_typedef is not None:
is_llvm_typedef = True
else:
is_llvm_typedef = False
###########################################
# Is field a llvm struct? Tells script to treat type as array of bytes that is size of structure.
is_llvm_struct = re.search(r"@llvm_struct", line)
if is_llvm_struct is not None:
is_llvm_struct = True
else:
is_llvm_struct = False
###########################################
# Is field a llvm enum? Tells script to treat type as an enum and replaced with uint32 type.
is_llvm_enum = re.search(r"@llvm_enum", line)
if is_llvm_enum is not None:
is_llvm_enum = True
else:
is_llvm_enum = False
###########################################
# Is field a llvm function pointer? Tells script to treat type as an enum and replaced with uint32 type.
is_llvm_pfn = re.search(r"@llvm_pfn", line)
if is_llvm_pfn is not None:
is_llvm_pfn = True
else:
is_llvm_pfn = False
###########################################
# Is field const?
is_const = re.search(r"\s+const\s+", line)
if is_const is not None:
is_const = True
else:
is_const = False
###########################################
# Is field a pointer?
is_pointer_pointer = re.search("\*\*", line)
if is_pointer_pointer is not None:
is_pointer_pointer = True
else:
is_pointer_pointer = False
###########################################
# Is field a pointer?
is_pointer = re.search("\*", line)
if is_pointer is not None:
is_pointer = True
else:
is_pointer = False
###########################################
# Is field an array of arrays?
# TODO: Can add this to a list.
is_array_array = re.search("\[(\w*)\]\[(\w*)\]", line)
array_count = '0'
array_count1 = '0'
if is_array_array is not None:
array_count = is_array_array.group(1)
array_count1 = is_array_array.group(2)
is_array_array = True
else:
is_array_array = False
###########################################
# Is field an array?
is_array = re.search("\[(\w*)\]", line)
if is_array is not None:
array_count = is_array.group(1)
is_array = True
else:
is_array = False
is_scoped = re.search("::", line)
if is_scoped is not None:
is_scoped = True
else:
is_scoped = False
type = None
name = None
if is_const and is_pointer:
if is_scoped:
field_match = re.match(r"(\s*)(\w+\<*\w*\>*)(\s+)(\w+::)(\w+)(\s*\**\s*)(\w+)", line)
type = "%s%s" % (field_match.group(4), field_match.group(5))
name = field_match.group(7)
else:
field_match = re.match(r"(\s*)(\w+\<*\w*\>*)(\s+)(\w+)(\s*\**\s*)(\w+)", line)
type = field_match.group(4)
name = field_match.group(6)
elif is_pointer:
field_match = re.match(r"(\s*)(\s+)(\w+\<*\w*\>*)(\s*\**\s*)(\w+)", line)
if field_match:
type = field_match.group(3)
name = field_match.group(5)
elif is_const:
field_match = re.match(r"(\s*)(\w+\<*\w*\>*)(\s+)(\w+)(\s*)(\w+)", line)
if field_match:
type = field_match.group(4)
name = field_match.group(6)
else:
if is_scoped:
field_match = re.match(r"\s*(\w+\<*\w*\>*)\s*::\s*(\w+\<*\w*\>*)\s+(\w+)", line)
if field_match:
type = field_match.group(1) + '::' + field_match.group(2)
name = field_match.group(3)
else:
field_match = re.match(r"(\s*)(\w+\<*\w*\>*)(\s+)(\w+)", line)
if field_match:
type = field_match.group(2)
name = field_match.group(4)
if is_llvm_typedef is False:
if type is not None:
output_lines += gen_llvm_type(type, name, postfix_name, is_pointer, is_pointer_pointer, is_array, is_array_array, array_count, array_count1, is_llvm_struct, is_llvm_enum, is_llvm_pfn, output_file)
llvm_args.append(name)
# Detect end of structure
end_of_struct = re.match(r"(\s*)};", line)
if (end_of_struct):
output_lines += [
'',
' return StructType::get(ctx, members, false);',
'}',
'',
]
for i in range(len(llvm_args)):
output_lines.append('static const uint32_t %s%s_%s = %s;' % (struct_name, postfix_name, llvm_args[i], i))
output_lines.append('')
output_file.write('\n'.join(output_lines) + '\n')
"""
Function which is invoked when this script is started from a command line.
Will present and consume a set of arguments which will tell this script how
to behave
"""
def main():
# Parse args...
parser = argparse.ArgumentParser()
parser.add_argument("--input", "-i", type=argparse.FileType('r'),
help="Path to input file containing structs", required=True)
parser.add_argument("--output", "-o", type=argparse.FileType('w'),
help="Path to output file", required=True)
parser.add_argument("--scalar", "-scalar", help="Generates scalar files with all enums", action="store_true", default=False)
args = parser.parse_args()
gen_llvm_types(args.input, args.output)
if __name__ == '__main__':
main()
# END OF FILE

View File

@ -0,0 +1,357 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file streamout_jit.cpp
*
* @brief Implementation of the streamout jitter
*
* Notes:
*
******************************************************************************/
#include "jit_api.h"
#include "streamout_jit.h"
#include "builder.h"
#include "state_llvm.h"
#include "common/containers.hpp"
#include "llvm/IR/DataLayout.h"
#include <sstream>
#include <unordered_set>
//////////////////////////////////////////////////////////////////////////
/// Interface to Jitting a fetch shader
//////////////////////////////////////////////////////////////////////////
struct StreamOutJit : public Builder
{
StreamOutJit(JitManager* pJitMgr) : Builder(pJitMgr){};
// returns pointer to SWR_STREAMOUT_BUFFER
Value* getSOBuffer(Value* pSoCtx, uint32_t buffer)
{
return LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_pBuffer, buffer });
}
//////////////////////////////////////////////////////////////////////////
// @brief checks if streamout buffer is oob
// @return <i1> true/false
Value* oob(const STREAMOUT_COMPILE_STATE& state, Value* pSoCtx, uint32_t buffer)
{
Value* returnMask = C(false);
Value* pBuf = getSOBuffer(pSoCtx, buffer);
// load enable
// @todo bool data types should generate <i1> llvm type
Value* enabled = TRUNC(LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_enable }), IRB()->getInt1Ty());
// load buffer size
Value* bufferSize = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_bufferSize });
// load current streamOffset
Value* streamOffset = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset });
// load buffer pitch
Value* pitch = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_pitch });
// buffer is considered oob if in use in a decl but not enabled
returnMask = OR(returnMask, NOT(enabled));
// buffer is oob if cannot fit a prims worth of verts
Value* newOffset = ADD(streamOffset, MUL(pitch, C(state.numVertsPerPrim)));
returnMask = OR(returnMask, ICMP_SGT(newOffset, bufferSize));
return returnMask;
}
//////////////////////////////////////////////////////////////////////////
// @brief converts scalar bitmask to <4 x i32> suitable for shuffle vector,
// packing the active mask bits
// ex. bitmask 0011 -> (0, 1, 0, 0)
// bitmask 1000 -> (3, 0, 0, 0)
// bitmask 1100 -> (2, 3, 0, 0)
Value* PackMask(uint32_t bitmask)
{
std::vector<Constant*> indices(4, C(0));
DWORD index;
uint32_t elem = 0;
while (_BitScanForward(&index, bitmask))
{
indices[elem++] = C((int)index);
bitmask &= ~(1 << index);
}
return ConstantVector::get(indices);
}
//////////////////////////////////////////////////////////////////////////
// @brief convert scalar bitmask to <4xfloat> bitmask
Value* ToMask(uint32_t bitmask)
{
std::vector<Constant*> indices;
for (uint32_t i = 0; i < 4; ++i)
{
if (bitmask & (1 << i))
{
indices.push_back(C(-1.0f));
}
else
{
indices.push_back(C(0.0f));
}
}
return ConstantVector::get(indices);
}
//////////////////////////////////////////////////////////////////////////
// @brief processes a single decl from the streamout stream. Reads 4 components from the input
// stream and writes N components to the output buffer given the componentMask or if
// a hole, just increments the buffer pointer
// @param pStream - pointer to current attribute
// @param pOutBuffers - pointers to the current location of each output buffer
// @param decl - input decl
void buildDecl(Value* pStream, Value* pOutBuffers[4], const STREAMOUT_DECL& decl)
{
// @todo add this to x86 macros
Function* maskStore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_maskstore_ps);
uint32_t numComponents = _mm_popcnt_u32(decl.componentMask);
uint32_t packedMask = (1 << numComponents) - 1;
if (!decl.hole)
{
// increment stream pointer to correct slot
Value* pAttrib = GEP(pStream, C(4 * decl.attribSlot));
// load 4 components from stream
Type* simd4Ty = VectorType::get(IRB()->getFloatTy(), 4);
Type* simd4PtrTy = PointerType::get(simd4Ty, 0);
pAttrib = BITCAST(pAttrib, simd4PtrTy);
Value *vattrib = LOAD(pAttrib);
// shuffle/pack enabled components
Value* vpackedAttrib = VSHUFFLE(vattrib, vattrib, PackMask(decl.componentMask));
// store to output buffer
// cast SO buffer to i8*, needed by maskstore
Value* pOut = BITCAST(pOutBuffers[decl.bufferIndex], PointerType::get(mInt8Ty, 0));
// cast input to <4xfloat>
Value* src = BITCAST(vpackedAttrib, simd4Ty);
CALL(maskStore, {pOut, ToMask(packedMask), src});
}
// increment SO buffer
pOutBuffers[decl.bufferIndex] = GEP(pOutBuffers[decl.bufferIndex], C(numComponents));
}
//////////////////////////////////////////////////////////////////////////
// @brief builds a single vertex worth of data for the given stream
// @param streamState - state for this stream
// @param pCurVertex - pointer to src stream vertex data
// @param pOutBuffer - pointers to up to 4 SO buffers
void buildVertex(const STREAMOUT_STREAM& streamState, Value* pCurVertex, Value* pOutBuffer[4])
{
for (uint32_t d = 0; d < streamState.numDecls; ++d)
{
const STREAMOUT_DECL& decl = streamState.decl[d];
buildDecl(pCurVertex, pOutBuffer, decl);
}
}
void buildStream(const STREAMOUT_COMPILE_STATE& state, const STREAMOUT_STREAM& streamState, Value* pSoCtx, BasicBlock* returnBB, Function* soFunc)
{
// get list of active SO buffers
std::unordered_set<uint32_t> activeSOBuffers;
for (uint32_t d = 0; d < streamState.numDecls; ++d)
{
const STREAMOUT_DECL& decl = streamState.decl[d];
activeSOBuffers.insert(decl.bufferIndex);
}
// always increment numPrimStorageNeeded
Value *numPrimStorageNeeded = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimStorageNeeded });
numPrimStorageNeeded = ADD(numPrimStorageNeeded, C(1));
STORE(numPrimStorageNeeded, pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimStorageNeeded });
// check OOB on active SO buffers. If any buffer is out of bound, don't write
// the primitive to any buffer
Value* oobMask = C(false);
for (uint32_t buffer : activeSOBuffers)
{
oobMask = OR(oobMask, oob(state, pSoCtx, buffer));
}
BasicBlock* validBB = BasicBlock::Create(JM()->mContext, "valid", soFunc);
// early out if OOB
COND_BR(oobMask, returnBB, validBB);
IRB()->SetInsertPoint(validBB);
Value* numPrimsWritten = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimsWritten });
numPrimsWritten = ADD(numPrimsWritten, C(1));
STORE(numPrimsWritten, pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimsWritten });
// compute start pointer for each output buffer
Value* pOutBuffer[4];
Value* pOutBufferStartVertex[4];
Value* outBufferPitch[4];
for (uint32_t b: activeSOBuffers)
{
Value* pBuf = getSOBuffer(pSoCtx, b);
Value* pData = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_pBuffer });
Value* streamOffset = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset });
pOutBuffer[b] = GEP(pData, streamOffset);
pOutBufferStartVertex[b] = pOutBuffer[b];
outBufferPitch[b] = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_pitch });
}
// loop over the vertices of the prim
Value* pStreamData = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_pPrimData });
for (uint32_t v = 0; v < state.numVertsPerPrim; ++v)
{
buildVertex(streamState, pStreamData, pOutBuffer);
// increment stream and output buffer pointers
// stream verts are always 32*4 dwords apart
pStreamData = GEP(pStreamData, C(KNOB_NUM_ATTRIBUTES * 4));
// output buffers offset using pitch in buffer state
for (uint32_t b : activeSOBuffers)
{
pOutBufferStartVertex[b] = GEP(pOutBufferStartVertex[b], outBufferPitch[b]);
pOutBuffer[b] = pOutBufferStartVertex[b];
}
}
// update each active buffer's streamOffset
for (uint32_t b : activeSOBuffers)
{
Value* pBuf = getSOBuffer(pSoCtx, b);
Value* streamOffset = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset });
streamOffset = ADD(streamOffset, MUL(C(state.numVertsPerPrim), outBufferPitch[b]));
STORE(streamOffset, pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset });
}
}
Function* Create(const STREAMOUT_COMPILE_STATE& state)
{
static std::size_t soNum = 0;
std::stringstream fnName("SOShader", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
fnName << soNum++;
// SO function signature
// typedef void(__cdecl *PFN_SO_FUNC)(SWR_STREAMOUT_CONTEXT*)
std::vector<Type*> args{
PointerType::get(Gen_SWR_STREAMOUT_CONTEXT(JM()), 0), // SWR_STREAMOUT_CONTEXT*
};
FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false);
Function* soFunc = Function::Create(fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
// create return basic block
BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", soFunc);
BasicBlock* returnBB = BasicBlock::Create(JM()->mContext, "return", soFunc);
IRB()->SetInsertPoint(entry);
// arguments
auto argitr = soFunc->getArgumentList().begin();
Value* pSoCtx = &*argitr++;
pSoCtx->setName("pSoCtx");
const STREAMOUT_STREAM& streamState = state.stream;
buildStream(state, streamState, pSoCtx, returnBB, soFunc);
BR(returnBB);
IRB()->SetInsertPoint(returnBB);
RET_VOID();
JitManager::DumpToFile(soFunc, "SoFunc");
FunctionPassManager passes(JM()->mpCurrentModule);
passes.add(createBreakCriticalEdgesPass());
passes.add(createCFGSimplificationPass());
passes.add(createEarlyCSEPass());
passes.add(createPromoteMemoryToRegisterPass());
passes.add(createCFGSimplificationPass());
passes.add(createEarlyCSEPass());
passes.add(createInstructionCombiningPass());
passes.add(createInstructionSimplifierPass());
passes.add(createConstantPropagationPass());
passes.add(createSCCPPass());
passes.add(createAggressiveDCEPass());
passes.run(*soFunc);
JitManager::DumpToFile(soFunc, "SoFunc_optimized");
return soFunc;
}
};
//////////////////////////////////////////////////////////////////////////
/// @brief JITs from streamout shader IR
/// @param hJitMgr - JitManager handle
/// @param func - LLVM function IR
/// @return PFN_SO_FUNC - pointer to SOS function
PFN_SO_FUNC JitStreamoutFunc(HANDLE hJitMgr, const HANDLE hFunc)
{
const llvm::Function *func = (const llvm::Function*)hFunc;
JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
PFN_SO_FUNC pfnStreamOut;
pfnStreamOut = (PFN_SO_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
// MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
pJitMgr->mIsModuleFinalized = true;
return pfnStreamOut;
}
//////////////////////////////////////////////////////////////////////////
/// @brief JIT compiles streamout shader
/// @param hJitMgr - JitManager handle
/// @param state - SO state to build function from
extern "C" PFN_SO_FUNC JITCALL JitCompileStreamout(HANDLE hJitMgr, const STREAMOUT_COMPILE_STATE& state)
{
JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
STREAMOUT_COMPILE_STATE soState = state;
if (soState.offsetAttribs)
{
for (uint32_t i = 0; i < soState.stream.numDecls; ++i)
{
soState.stream.decl[i].attribSlot -= soState.offsetAttribs;
}
}
pJitMgr->SetupNewModule();
StreamOutJit theJit(pJitMgr);
HANDLE hFunc = theJit.Create(soState);
return JitStreamoutFunc(hJitMgr, hFunc);
}

View File

@ -0,0 +1,94 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file streamout_jit.h
*
* @brief Definition of the streamout jitter
*
* Notes:
*
******************************************************************************/
#pragma once
#include "common/formats.h"
#include "core/state.h"
//////////////////////////////////////////////////////////////////////////
/// STREAMOUT_DECL - Stream decl
//////////////////////////////////////////////////////////////////////////
struct STREAMOUT_DECL
{
// Buffer that stream maps to.
DWORD bufferIndex;
// attribute to stream
uint32_t attribSlot;
// attribute component mask
uint32_t componentMask;
// indicates this decl is a hole
bool hole;
};
//////////////////////////////////////////////////////////////////////////
/// STREAMOUT_STREAM - Stream decls
//////////////////////////////////////////////////////////////////////////
struct STREAMOUT_STREAM
{
// numnber of decls for this stream
uint32_t numDecls;
// array of numDecls decls
STREAMOUT_DECL decl[128];
};
//////////////////////////////////////////////////////////////////////////
/// State required for streamout jit
//////////////////////////////////////////////////////////////////////////
struct STREAMOUT_COMPILE_STATE
{
// number of verts per primitive
uint32_t numVertsPerPrim;
uint32_t offsetAttribs; ///< attrib offset to subtract from all STREAMOUT_DECL::attribSlot values.
uint64_t streamMask;
// stream decls
STREAMOUT_STREAM stream;
bool operator==(const STREAMOUT_COMPILE_STATE &other) const
{
if (numVertsPerPrim != other.numVertsPerPrim) return false;
if (stream.numDecls != other.stream.numDecls) return false;
for (uint32_t i = 0; i < stream.numDecls; ++i)
{
if (stream.decl[i].bufferIndex != other.stream.decl[i].bufferIndex) return false;
if (stream.decl[i].attribSlot != other.stream.decl[i].attribSlot) return false;
if (stream.decl[i].componentMask != other.stream.decl[i].componentMask) return false;
if (stream.decl[i].hole != other.stream.decl[i].hole) return false;
}
return true;
}
};

View File

@ -0,0 +1,287 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file ClearTile.cpp
*
* @brief Functionality for ClearTile. StoreHotTileClear clears a single macro
* tile in the destination.
*
******************************************************************************/
#include "common/os.h"
#include "core/context.h"
#include "common/formats.h"
#include "memory/TilingFunctions.h"
#include "memory/tilingtraits.h"
#include "memory/Convert.h"
typedef void(*PFN_STORE_TILES_CLEAR)(const FLOAT*, SWR_SURFACE_STATE*, UINT, UINT);
//////////////////////////////////////////////////////////////////////////
/// Clear Raster Tile Function Tables.
//////////////////////////////////////////////////////////////////////////
static PFN_STORE_TILES_CLEAR sStoreTilesClearColorTable[NUM_SWR_FORMATS];
static PFN_STORE_TILES_CLEAR sStoreTilesClearDepthTable[NUM_SWR_FORMATS];
//////////////////////////////////////////////////////////////////////////
/// StoreRasterTileClear
//////////////////////////////////////////////////////////////////////////
template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
struct StoreRasterTileClear
{
//////////////////////////////////////////////////////////////////////////
/// @brief Stores an 8x8 raster tile to the destination surface.
/// @param pColor - Pointer to clear color.
/// @param pDstSurface - Destination surface state
/// @param x, y - Coordinates to raster tile.
INLINE static void StoreClear(
const BYTE* dstFormattedColor,
UINT dstBytesPerPixel,
SWR_SURFACE_STATE* pDstSurface,
UINT x, UINT y) // (x, y) pixel coordinate to start of raster tile.
{
// Compute destination address for raster tile.
BYTE* pDstTile = (BYTE*)pDstSurface->pBaseAddress +
(y * pDstSurface->pitch) + (x * dstBytesPerPixel);
// start of first row
BYTE* pDst = pDstTile;
UINT dstBytesPerRow = 0;
// For each raster tile pixel in row 0 (rx, 0)
for (UINT rx = 0; (rx < KNOB_TILE_X_DIM) && ((x + rx) < pDstSurface->width); ++rx)
{
memcpy(pDst, dstFormattedColor, dstBytesPerPixel);
// Increment pointer to next pixel in row.
pDst += dstBytesPerPixel;
dstBytesPerRow += dstBytesPerPixel;
}
// start of second row
pDst = pDstTile + pDstSurface->pitch;
// For each remaining row in the rest of the raster tile
for (UINT ry = 1; (ry < KNOB_TILE_Y_DIM) && ((y + ry) < pDstSurface->height); ++ry)
{
// copy row
memcpy(pDst, pDstTile, dstBytesPerRow);
// Increment pointer to first pixel in next row.
pDst += pDstSurface->pitch;
}
}
};
//////////////////////////////////////////////////////////////////////////
/// StoreMacroTileClear - Stores a macro tile clear to its raster tiles.
//////////////////////////////////////////////////////////////////////////
template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
struct StoreMacroTileClear
{
//////////////////////////////////////////////////////////////////////////
/// @brief Stores a macrotile to the destination surface.
/// @param pColor - Pointer to color to write to pixels.
/// @param pDstSurface - Destination surface state
/// @param x, y - Coordinates to macro tile
static void StoreClear(
const FLOAT *pColor,
SWR_SURFACE_STATE* pDstSurface,
UINT x, UINT y)
{
UINT dstBytesPerPixel = (FormatTraits<DstFormat>::bpp / 8);
BYTE dstFormattedColor[16]; // max bpp is 128, so 16 is all we need here for one pixel
FLOAT srcColor[4];
for (UINT comp = 0; comp < FormatTraits<DstFormat>::numComps; ++comp)
{
srcColor[comp] = pColor[FormatTraits<DstFormat>::swizzle(comp)];
}
// using this helper function, but the Tiling Traits is unused inside it so just using a dummy value
ConvertPixelFromFloat<DstFormat>(dstFormattedColor, srcColor);
// Store each raster tile from the hot tile to the destination surface.
// TODO: Put in check for partial coverage on x/y -- SWR_ASSERT if it happens.
// Intent is for this function to only handle full tiles.
for (UINT row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
{
for (UINT col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
{
StoreRasterTileClear<SrcFormat, DstFormat>::StoreClear(dstFormattedColor, dstBytesPerPixel, pDstSurface, (x + col), (y + row));
}
}
}
};
//////////////////////////////////////////////////////////////////////////
/// @brief Writes clear color to every pixel of a render surface
/// @param hPrivateContext - Handle to private DC
/// @param renderTargetIndex - Index to destination render target
/// @param x, y - Coordinates to raster tile.
/// @param pClearColor - Pointer to clear color
void StoreHotTileClear(
SWR_SURFACE_STATE *pDstSurface,
SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
UINT x,
UINT y,
const float* pClearColor)
{
PFN_STORE_TILES_CLEAR pfnStoreTilesClear = NULL;
SWR_ASSERT(renderTargetIndex != SWR_ATTACHMENT_STENCIL); ///@todo Not supported yet.
if (renderTargetIndex != SWR_ATTACHMENT_DEPTH)
{
pfnStoreTilesClear = sStoreTilesClearColorTable[pDstSurface->format];
}
else
{
pfnStoreTilesClear = sStoreTilesClearDepthTable[pDstSurface->format];
}
SWR_ASSERT(pfnStoreTilesClear != NULL);
// Store a macro tile.
/// @todo Once all formats are supported then if check can go away. This is to help us near term to make progress.
if (pfnStoreTilesClear != NULL)
{
pfnStoreTilesClear(pClearColor, pDstSurface, x, y);
}
}
//////////////////////////////////////////////////////////////////////////
/// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables.
#define INIT_STORE_TILES_CLEAR_COLOR_TABLE() \
memset(sStoreTilesClearColorTable, 0, sizeof(sStoreTilesClearColorTable)); \
\
sStoreTilesClearColorTable[R32G32B32A32_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32B32A32_FLOAT>::StoreClear; \
sStoreTilesClearColorTable[R32G32B32A32_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32B32A32_SINT>::StoreClear; \
sStoreTilesClearColorTable[R32G32B32A32_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32B32A32_UINT>::StoreClear; \
sStoreTilesClearColorTable[R32G32B32X32_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32B32X32_FLOAT>::StoreClear; \
sStoreTilesClearColorTable[R32G32B32_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32B32_FLOAT>::StoreClear; \
sStoreTilesClearColorTable[R32G32B32_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32B32_SINT>::StoreClear; \
sStoreTilesClearColorTable[R32G32B32_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32B32_UINT>::StoreClear; \
sStoreTilesClearColorTable[R16G16B16A16_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16A16_UNORM>::StoreClear; \
sStoreTilesClearColorTable[R16G16B16A16_SNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16A16_SNORM>::StoreClear; \
sStoreTilesClearColorTable[R16G16B16A16_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16A16_SINT>::StoreClear; \
sStoreTilesClearColorTable[R16G16B16A16_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16A16_UINT>::StoreClear; \
sStoreTilesClearColorTable[R16G16B16A16_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16A16_FLOAT>::StoreClear; \
sStoreTilesClearColorTable[R32G32_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32_FLOAT>::StoreClear; \
sStoreTilesClearColorTable[R32G32_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32_SINT>::StoreClear; \
sStoreTilesClearColorTable[R32G32_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32_UINT>::StoreClear; \
sStoreTilesClearColorTable[R16G16B16X16_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16X16_UNORM>::StoreClear; \
sStoreTilesClearColorTable[R16G16B16X16_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16X16_FLOAT>::StoreClear; \
sStoreTilesClearColorTable[B8G8R8A8_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, B8G8R8A8_UNORM>::StoreClear; \
sStoreTilesClearColorTable[B8G8R8A8_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB>::StoreClear; \
sStoreTilesClearColorTable[R10G10B10A2_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R10G10B10A2_UNORM>::StoreClear; \
sStoreTilesClearColorTable[R10G10B10A2_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, R10G10B10A2_UNORM_SRGB>::StoreClear; \
sStoreTilesClearColorTable[R10G10B10A2_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R10G10B10A2_UINT>::StoreClear; \
sStoreTilesClearColorTable[R8G8B8A8_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8A8_UNORM>::StoreClear; \
sStoreTilesClearColorTable[R8G8B8A8_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB>::StoreClear; \
sStoreTilesClearColorTable[R8G8B8A8_SNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8A8_SNORM>::StoreClear; \
sStoreTilesClearColorTable[R8G8B8A8_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8A8_SINT>::StoreClear; \
sStoreTilesClearColorTable[R8G8B8A8_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8A8_UINT>::StoreClear; \
sStoreTilesClearColorTable[R16G16_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16_UNORM>::StoreClear; \
sStoreTilesClearColorTable[R16G16_SNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16_SNORM>::StoreClear; \
sStoreTilesClearColorTable[R16G16_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16_SINT>::StoreClear; \
sStoreTilesClearColorTable[R16G16_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16_UINT>::StoreClear; \
sStoreTilesClearColorTable[R16G16_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16_FLOAT>::StoreClear; \
sStoreTilesClearColorTable[B10G10R10A2_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, B10G10R10A2_UNORM>::StoreClear; \
sStoreTilesClearColorTable[B10G10R10A2_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, B10G10R10A2_UNORM_SRGB>::StoreClear; \
sStoreTilesClearColorTable[R11G11B10_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R11G11B10_FLOAT>::StoreClear; \
sStoreTilesClearColorTable[R32_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32_SINT>::StoreClear; \
sStoreTilesClearColorTable[R32_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32_UINT>::StoreClear; \
sStoreTilesClearColorTable[R32_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R32_FLOAT>::StoreClear; \
sStoreTilesClearColorTable[A32_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, A32_FLOAT>::StoreClear; \
sStoreTilesClearColorTable[B8G8R8X8_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, B8G8R8X8_UNORM>::StoreClear; \
sStoreTilesClearColorTable[B8G8R8X8_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB>::StoreClear; \
sStoreTilesClearColorTable[R8G8B8X8_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8X8_UNORM>::StoreClear; \
sStoreTilesClearColorTable[R8G8B8X8_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB>::StoreClear; \
sStoreTilesClearColorTable[B10G10R10X2_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, B10G10R10X2_UNORM>::StoreClear; \
sStoreTilesClearColorTable[B5G6R5_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, B5G6R5_UNORM>::StoreClear; \
sStoreTilesClearColorTable[B5G6R5_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, B5G6R5_UNORM_SRGB>::StoreClear; \
sStoreTilesClearColorTable[B5G5R5A1_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, B5G5R5A1_UNORM>::StoreClear; \
sStoreTilesClearColorTable[B5G5R5A1_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, B5G5R5A1_UNORM_SRGB>::StoreClear; \
sStoreTilesClearColorTable[B4G4R4A4_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, B4G4R4A4_UNORM>::StoreClear; \
sStoreTilesClearColorTable[B4G4R4A4_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, B4G4R4A4_UNORM_SRGB>::StoreClear; \
sStoreTilesClearColorTable[R8G8_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8_UNORM>::StoreClear; \
sStoreTilesClearColorTable[R8G8_SNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8_SNORM>::StoreClear; \
sStoreTilesClearColorTable[R8G8_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8_SINT>::StoreClear; \
sStoreTilesClearColorTable[R8G8_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8_UINT>::StoreClear; \
sStoreTilesClearColorTable[R16_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16_UNORM>::StoreClear; \
sStoreTilesClearColorTable[R16_SNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16_SNORM>::StoreClear; \
sStoreTilesClearColorTable[R16_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16_SINT>::StoreClear; \
sStoreTilesClearColorTable[R16_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16_UINT>::StoreClear; \
sStoreTilesClearColorTable[R16_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16_FLOAT>::StoreClear; \
sStoreTilesClearColorTable[A16_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, A16_UNORM>::StoreClear; \
sStoreTilesClearColorTable[A16_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, A16_FLOAT>::StoreClear; \
sStoreTilesClearColorTable[B5G5R5X1_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, B5G5R5X1_UNORM>::StoreClear; \
sStoreTilesClearColorTable[B5G5R5X1_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, B5G5R5X1_UNORM_SRGB>::StoreClear; \
sStoreTilesClearColorTable[R8_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8_UNORM>::StoreClear; \
sStoreTilesClearColorTable[R8_SNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8_SNORM>::StoreClear; \
sStoreTilesClearColorTable[R8_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8_SINT>::StoreClear; \
sStoreTilesClearColorTable[R8_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8_UINT>::StoreClear; \
sStoreTilesClearColorTable[A8_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, A8_UNORM>::StoreClear; \
sStoreTilesClearColorTable[BC1_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, BC1_UNORM>::StoreClear; \
sStoreTilesClearColorTable[BC2_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, BC2_UNORM>::StoreClear; \
sStoreTilesClearColorTable[BC3_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, BC3_UNORM>::StoreClear; \
sStoreTilesClearColorTable[BC4_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, BC4_UNORM>::StoreClear; \
sStoreTilesClearColorTable[BC5_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, BC5_UNORM>::StoreClear; \
sStoreTilesClearColorTable[BC1_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, BC1_UNORM_SRGB>::StoreClear; \
sStoreTilesClearColorTable[BC2_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, BC2_UNORM_SRGB>::StoreClear; \
sStoreTilesClearColorTable[BC3_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, BC3_UNORM_SRGB>::StoreClear; \
sStoreTilesClearColorTable[R8G8B8_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8_UNORM>::StoreClear; \
sStoreTilesClearColorTable[R8G8B8_SNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8_SNORM>::StoreClear; \
sStoreTilesClearColorTable[BC4_SNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, BC4_SNORM>::StoreClear; \
sStoreTilesClearColorTable[BC5_SNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, BC5_SNORM>::StoreClear; \
sStoreTilesClearColorTable[R16G16B16_FLOAT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16_FLOAT>::StoreClear; \
sStoreTilesClearColorTable[R16G16B16_UNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16_UNORM>::StoreClear; \
sStoreTilesClearColorTable[R16G16B16_SNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16_SNORM>::StoreClear; \
sStoreTilesClearColorTable[R8G8B8_UNORM_SRGB] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8_UNORM_SRGB>::StoreClear; \
sStoreTilesClearColorTable[R16G16B16_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16_UINT>::StoreClear; \
sStoreTilesClearColorTable[R16G16B16_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16_SINT>::StoreClear; \
sStoreTilesClearColorTable[R10G10B10A2_SNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, R10G10B10A2_SNORM>::StoreClear; \
sStoreTilesClearColorTable[R10G10B10A2_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R10G10B10A2_SINT>::StoreClear; \
sStoreTilesClearColorTable[B10G10R10A2_SNORM] = StoreMacroTileClear<R32G32B32A32_FLOAT, B10G10R10A2_SNORM>::StoreClear; \
sStoreTilesClearColorTable[B10G10R10A2_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, B10G10R10A2_UINT>::StoreClear; \
sStoreTilesClearColorTable[B10G10R10A2_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, B10G10R10A2_SINT>::StoreClear; \
sStoreTilesClearColorTable[R8G8B8_UINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8_UINT>::StoreClear; \
sStoreTilesClearColorTable[R8G8B8_SINT] = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8_SINT>::StoreClear; \
//////////////////////////////////////////////////////////////////////////
/// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables.
#define INIT_STORE_TILES_CLEAR_DEPTH_TABLE() \
memset(sStoreTilesClearDepthTable, 0, sizeof(sStoreTilesClearDepthTable)); \
\
sStoreTilesClearDepthTable[R32_FLOAT] = StoreMacroTileClear<R32_FLOAT, R32_FLOAT>::StoreClear; \
sStoreTilesClearDepthTable[R24_UNORM_X8_TYPELESS] = StoreMacroTileClear<R32_FLOAT, R24_UNORM_X8_TYPELESS>::StoreClear; \
//////////////////////////////////////////////////////////////////////////
/// @brief Sets up tables for ClearTile
void InitSimClearTilesTable()
{
INIT_STORE_TILES_CLEAR_COLOR_TABLE();
INIT_STORE_TILES_CLEAR_DEPTH_TABLE();
}

View File

@ -0,0 +1,698 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file Convert.h
*
* @brief Conversion utility functions
*
******************************************************************************/
#pragma once
#if defined(_WIN32)
// disable "potential divide by 0"
#pragma warning(disable: 4723)
#endif
//////////////////////////////////////////////////////////////////////////
/// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision
/// float
/// @param val - 16-bit float
/// @todo Maybe move this outside of this file into a header?
static float ConvertSmallFloatTo32(UINT val)
{
UINT result;
if ((val & 0x7fff) == 0)
{
result = ((uint32_t)(val & 0x8000)) << 16;
}
else if ((val & 0x7c00) == 0x7c00)
{
result = ((val & 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000;
result |= ((uint32_t)val & 0x8000) << 16;
}
else
{
uint32_t sign = (val & 0x8000) << 16;
uint32_t mant = (val & 0x3ff) << 13;
uint32_t exp = (val >> 10) & 0x1f;
if ((exp == 0) && (mant != 0)) // Adjust exponent and mantissa for denormals
{
mant <<= 1;
while (mant < (0x400 << 13))
{
exp--;
mant <<= 1;
}
mant &= (0x3ff << 13);
}
exp = ((exp - 15 + 127) & 0xff) << 23;
result = sign | exp | mant;
}
return *(float*)&result;
}
//////////////////////////////////////////////////////////////////////////
/// @brief Convert an IEEE 754 32-bit single precision float to an
/// unsigned small float with 5 exponent bits and a variable
/// number of mantissa bits.
/// @param val - 32-bit float
/// @todo Maybe move this outside of this file into a header?
template<UINT numMantissaBits>
static UINT Convert32ToSmallFloat(float val)
{
uint32_t sign, exp, mant;
uint32_t roundBits;
// Extract the sign, exponent, and mantissa
UINT uf = *(UINT*)&val;
sign = (uf & 0x80000000) >> 31;
exp = (uf & 0x7F800000) >> 23;
mant = uf & 0x007FFFFF;
// 10/11 bit floats are unsigned. Negative values are clamped to 0.
if (sign != 0)
{
exp = mant = 0;
}
// Check for out of range
else if ((exp == 0xFF) && (mant != 0)) // NaN
{
exp = 0x1F;
mant = 1 << numMantissaBits;
}
else if ((exp == 0xFF) && (mant == 0)) // INF
{
exp = 0x1F;
mant = 0;
}
else if (exp > (0x70 + 0x1E)) // Too big to represent
{
exp = 0x1Eu;
mant = (1 << numMantissaBits) - 1; // 0x3F for 6 bit mantissa.
}
else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm
{
mant |= 0x00800000;
for (; exp <= 0x70; mant >>= 1, exp++)
;
exp = 0;
mant = mant >> (23 - numMantissaBits);
}
else if (exp < 0x66) // Too small to represent -> Zero
{
exp = 0;
mant = 0;
}
else
{
// Saves bits that will be shifted off for rounding
roundBits = mant & 0x1FFFu;
// convert exponent and mantissa to 16 bit format
exp = exp - 0x70u;
mant = mant >> (23 - numMantissaBits);
// Essentially RTZ, but round up if off by only 1 lsb
if (roundBits == 0x1FFFu)
{
mant++;
// check for overflow
if ((mant & (0x3 << numMantissaBits)) != 0) // 0x60 = 0x3 << (num Mantissa Bits)
exp++;
// make sure only the needed bits are used
mant &= (1 << numMantissaBits) - 1;
}
}
UINT tmpVal = (exp << numMantissaBits) | mant;
return tmpVal;
}
#if KNOB_ARCH == KNOB_ARCH_AVX
//////////////////////////////////////////////////////////////////////////
/// @brief Convert an IEEE 754 32-bit single precision float to an
/// 16 bit float with 5 exponent bits and a variable
/// number of mantissa bits.
/// @param val - 32-bit float
/// @todo Maybe move this outside of this file into a header?
static uint16_t Convert32To16Float(float val)
{
uint32_t sign, exp, mant;
uint32_t roundBits;
// Extract the sign, exponent, and mantissa
uint32_t uf = *(uint32_t*)&val;
sign = (uf & 0x80000000) >> 31;
exp = (uf & 0x7F800000) >> 23;
mant = uf & 0x007FFFFF;
// Check for out of range
if (std::isnan(val))
{
exp = 0x1F;
mant = 0x200;
sign = 1; // set the sign bit for NANs
}
else if (std::isinf(val))
{
exp = 0x1f;
mant = 0x0;
}
else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value
{
exp = 0x1E;
mant = 0x3FF;
}
else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm
{
mant |= 0x00800000;
for (; exp <= 0x70; mant >>= 1, exp++)
;
exp = 0;
mant = mant >> 13;
}
else if (exp < 0x66) // Too small to represent -> Zero
{
exp = 0;
mant = 0;
}
else
{
// Saves bits that will be shifted off for rounding
roundBits = mant & 0x1FFFu;
// convert exponent and mantissa to 16 bit format
exp = exp - 0x70;
mant = mant >> 13;
// Essentially RTZ, but round up if off by only 1 lsb
if (roundBits == 0x1FFFu)
{
mant++;
// check for overflow
if ((mant & 0xC00u) != 0)
exp++;
// make sure only the needed bits are used
mant &= 0x3FF;
}
}
uint32_t tmpVal = (sign << 15) | (exp << 10) | mant;
return (uint16_t)tmpVal;
}
#endif
//////////////////////////////////////////////////////////////////////////
/// @brief Retrieve color from hot tile source which is always float.
/// @param pDstPixel - Pointer to destination pixel.
/// @param srcPixel - Pointer to source pixel (pre-swizzled according to dest).
template<SWR_FORMAT DstFormat>
static void ConvertPixelFromFloat(
BYTE* pDstPixel,
const float srcPixel[4])
{
UINT outColor[4]; // typeless bits
// Store component
for (UINT comp = 0; comp < FormatTraits<DstFormat>::numComps; ++comp)
{
SWR_TYPE type = FormatTraits<DstFormat>::GetType(comp);
float src = srcPixel[comp];
switch (type)
{
case SWR_TYPE_UNORM:
{
// Force NaN to 0. IEEE standard, comparisons involving NaN always evaluate to false.
src = (src != src) ? 0.0f : src;
// Clamp [0, 1]
src = std::max(src, 0.0f);
src = std::min(src, 1.0f);
// SRGB
if (FormatTraits<DstFormat>::isSRGB && comp != 3)
{
src = (src <= 0.0031308f) ? (12.92f * src) : (1.055f * powf(src, (1.0f / 2.4f)) - 0.055f);
}
// Float scale to integer scale.
UINT scale = (1 << FormatTraits<DstFormat>::GetBPC(comp)) - 1;
src = (float)scale * src;
src = roundf(src);
outColor[comp] = (UINT)src; // Drop fractional part.
break;
}
case SWR_TYPE_SNORM:
{
SWR_ASSERT(!FormatTraits<DstFormat>::isSRGB);
// Force NaN to 0. IEEE standard, comparisons involving NaN always evaluate to false.
src = (src != src) ? 0.0f : src;
// Clamp [-1, 1]
src = std::max(src, -1.0f);
src = std::min(src, 1.0f);
// Float scale to integer scale.
UINT scale = (1 << (FormatTraits<DstFormat>::GetBPC(comp) - 1)) - 1;
src = (float)scale * src;
// Round
src += (src >= 0) ? 0.5f : -0.5f;
INT out = (INT)src;
outColor[comp] = *(UINT*)&out;
break;
}
case SWR_TYPE_UINT:
{
///@note The *(UINT*)& is currently necessary as the hot tile appears to always be float.
// However, the number in the hot tile should be unsigned integer. So doing this
// to preserve bits intead of doing a float -> integer conversion.
if (FormatTraits<DstFormat>::GetBPC(comp) == 32)
{
outColor[comp] = *(UINT*)&src;
}
else
{
outColor[comp] = *(UINT*)&src;
UINT max = (1 << FormatTraits<DstFormat>::GetBPC(comp)) - 1; // 2^numBits - 1
outColor[comp] = std::min(max, outColor[comp]);
}
break;
}
case SWR_TYPE_SINT:
{
if (FormatTraits<DstFormat>::GetBPC(comp) == 32)
{
outColor[comp] = *(UINT*)&src;
}
else
{
INT out = *(INT*)&src; // Hot tile format is SINT?
INT max = (1 << (FormatTraits<DstFormat>::GetBPC(comp) - 1)) - 1;
INT min = -1 - max;
///@note The output is unsigned integer (bag of bits) and so performing
// the clamping here based on range of output component. Also, manually adding
// the sign bit in the appropriate spot. Maybe a better way?
out = std::max(out, min);
out = std::min(out, max);
outColor[comp] = *(UINT*)&out;
}
break;
}
case SWR_TYPE_FLOAT:
{
if (FormatTraits<DstFormat>::GetBPC(comp) == 16)
{
// Convert from 32-bit float to 16-bit float using _mm_cvtps_ph
// @todo 16bit float instruction support is orthogonal to avx support. need to
// add check for F16C support instead.
#if KNOB_ARCH == KNOB_ARCH_AVX2
__m128 src128 = _mm_set1_ps(src);
__m128i srci128 = _mm_cvtps_ph(src128, _MM_FROUND_TRUNC);
UINT value = _mm_extract_epi16(srci128, 0);
#else
UINT value = Convert32To16Float(src);
#endif
outColor[comp] = value;
}
else if (FormatTraits<DstFormat>::GetBPC(comp) == 11)
{
outColor[comp] = Convert32ToSmallFloat<6>(src);
}
else if (FormatTraits<DstFormat>::GetBPC(comp) == 10)
{
outColor[comp] = Convert32ToSmallFloat<5>(src);
}
else
{
outColor[comp] = *(UINT*)&src;
}
break;
}
default:
SWR_ASSERT(0);
break;
}
}
typename FormatTraits<DstFormat>::FormatT* pPixel = (typename FormatTraits<DstFormat>::FormatT*)pDstPixel;
switch (FormatTraits<DstFormat>::numComps)
{
case 4:
pPixel->a = outColor[3];
case 3:
pPixel->b = outColor[2];
case 2:
pPixel->g = outColor[1];
case 1:
pPixel->r = outColor[0];
break;
default:
SWR_ASSERT(0);
}
}
//////////////////////////////////////////////////////////////////////////
/// @brief Convert pixel in any format to float32
/// @param pDstPixel - Pointer to destination pixel.
/// @param srcPixel - Pointer to source pixel
template<SWR_FORMAT SrcFormat>
INLINE static void ConvertPixelToFloat(
float dstPixel[4],
const BYTE* pSrc)
{
UINT srcColor[4]; // typeless bits
// unpack src pixel
typename FormatTraits<SrcFormat>::FormatT* pPixel = (typename FormatTraits<SrcFormat>::FormatT*)pSrc;
// apply format defaults
for (uint32_t comp = 0; comp < 4; ++comp)
{
uint32_t def = FormatTraits<SrcFormat>::GetDefault(comp);
dstPixel[comp] = *(float*)&def;
}
// load format data
switch (FormatTraits<SrcFormat>::numComps)
{
case 4:
srcColor[3] = pPixel->a;
case 3:
srcColor[2] = pPixel->b;
case 2:
srcColor[1] = pPixel->g;
case 1:
srcColor[0] = pPixel->r;
break;
default:
SWR_ASSERT(0);
}
// Convert components
for (UINT comp = 0; comp < FormatTraits<SrcFormat>::numComps; ++comp)
{
SWR_TYPE type = FormatTraits<SrcFormat>::GetType(comp);
UINT src = srcColor[comp];
switch (type)
{
case SWR_TYPE_UNORM:
{
float dst;
if (FormatTraits<SrcFormat>::isSRGB && comp != 3)
{
dst = *(float*)&srgb8Table[src];
}
else
{
// component sizes > 16 must use fp divide to maintain ulp requirements
if (FormatTraits<SrcFormat>::GetBPC(comp) > 16)
{
dst = (float)src / (float)((1 << FormatTraits<SrcFormat>::GetBPC(comp)) - 1);
}
else
{
const float scale = (1.0f / (float)((1 << FormatTraits<SrcFormat>::GetBPC(comp)) - 1));
dst = (float)src * scale;
}
}
dstPixel[FormatTraits<SrcFormat>::swizzle(comp)] = dst;
break;
}
case SWR_TYPE_SNORM:
{
SWR_ASSERT(!FormatTraits<SrcFormat>::isSRGB);
float dst;
if (src == 0x10)
{
dst = -1.0f;
}
else
{
switch (FormatTraits<SrcFormat>::GetBPC(comp))
{
case 8:
dst = (float)((int8_t)src);
break;
case 16:
dst = (float)((int16_t)src);
break;
case 32:
dst = (float)((int32_t)src);
break;
default:
assert(0 && "attempted to load from SNORM with unsupported bpc");
dst = 0.0f;
break;
}
dst = dst * (1.0f / ((1 << (FormatTraits<SrcFormat>::GetBPC(comp) - 1)) - 1));
}
dstPixel[FormatTraits<SrcFormat>::swizzle(comp)] = dst;
break;
}
case SWR_TYPE_UINT:
{
UINT dst = (UINT)src;
dstPixel[FormatTraits<SrcFormat>::swizzle(comp)] = *(float*)&dst;
break;
}
case SWR_TYPE_SINT:
{
int dst;
switch (FormatTraits<SrcFormat>::GetBPC(comp))
{
case 8:
dst = (int8_t)src;
break;
case 16:
dst = (int16_t)src;
break;
case 32:
dst = (int32_t)src;
break;
default:
assert(0 && "attempted to load from SINT with unsupported bpc");
dst = 0;
break;
}
dstPixel[FormatTraits<SrcFormat>::swizzle(comp)] = *(float*)&dst;
break;
}
case SWR_TYPE_FLOAT:
{
float dst;
if (FormatTraits<SrcFormat>::GetBPC(comp) == 16)
{
#if KNOB_ARCH == KNOB_ARCH_AVX2
// Convert from 16-bit float to 32-bit float using _mm_cvtph_ps
// @todo 16bit float instruction support is orthogonal to avx support. need to
// add check for F16C support instead.
__m128i src128 = _mm_set1_epi32(src);
__m128 res = _mm_cvtph_ps(src128);
_mm_store_ss(&dst, res);
#else
dst = ConvertSmallFloatTo32(src);
#endif
}
else if (FormatTraits<SrcFormat>::GetBPC(comp) == 11)
{
dst = ConvertSmallFloatTo32(src << 4);
}
else if (FormatTraits<SrcFormat>::GetBPC(comp) == 10)
{
dst = ConvertSmallFloatTo32(src << 5);
}
else
{
dst = *(float*)&src;
}
dstPixel[FormatTraits<SrcFormat>::swizzle(comp)] = *(float*)&dst;
break;
}
default:
SWR_ASSERT(0);
break;
}
}
}
// non-templated version of conversion functions
INLINE static void ConvertPixelFromFloat(
SWR_FORMAT format,
uint8_t* pDst,
const float srcPixel[4])
{
switch (format)
{
case R32G32B32A32_FLOAT: ConvertPixelFromFloat<R32G32B32A32_FLOAT>(pDst, srcPixel); break;
case R32G32B32A32_SINT: ConvertPixelFromFloat<R32G32B32A32_SINT>(pDst, srcPixel); break;
case R32G32B32A32_UINT: ConvertPixelFromFloat<R32G32B32A32_UINT>(pDst, srcPixel); break;
case R32G32B32X32_FLOAT: ConvertPixelFromFloat<R32G32B32X32_FLOAT>(pDst, srcPixel); break;
case R32G32B32A32_SSCALED: ConvertPixelFromFloat<R32G32B32A32_SSCALED>(pDst, srcPixel); break;
case R32G32B32A32_USCALED: ConvertPixelFromFloat<R32G32B32A32_USCALED>(pDst, srcPixel); break;
case R32G32B32_FLOAT: ConvertPixelFromFloat<R32G32B32_FLOAT>(pDst, srcPixel); break;
case R32G32B32_SINT: ConvertPixelFromFloat<R32G32B32_SINT>(pDst, srcPixel); break;
case R32G32B32_UINT: ConvertPixelFromFloat<R32G32B32_UINT>(pDst, srcPixel); break;
case R32G32B32_SSCALED: ConvertPixelFromFloat<R32G32B32_SSCALED>(pDst, srcPixel); break;
case R32G32B32_USCALED: ConvertPixelFromFloat<R32G32B32_USCALED>(pDst, srcPixel); break;
case R16G16B16A16_UNORM: ConvertPixelFromFloat<R16G16B16A16_UNORM>(pDst, srcPixel); break;
case R16G16B16A16_SNORM: ConvertPixelFromFloat<R16G16B16A16_SNORM>(pDst, srcPixel); break;
case R16G16B16A16_SINT: ConvertPixelFromFloat<R16G16B16A16_SINT>(pDst, srcPixel); break;
case R16G16B16A16_UINT: ConvertPixelFromFloat<R16G16B16A16_UINT>(pDst, srcPixel); break;
case R16G16B16A16_FLOAT: ConvertPixelFromFloat<R16G16B16A16_FLOAT>(pDst, srcPixel); break;
case R32G32_FLOAT: ConvertPixelFromFloat<R32G32_FLOAT>(pDst, srcPixel); break;
case R32G32_SINT: ConvertPixelFromFloat<R32G32_SINT>(pDst, srcPixel); break;
case R32G32_UINT: ConvertPixelFromFloat<R32G32_UINT>(pDst, srcPixel); break;
case R32_FLOAT_X8X24_TYPELESS: ConvertPixelFromFloat<R32_FLOAT_X8X24_TYPELESS>(pDst, srcPixel); break;
case R16G16B16X16_UNORM: ConvertPixelFromFloat<R16G16B16X16_UNORM>(pDst, srcPixel); break;
case R16G16B16X16_FLOAT: ConvertPixelFromFloat<R16G16B16X16_FLOAT>(pDst, srcPixel); break;
case R16G16B16A16_SSCALED: ConvertPixelFromFloat<R16G16B16A16_SSCALED>(pDst, srcPixel); break;
case R16G16B16A16_USCALED: ConvertPixelFromFloat<R16G16B16A16_USCALED>(pDst, srcPixel); break;
case R32G32_SSCALED: ConvertPixelFromFloat<R32G32_SSCALED>(pDst, srcPixel); break;
case R32G32_USCALED: ConvertPixelFromFloat<R32G32_USCALED>(pDst, srcPixel); break;
case R32_FLOAT_X8X24_TYPELESS_LD: ConvertPixelFromFloat<R32_FLOAT_X8X24_TYPELESS_LD>(pDst, srcPixel); break;
case B8G8R8A8_UNORM: ConvertPixelFromFloat<B8G8R8A8_UNORM>(pDst, srcPixel); break;
case B8G8R8A8_UNORM_SRGB: ConvertPixelFromFloat<B8G8R8A8_UNORM_SRGB>(pDst, srcPixel); break;
case R10G10B10A2_UNORM: ConvertPixelFromFloat<R10G10B10A2_UNORM>(pDst, srcPixel); break;
case R10G10B10A2_UNORM_SRGB: ConvertPixelFromFloat<R10G10B10A2_UNORM_SRGB>(pDst, srcPixel); break;
case R10G10B10A2_UINT: ConvertPixelFromFloat<R10G10B10A2_UINT>(pDst, srcPixel); break;
case R8G8B8A8_UNORM: ConvertPixelFromFloat<R8G8B8A8_UNORM>(pDst, srcPixel); break;
case R8G8B8A8_UNORM_SRGB: ConvertPixelFromFloat<R8G8B8A8_UNORM_SRGB>(pDst, srcPixel); break;
case R8G8B8A8_SNORM: ConvertPixelFromFloat<R8G8B8A8_SNORM>(pDst, srcPixel); break;
case R8G8B8A8_SINT: ConvertPixelFromFloat<R8G8B8A8_SINT>(pDst, srcPixel); break;
case R8G8B8A8_UINT: ConvertPixelFromFloat<R8G8B8A8_UINT>(pDst, srcPixel); break;
case R16G16_UNORM: ConvertPixelFromFloat<R16G16_UNORM>(pDst, srcPixel); break;
case R16G16_SNORM: ConvertPixelFromFloat<R16G16_SNORM>(pDst, srcPixel); break;
case R16G16_SINT: ConvertPixelFromFloat<R16G16_SINT>(pDst, srcPixel); break;
case R16G16_UINT: ConvertPixelFromFloat<R16G16_UINT>(pDst, srcPixel); break;
case R16G16_FLOAT: ConvertPixelFromFloat<R16G16_FLOAT>(pDst, srcPixel); break;
case B10G10R10A2_UNORM: ConvertPixelFromFloat<B10G10R10A2_UNORM>(pDst, srcPixel); break;
case B10G10R10A2_UNORM_SRGB: ConvertPixelFromFloat<B10G10R10A2_UNORM_SRGB>(pDst, srcPixel); break;
case R11G11B10_FLOAT: ConvertPixelFromFloat<R11G11B10_FLOAT>(pDst, srcPixel); break;
case R32_SINT: ConvertPixelFromFloat<R32_SINT>(pDst, srcPixel); break;
case R32_UINT: ConvertPixelFromFloat<R32_UINT>(pDst, srcPixel); break;
case R32_FLOAT: ConvertPixelFromFloat<R32_FLOAT>(pDst, srcPixel); break;
case R24_UNORM_X8_TYPELESS: ConvertPixelFromFloat<R24_UNORM_X8_TYPELESS>(pDst, srcPixel); break;
case R24_UNORM_X8_TYPELESS_LD: ConvertPixelFromFloat<R24_UNORM_X8_TYPELESS_LD>(pDst, srcPixel); break;
case A32_FLOAT: ConvertPixelFromFloat<A32_FLOAT>(pDst, srcPixel); break;
case B8G8R8X8_UNORM: ConvertPixelFromFloat<B8G8R8X8_UNORM>(pDst, srcPixel); break;
case B8G8R8X8_UNORM_SRGB: ConvertPixelFromFloat<B8G8R8X8_UNORM_SRGB>(pDst, srcPixel); break;
case R8G8B8X8_UNORM: ConvertPixelFromFloat<R8G8B8X8_UNORM>(pDst, srcPixel); break;
case R8G8B8X8_UNORM_SRGB: ConvertPixelFromFloat<R8G8B8X8_UNORM_SRGB>(pDst, srcPixel); break;
case R9G9B9E5_SHAREDEXP: ConvertPixelFromFloat<R9G9B9E5_SHAREDEXP>(pDst, srcPixel); break;
case B10G10R10X2_UNORM: ConvertPixelFromFloat<B10G10R10X2_UNORM>(pDst, srcPixel); break;
case R10G10B10X2_USCALED: ConvertPixelFromFloat<R10G10B10X2_USCALED>(pDst, srcPixel); break;
case R8G8B8A8_SSCALED: ConvertPixelFromFloat<R8G8B8A8_SSCALED>(pDst, srcPixel); break;
case R8G8B8A8_USCALED: ConvertPixelFromFloat<R8G8B8A8_USCALED>(pDst, srcPixel); break;
case R16G16_SSCALED: ConvertPixelFromFloat<R16G16_SSCALED>(pDst, srcPixel); break;
case R16G16_USCALED: ConvertPixelFromFloat<R16G16_USCALED>(pDst, srcPixel); break;
case R32_SSCALED: ConvertPixelFromFloat<R32_SSCALED>(pDst, srcPixel); break;
case R32_USCALED: ConvertPixelFromFloat<R32_USCALED>(pDst, srcPixel); break;
case B5G6R5_UNORM: ConvertPixelFromFloat<B5G6R5_UNORM>(pDst, srcPixel); break;
case B5G6R5_UNORM_SRGB: ConvertPixelFromFloat<B5G6R5_UNORM_SRGB>(pDst, srcPixel); break;
case B5G5R5A1_UNORM: ConvertPixelFromFloat<B5G5R5A1_UNORM>(pDst, srcPixel); break;
case B5G5R5A1_UNORM_SRGB: ConvertPixelFromFloat<B5G5R5A1_UNORM_SRGB>(pDst, srcPixel); break;
case B4G4R4A4_UNORM: ConvertPixelFromFloat<B4G4R4A4_UNORM>(pDst, srcPixel); break;
case B4G4R4A4_UNORM_SRGB: ConvertPixelFromFloat<B4G4R4A4_UNORM_SRGB>(pDst, srcPixel); break;
case R8G8_UNORM: ConvertPixelFromFloat<R8G8_UNORM>(pDst, srcPixel); break;
case R8G8_SNORM: ConvertPixelFromFloat<R8G8_SNORM>(pDst, srcPixel); break;
case R8G8_SINT: ConvertPixelFromFloat<R8G8_SINT>(pDst, srcPixel); break;
case R8G8_UINT: ConvertPixelFromFloat<R8G8_UINT>(pDst, srcPixel); break;
case R16_UNORM: ConvertPixelFromFloat<R16_UNORM>(pDst, srcPixel); break;
case R16_SNORM: ConvertPixelFromFloat<R16_SNORM>(pDst, srcPixel); break;
case R16_SINT: ConvertPixelFromFloat<R16_SINT>(pDst, srcPixel); break;
case R16_UINT: ConvertPixelFromFloat<R16_UINT>(pDst, srcPixel); break;
case R16_FLOAT: ConvertPixelFromFloat<R16_FLOAT>(pDst, srcPixel); break;
case A16_UNORM: ConvertPixelFromFloat<A16_UNORM>(pDst, srcPixel); break;
case A16_FLOAT: ConvertPixelFromFloat<A16_FLOAT>(pDst, srcPixel); break;
case B5G5R5X1_UNORM: ConvertPixelFromFloat<B5G5R5X1_UNORM>(pDst, srcPixel); break;
case B5G5R5X1_UNORM_SRGB: ConvertPixelFromFloat<B5G5R5X1_UNORM_SRGB>(pDst, srcPixel); break;
case R8G8_SSCALED: ConvertPixelFromFloat<R8G8_SSCALED>(pDst, srcPixel); break;
case R8G8_USCALED: ConvertPixelFromFloat<R8G8_USCALED>(pDst, srcPixel); break;
case R16_SSCALED: ConvertPixelFromFloat<R16_SSCALED>(pDst, srcPixel); break;
case R16_USCALED: ConvertPixelFromFloat<R16_USCALED>(pDst, srcPixel); break;
case R8_UNORM: ConvertPixelFromFloat<R8_UNORM>(pDst, srcPixel); break;
case R8_SNORM: ConvertPixelFromFloat<R8_SNORM>(pDst, srcPixel); break;
case R8_SINT: ConvertPixelFromFloat<R8_SINT>(pDst, srcPixel); break;
case R8_UINT: ConvertPixelFromFloat<R8_UINT>(pDst, srcPixel); break;
case A8_UNORM: ConvertPixelFromFloat<A8_UNORM>(pDst, srcPixel); break;
case R8_SSCALED: ConvertPixelFromFloat<R8_SSCALED>(pDst, srcPixel); break;
case R8_USCALED: ConvertPixelFromFloat<R8_USCALED>(pDst, srcPixel); break;
case YCRCB_SWAPUVY: ConvertPixelFromFloat<YCRCB_SWAPUVY>(pDst, srcPixel); break;
case BC1_UNORM: ConvertPixelFromFloat<BC1_UNORM>(pDst, srcPixel); break;
case BC2_UNORM: ConvertPixelFromFloat<BC2_UNORM>(pDst, srcPixel); break;
case BC3_UNORM: ConvertPixelFromFloat<BC3_UNORM>(pDst, srcPixel); break;
case BC4_UNORM: ConvertPixelFromFloat<BC4_UNORM>(pDst, srcPixel); break;
case BC5_UNORM: ConvertPixelFromFloat<BC5_UNORM>(pDst, srcPixel); break;
case BC1_UNORM_SRGB: ConvertPixelFromFloat<BC1_UNORM_SRGB>(pDst, srcPixel); break;
case BC2_UNORM_SRGB: ConvertPixelFromFloat<BC2_UNORM_SRGB>(pDst, srcPixel); break;
case BC3_UNORM_SRGB: ConvertPixelFromFloat<BC3_UNORM_SRGB>(pDst, srcPixel); break;
case YCRCB_SWAPUV: ConvertPixelFromFloat<YCRCB_SWAPUV>(pDst, srcPixel); break;
case R8G8B8_UNORM: ConvertPixelFromFloat<R8G8B8_UNORM>(pDst, srcPixel); break;
case R8G8B8_SNORM: ConvertPixelFromFloat<R8G8B8_SNORM>(pDst, srcPixel); break;
case R8G8B8_SSCALED: ConvertPixelFromFloat<R8G8B8_SSCALED>(pDst, srcPixel); break;
case R8G8B8_USCALED: ConvertPixelFromFloat<R8G8B8_USCALED>(pDst, srcPixel); break;
case BC4_SNORM: ConvertPixelFromFloat<BC4_SNORM>(pDst, srcPixel); break;
case BC5_SNORM: ConvertPixelFromFloat<BC5_SNORM>(pDst, srcPixel); break;
case R16G16B16_FLOAT: ConvertPixelFromFloat<R16G16B16_FLOAT>(pDst, srcPixel); break;
case R16G16B16_UNORM: ConvertPixelFromFloat<R16G16B16_UNORM>(pDst, srcPixel); break;
case R16G16B16_SNORM: ConvertPixelFromFloat<R16G16B16_SNORM>(pDst, srcPixel); break;
case R16G16B16_SSCALED: ConvertPixelFromFloat<R16G16B16_SSCALED>(pDst, srcPixel); break;
case R16G16B16_USCALED: ConvertPixelFromFloat<R16G16B16_USCALED>(pDst, srcPixel); break;
case BC7_UNORM: ConvertPixelFromFloat<BC7_UNORM>(pDst, srcPixel); break;
case BC7_UNORM_SRGB: ConvertPixelFromFloat<BC7_UNORM_SRGB>(pDst, srcPixel); break;
case R8G8B8_UNORM_SRGB: ConvertPixelFromFloat<R8G8B8_UNORM_SRGB>(pDst, srcPixel); break;
case R16G16B16_UINT: ConvertPixelFromFloat<R16G16B16_UINT>(pDst, srcPixel); break;
case R16G16B16_SINT: ConvertPixelFromFloat<R16G16B16_SINT>(pDst, srcPixel); break;
case R10G10B10A2_SNORM: ConvertPixelFromFloat<R10G10B10A2_SNORM>(pDst, srcPixel); break;
case R10G10B10A2_USCALED: ConvertPixelFromFloat<R10G10B10A2_USCALED>(pDst, srcPixel); break;
case R10G10B10A2_SSCALED: ConvertPixelFromFloat<R10G10B10A2_SSCALED>(pDst, srcPixel); break;
case R10G10B10A2_SINT: ConvertPixelFromFloat<R10G10B10A2_SINT>(pDst, srcPixel); break;
case B10G10R10A2_SNORM: ConvertPixelFromFloat<B10G10R10A2_SNORM>(pDst, srcPixel); break;
case B10G10R10A2_USCALED: ConvertPixelFromFloat<B10G10R10A2_USCALED>(pDst, srcPixel); break;
case B10G10R10A2_SSCALED: ConvertPixelFromFloat<B10G10R10A2_SSCALED>(pDst, srcPixel); break;
case B10G10R10A2_UINT: ConvertPixelFromFloat<B10G10R10A2_UINT>(pDst, srcPixel); break;
case B10G10R10A2_SINT: ConvertPixelFromFloat<B10G10R10A2_SINT>(pDst, srcPixel); break;
case R8G8B8_UINT: ConvertPixelFromFloat<R8G8B8_UINT>(pDst, srcPixel); break;
case R8G8B8_SINT: ConvertPixelFromFloat<R8G8B8_SINT>(pDst, srcPixel); break;
default:
break;
}
}

View File

@ -0,0 +1,396 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file LoadTile.cpp
*
* @brief Functionality for Load
*
******************************************************************************/
#include "common/os.h"
#include "common/formats.h"
#include "core/context.h"
#include "core/rdtsc_core.h"
#include "memory/TilingFunctions.h"
#include "memory/tilingtraits.h"
#include "memory/Convert.h"
typedef void(*PFN_LOAD_TILES)(SWR_SURFACE_STATE*, uint8_t*, uint32_t, uint32_t, uint32_t);
//////////////////////////////////////////////////////////////////////////
/// Load Raster Tile Function Tables.
//////////////////////////////////////////////////////////////////////////
static PFN_LOAD_TILES sLoadTilesColorTable_SWR_TILE_NONE[NUM_SWR_FORMATS];
static PFN_LOAD_TILES sLoadTilesDepthTable_SWR_TILE_NONE[NUM_SWR_FORMATS];
static PFN_LOAD_TILES sLoadTilesColorTable_SWR_TILE_MODE_YMAJOR[NUM_SWR_FORMATS];
static PFN_LOAD_TILES sLoadTilesColorTable_SWR_TILE_MODE_XMAJOR[NUM_SWR_FORMATS];
static PFN_LOAD_TILES sLoadTilesDepthTable_SWR_TILE_MODE_YMAJOR[NUM_SWR_FORMATS];
//////////////////////////////////////////////////////////////////////////
/// LoadRasterTile
//////////////////////////////////////////////////////////////////////////
template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
struct LoadRasterTile
{
//////////////////////////////////////////////////////////////////////////
/// @brief Retrieve color from hot tile source which is always float.
/// @param pSrc - Pointer to raster tile.
/// @param x, y - Coordinates to raster tile.
/// @param output - output color
INLINE static void SetSwizzledDstColor(
const float srcColor[4],
uint32_t x, uint32_t y,
uint8_t* pDst)
{
typedef SimdTile<DstFormat, SrcFormat> SimdT;
SimdT* pDstSimdTiles = (SimdT*)pDst;
// Compute which simd tile we're accessing within 8x8 tile.
// i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates.
uint32_t simdIndex = (y / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM) + (x / SIMD_TILE_X_DIM);
SimdT* pSimdTile = &pDstSimdTiles[simdIndex];
uint32_t simdOffset = (y % SIMD_TILE_Y_DIM) * SIMD_TILE_X_DIM + (x % SIMD_TILE_X_DIM);
pSimdTile->SetSwizzledColor(simdOffset, srcColor);
}
//////////////////////////////////////////////////////////////////////////
/// @brief Loads an 8x8 raster tile from the src surface.
/// @param pSrcSurface - Src surface state
/// @param pDst - Destination hot tile pointer
/// @param x, y - Coordinates to raster tile.
INLINE static void Load(
SWR_SURFACE_STATE* pSrcSurface,
uint8_t* pDst,
uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile.
{
uint32_t lodWidth = (pSrcSurface->width == 1) ? 1 : pSrcSurface->width >> pSrcSurface->lod;
uint32_t lodHeight = (pSrcSurface->height == 1) ? 1 : pSrcSurface->height >> pSrcSurface->lod;
// For each raster tile pixel (rx, ry)
for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry)
{
for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx)
{
if (((x + rx) < lodWidth) &&
((y + ry) < lodHeight))
{
uint8_t* pSrc = (uint8_t*)ComputeSurfaceAddress<false>(x + rx, y + ry, pSrcSurface->arrayIndex + renderTargetArrayIndex,
pSrcSurface->arrayIndex + renderTargetArrayIndex, sampleNum,
pSrcSurface->lod, pSrcSurface);
float srcColor[4];
ConvertPixelToFloat<SrcFormat>(srcColor, pSrc);
// store pixel to hottile
SetSwizzledDstColor(srcColor, rx, ry, pDst);
}
}
}
}
};
//////////////////////////////////////////////////////////////////////////
/// LoadMacroTile - Loads a macro tile which consists of raster tiles.
//////////////////////////////////////////////////////////////////////////
template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
struct LoadMacroTile
{
//////////////////////////////////////////////////////////////////////////
/// @brief Load a macrotile to the destination surface.
/// @param pSrc - Pointer to macro tile.
/// @param pDstSurface - Destination surface state
/// @param x, y - Coordinates to macro tile
static void Load(
SWR_SURFACE_STATE* pSrcSurface,
uint8_t *pDstHotTile,
uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
{
// Load each raster tile from the hot tile to the destination surface.
for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
{
for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
{
for (uint32_t sampleNum = 0; sampleNum < pSrcSurface->numSamples; sampleNum++)
{
LoadRasterTile<TTraits, SrcFormat, DstFormat>::Load(pSrcSurface, pDstHotTile,
(x + col), (y + row), sampleNum, renderTargetArrayIndex);
pDstHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<DstFormat>::bpp / 8);
}
}
}
}
};
static void BUCKETS_START(UINT id)
{
#ifdef KNOB_ENABLE_RDTSC
gBucketMgr.StartBucket(id);
#endif
}
static void BUCKETS_STOP(UINT id)
{
#ifdef KNOB_ENABLE_RDTSC
gBucketMgr.StopBucket(id);
#endif
}
// on demand buckets for load tiles
static std::vector<int> sBuckets(NUM_SWR_FORMATS, -1);
static std::mutex sBucketMutex;
//////////////////////////////////////////////////////////////////////////
/// @brief Loads a full hottile from a render surface
/// @param hPrivateContext - Handle to private DC
/// @param dstFormat - Format for hot tile.
/// @param renderTargetIndex - Index to src render target
/// @param x, y - Coordinates to raster tile.
/// @param pDstHotTile - Pointer to Hot Tile
void LoadHotTile(
SWR_SURFACE_STATE *pSrcSurface,
SWR_FORMAT dstFormat,
SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex,
uint8_t *pDstHotTile)
{
PFN_LOAD_TILES pfnLoadTiles = NULL;
// don't need to load null surfaces
if (pSrcSurface->type == SURFACE_NULL)
{
return;
}
// force 0 if requested renderTargetArrayIndex is OOB
if (renderTargetArrayIndex >= pSrcSurface->depth)
{
renderTargetArrayIndex = 0;
}
if (renderTargetIndex < SWR_ATTACHMENT_DEPTH)
{
switch (pSrcSurface->tileMode)
{
case SWR_TILE_NONE:
pfnLoadTiles = sLoadTilesColorTable_SWR_TILE_NONE[pSrcSurface->format];
break;
case SWR_TILE_MODE_YMAJOR:
pfnLoadTiles = sLoadTilesColorTable_SWR_TILE_MODE_YMAJOR[pSrcSurface->format];
break;
case SWR_TILE_MODE_XMAJOR:
pfnLoadTiles = sLoadTilesColorTable_SWR_TILE_MODE_XMAJOR[pSrcSurface->format];
break;
case SWR_TILE_MODE_WMAJOR:
SWR_ASSERT(pSrcSurface->format == R8_UINT);
pfnLoadTiles = LoadMacroTile<TilingTraits<SWR_TILE_MODE_WMAJOR, 8>, R8_UINT, R8_UINT>::Load;
break;
default:
SWR_ASSERT(0, "Unsupported tiling mode");
break;
}
}
else if (renderTargetIndex == SWR_ATTACHMENT_DEPTH)
{
// Currently depth can map to linear and tile-y.
switch (pSrcSurface->tileMode)
{
case SWR_TILE_NONE:
pfnLoadTiles = sLoadTilesDepthTable_SWR_TILE_NONE[pSrcSurface->format];
break;
case SWR_TILE_MODE_YMAJOR:
pfnLoadTiles = sLoadTilesDepthTable_SWR_TILE_MODE_YMAJOR[pSrcSurface->format];
break;
default:
SWR_ASSERT(0, "Unsupported tiling mode");
break;
}
}
else
{
SWR_ASSERT(renderTargetIndex == SWR_ATTACHMENT_STENCIL);
SWR_ASSERT(pSrcSurface->format == R8_UINT);
switch (pSrcSurface->tileMode)
{
case SWR_TILE_NONE:
pfnLoadTiles = LoadMacroTile<TilingTraits<SWR_TILE_NONE, 8>, R8_UINT, R8_UINT>::Load;
break;
case SWR_TILE_MODE_WMAJOR:
pfnLoadTiles = LoadMacroTile<TilingTraits<SWR_TILE_MODE_WMAJOR, 8>, R8_UINT, R8_UINT>::Load;
break;
default:
SWR_ASSERT(0, "Unsupported tiling mode");
break;
}
}
if (pfnLoadTiles == nullptr)
{
SWR_ASSERT(false, "Unsupported format for load tile");
return;
}
// Load a macro tile.
#ifdef KNOB_ENABLE_RDTSC
if (sBuckets[pSrcSurface->format] == -1)
{
// guard sBuckets update since storetiles is called by multiple threads
sBucketMutex.lock();
if (sBuckets[pSrcSurface->format] == -1)
{
const SWR_FORMAT_INFO& info = GetFormatInfo(pSrcSurface->format);
BUCKET_DESC desc{ info.name, "", false, 0xffffffff };
sBuckets[pSrcSurface->format] = gBucketMgr.RegisterBucket(desc);
}
sBucketMutex.unlock();
}
#endif
BUCKETS_START(sBuckets[pSrcSurface->format]);
pfnLoadTiles(pSrcSurface, pDstHotTile, x, y, renderTargetArrayIndex);
BUCKETS_STOP(sBuckets[pSrcSurface->format]);
}
//////////////////////////////////////////////////////////////////////////
/// INIT_LOAD_TILES_TABLE - Helper macro for setting up the tables.
#define INIT_LOAD_TILES_COLOR_TABLE(tilemode) \
memset(sLoadTilesColorTable_##tilemode, 0, sizeof(sLoadTilesColorTable_##tilemode)); \
\
sLoadTilesColorTable_##tilemode[R32G32B32A32_FLOAT] = LoadMacroTile<TilingTraits<tilemode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R32G32B32A32_SINT] = LoadMacroTile<TilingTraits<tilemode, 128>, R32G32B32A32_SINT, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R32G32B32A32_UINT] = LoadMacroTile<TilingTraits<tilemode, 128>, R32G32B32A32_UINT, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R32G32B32X32_FLOAT] = LoadMacroTile<TilingTraits<tilemode, 128>, R32G32B32X32_FLOAT, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R32G32B32_FLOAT] = LoadMacroTile<TilingTraits<tilemode, 96>, R32G32B32_FLOAT, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R32G32B32_SINT] = LoadMacroTile<TilingTraits<tilemode, 96>, R32G32B32_SINT, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R32G32B32_UINT] = LoadMacroTile<TilingTraits<tilemode, 96>, R32G32B32_UINT, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R16G16B16A16_UNORM] = LoadMacroTile<TilingTraits<tilemode, 64>, R16G16B16A16_UNORM, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R16G16B16A16_SNORM] = LoadMacroTile<TilingTraits<tilemode, 64>, R16G16B16A16_SNORM, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R16G16B16A16_SINT] = LoadMacroTile<TilingTraits<tilemode, 64>, R16G16B16A16_SINT, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R16G16B16A16_UINT] = LoadMacroTile<TilingTraits<tilemode, 64>, R16G16B16A16_UINT, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R16G16B16A16_FLOAT] = LoadMacroTile<TilingTraits<tilemode, 64>, R16G16B16A16_FLOAT, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R32G32_FLOAT] = LoadMacroTile<TilingTraits<tilemode, 64>, R32G32_FLOAT, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R32G32_SINT] = LoadMacroTile<TilingTraits<tilemode, 64>, R32G32_SINT, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R32G32_UINT] = LoadMacroTile<TilingTraits<tilemode, 64>, R32G32_UINT, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R16G16B16X16_UNORM] = LoadMacroTile<TilingTraits<tilemode, 64>, R16G16B16X16_UNORM, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R16G16B16X16_FLOAT] = LoadMacroTile<TilingTraits<tilemode, 64>, R16G16B16X16_FLOAT, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[B8G8R8A8_UNORM] = LoadMacroTile<TilingTraits<tilemode, 32>, B8G8R8A8_UNORM, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[B8G8R8A8_UNORM_SRGB] = LoadMacroTile<TilingTraits<tilemode, 32>, B8G8R8A8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R10G10B10A2_UNORM] = LoadMacroTile<TilingTraits<tilemode, 32>, R10G10B10A2_UNORM, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R10G10B10A2_UNORM_SRGB] = LoadMacroTile<TilingTraits<tilemode, 32>, R10G10B10A2_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R10G10B10A2_UINT] = LoadMacroTile<TilingTraits<tilemode, 32>, R10G10B10A2_UINT, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R8G8B8A8_UNORM] = LoadMacroTile<TilingTraits<tilemode, 32>, R8G8B8A8_UNORM, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R8G8B8A8_UNORM_SRGB] = LoadMacroTile<TilingTraits<tilemode, 32>, R8G8B8A8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R8G8B8A8_SNORM] = LoadMacroTile<TilingTraits<tilemode, 32>, R8G8B8A8_SNORM, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R8G8B8A8_SINT] = LoadMacroTile<TilingTraits<tilemode, 32>, R8G8B8A8_SINT, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R8G8B8A8_UINT] = LoadMacroTile<TilingTraits<tilemode, 32>, R8G8B8A8_UINT, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R16G16_UNORM] = LoadMacroTile<TilingTraits<tilemode, 32>, R16G16_UNORM, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R16G16_SNORM] = LoadMacroTile<TilingTraits<tilemode, 32>, R16G16_SNORM, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R16G16_SINT] = LoadMacroTile<TilingTraits<tilemode, 32>, R16G16_SINT, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R16G16_UINT] = LoadMacroTile<TilingTraits<tilemode, 32>, R16G16_UINT, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R16G16_FLOAT] = LoadMacroTile<TilingTraits<tilemode, 32>, R16G16_FLOAT, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[B10G10R10A2_UNORM] = LoadMacroTile<TilingTraits<tilemode, 32>, B10G10R10A2_UNORM, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[B10G10R10A2_UNORM_SRGB] = LoadMacroTile<TilingTraits<tilemode, 32>, B10G10R10A2_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R11G11B10_FLOAT] = LoadMacroTile<TilingTraits<tilemode, 32>, R11G11B10_FLOAT, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R32_SINT] = LoadMacroTile<TilingTraits<tilemode, 32>, R32_SINT, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R32_UINT] = LoadMacroTile<TilingTraits<tilemode, 32>, R32_UINT, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R32_FLOAT] = LoadMacroTile<TilingTraits<tilemode, 32>, R32_FLOAT, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[A32_FLOAT] = LoadMacroTile<TilingTraits<tilemode, 32>, A32_FLOAT, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[B8G8R8X8_UNORM] = LoadMacroTile<TilingTraits<tilemode, 32>, B8G8R8X8_UNORM, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[B8G8R8X8_UNORM_SRGB] = LoadMacroTile<TilingTraits<tilemode, 32>, B8G8R8X8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R8G8B8X8_UNORM] = LoadMacroTile<TilingTraits<tilemode, 32>, R8G8B8X8_UNORM, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R8G8B8X8_UNORM_SRGB] = LoadMacroTile<TilingTraits<tilemode, 32>, R8G8B8X8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[B10G10R10X2_UNORM] = LoadMacroTile<TilingTraits<tilemode, 32>, B10G10R10X2_UNORM, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[B5G6R5_UNORM] = LoadMacroTile<TilingTraits<tilemode, 16>, B5G6R5_UNORM, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[B5G6R5_UNORM_SRGB] = LoadMacroTile<TilingTraits<tilemode, 16>, B5G6R5_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[B5G5R5A1_UNORM] = LoadMacroTile<TilingTraits<tilemode, 16>, B5G5R5A1_UNORM, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[B5G5R5A1_UNORM_SRGB] = LoadMacroTile<TilingTraits<tilemode, 16>, B5G5R5A1_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[B4G4R4A4_UNORM] = LoadMacroTile<TilingTraits<tilemode, 16>, B4G4R4A4_UNORM, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[B4G4R4A4_UNORM_SRGB] = LoadMacroTile<TilingTraits<tilemode, 16>, B4G4R4A4_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R8G8_UNORM] = LoadMacroTile<TilingTraits<tilemode, 16>, R8G8_UNORM, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R8G8_SNORM] = LoadMacroTile<TilingTraits<tilemode, 16>, R8G8_SNORM, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R8G8_SINT] = LoadMacroTile<TilingTraits<tilemode, 16>, R8G8_SINT, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R8G8_UINT] = LoadMacroTile<TilingTraits<tilemode, 16>, R8G8_UINT, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R16_UNORM] = LoadMacroTile<TilingTraits<tilemode, 16>, R16_UNORM, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R16_SNORM] = LoadMacroTile<TilingTraits<tilemode, 16>, R16_SNORM, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R16_SINT] = LoadMacroTile<TilingTraits<tilemode, 16>, R16_SINT, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R16_UINT] = LoadMacroTile<TilingTraits<tilemode, 16>, R16_UINT, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R16_FLOAT] = LoadMacroTile<TilingTraits<tilemode, 16>, R16_FLOAT, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[A16_UNORM] = LoadMacroTile<TilingTraits<tilemode, 16>, A16_UNORM, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[A16_FLOAT] = LoadMacroTile<TilingTraits<tilemode, 16>, A16_FLOAT, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[B5G5R5X1_UNORM] = LoadMacroTile<TilingTraits<tilemode, 16>, B5G5R5X1_UNORM, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[B5G5R5X1_UNORM_SRGB] = LoadMacroTile<TilingTraits<tilemode, 16>, B5G5R5X1_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R8_UNORM] = LoadMacroTile<TilingTraits<tilemode, 8>, R8_UNORM, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R8_SNORM] = LoadMacroTile<TilingTraits<tilemode, 8>, R8_SNORM, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R8_SINT] = LoadMacroTile<TilingTraits<tilemode, 8>, R8_SINT, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R8_UINT] = LoadMacroTile<TilingTraits<tilemode, 8>, R8_UINT, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[A8_UNORM] = LoadMacroTile<TilingTraits<tilemode, 8>, A8_UNORM, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[BC1_UNORM] = LoadMacroTile<TilingTraits<tilemode, 64>, BC1_UNORM, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[BC2_UNORM] = LoadMacroTile<TilingTraits<tilemode, 128>, BC2_UNORM, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[BC3_UNORM] = LoadMacroTile<TilingTraits<tilemode, 128>, BC3_UNORM, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[BC4_UNORM] = LoadMacroTile<TilingTraits<tilemode, 64>, BC4_UNORM, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[BC5_UNORM] = LoadMacroTile<TilingTraits<tilemode, 128>, BC5_UNORM, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[BC1_UNORM_SRGB] = LoadMacroTile<TilingTraits<tilemode, 64>, BC1_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[BC2_UNORM_SRGB] = LoadMacroTile<TilingTraits<tilemode, 128>, BC2_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[BC3_UNORM_SRGB] = LoadMacroTile<TilingTraits<tilemode, 128>, BC3_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R8G8B8_UNORM] = LoadMacroTile<TilingTraits<tilemode, 24>, R8G8B8_UNORM, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R8G8B8_SNORM] = LoadMacroTile<TilingTraits<tilemode, 24>, R8G8B8_SNORM, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[BC4_SNORM] = LoadMacroTile<TilingTraits<tilemode, 64>, BC4_SNORM, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[BC5_SNORM] = LoadMacroTile<TilingTraits<tilemode, 128>, BC5_SNORM, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R16G16B16_FLOAT] = LoadMacroTile<TilingTraits<tilemode, 48>, R16G16B16_FLOAT, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R16G16B16_UNORM] = LoadMacroTile<TilingTraits<tilemode, 48>, R16G16B16_UNORM, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R16G16B16_SNORM] = LoadMacroTile<TilingTraits<tilemode, 48>, R16G16B16_SNORM, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R8G8B8_UNORM_SRGB] = LoadMacroTile<TilingTraits<tilemode, 24>, R8G8B8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R16G16B16_UINT] = LoadMacroTile<TilingTraits<tilemode, 48>, R16G16B16_UINT, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R16G16B16_SINT] = LoadMacroTile<TilingTraits<tilemode, 48>, R16G16B16_SINT, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R10G10B10A2_SNORM] = LoadMacroTile<TilingTraits<tilemode, 32>, R10G10B10A2_SNORM, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R10G10B10A2_SINT] = LoadMacroTile<TilingTraits<tilemode, 32>, R10G10B10A2_SINT, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[B10G10R10A2_SNORM] = LoadMacroTile<TilingTraits<tilemode, 32>, B10G10R10A2_SNORM, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[B10G10R10A2_UINT] = LoadMacroTile<TilingTraits<tilemode, 32>, B10G10R10A2_UINT, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[B10G10R10A2_SINT] = LoadMacroTile<TilingTraits<tilemode, 32>, B10G10R10A2_SINT, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R8G8B8_UINT] = LoadMacroTile<TilingTraits<tilemode, 24>, R8G8B8_UINT, R32G32B32A32_FLOAT>::Load; \
sLoadTilesColorTable_##tilemode[R8G8B8_SINT] = LoadMacroTile<TilingTraits<tilemode, 24>, R8G8B8_SINT, R32G32B32A32_FLOAT>::Load; \
//////////////////////////////////////////////////////////////////////////
/// INIT_LOAD_TILES_TABLE - Helper macro for setting up the tables.
#define INIT_LOAD_TILES_DEPTH_TABLE(tilemode) \
memset(sLoadTilesDepthTable_##tilemode, 0, sizeof(sLoadTilesDepthTable_##tilemode)); \
\
sLoadTilesDepthTable_##tilemode[R16_UNORM] = LoadMacroTile<TilingTraits<tilemode, 16>, R16_UNORM, R32_FLOAT>::Load; \
sLoadTilesDepthTable_##tilemode[R32_FLOAT] = LoadMacroTile<TilingTraits<tilemode, 32>, R32_FLOAT, R32_FLOAT>::Load; \
sLoadTilesDepthTable_##tilemode[R24_UNORM_X8_TYPELESS] = LoadMacroTile<TilingTraits<tilemode, 32>, R24_UNORM_X8_TYPELESS, R32_FLOAT>::Load; \
//////////////////////////////////////////////////////////////////////////
/// @brief Sets up tables for LoadTile
void InitSimLoadTilesTable()
{
INIT_LOAD_TILES_COLOR_TABLE(SWR_TILE_NONE);
INIT_LOAD_TILES_DEPTH_TABLE(SWR_TILE_NONE);
INIT_LOAD_TILES_COLOR_TABLE(SWR_TILE_MODE_YMAJOR);
INIT_LOAD_TILES_COLOR_TABLE(SWR_TILE_MODE_XMAJOR);
INIT_LOAD_TILES_DEPTH_TABLE(SWR_TILE_MODE_YMAJOR);
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,581 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file TilingFunctions.h
*
* @brief Tiling functions.
*
******************************************************************************/
#pragma once
#include "core/state.h"
#include "core/format_traits.h"
#include "memory/tilingtraits.h"
#include <algorithm>
#define MAX_NUM_LOD 15
#define GFX_ALIGN(x, a) (((x) + ((a) - 1)) - (((x) + ((a) - 1)) & ((a) - 1))) // Alt implementation with bitwise not (~) has issue with uint32 align used with 64-bit value, since ~'ed value will remain 32-bit.
//////////////////////////////////////////////////////////////////////////
/// SimdTile SSE(2x2), AVX(4x2), or AVX-512(4x4?)
//////////////////////////////////////////////////////////////////////////
template<SWR_FORMAT HotTileFormat, SWR_FORMAT SrcOrDstFormat>
struct SimdTile
{
// SimdTile is SOA (e.g. rrrrrrrr gggggggg bbbbbbbb aaaaaaaa )
float color[FormatTraits<HotTileFormat>::numComps][KNOB_SIMD_WIDTH];
//////////////////////////////////////////////////////////////////////////
/// @brief Retrieve color from simd.
/// @param index - linear index to color within simd.
/// @param outputColor - output color
INLINE void GetSwizzledColor(
uint32_t index,
float outputColor[4])
{
// SOA pattern for 2x2 is a subset of 4x2.
// 0 1 4 5
// 2 3 6 7
// The offset converts pattern to linear
#if (SIMD_TILE_X_DIM == 4)
static const uint32_t offset[] = { 0, 1, 4, 5, 2, 3, 6, 7 };
#elif (SIMD_TILE_X_DIM == 2)
static const uint32_t offset[] = { 0, 1, 2, 3 };
#endif
for (uint32_t i = 0; i < FormatTraits<SrcOrDstFormat>::numComps; ++i)
{
outputColor[i] = this->color[FormatTraits<SrcOrDstFormat>::swizzle(i)][offset[index]];
}
}
//////////////////////////////////////////////////////////////////////////
/// @brief Retrieve color from simd.
/// @param index - linear index to color within simd.
/// @param outputColor - output color
INLINE void SetSwizzledColor(
uint32_t index,
const float src[4])
{
// SOA pattern for 2x2 is a subset of 4x2.
// 0 1 4 5
// 2 3 6 7
// The offset converts pattern to linear
#if (SIMD_TILE_X_DIM == 4)
static const uint32_t offset[] = { 0, 1, 4, 5, 2, 3, 6, 7 };
#elif (SIMD_TILE_X_DIM == 2)
static const uint32_t offset[] = { 0, 1, 2, 3 };
#endif
// Only loop over the components needed for destination.
for (uint32_t i = 0; i < FormatTraits<SrcOrDstFormat>::numComps; ++i)
{
this->color[i][offset[index]] = src[i];
}
}
};
template<>
struct SimdTile <R8_UINT,R8_UINT>
{
// SimdTile is SOA (e.g. rrrrrrrr gggggggg bbbbbbbb aaaaaaaa )
uint8_t color[FormatTraits<R8_UINT>::numComps][KNOB_SIMD_WIDTH];
//////////////////////////////////////////////////////////////////////////
/// @brief Retrieve color from simd.
/// @param index - linear index to color within simd.
/// @param outputColor - output color
INLINE void GetSwizzledColor(
uint32_t index,
float outputColor[4])
{
// SOA pattern for 2x2 is a subset of 4x2.
// 0 1 4 5
// 2 3 6 7
// The offset converts pattern to linear
#if (SIMD_TILE_X_DIM == 4)
static const uint32_t offset[] = { 0, 1, 4, 5, 2, 3, 6, 7 };
#elif (SIMD_TILE_X_DIM == 2)
static const uint32_t offset[] = { 0, 1, 2, 3 };
#endif
for (uint32_t i = 0; i < FormatTraits<R8_UINT>::numComps; ++i)
{
uint32_t src = this->color[FormatTraits<R8_UINT>::swizzle(i)][offset[index]];
outputColor[i] = *(float*)&src;
}
}
//////////////////////////////////////////////////////////////////////////
/// @brief Retrieve color from simd.
/// @param index - linear index to color within simd.
/// @param outputColor - output color
INLINE void SetSwizzledColor(
uint32_t index,
const float src[4])
{
// SOA pattern for 2x2 is a subset of 4x2.
// 0 1 4 5
// 2 3 6 7
// The offset converts pattern to linear
#if (SIMD_TILE_X_DIM == 4)
static const uint32_t offset[] = { 0, 1, 4, 5, 2, 3, 6, 7 };
#elif (SIMD_TILE_X_DIM == 2)
static const uint32_t offset[] = { 0, 1, 2, 3 };
#endif
// Only loop over the components needed for destination.
for (uint32_t i = 0; i < FormatTraits<R8_UINT>::numComps; ++i)
{
this->color[i][offset[index]] = *(uint8_t*)&src[i];
}
}
};
//////////////////////////////////////////////////////////////////////////
/// @brief Computes lod offset for 1D surface at specified lod.
/// @param baseWidth - width of basemip (mip 0).
/// @param hAlign - horizontal alignment per miip, in texels
/// @param lod - lod index
/// @param offset - output offset.
INLINE void ComputeLODOffset1D(
const SWR_FORMAT_INFO& info,
uint32_t baseWidth,
uint32_t hAlign,
uint32_t lod,
uint32_t &offset)
{
if (lod == 0)
{
offset = 0;
}
else
{
uint32_t curWidth = baseWidth;
// translate mip width from pixels to blocks for block compressed formats
// @note hAlign is already in blocks for compressed formats so no need to convert
if (info.isBC) curWidth /= info.bcWidth;
offset = GFX_ALIGN(curWidth, hAlign);
for (uint32_t l = 1; l < lod; ++l)
{
curWidth = GFX_ALIGN(std::max<uint32_t>(curWidth >> 1, 1U), hAlign);
offset += curWidth;
}
if (info.isSubsampled)
{
offset /= info.bcWidth;
}
}
}
//////////////////////////////////////////////////////////////////////////
/// @brief Computes x lod offset for 2D surface at specified lod.
/// @param baseWidth - width of basemip (mip 0).
/// @param hAlign - horizontal alignment per mip, in texels
/// @param lod - lod index
/// @param offset - output offset.
INLINE void ComputeLODOffsetX(
const SWR_FORMAT_INFO& info,
uint32_t baseWidth,
uint32_t hAlign,
uint32_t lod,
uint32_t &offset)
{
if (lod < 2)
{
offset = 0;
}
else
{
uint32_t curWidth = baseWidth;
// convert mip width from pixels to blocks for block compressed formats
// @note hAlign is already in blocks for compressed formats so no need to convert
if (info.isBC) curWidth /= info.bcWidth;
curWidth = std::max<uint32_t>(curWidth >> 1, 1U);
curWidth = GFX_ALIGN(curWidth, hAlign);
if (info.isSubsampled)
{
curWidth /= info.bcWidth;
}
offset = curWidth;
}
}
//////////////////////////////////////////////////////////////////////////
/// @brief Computes y lod offset for 2D surface at specified lod.
/// @param baseWidth - width of basemip (mip 0).
/// @param vAlign - vertical alignment per mip, in rows
/// @param lod - lod index
/// @param offset - output offset.
INLINE void ComputeLODOffsetY(
const SWR_FORMAT_INFO& info,
uint32_t baseHeight,
uint32_t vAlign,
uint32_t lod,
uint32_t &offset)
{
if (lod == 0)
{
offset = 0;
}
else
{
offset = 0;
uint32_t mipHeight = baseHeight;
// translate mip height from pixels to blocks for block compressed formats
// @note VAlign is already in blocks for compressed formats so no need to convert
if (info.isBC) mipHeight /= info.bcHeight;
for (uint32_t l = 1; l <= lod; ++l)
{
uint32_t alignedMipHeight = GFX_ALIGN(mipHeight, vAlign);
offset += ((l != 2) ? alignedMipHeight : 0);
mipHeight = std::max<uint32_t>(mipHeight >> 1, 1U);
}
}
}
//////////////////////////////////////////////////////////////////////////
/// @brief Computes 1D surface offset
/// @param x - offset from start of array slice at given lod.
/// @param array - array slice index
/// @param lod - lod index
/// @param pState - surface state
/// @param xOffsetBytes - output offset in bytes.
template<bool UseCachedOffsets>
INLINE void ComputeSurfaceOffset1D(
uint32_t x,
uint32_t array,
uint32_t lod,
const SWR_SURFACE_STATE *pState,
uint32_t &xOffsetBytes)
{
const SWR_FORMAT_INFO &info = GetFormatInfo(pState->format);
uint32_t lodOffset;
if (UseCachedOffsets)
{
lodOffset = pState->lodOffsets[0][lod];
}
else
{
ComputeLODOffset1D(info, pState->width, pState->halign, lod, lodOffset);
}
xOffsetBytes = (array * pState->qpitch + lodOffset + x) * info.Bpp;
}
//////////////////////////////////////////////////////////////////////////
/// @brief Adjusts the array slice for legacy TileY MSAA
/// @param pState - surface state
/// @param array - array slice index
/// @param sampleNum - requested sample
INLINE void AdjustCoordsForMSAA(const SWR_SURFACE_STATE *pState, uint32_t& x, uint32_t& y, uint32_t& arrayIndex, uint32_t sampleNum)
{
/// @todo: might want to templatize adjusting for sample slices when we support tileYS/tileYF.
if((pState->tileMode == SWR_TILE_MODE_YMAJOR ||
pState->tileMode == SWR_TILE_MODE_WMAJOR) &&
pState->bInterleavedSamples)
{
uint32_t newX, newY, newSampleX, newSampleY;
switch(pState->numSamples)
{
case 1:
newX = x;
newY = y;
newSampleX = newSampleY = 0;
break;
case 2:
{
assert(pState->type == SURFACE_2D);
static const uint32_t xMask = 0xFFFFFFFD;
static const uint32_t sampleMaskX = 0x1;
newX = pdep_u32(x, xMask);
newY = y;
newSampleX = pext_u32(sampleNum, sampleMaskX);
newSampleY = 0;
}
break;
case 4:
{
assert(pState->type == SURFACE_2D);
static const uint32_t mask = 0xFFFFFFFD;
static const uint32_t sampleMaskX = 0x1;
static const uint32_t sampleMaskY = 0x2;
newX = pdep_u32(x, mask);
newY = pdep_u32(y, mask);
newSampleX = pext_u32(sampleNum, sampleMaskX);
newSampleY = pext_u32(sampleNum, sampleMaskY);
}
break;
case 8:
{
assert(pState->type == SURFACE_2D);
static const uint32_t xMask = 0xFFFFFFF9;
static const uint32_t yMask = 0xFFFFFFFD;
static const uint32_t sampleMaskX = 0x5;
static const uint32_t sampleMaskY = 0x2;
newX = pdep_u32(x, xMask);
newY = pdep_u32(y, yMask);
newSampleX = pext_u32(sampleNum, sampleMaskX);
newSampleY = pext_u32(sampleNum, sampleMaskY);
}
break;
case 16:
{
assert(pState->type == SURFACE_2D);
static const uint32_t mask = 0xFFFFFFF9;
static const uint32_t sampleMaskX = 0x5;
static const uint32_t sampleMaskY = 0xA;
newX = pdep_u32(x, mask);
newY = pdep_u32(y, mask);
newSampleX = pext_u32(sampleNum, sampleMaskX);
newSampleY = pext_u32(sampleNum, sampleMaskY);
}
break;
default:
assert(0 && "Unsupported sample count");
newX = newY = 0;
newSampleX = newSampleY = 0;
break;
}
x = newX | (newSampleX << 1);
y = newY | (newSampleY << 1);
}
else if(pState->tileMode == SWR_TILE_MODE_YMAJOR ||
pState->tileMode == SWR_TILE_NONE)
{
uint32_t sampleShift;
switch(pState->numSamples)
{
case 1:
assert(sampleNum == 0);
sampleShift = 0;
break;
case 2:
assert(pState->type == SURFACE_2D);
sampleShift = 1;
break;
case 4:
assert(pState->type == SURFACE_2D);
sampleShift = 2;
break;
case 8:
assert(pState->type == SURFACE_2D);
sampleShift = 3;
break;
case 16:
assert(pState->type == SURFACE_2D);
sampleShift = 4;
break;
default:
assert(0 && "Unsupported sample count");
sampleShift = 0;
break;
}
arrayIndex = (arrayIndex << sampleShift) | sampleNum;
}
}
//////////////////////////////////////////////////////////////////////////
/// @brief Computes 2D surface offset
/// @param x - horizontal offset from start of array slice and lod.
/// @param y - vertical offset from start of array slice and lod.
/// @param array - array slice index
/// @param lod - lod index
/// @param pState - surface state
/// @param xOffsetBytes - output x offset in bytes.
/// @param yOffsetRows - output y offset in bytes.
template<bool UseCachedOffsets>
INLINE void ComputeSurfaceOffset2D(uint32_t x, uint32_t y, uint32_t array, uint32_t sampleNum, uint32_t lod, const SWR_SURFACE_STATE *pState, uint32_t &xOffsetBytes, uint32_t &yOffsetRows)
{
const SWR_FORMAT_INFO &info = GetFormatInfo(pState->format);
uint32_t lodOffsetX, lodOffsetY;
if (UseCachedOffsets)
{
lodOffsetX = pState->lodOffsets[0][lod];
lodOffsetY = pState->lodOffsets[1][lod];
}
else
{
ComputeLODOffsetX(info, pState->width, pState->halign, lod, lodOffsetX);
ComputeLODOffsetY(info, pState->height, pState->valign, lod, lodOffsetY);
}
AdjustCoordsForMSAA(pState, x, y, array, sampleNum);
xOffsetBytes = (x + lodOffsetX + pState->xOffset) * info.Bpp;
yOffsetRows = (array * pState->qpitch) + lodOffsetY + y + pState->yOffset;
}
//////////////////////////////////////////////////////////////////////////
/// @brief Computes 3D surface offset
/// @param x - horizontal offset from start of array slice and lod.
/// @param y - vertical offset from start of array slice and lod.
/// @param z - depth offset from start of array slice and lod.
/// @param lod - lod index
/// @param pState - surface state
/// @param xOffsetBytes - output x offset in bytes.
/// @param yOffsetRows - output y offset in rows.
/// @param zOffsetSlices - output y offset in slices.
template<bool UseCachedOffsets>
INLINE void ComputeSurfaceOffset3D(uint32_t x, uint32_t y, uint32_t z, uint32_t lod, const SWR_SURFACE_STATE *pState, uint32_t &xOffsetBytes, uint32_t &yOffsetRows, uint32_t &zOffsetSlices)
{
const SWR_FORMAT_INFO &info = GetFormatInfo(pState->format);
uint32_t lodOffsetX, lodOffsetY;
if (UseCachedOffsets)
{
lodOffsetX = pState->lodOffsets[0][lod];
lodOffsetY = pState->lodOffsets[1][lod];
}
else
{
ComputeLODOffsetX(info, pState->width, pState->halign, lod, lodOffsetX);
ComputeLODOffsetY(info, pState->height, pState->valign, lod, lodOffsetY);
}
xOffsetBytes = (x + lodOffsetX) * info.Bpp;
yOffsetRows = lodOffsetY + y;
zOffsetSlices = z;
}
//////////////////////////////////////////////////////////////////////////
/// @brief Swizzles the linear x,y offsets depending on surface tiling mode
/// and returns final surface address
/// @param xOffsetBytes - x offset from base of surface in bytes
/// @param yOffsetRows - y offset from base of surface in rows
/// @param pState - pointer to the surface state
template<typename TTraits>
INLINE uint32_t ComputeTileSwizzle2D(uint32_t xOffsetBytes, uint32_t yOffsetRows, const SWR_SURFACE_STATE *pState)
{
return ComputeOffset2D<TTraits>(pState->pitch, xOffsetBytes, yOffsetRows);
}
//////////////////////////////////////////////////////////////////////////
/// @brief Swizzles the linear x,y offsets depending on surface tiling mode
/// and returns final surface address
/// @param xOffsetBytes - x offset from base of surface in bytes
/// @param yOffsetRows - y offset from base of surface in rows
/// @param pState - pointer to the surface state
template<typename TTraits>
INLINE uint32_t ComputeTileSwizzle3D(uint32_t xOffsetBytes, uint32_t yOffsetRows, uint32_t zOffsetSlices, const SWR_SURFACE_STATE *pState)
{
return ComputeOffset3D<TTraits>(pState->qpitch, pState->pitch, xOffsetBytes, yOffsetRows, zOffsetSlices);
}
//////////////////////////////////////////////////////////////////////////
/// @brief Swizzles the linear x,y offsets depending on surface tiling mode
/// and returns final surface address
/// @param xOffsetBytes - x offset from base of surface in bytes
/// @param yOffsetRows - y offset from base of surface in rows
/// @param pState - pointer to the surface state
INLINE
uint32_t TileSwizzle2D(uint32_t xOffsetBytes, uint32_t yOffsetRows, const SWR_SURFACE_STATE *pState)
{
switch (pState->tileMode)
{
case SWR_TILE_NONE: return ComputeTileSwizzle2D<TilingTraits<SWR_TILE_NONE, 32> >(xOffsetBytes, yOffsetRows, pState);
case SWR_TILE_SWRZ: return ComputeTileSwizzle2D<TilingTraits<SWR_TILE_SWRZ, 32> >(xOffsetBytes, yOffsetRows, pState);
case SWR_TILE_MODE_XMAJOR: return ComputeTileSwizzle2D<TilingTraits<SWR_TILE_MODE_XMAJOR, 8> >(xOffsetBytes, yOffsetRows, pState);
case SWR_TILE_MODE_YMAJOR: return ComputeTileSwizzle2D<TilingTraits<SWR_TILE_MODE_YMAJOR, 32> >(xOffsetBytes, yOffsetRows, pState);
case SWR_TILE_MODE_WMAJOR: return ComputeTileSwizzle2D<TilingTraits<SWR_TILE_MODE_WMAJOR, 8> >(xOffsetBytes, yOffsetRows, pState);
default: SWR_ASSERT(0, "Unsupported tiling mode");
}
return (uint32_t) NULL;
}
//////////////////////////////////////////////////////////////////////////
/// @brief Swizzles the linear x,y,z offsets depending on surface tiling mode
/// and returns final surface address
/// @param xOffsetBytes - x offset from base of surface in bytes
/// @param yOffsetRows - y offset from base of surface in rows
/// @param zOffsetSlices - z offset from base of surface in slices
/// @param pState - pointer to the surface state
INLINE
uint32_t TileSwizzle3D(uint32_t xOffsetBytes, uint32_t yOffsetRows, uint32_t zOffsetSlices, const SWR_SURFACE_STATE *pState)
{
switch (pState->tileMode)
{
case SWR_TILE_NONE: return ComputeTileSwizzle3D<TilingTraits<SWR_TILE_NONE, 32> >(xOffsetBytes, yOffsetRows, zOffsetSlices, pState);
case SWR_TILE_SWRZ: return ComputeTileSwizzle3D<TilingTraits<SWR_TILE_SWRZ, 32> >(xOffsetBytes, yOffsetRows, zOffsetSlices, pState);
case SWR_TILE_MODE_YMAJOR: return ComputeTileSwizzle3D<TilingTraits<SWR_TILE_MODE_YMAJOR, 32> >(xOffsetBytes, yOffsetRows, zOffsetSlices, pState);
default: SWR_ASSERT(0, "Unsupported tiling mode");
}
return (uint32_t) NULL;
}
template<bool UseCachedOffsets>
INLINE
uint32_t ComputeSurfaceOffset(uint32_t x, uint32_t y, uint32_t z, uint32_t array, uint32_t sampleNum, uint32_t lod, const SWR_SURFACE_STATE *pState)
{
uint32_t offsetX = 0, offsetY = 0, offsetZ = 0;
switch (pState->type)
{
case SURFACE_BUFFER:
case SURFACE_STRUCTURED_BUFFER:
offsetX = x * pState->pitch;
return offsetX;
break;
case SURFACE_1D:
ComputeSurfaceOffset1D<UseCachedOffsets>(x, array, lod, pState, offsetX);
return TileSwizzle2D(offsetX, 0, pState);
break;
case SURFACE_2D:
ComputeSurfaceOffset2D<UseCachedOffsets>(x, y, array, sampleNum, lod, pState, offsetX, offsetY);
return TileSwizzle2D(offsetX, offsetY, pState);
case SURFACE_3D:
ComputeSurfaceOffset3D<UseCachedOffsets>(x, y, z, lod, pState, offsetX, offsetY, offsetZ);
return TileSwizzle3D(offsetX, offsetY, offsetZ, pState);
break;
case SURFACE_CUBE:
ComputeSurfaceOffset2D<UseCachedOffsets>(x, y, array, sampleNum, lod, pState, offsetX, offsetY);
return TileSwizzle2D(offsetX, offsetY, pState);
break;
default: SWR_ASSERT(0, "Unsupported format");
}
return (uint32_t) NULL;
}
//////////////////////////////////////////////////////////////////////////
/// @brief Computes surface address at the given location and lod
/// @param x - x location in pixels
/// @param y - y location in rows
/// @param z - z location for 3D surfaces
/// @param array - array slice for 1D and 2D surfaces
/// @param lod - level of detail
/// @param pState - pointer to the surface state
template<bool UseCachedOffsets>
INLINE
void* ComputeSurfaceAddress(uint32_t x, uint32_t y, uint32_t z, uint32_t array, uint32_t sampleNum, uint32_t lod, const SWR_SURFACE_STATE *pState)
{
return pState->pBaseAddress + ComputeSurfaceOffset<UseCachedOffsets>(x, y, z, array, sampleNum, lod, pState);
}

View File

@ -0,0 +1,263 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file tilingtraits.h
*
* @brief Tiling traits.
*
******************************************************************************/
#pragma once
#include "core/state.h"
template<SWR_TILE_MODE mode, int>
struct TilingTraits
{
static const SWR_TILE_MODE TileMode{ mode };
static UINT GetCu() { SWR_ASSERT(0); return 0; }
static UINT GetCv() { SWR_ASSERT(0); return 0; }
static UINT GetCr() { SWR_ASSERT(0); return 0; }
static UINT GetTileIDShift() { SWR_ASSERT(0); return 0; }
/// @todo correct pdep shifts for all rastertile dims. Unused for now
static UINT GetPdepX() { SWR_ASSERT(0); return 0x37; }
static UINT GetPdepY() { SWR_ASSERT(0); return 0xC8; }
};
template<int X> struct TilingTraits <SWR_TILE_NONE, X>
{
static const SWR_TILE_MODE TileMode{ SWR_TILE_NONE };
static UINT GetCu() { return 0; }
static UINT GetCv() { return 0; }
static UINT GetCr() { return 0; }
static UINT GetTileIDShift() { return 0; }
static UINT GetPdepX() { return 0x00; }
static UINT GetPdepY() { return 0x00; }
};
template<> struct TilingTraits <SWR_TILE_SWRZ, 8>
{
static const SWR_TILE_MODE TileMode{ SWR_TILE_SWRZ };
static UINT GetCu() { return KNOB_TILE_X_DIM_SHIFT; }
static UINT GetCv() { return KNOB_TILE_Y_DIM_SHIFT; }
static UINT GetCr() { return 0; }
static UINT GetTileIDShift() { return KNOB_TILE_X_DIM_SHIFT + KNOB_TILE_Y_DIM_SHIFT; }
/// @todo correct pdep shifts for all rastertile dims. Unused for now
static UINT GetPdepX() { SWR_ASSERT(0); return 0x00; }
static UINT GetPdepY() { SWR_ASSERT(0); return 0x00; }
};
template<> struct TilingTraits <SWR_TILE_SWRZ, 32>
{
static const SWR_TILE_MODE TileMode{ SWR_TILE_SWRZ };
static UINT GetCu() { return KNOB_TILE_X_DIM_SHIFT + 2; }
static UINT GetCv() { return KNOB_TILE_Y_DIM_SHIFT; }
static UINT GetCr() { return 0; }
static UINT GetTileIDShift() { return KNOB_TILE_X_DIM_SHIFT + KNOB_TILE_Y_DIM_SHIFT + 2; }
static UINT GetPdepX() { return 0x37; }
static UINT GetPdepY() { return 0xC8; }
};
template<> struct TilingTraits <SWR_TILE_SWRZ, 128>
{
static const SWR_TILE_MODE TileMode{ SWR_TILE_SWRZ };
static UINT GetCu() { return KNOB_TILE_X_DIM_SHIFT + 4; }
static UINT GetCv() { return KNOB_TILE_Y_DIM_SHIFT; }
static UINT GetCr() { return 0; }
static UINT GetTileIDShift() { return KNOB_TILE_X_DIM_SHIFT + KNOB_TILE_Y_DIM_SHIFT + 4; }
/// @todo correct pdep shifts for all rastertile dims. Unused for now
static UINT GetPdepX() { SWR_ASSERT(0); return 0x37; }
static UINT GetPdepY() { SWR_ASSERT(0); return 0xC8; }
};
// y-major tiling layout unaffected by element size
template<int X> struct TilingTraits <SWR_TILE_MODE_YMAJOR, X>
{
static const SWR_TILE_MODE TileMode{ SWR_TILE_MODE_YMAJOR };
static UINT GetCu() { return 7; }
static UINT GetCv() { return 5; }
static UINT GetCr() { return 0; }
static UINT GetTileIDShift() { return 12; }
static UINT GetPdepX() { return 0xe0f; }
static UINT GetPdepY() { return 0x1f0; }
};
// x-major tiling layout unaffected by element size
template<int X> struct TilingTraits <SWR_TILE_MODE_XMAJOR, X>
{
static const SWR_TILE_MODE TileMode{ SWR_TILE_MODE_XMAJOR };
static UINT GetCu() { return 9; }
static UINT GetCv() { return 3; }
static UINT GetCr() { return 0; }
static UINT GetTileIDShift() { return 12; }
static UINT GetPdepX() { return 0x1ff; }
static UINT GetPdepY() { return 0xe00; }
};
template<int X> struct TilingTraits <SWR_TILE_MODE_WMAJOR, X>
{
static const SWR_TILE_MODE TileMode{ SWR_TILE_MODE_WMAJOR };
static UINT GetCu() { return 6; }
static UINT GetCv() { return 6; }
static UINT GetCr() { return 0; }
static UINT GetTileIDShift() { return 12; }
static UINT GetPdepX() { return 0xe15; }
static UINT GetPdepY() { return 0x1ea; }
};
INLINE
UINT pdep_u32(UINT a, UINT mask)
{
#if KNOB_ARCH==KNOB_ARCH_AVX2
return _pdep_u32(a, mask);
#else
UINT result = 0;
// copied from http://wm.ite.pl/articles/pdep-soft-emu.html
// using bsf instead of funky loop
DWORD maskIndex;
while (_BitScanForward(&maskIndex, mask))
{
// 1. isolate lowest set bit of mask
const UINT lowest = 1 << maskIndex;
// 2. populate LSB from src
const UINT LSB = (UINT)((int)(a << 31) >> 31);
// 3. copy bit from mask
result |= LSB & lowest;
// 4. clear lowest bit
mask &= ~lowest;
// 5. prepare for next iteration
a >>= 1;
}
return result;
#endif
}
INLINE
UINT pext_u32(UINT a, UINT mask)
{
#if KNOB_ARCH==KNOB_ARCH_AVX2
return _pext_u32(a, mask);
#else
UINT result = 0;
DWORD maskIndex;
uint32_t currentBit = 0;
while (_BitScanForward(&maskIndex, mask))
{
// 1. isolate lowest set bit of mask
const UINT lowest = 1 << maskIndex;
// 2. copy bit from mask
result |= ((a & lowest) > 0) << currentBit++;
// 3. clear lowest bit
mask &= ~lowest;
}
return result;
#endif
}
//////////////////////////////////////////////////////////////////////////
/// @brief Computes the tileID for 2D tiled surfaces
/// @param pitch - surface pitch in bytes
/// @param tileX - x offset in tiles
/// @param tileY - y offset in tiles
template<typename TTraits>
INLINE UINT ComputeTileOffset2D(UINT pitch, UINT tileX, UINT tileY)
{
UINT tileID = tileY * (pitch >> TTraits::GetCu()) + tileX;
return tileID << TTraits::GetTileIDShift();
}
//////////////////////////////////////////////////////////////////////////
/// @brief Computes the tileID for 3D tiled surfaces
/// @param qpitch - surface qpitch in rows
/// @param pitch - surface pitch in bytes
/// @param tileX - x offset in tiles
/// @param tileY - y offset in tiles
/// @param tileZ - y offset in tiles
template<typename TTraits>
INLINE UINT ComputeTileOffset3D(UINT qpitch, UINT pitch, UINT tileX, UINT tileY, UINT tileZ)
{
UINT tileID = (tileZ * (qpitch >> TTraits::GetCv()) + tileY) * (pitch >> TTraits::GetCu()) + tileX;
return tileID << TTraits::GetTileIDShift();
}
//////////////////////////////////////////////////////////////////////////
/// @brief Computes the byte offset for 2D tiled surfaces
/// @param pitch - surface pitch in bytes
/// @param x - x offset in bytes
/// @param y - y offset in rows
template<typename TTraits>
INLINE UINT ComputeOffset2D(UINT pitch, UINT x, UINT y)
{
UINT tileID = ComputeTileOffset2D<TTraits>(pitch, x >> TTraits::GetCu(), y >> TTraits::GetCv());
UINT xSwizzle = pdep_u32(x, TTraits::GetPdepX());
UINT ySwizzle = pdep_u32(y, TTraits::GetPdepY());
return (tileID | xSwizzle | ySwizzle);
}
#if KNOB_ARCH <= KNOB_ARCH_AVX
//////////////////////////////////////////////////////////////////////////
/// @brief Computes the byte offset for 2D tiled surfaces. Specialization
/// for tile-y surfaces that uses bit twiddling instead of pdep emulation.
/// @param pitch - surface pitch in bytes
/// @param x - x offset in bytes
/// @param y - y offset in rows
template<>
INLINE UINT ComputeOffset2D<TilingTraits<SWR_TILE_MODE_YMAJOR, 32> >(UINT pitch, UINT x, UINT y)
{
typedef TilingTraits<SWR_TILE_MODE_YMAJOR, 32> TTraits;
UINT tileID = ComputeTileOffset2D<TTraits>(pitch, x >> TTraits::GetCu(), y >> TTraits::GetCv());
UINT xSwizzle = ((x << 5) & 0xe00) | (x & 0xf);
UINT ySwizzle = (y << 4) & 0x1f0;
return (tileID | xSwizzle | ySwizzle);
}
#endif
//////////////////////////////////////////////////////////////////////////
/// @brief Computes the byte offset for 3D tiled surfaces
/// @param qpitch - depth pitch in rows
/// @param pitch - surface pitch in bytes
/// @param x - x offset in bytes
/// @param y - y offset in rows
/// @param z - y offset in slices
template<typename TTraits>
INLINE UINT ComputeOffset3D(UINT qpitch, UINT pitch, UINT x, UINT y, UINT z)
{
UINT tileID = ComputeTileOffset3D<TTraits>(qpitch, pitch, x >> TTraits::GetCu(), y >> TTraits::GetCv(), z >> TTraits::GetCr());
UINT xSwizzle = pdep_u32(x, TTraits::GetPdepX());
UINT ySwizzle = pdep_u32(y, TTraits::GetPdepY());
return (tileID | xSwizzle | ySwizzle);
}

View File

@ -0,0 +1,79 @@
# Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice (including the next
# paragraph) shall be included in all copies or substantial portions of the
# Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
# Python source
from __future__ import print_function
import os
import sys
import knob_defs
from mako.template import Template
from mako.exceptions import RichTraceback
def write_template_to_string(template_filename, **kwargs):
try:
template = Template(filename=template_filename)
# Split + Join fixes line-endings for whatever platform you are using
return '\n'.join(template.render(**kwargs).splitlines())
except:
traceback = RichTraceback()
for (filename, lineno, function, line) in traceback.traceback:
print("File %s, line %s, in %s" % (filename, lineno, function))
print(line, "\n")
print("%s: %s" % (str(traceback.error.__class__.__name__), traceback.error))
def write_template_to_file(template_filename, output_filename, **kwargs):
with open(output_filename, "w") as outfile:
print(write_template_to_string(template_filename, **kwargs), file=outfile)
def main(args=sys.argv[1:]):
if len(args) != 1:
print('Usage:', sys.argv[0], '<output_directory>', file=sys.stderr)
return 1
output_dir = args[0]
if not os.path.isdir(output_dir):
if os.path.exists(output_dir):
print('ERROR: Invalid output directory:', output_dir, file=sys.stderr)
return 1
try:
os.makedirs(output_dir)
except:
print('ERROR: Could not create output directory:', output_dir, file=sys.stderr)
return 1
# Output path exists, now just run the template
template_file = os.sep.join([sys.path[0], 'templates', 'knobs.template'])
output_file = os.sep.join([output_dir, 'gen_knobs.cpp'])
output_header = os.sep.join([output_dir, 'gen_knobs.h'])
for f in [output_header, output_file]:
write_template_to_file(template_file, f,
filename='gen_knobs',
knobs=knob_defs.KNOBS,
includes=['core/knobs_init.h', 'common/os.h', 'sstream', 'iomanip'],
gen_header=True if f == output_header else False)
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -0,0 +1,226 @@
# Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice (including the next
# paragraph) shall be included in all copies or substantial portions of the
# Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
# Python source
KNOBS = [
['ENABLE_ASSERT_DIALOGS', {
'type' : 'bool',
'default' : 'true',
'desc' : ['Use dialogs when asserts fire.',
'Asserts are only enabled in debug builds'],
}],
['SINGLE_THREADED', {
'type' : 'bool',
'default' : 'false',
'desc' : ['If enabled will perform all rendering on the API thread.',
'This is useful mainly for debugging purposes.'],
}],
['DUMP_SHADER_IR', {
'type' : 'bool',
'default' : 'false',
'desc' : ['Dumps shader LLVM IR at various stages of jit compilation.'],
}],
['USE_GENERIC_STORETILE', {
'type' : 'bool',
'default' : 'false',
'desc' : ['Always use generic function for performing StoreTile.',
'Will be slightly slower than using optimized (jitted) path'],
}],
['FAST_CLEAR', {
'type' : 'bool',
'default' : 'true',
'desc' : ['Replace 3D primitive execute with a SWRClearRT operation and',
'defer clear execution to first backend op on hottile, or hottile store'],
}],
['MAX_NUMA_NODES', {
'type' : 'uint32_t',
'default' : '0',
'desc' : ['Maximum # of NUMA-nodes per system used for worker threads',
' 0 == ALL NUMA-nodes in the system',
' N == Use at most N NUMA-nodes for rendering'],
}],
['MAX_CORES_PER_NUMA_NODE', {
'type' : 'uint32_t',
'default' : '0',
'desc' : ['Maximum # of cores per NUMA-node used for worker threads.',
' 0 == ALL non-API thread cores per NUMA-node',
' N == Use at most N cores per NUMA-node'],
}],
['MAX_THREADS_PER_CORE', {
'type' : 'uint32_t',
'default' : '1',
'desc' : ['Maximum # of (hyper)threads per physical core used for worker threads.',
' 0 == ALL hyper-threads per core',
' N == Use at most N hyper-threads per physical core'],
}],
['MAX_WORKER_THREADS', {
'type' : 'uint32_t',
'default' : '0',
'desc' : ['Maximum worker threads to spawn.',
'',
'IMPORTANT: If this is non-zero, no worker threads will be bound to',
'specific HW threads. They will all be "floating" SW threads.',
'In this case, the above 3 KNOBS will be ignored.'],
}],
['BUCKETS_START_FRAME', {
'type' : 'uint32_t',
'default' : '1200',
'desc' : ['Frame from when to start saving buckets data.',
'',
'NOTE: KNOB_ENABLE_RDTSC must be enabled in core/knobs.h',
'for this to have an effect.'],
}],
['BUCKETS_END_FRAME', {
'type' : 'uint32_t',
'default' : '1400',
'desc' : ['Frame at which to stop saving buckets data.',
'',
'NOTE: KNOB_ENABLE_RDTSC must be enabled in core/knobs.h',
'for this to have an effect.'],
}],
['WORKER_SPIN_LOOP_COUNT', {
'type' : 'uint32_t',
'default' : '5000',
'desc' : ['Number of spin-loop iterations worker threads will perform',
'before going to sleep when waiting for work'],
}],
['MAX_DRAWS_IN_FLIGHT', {
'type' : 'uint32_t',
'default' : '160',
'desc' : ['Maximum number of draws outstanding before API thread blocks.'],
}],
['MAX_PRIMS_PER_DRAW', {
'type' : 'uint32_t',
'default' : '2040',
'desc' : ['Maximum primitives in a single Draw().',
'Larger primitives are split into smaller Draw calls.',
'Should be a multiple of (3 * vectorWidth).'],
}],
['MAX_TESS_PRIMS_PER_DRAW', {
'type' : 'uint32_t',
'default' : '16',
'desc' : ['Maximum primitives in a single Draw() with tessellation enabled.',
'Larger primitives are split into smaller Draw calls.',
'Should be a multiple of (vectorWidth).'],
}],
['MAX_FRAC_ODD_TESS_FACTOR', {
'type' : 'float',
'default' : '63.0f',
'desc' : ['(DEBUG) Maximum tessellation factor for fractional-odd partitioning.'],
}],
['MAX_FRAC_EVEN_TESS_FACTOR', {
'type' : 'float',
'default' : '64.0f',
'desc' : ['(DEBUG) Maximum tessellation factor for fractional-even partitioning.'],
}],
['MAX_INTEGER_TESS_FACTOR', {
'type' : 'uint32_t',
'default' : '64',
'desc' : ['(DEBUG) Maximum tessellation factor for integer partitioning.'],
}],
['BUCKETS_ENABLE_THREADVIZ', {
'type' : 'bool',
'default' : 'false',
'desc' : ['Enable threadviz output.'],
}],
['TOSS_DRAW', {
'type' : 'bool',
'default' : 'false',
'desc' : ['Disable per-draw/dispatch execution'],
}],
['TOSS_QUEUE_FE', {
'type' : 'bool',
'default' : 'false',
'desc' : ['Stop per-draw execution at worker FE',
'',
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
}],
['TOSS_FETCH', {
'type' : 'bool',
'default' : 'false',
'desc' : ['Stop per-draw execution at vertex fetch',
'',
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
}],
['TOSS_IA', {
'type' : 'bool',
'default' : 'false',
'desc' : ['Stop per-draw execution at input assembler',
'',
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
}],
['TOSS_VS', {
'type' : 'bool',
'default' : 'false',
'desc' : ['Stop per-draw execution at vertex shader',
'',
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
}],
['TOSS_SETUP_TRIS', {
'type' : 'bool',
'default' : 'false',
'desc' : ['Stop per-draw execution at primitive setup',
'',
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
}],
['TOSS_BIN_TRIS', {
'type' : 'bool',
'default' : 'false',
'desc' : ['Stop per-draw execution at primitive binning',
'',
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
}],
['TOSS_RS', {
'type' : 'bool',
'default' : 'false',
'desc' : ['Stop per-draw execution at rasterizer',
'',
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
}],
]

View File

@ -0,0 +1,8 @@
# mako/__init__.py
# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
#
# This module is part of Mako and is released under
# the MIT License: http://www.opensource.org/licenses/mit-license.php
__version__ = '1.0.1'

View File

@ -0,0 +1,845 @@
# mako/_ast_util.py
# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
#
# This module is part of Mako and is released under
# the MIT License: http://www.opensource.org/licenses/mit-license.php
"""
ast
~~~
The `ast` module helps Python applications to process trees of the Python
abstract syntax grammar. The abstract syntax itself might change with
each Python release; this module helps to find out programmatically what
the current grammar looks like and allows modifications of it.
An abstract syntax tree can be generated by passing `ast.PyCF_ONLY_AST` as
a flag to the `compile()` builtin function or by using the `parse()`
function from this module. The result will be a tree of objects whose
classes all inherit from `ast.AST`.
A modified abstract syntax tree can be compiled into a Python code object
using the built-in `compile()` function.
Additionally various helper functions are provided that make working with
the trees simpler. The main intention of the helper functions and this
module in general is to provide an easy to use interface for libraries
that work tightly with the python syntax (template engines for example).
:copyright: Copyright 2008 by Armin Ronacher.
:license: Python License.
"""
from _ast import *
from mako.compat import arg_stringname
BOOLOP_SYMBOLS = {
And: 'and',
Or: 'or'
}
BINOP_SYMBOLS = {
Add: '+',
Sub: '-',
Mult: '*',
Div: '/',
FloorDiv: '//',
Mod: '%',
LShift: '<<',
RShift: '>>',
BitOr: '|',
BitAnd: '&',
BitXor: '^'
}
CMPOP_SYMBOLS = {
Eq: '==',
Gt: '>',
GtE: '>=',
In: 'in',
Is: 'is',
IsNot: 'is not',
Lt: '<',
LtE: '<=',
NotEq: '!=',
NotIn: 'not in'
}
UNARYOP_SYMBOLS = {
Invert: '~',
Not: 'not',
UAdd: '+',
USub: '-'
}
ALL_SYMBOLS = {}
ALL_SYMBOLS.update(BOOLOP_SYMBOLS)
ALL_SYMBOLS.update(BINOP_SYMBOLS)
ALL_SYMBOLS.update(CMPOP_SYMBOLS)
ALL_SYMBOLS.update(UNARYOP_SYMBOLS)
def parse(expr, filename='<unknown>', mode='exec'):
"""Parse an expression into an AST node."""
return compile(expr, filename, mode, PyCF_ONLY_AST)
def to_source(node, indent_with=' ' * 4):
"""
This function can convert a node tree back into python sourcecode. This
is useful for debugging purposes, especially if you're dealing with custom
asts not generated by python itself.
It could be that the sourcecode is evaluable when the AST itself is not
compilable / evaluable. The reason for this is that the AST contains some
more data than regular sourcecode does, which is dropped during
conversion.
Each level of indentation is replaced with `indent_with`. Per default this
parameter is equal to four spaces as suggested by PEP 8, but it might be
adjusted to match the application's styleguide.
"""
generator = SourceGenerator(indent_with)
generator.visit(node)
return ''.join(generator.result)
def dump(node):
"""
A very verbose representation of the node passed. This is useful for
debugging purposes.
"""
def _format(node):
if isinstance(node, AST):
return '%s(%s)' % (node.__class__.__name__,
', '.join('%s=%s' % (a, _format(b))
for a, b in iter_fields(node)))
elif isinstance(node, list):
return '[%s]' % ', '.join(_format(x) for x in node)
return repr(node)
if not isinstance(node, AST):
raise TypeError('expected AST, got %r' % node.__class__.__name__)
return _format(node)
def copy_location(new_node, old_node):
"""
Copy the source location hint (`lineno` and `col_offset`) from the
old to the new node if possible and return the new one.
"""
for attr in 'lineno', 'col_offset':
if attr in old_node._attributes and attr in new_node._attributes \
and hasattr(old_node, attr):
setattr(new_node, attr, getattr(old_node, attr))
return new_node
def fix_missing_locations(node):
"""
Some nodes require a line number and the column offset. Without that
information the compiler will abort the compilation. Because it can be
a dull task to add appropriate line numbers and column offsets when
adding new nodes this function can help. It copies the line number and
column offset of the parent node to the child nodes without this
information.
Unlike `copy_location` this works recursive and won't touch nodes that
already have a location information.
"""
def _fix(node, lineno, col_offset):
if 'lineno' in node._attributes:
if not hasattr(node, 'lineno'):
node.lineno = lineno
else:
lineno = node.lineno
if 'col_offset' in node._attributes:
if not hasattr(node, 'col_offset'):
node.col_offset = col_offset
else:
col_offset = node.col_offset
for child in iter_child_nodes(node):
_fix(child, lineno, col_offset)
_fix(node, 1, 0)
return node
def increment_lineno(node, n=1):
"""
Increment the line numbers of all nodes by `n` if they have line number
attributes. This is useful to "move code" to a different location in a
file.
"""
for node in zip((node,), walk(node)):
if 'lineno' in node._attributes:
node.lineno = getattr(node, 'lineno', 0) + n
def iter_fields(node):
"""Iterate over all fields of a node, only yielding existing fields."""
# CPython 2.5 compat
if not hasattr(node, '_fields') or not node._fields:
return
for field in node._fields:
try:
yield field, getattr(node, field)
except AttributeError:
pass
def get_fields(node):
"""Like `iter_fiels` but returns a dict."""
return dict(iter_fields(node))
def iter_child_nodes(node):
"""Iterate over all child nodes or a node."""
for name, field in iter_fields(node):
if isinstance(field, AST):
yield field
elif isinstance(field, list):
for item in field:
if isinstance(item, AST):
yield item
def get_child_nodes(node):
"""Like `iter_child_nodes` but returns a list."""
return list(iter_child_nodes(node))
def get_compile_mode(node):
"""
Get the mode for `compile` of a given node. If the node is not a `mod`
node (`Expression`, `Module` etc.) a `TypeError` is thrown.
"""
if not isinstance(node, mod):
raise TypeError('expected mod node, got %r' % node.__class__.__name__)
return {
Expression: 'eval',
Interactive: 'single'
}.get(node.__class__, 'expr')
def get_docstring(node):
"""
Return the docstring for the given node or `None` if no docstring can be
found. If the node provided does not accept docstrings a `TypeError`
will be raised.
"""
if not isinstance(node, (FunctionDef, ClassDef, Module)):
raise TypeError("%r can't have docstrings" % node.__class__.__name__)
if node.body and isinstance(node.body[0], Str):
return node.body[0].s
def walk(node):
"""
Iterate over all nodes. This is useful if you only want to modify nodes in
place and don't care about the context or the order the nodes are returned.
"""
from collections import deque
todo = deque([node])
while todo:
node = todo.popleft()
todo.extend(iter_child_nodes(node))
yield node
class NodeVisitor(object):
"""
Walks the abstract syntax tree and call visitor functions for every node
found. The visitor functions may return values which will be forwarded
by the `visit` method.
Per default the visitor functions for the nodes are ``'visit_'`` +
class name of the node. So a `TryFinally` node visit function would
be `visit_TryFinally`. This behavior can be changed by overriding
the `get_visitor` function. If no visitor function exists for a node
(return value `None`) the `generic_visit` visitor is used instead.
Don't use the `NodeVisitor` if you want to apply changes to nodes during
traversing. For this a special visitor exists (`NodeTransformer`) that
allows modifications.
"""
def get_visitor(self, node):
"""
Return the visitor function for this node or `None` if no visitor
exists for this node. In that case the generic visit function is
used instead.
"""
method = 'visit_' + node.__class__.__name__
return getattr(self, method, None)
def visit(self, node):
"""Visit a node."""
f = self.get_visitor(node)
if f is not None:
return f(node)
return self.generic_visit(node)
def generic_visit(self, node):
"""Called if no explicit visitor function exists for a node."""
for field, value in iter_fields(node):
if isinstance(value, list):
for item in value:
if isinstance(item, AST):
self.visit(item)
elif isinstance(value, AST):
self.visit(value)
class NodeTransformer(NodeVisitor):
"""
Walks the abstract syntax tree and allows modifications of nodes.
The `NodeTransformer` will walk the AST and use the return value of the
visitor functions to replace or remove the old node. If the return
value of the visitor function is `None` the node will be removed
from the previous location otherwise it's replaced with the return
value. The return value may be the original node in which case no
replacement takes place.
Here an example transformer that rewrites all `foo` to `data['foo']`::
class RewriteName(NodeTransformer):
def visit_Name(self, node):
return copy_location(Subscript(
value=Name(id='data', ctx=Load()),
slice=Index(value=Str(s=node.id)),
ctx=node.ctx
), node)
Keep in mind that if the node you're operating on has child nodes
you must either transform the child nodes yourself or call the generic
visit function for the node first.
Nodes that were part of a collection of statements (that applies to
all statement nodes) may also return a list of nodes rather than just
a single node.
Usually you use the transformer like this::
node = YourTransformer().visit(node)
"""
def generic_visit(self, node):
for field, old_value in iter_fields(node):
old_value = getattr(node, field, None)
if isinstance(old_value, list):
new_values = []
for value in old_value:
if isinstance(value, AST):
value = self.visit(value)
if value is None:
continue
elif not isinstance(value, AST):
new_values.extend(value)
continue
new_values.append(value)
old_value[:] = new_values
elif isinstance(old_value, AST):
new_node = self.visit(old_value)
if new_node is None:
delattr(node, field)
else:
setattr(node, field, new_node)
return node
class SourceGenerator(NodeVisitor):
"""
This visitor is able to transform a well formed syntax tree into python
sourcecode. For more details have a look at the docstring of the
`node_to_source` function.
"""
def __init__(self, indent_with):
self.result = []
self.indent_with = indent_with
self.indentation = 0
self.new_lines = 0
def write(self, x):
if self.new_lines:
if self.result:
self.result.append('\n' * self.new_lines)
self.result.append(self.indent_with * self.indentation)
self.new_lines = 0
self.result.append(x)
def newline(self, n=1):
self.new_lines = max(self.new_lines, n)
def body(self, statements):
self.new_line = True
self.indentation += 1
for stmt in statements:
self.visit(stmt)
self.indentation -= 1
def body_or_else(self, node):
self.body(node.body)
if node.orelse:
self.newline()
self.write('else:')
self.body(node.orelse)
def signature(self, node):
want_comma = []
def write_comma():
if want_comma:
self.write(', ')
else:
want_comma.append(True)
padding = [None] * (len(node.args) - len(node.defaults))
for arg, default in zip(node.args, padding + node.defaults):
write_comma()
self.visit(arg)
if default is not None:
self.write('=')
self.visit(default)
if node.vararg is not None:
write_comma()
self.write('*' + arg_stringname(node.vararg))
if node.kwarg is not None:
write_comma()
self.write('**' + arg_stringname(node.kwarg))
def decorators(self, node):
for decorator in node.decorator_list:
self.newline()
self.write('@')
self.visit(decorator)
# Statements
def visit_Assign(self, node):
self.newline()
for idx, target in enumerate(node.targets):
if idx:
self.write(', ')
self.visit(target)
self.write(' = ')
self.visit(node.value)
def visit_AugAssign(self, node):
self.newline()
self.visit(node.target)
self.write(BINOP_SYMBOLS[type(node.op)] + '=')
self.visit(node.value)
def visit_ImportFrom(self, node):
self.newline()
self.write('from %s%s import ' % ('.' * node.level, node.module))
for idx, item in enumerate(node.names):
if idx:
self.write(', ')
self.write(item)
def visit_Import(self, node):
self.newline()
for item in node.names:
self.write('import ')
self.visit(item)
def visit_Expr(self, node):
self.newline()
self.generic_visit(node)
def visit_FunctionDef(self, node):
self.newline(n=2)
self.decorators(node)
self.newline()
self.write('def %s(' % node.name)
self.signature(node.args)
self.write('):')
self.body(node.body)
def visit_ClassDef(self, node):
have_args = []
def paren_or_comma():
if have_args:
self.write(', ')
else:
have_args.append(True)
self.write('(')
self.newline(n=3)
self.decorators(node)
self.newline()
self.write('class %s' % node.name)
for base in node.bases:
paren_or_comma()
self.visit(base)
# XXX: the if here is used to keep this module compatible
# with python 2.6.
if hasattr(node, 'keywords'):
for keyword in node.keywords:
paren_or_comma()
self.write(keyword.arg + '=')
self.visit(keyword.value)
if node.starargs is not None:
paren_or_comma()
self.write('*')
self.visit(node.starargs)
if node.kwargs is not None:
paren_or_comma()
self.write('**')
self.visit(node.kwargs)
self.write(have_args and '):' or ':')
self.body(node.body)
def visit_If(self, node):
self.newline()
self.write('if ')
self.visit(node.test)
self.write(':')
self.body(node.body)
while True:
else_ = node.orelse
if len(else_) == 1 and isinstance(else_[0], If):
node = else_[0]
self.newline()
self.write('elif ')
self.visit(node.test)
self.write(':')
self.body(node.body)
else:
self.newline()
self.write('else:')
self.body(else_)
break
def visit_For(self, node):
self.newline()
self.write('for ')
self.visit(node.target)
self.write(' in ')
self.visit(node.iter)
self.write(':')
self.body_or_else(node)
def visit_While(self, node):
self.newline()
self.write('while ')
self.visit(node.test)
self.write(':')
self.body_or_else(node)
def visit_With(self, node):
self.newline()
self.write('with ')
self.visit(node.context_expr)
if node.optional_vars is not None:
self.write(' as ')
self.visit(node.optional_vars)
self.write(':')
self.body(node.body)
def visit_Pass(self, node):
self.newline()
self.write('pass')
def visit_Print(self, node):
# XXX: python 2.6 only
self.newline()
self.write('print ')
want_comma = False
if node.dest is not None:
self.write(' >> ')
self.visit(node.dest)
want_comma = True
for value in node.values:
if want_comma:
self.write(', ')
self.visit(value)
want_comma = True
if not node.nl:
self.write(',')
def visit_Delete(self, node):
self.newline()
self.write('del ')
for idx, target in enumerate(node):
if idx:
self.write(', ')
self.visit(target)
def visit_TryExcept(self, node):
self.newline()
self.write('try:')
self.body(node.body)
for handler in node.handlers:
self.visit(handler)
def visit_TryFinally(self, node):
self.newline()
self.write('try:')
self.body(node.body)
self.newline()
self.write('finally:')
self.body(node.finalbody)
def visit_Global(self, node):
self.newline()
self.write('global ' + ', '.join(node.names))
def visit_Nonlocal(self, node):
self.newline()
self.write('nonlocal ' + ', '.join(node.names))
def visit_Return(self, node):
self.newline()
self.write('return ')
self.visit(node.value)
def visit_Break(self, node):
self.newline()
self.write('break')
def visit_Continue(self, node):
self.newline()
self.write('continue')
def visit_Raise(self, node):
# XXX: Python 2.6 / 3.0 compatibility
self.newline()
self.write('raise')
if hasattr(node, 'exc') and node.exc is not None:
self.write(' ')
self.visit(node.exc)
if node.cause is not None:
self.write(' from ')
self.visit(node.cause)
elif hasattr(node, 'type') and node.type is not None:
self.visit(node.type)
if node.inst is not None:
self.write(', ')
self.visit(node.inst)
if node.tback is not None:
self.write(', ')
self.visit(node.tback)
# Expressions
def visit_Attribute(self, node):
self.visit(node.value)
self.write('.' + node.attr)
def visit_Call(self, node):
want_comma = []
def write_comma():
if want_comma:
self.write(', ')
else:
want_comma.append(True)
self.visit(node.func)
self.write('(')
for arg in node.args:
write_comma()
self.visit(arg)
for keyword in node.keywords:
write_comma()
self.write(keyword.arg + '=')
self.visit(keyword.value)
if node.starargs is not None:
write_comma()
self.write('*')
self.visit(node.starargs)
if node.kwargs is not None:
write_comma()
self.write('**')
self.visit(node.kwargs)
self.write(')')
def visit_Name(self, node):
self.write(node.id)
def visit_NameConstant(self, node):
self.write(str(node.value))
def visit_arg(self, node):
self.write(node.arg)
def visit_Str(self, node):
self.write(repr(node.s))
def visit_Bytes(self, node):
self.write(repr(node.s))
def visit_Num(self, node):
self.write(repr(node.n))
def visit_Tuple(self, node):
self.write('(')
idx = -1
for idx, item in enumerate(node.elts):
if idx:
self.write(', ')
self.visit(item)
self.write(idx and ')' or ',)')
def sequence_visit(left, right):
def visit(self, node):
self.write(left)
for idx, item in enumerate(node.elts):
if idx:
self.write(', ')
self.visit(item)
self.write(right)
return visit
visit_List = sequence_visit('[', ']')
visit_Set = sequence_visit('{', '}')
del sequence_visit
def visit_Dict(self, node):
self.write('{')
for idx, (key, value) in enumerate(zip(node.keys, node.values)):
if idx:
self.write(', ')
self.visit(key)
self.write(': ')
self.visit(value)
self.write('}')
def visit_BinOp(self, node):
self.write('(')
self.visit(node.left)
self.write(' %s ' % BINOP_SYMBOLS[type(node.op)])
self.visit(node.right)
self.write(')')
def visit_BoolOp(self, node):
self.write('(')
for idx, value in enumerate(node.values):
if idx:
self.write(' %s ' % BOOLOP_SYMBOLS[type(node.op)])
self.visit(value)
self.write(')')
def visit_Compare(self, node):
self.write('(')
self.visit(node.left)
for op, right in zip(node.ops, node.comparators):
self.write(' %s ' % CMPOP_SYMBOLS[type(op)])
self.visit(right)
self.write(')')
def visit_UnaryOp(self, node):
self.write('(')
op = UNARYOP_SYMBOLS[type(node.op)]
self.write(op)
if op == 'not':
self.write(' ')
self.visit(node.operand)
self.write(')')
def visit_Subscript(self, node):
self.visit(node.value)
self.write('[')
self.visit(node.slice)
self.write(']')
def visit_Slice(self, node):
if node.lower is not None:
self.visit(node.lower)
self.write(':')
if node.upper is not None:
self.visit(node.upper)
if node.step is not None:
self.write(':')
if not (isinstance(node.step, Name) and node.step.id == 'None'):
self.visit(node.step)
def visit_ExtSlice(self, node):
for idx, item in node.dims:
if idx:
self.write(', ')
self.visit(item)
def visit_Yield(self, node):
self.write('yield ')
self.visit(node.value)
def visit_Lambda(self, node):
self.write('lambda ')
self.signature(node.args)
self.write(': ')
self.visit(node.body)
def visit_Ellipsis(self, node):
self.write('Ellipsis')
def generator_visit(left, right):
def visit(self, node):
self.write(left)
self.visit(node.elt)
for comprehension in node.generators:
self.visit(comprehension)
self.write(right)
return visit
visit_ListComp = generator_visit('[', ']')
visit_GeneratorExp = generator_visit('(', ')')
visit_SetComp = generator_visit('{', '}')
del generator_visit
def visit_DictComp(self, node):
self.write('{')
self.visit(node.key)
self.write(': ')
self.visit(node.value)
for comprehension in node.generators:
self.visit(comprehension)
self.write('}')
def visit_IfExp(self, node):
self.visit(node.body)
self.write(' if ')
self.visit(node.test)
self.write(' else ')
self.visit(node.orelse)
def visit_Starred(self, node):
self.write('*')
self.visit(node.value)
def visit_Repr(self, node):
# XXX: python 2.6 only
self.write('`')
self.visit(node.value)
self.write('`')
# Helper Nodes
def visit_alias(self, node):
self.write(node.name)
if node.asname is not None:
self.write(' as ' + node.asname)
def visit_comprehension(self, node):
self.write(' for ')
self.visit(node.target)
self.write(' in ')
self.visit(node.iter)
if node.ifs:
for if_ in node.ifs:
self.write(' if ')
self.visit(if_)
def visit_excepthandler(self, node):
self.newline()
self.write('except')
if node.type is not None:
self.write(' ')
self.visit(node.type)
if node.name is not None:
self.write(' as ')
self.visit(node.name)
self.write(':')
self.body(node.body)

View File

@ -0,0 +1,178 @@
# mako/ast.py
# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
#
# This module is part of Mako and is released under
# the MIT License: http://www.opensource.org/licenses/mit-license.php
"""utilities for analyzing expressions and blocks of Python
code, as well as generating Python from AST nodes"""
from mako import exceptions, pyparser, compat
import re
class PythonCode(object):
"""represents information about a string containing Python code"""
def __init__(self, code, **exception_kwargs):
self.code = code
# represents all identifiers which are assigned to at some point in
# the code
self.declared_identifiers = set()
# represents all identifiers which are referenced before their
# assignment, if any
self.undeclared_identifiers = set()
# note that an identifier can be in both the undeclared and declared
# lists.
# using AST to parse instead of using code.co_varnames,
# code.co_names has several advantages:
# - we can locate an identifier as "undeclared" even if
# its declared later in the same block of code
# - AST is less likely to break with version changes
# (for example, the behavior of co_names changed a little bit
# in python version 2.5)
if isinstance(code, compat.string_types):
expr = pyparser.parse(code.lstrip(), "exec", **exception_kwargs)
else:
expr = code
f = pyparser.FindIdentifiers(self, **exception_kwargs)
f.visit(expr)
class ArgumentList(object):
"""parses a fragment of code as a comma-separated list of expressions"""
def __init__(self, code, **exception_kwargs):
self.codeargs = []
self.args = []
self.declared_identifiers = set()
self.undeclared_identifiers = set()
if isinstance(code, compat.string_types):
if re.match(r"\S", code) and not re.match(r",\s*$", code):
# if theres text and no trailing comma, insure its parsed
# as a tuple by adding a trailing comma
code += ","
expr = pyparser.parse(code, "exec", **exception_kwargs)
else:
expr = code
f = pyparser.FindTuple(self, PythonCode, **exception_kwargs)
f.visit(expr)
class PythonFragment(PythonCode):
"""extends PythonCode to provide identifier lookups in partial control
statements
e.g.
for x in 5:
elif y==9:
except (MyException, e):
etc.
"""
def __init__(self, code, **exception_kwargs):
m = re.match(r'^(\w+)(?:\s+(.*?))?:\s*(#|$)', code.strip(), re.S)
if not m:
raise exceptions.CompileException(
"Fragment '%s' is not a partial control statement" %
code, **exception_kwargs)
if m.group(3):
code = code[:m.start(3)]
(keyword, expr) = m.group(1,2)
if keyword in ['for','if', 'while']:
code = code + "pass"
elif keyword == 'try':
code = code + "pass\nexcept:pass"
elif keyword == 'elif' or keyword == 'else':
code = "if False:pass\n" + code + "pass"
elif keyword == 'except':
code = "try:pass\n" + code + "pass"
elif keyword == 'with':
code = code + "pass"
else:
raise exceptions.CompileException(
"Unsupported control keyword: '%s'" %
keyword, **exception_kwargs)
super(PythonFragment, self).__init__(code, **exception_kwargs)
class FunctionDecl(object):
"""function declaration"""
def __init__(self, code, allow_kwargs=True, **exception_kwargs):
self.code = code
expr = pyparser.parse(code, "exec", **exception_kwargs)
f = pyparser.ParseFunc(self, **exception_kwargs)
f.visit(expr)
if not hasattr(self, 'funcname'):
raise exceptions.CompileException(
"Code '%s' is not a function declaration" % code,
**exception_kwargs)
if not allow_kwargs and self.kwargs:
raise exceptions.CompileException(
"'**%s' keyword argument not allowed here" %
self.kwargnames[-1], **exception_kwargs)
def get_argument_expressions(self, as_call=False):
"""Return the argument declarations of this FunctionDecl as a printable
list.
By default the return value is appropriate for writing in a ``def``;
set `as_call` to true to build arguments to be passed to the function
instead (assuming locals with the same names as the arguments exist).
"""
namedecls = []
# Build in reverse order, since defaults and slurpy args come last
argnames = self.argnames[::-1]
kwargnames = self.kwargnames[::-1]
defaults = self.defaults[::-1]
kwdefaults = self.kwdefaults[::-1]
# Named arguments
if self.kwargs:
namedecls.append("**" + kwargnames.pop(0))
for name in kwargnames:
# Keyword-only arguments must always be used by name, so even if
# this is a call, print out `foo=foo`
if as_call:
namedecls.append("%s=%s" % (name, name))
elif kwdefaults:
default = kwdefaults.pop(0)
if default is None:
# The AST always gives kwargs a default, since you can do
# `def foo(*, a=1, b, c=3)`
namedecls.append(name)
else:
namedecls.append("%s=%s" % (
name, pyparser.ExpressionGenerator(default).value()))
else:
namedecls.append(name)
# Positional arguments
if self.varargs:
namedecls.append("*" + argnames.pop(0))
for name in argnames:
if as_call or not defaults:
namedecls.append(name)
else:
default = defaults.pop(0)
namedecls.append("%s=%s" % (
name, pyparser.ExpressionGenerator(default).value()))
namedecls.reverse()
return namedecls
@property
def allargnames(self):
return tuple(self.argnames) + tuple(self.kwargnames)
class FunctionArgs(FunctionDecl):
"""the argument portion of a function declaration"""
def __init__(self, code, **kwargs):
super(FunctionArgs, self).__init__("def ANON(%s):pass" % code,
**kwargs)

View File

@ -0,0 +1,238 @@
# mako/cache.py
# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
#
# This module is part of Mako and is released under
# the MIT License: http://www.opensource.org/licenses/mit-license.php
from mako import compat, util
_cache_plugins = util.PluginLoader("mako.cache")
register_plugin = _cache_plugins.register
register_plugin("beaker", "mako.ext.beaker_cache", "BeakerCacheImpl")
class Cache(object):
"""Represents a data content cache made available to the module
space of a specific :class:`.Template` object.
.. versionadded:: 0.6
:class:`.Cache` by itself is mostly a
container for a :class:`.CacheImpl` object, which implements
a fixed API to provide caching services; specific subclasses exist to
implement different
caching strategies. Mako includes a backend that works with
the Beaker caching system. Beaker itself then supports
a number of backends (i.e. file, memory, memcached, etc.)
The construction of a :class:`.Cache` is part of the mechanics
of a :class:`.Template`, and programmatic access to this
cache is typically via the :attr:`.Template.cache` attribute.
"""
impl = None
"""Provide the :class:`.CacheImpl` in use by this :class:`.Cache`.
This accessor allows a :class:`.CacheImpl` with additional
methods beyond that of :class:`.Cache` to be used programmatically.
"""
id = None
"""Return the 'id' that identifies this cache.
This is a value that should be globally unique to the
:class:`.Template` associated with this cache, and can
be used by a caching system to name a local container
for data specific to this template.
"""
starttime = None
"""Epochal time value for when the owning :class:`.Template` was
first compiled.
A cache implementation may wish to invalidate data earlier than
this timestamp; this has the effect of the cache for a specific
:class:`.Template` starting clean any time the :class:`.Template`
is recompiled, such as when the original template file changed on
the filesystem.
"""
def __init__(self, template, *args):
# check for a stale template calling the
# constructor
if isinstance(template, compat.string_types) and args:
return
self.template = template
self.id = template.module.__name__
self.starttime = template.module._modified_time
self._def_regions = {}
self.impl = self._load_impl(self.template.cache_impl)
def _load_impl(self, name):
return _cache_plugins.load(name)(self)
def get_or_create(self, key, creation_function, **kw):
"""Retrieve a value from the cache, using the given creation function
to generate a new value."""
return self._ctx_get_or_create(key, creation_function, None, **kw)
def _ctx_get_or_create(self, key, creation_function, context, **kw):
"""Retrieve a value from the cache, using the given creation function
to generate a new value."""
if not self.template.cache_enabled:
return creation_function()
return self.impl.get_or_create(
key,
creation_function,
**self._get_cache_kw(kw, context))
def set(self, key, value, **kw):
"""Place a value in the cache.
:param key: the value's key.
:param value: the value.
:param \**kw: cache configuration arguments.
"""
self.impl.set(key, value, **self._get_cache_kw(kw, None))
put = set
"""A synonym for :meth:`.Cache.set`.
This is here for backwards compatibility.
"""
def get(self, key, **kw):
"""Retrieve a value from the cache.
:param key: the value's key.
:param \**kw: cache configuration arguments. The
backend is configured using these arguments upon first request.
Subsequent requests that use the same series of configuration
values will use that same backend.
"""
return self.impl.get(key, **self._get_cache_kw(kw, None))
def invalidate(self, key, **kw):
"""Invalidate a value in the cache.
:param key: the value's key.
:param \**kw: cache configuration arguments. The
backend is configured using these arguments upon first request.
Subsequent requests that use the same series of configuration
values will use that same backend.
"""
self.impl.invalidate(key, **self._get_cache_kw(kw, None))
def invalidate_body(self):
"""Invalidate the cached content of the "body" method for this
template.
"""
self.invalidate('render_body', __M_defname='render_body')
def invalidate_def(self, name):
"""Invalidate the cached content of a particular ``<%def>`` within this
template.
"""
self.invalidate('render_%s' % name, __M_defname='render_%s' % name)
def invalidate_closure(self, name):
"""Invalidate a nested ``<%def>`` within this template.
Caching of nested defs is a blunt tool as there is no
management of scope -- nested defs that use cache tags
need to have names unique of all other nested defs in the
template, else their content will be overwritten by
each other.
"""
self.invalidate(name, __M_defname=name)
def _get_cache_kw(self, kw, context):
defname = kw.pop('__M_defname', None)
if not defname:
tmpl_kw = self.template.cache_args.copy()
tmpl_kw.update(kw)
elif defname in self._def_regions:
tmpl_kw = self._def_regions[defname]
else:
tmpl_kw = self.template.cache_args.copy()
tmpl_kw.update(kw)
self._def_regions[defname] = tmpl_kw
if context and self.impl.pass_context:
tmpl_kw = tmpl_kw.copy()
tmpl_kw.setdefault('context', context)
return tmpl_kw
class CacheImpl(object):
"""Provide a cache implementation for use by :class:`.Cache`."""
def __init__(self, cache):
self.cache = cache
pass_context = False
"""If ``True``, the :class:`.Context` will be passed to
:meth:`get_or_create <.CacheImpl.get_or_create>` as the name ``'context'``.
"""
def get_or_create(self, key, creation_function, **kw):
"""Retrieve a value from the cache, using the given creation function
to generate a new value.
This function *must* return a value, either from
the cache, or via the given creation function.
If the creation function is called, the newly
created value should be populated into the cache
under the given key before being returned.
:param key: the value's key.
:param creation_function: function that when called generates
a new value.
:param \**kw: cache configuration arguments.
"""
raise NotImplementedError()
def set(self, key, value, **kw):
"""Place a value in the cache.
:param key: the value's key.
:param value: the value.
:param \**kw: cache configuration arguments.
"""
raise NotImplementedError()
def get(self, key, **kw):
"""Retrieve a value from the cache.
:param key: the value's key.
:param \**kw: cache configuration arguments.
"""
raise NotImplementedError()
def invalidate(self, key, **kw):
"""Invalidate a value in the cache.
:param key: the value's key.
:param \**kw: cache configuration arguments.
"""
raise NotImplementedError()

View File

@ -0,0 +1,62 @@
# mako/cmd.py
# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
#
# This module is part of Mako and is released under
# the MIT License: http://www.opensource.org/licenses/mit-license.php
from argparse import ArgumentParser
from os.path import isfile, dirname
import sys
from mako.template import Template
from mako.lookup import TemplateLookup
from mako import exceptions
def varsplit(var):
if "=" not in var:
return (var, "")
return var.split("=", 1)
def _exit():
sys.stderr.write(exceptions.text_error_template().render())
sys.exit(1)
def cmdline(argv=None):
parser = ArgumentParser("usage: %prog [FILENAME]")
parser.add_argument("--var", default=[], action="append",
help="variable (can be used multiple times, use name=value)")
parser.add_argument("--template-dir", default=[], action="append",
help="Directory to use for template lookup (multiple "
"directories may be provided). If not given then if the "
"template is read from stdin, the value defaults to be "
"the current directory, otherwise it defaults to be the "
"parent directory of the file provided.")
parser.add_argument('input', nargs='?', default='-')
options = parser.parse_args(argv)
if options.input == '-':
lookup_dirs = options.template_dir or ["."]
lookup = TemplateLookup(lookup_dirs)
try:
template = Template(sys.stdin.read(), lookup=lookup)
except:
_exit()
else:
filename = options.input
if not isfile(filename):
raise SystemExit("error: can't find %s" % filename)
lookup_dirs = options.template_dir or [dirname(filename)]
lookup = TemplateLookup(lookup_dirs)
try:
template = Template(filename=filename, lookup=lookup)
except:
_exit()
kw = dict([varsplit(var) for var in options.var])
try:
print(template.render(**kw))
except:
_exit()
if __name__ == "__main__":
cmdline()

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,174 @@
import sys
import time
py3k = sys.version_info >= (3, 0)
py33 = sys.version_info >= (3, 3)
py2k = sys.version_info < (3,)
py26 = sys.version_info >= (2, 6)
jython = sys.platform.startswith('java')
win32 = sys.platform.startswith('win')
pypy = hasattr(sys, 'pypy_version_info')
if py3k:
from io import StringIO
import builtins as compat_builtins
from urllib.parse import quote_plus, unquote_plus
from html.entities import codepoint2name, name2codepoint
string_types = str,
binary_type = bytes
text_type = str
from io import BytesIO as byte_buffer
def u(s):
return s
def b(s):
return s.encode("latin-1")
def octal(lit):
return eval("0o" + lit)
else:
import __builtin__ as compat_builtins
try:
from cStringIO import StringIO
except:
from StringIO import StringIO
byte_buffer = StringIO
from urllib import quote_plus, unquote_plus
from htmlentitydefs import codepoint2name, name2codepoint
string_types = basestring,
binary_type = str
text_type = unicode
def u(s):
return unicode(s, "utf-8")
def b(s):
return s
def octal(lit):
return eval("0" + lit)
if py33:
from importlib import machinery
def load_module(module_id, path):
return machinery.SourceFileLoader(module_id, path).load_module()
else:
import imp
def load_module(module_id, path):
fp = open(path, 'rb')
try:
return imp.load_source(module_id, path, fp)
finally:
fp.close()
if py3k:
def reraise(tp, value, tb=None, cause=None):
if cause is not None:
value.__cause__ = cause
if value.__traceback__ is not tb:
raise value.with_traceback(tb)
raise value
else:
exec("def reraise(tp, value, tb=None, cause=None):\n"
" raise tp, value, tb\n")
def exception_as():
return sys.exc_info()[1]
try:
import threading
if py3k:
import _thread as thread
else:
import thread
except ImportError:
import dummy_threading as threading
if py3k:
import _dummy_thread as thread
else:
import dummy_thread as thread
if win32 or jython:
time_func = time.clock
else:
time_func = time.time
try:
from functools import partial
except:
def partial(func, *args, **keywords):
def newfunc(*fargs, **fkeywords):
newkeywords = keywords.copy()
newkeywords.update(fkeywords)
return func(*(args + fargs), **newkeywords)
return newfunc
all = all
import json
def exception_name(exc):
return exc.__class__.__name__
try:
from inspect import CO_VARKEYWORDS, CO_VARARGS
def inspect_func_args(fn):
if py3k:
co = fn.__code__
else:
co = fn.func_code
nargs = co.co_argcount
names = co.co_varnames
args = list(names[:nargs])
varargs = None
if co.co_flags & CO_VARARGS:
varargs = co.co_varnames[nargs]
nargs = nargs + 1
varkw = None
if co.co_flags & CO_VARKEYWORDS:
varkw = co.co_varnames[nargs]
if py3k:
return args, varargs, varkw, fn.__defaults__
else:
return args, varargs, varkw, fn.func_defaults
except ImportError:
import inspect
def inspect_func_args(fn):
return inspect.getargspec(fn)
if py3k:
def callable(fn):
return hasattr(fn, '__call__')
else:
callable = callable
################################################
# cross-compatible metaclass implementation
# Copyright (c) 2010-2012 Benjamin Peterson
def with_metaclass(meta, base=object):
"""Create a base class with a metaclass."""
return meta("%sBase" % meta.__name__, (base,), {})
################################################
def arg_stringname(func_arg):
"""Gets the string name of a kwarg or vararg
In Python3.4 a function's args are
of _ast.arg type not _ast.name
"""
if hasattr(func_arg, 'arg'):
return func_arg.arg
else:
return str(func_arg)

View File

@ -0,0 +1,373 @@
# mako/exceptions.py
# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
#
# This module is part of Mako and is released under
# the MIT License: http://www.opensource.org/licenses/mit-license.php
"""exception classes"""
import traceback
import sys
from mako import util, compat
class MakoException(Exception):
pass
class RuntimeException(MakoException):
pass
def _format_filepos(lineno, pos, filename):
if filename is None:
return " at line: %d char: %d" % (lineno, pos)
else:
return " in file '%s' at line: %d char: %d" % (filename, lineno, pos)
class CompileException(MakoException):
def __init__(self, message, source, lineno, pos, filename):
MakoException.__init__(self,
message + _format_filepos(lineno, pos, filename))
self.lineno = lineno
self.pos = pos
self.filename = filename
self.source = source
class SyntaxException(MakoException):
def __init__(self, message, source, lineno, pos, filename):
MakoException.__init__(self,
message + _format_filepos(lineno, pos, filename))
self.lineno = lineno
self.pos = pos
self.filename = filename
self.source = source
class UnsupportedError(MakoException):
"""raised when a retired feature is used."""
class NameConflictError(MakoException):
"""raised when a reserved word is used inappropriately"""
class TemplateLookupException(MakoException):
pass
class TopLevelLookupException(TemplateLookupException):
pass
class RichTraceback(object):
"""Pull the current exception from the ``sys`` traceback and extracts
Mako-specific template information.
See the usage examples in :ref:`handling_exceptions`.
"""
def __init__(self, error=None, traceback=None):
self.source, self.lineno = "", 0
if error is None or traceback is None:
t, value, tback = sys.exc_info()
if error is None:
error = value or t
if traceback is None:
traceback = tback
self.error = error
self.records = self._init(traceback)
if isinstance(self.error, (CompileException, SyntaxException)):
self.source = self.error.source
self.lineno = self.error.lineno
self._has_source = True
self._init_message()
@property
def errorname(self):
return compat.exception_name(self.error)
def _init_message(self):
"""Find a unicode representation of self.error"""
try:
self.message = compat.text_type(self.error)
except UnicodeError:
try:
self.message = str(self.error)
except UnicodeEncodeError:
# Fallback to args as neither unicode nor
# str(Exception(u'\xe6')) work in Python < 2.6
self.message = self.error.args[0]
if not isinstance(self.message, compat.text_type):
self.message = compat.text_type(self.message, 'ascii', 'replace')
def _get_reformatted_records(self, records):
for rec in records:
if rec[6] is not None:
yield (rec[4], rec[5], rec[2], rec[6])
else:
yield tuple(rec[0:4])
@property
def traceback(self):
"""Return a list of 4-tuple traceback records (i.e. normal python
format) with template-corresponding lines remapped to the originating
template.
"""
return list(self._get_reformatted_records(self.records))
@property
def reverse_records(self):
return reversed(self.records)
@property
def reverse_traceback(self):
"""Return the same data as traceback, except in reverse order.
"""
return list(self._get_reformatted_records(self.reverse_records))
def _init(self, trcback):
"""format a traceback from sys.exc_info() into 7-item tuples,
containing the regular four traceback tuple items, plus the original
template filename, the line number adjusted relative to the template
source, and code line from that line number of the template."""
import mako.template
mods = {}
rawrecords = traceback.extract_tb(trcback)
new_trcback = []
for filename, lineno, function, line in rawrecords:
if not line:
line = ''
try:
(line_map, template_lines) = mods[filename]
except KeyError:
try:
info = mako.template._get_module_info(filename)
module_source = info.code
template_source = info.source
template_filename = info.template_filename or filename
except KeyError:
# A normal .py file (not a Template)
if not compat.py3k:
try:
fp = open(filename, 'rb')
encoding = util.parse_encoding(fp)
fp.close()
except IOError:
encoding = None
if encoding:
line = line.decode(encoding)
else:
line = line.decode('ascii', 'replace')
new_trcback.append((filename, lineno, function, line,
None, None, None, None))
continue
template_ln = 1
source_map = mako.template.ModuleInfo.\
get_module_source_metadata(
module_source, full_line_map=True)
line_map = source_map['full_line_map']
template_lines = [line for line in
template_source.split("\n")]
mods[filename] = (line_map, template_lines)
template_ln = line_map[lineno - 1]
if template_ln <= len(template_lines):
template_line = template_lines[template_ln - 1]
else:
template_line = None
new_trcback.append((filename, lineno, function,
line, template_filename, template_ln,
template_line, template_source))
if not self.source:
for l in range(len(new_trcback) - 1, 0, -1):
if new_trcback[l][5]:
self.source = new_trcback[l][7]
self.lineno = new_trcback[l][5]
break
else:
if new_trcback:
try:
# A normal .py file (not a Template)
fp = open(new_trcback[-1][0], 'rb')
encoding = util.parse_encoding(fp)
fp.seek(0)
self.source = fp.read()
fp.close()
if encoding:
self.source = self.source.decode(encoding)
except IOError:
self.source = ''
self.lineno = new_trcback[-1][1]
return new_trcback
def text_error_template(lookup=None):
"""Provides a template that renders a stack trace in a similar format to
the Python interpreter, substituting source template filenames, line
numbers and code for that of the originating source template, as
applicable.
"""
import mako.template
return mako.template.Template(r"""
<%page args="error=None, traceback=None"/>
<%!
from mako.exceptions import RichTraceback
%>\
<%
tback = RichTraceback(error=error, traceback=traceback)
%>\
Traceback (most recent call last):
% for (filename, lineno, function, line) in tback.traceback:
File "${filename}", line ${lineno}, in ${function or '?'}
${line | trim}
% endfor
${tback.errorname}: ${tback.message}
""")
def _install_pygments():
global syntax_highlight, pygments_html_formatter
from mako.ext.pygmentplugin import syntax_highlight,\
pygments_html_formatter
def _install_fallback():
global syntax_highlight, pygments_html_formatter
from mako.filters import html_escape
pygments_html_formatter = None
def syntax_highlight(filename='', language=None):
return html_escape
def _install_highlighting():
try:
_install_pygments()
except ImportError:
_install_fallback()
_install_highlighting()
def html_error_template():
"""Provides a template that renders a stack trace in an HTML format,
providing an excerpt of code as well as substituting source template
filenames, line numbers and code for that of the originating source
template, as applicable.
The template's default ``encoding_errors`` value is
``'htmlentityreplace'``. The template has two options. With the
``full`` option disabled, only a section of an HTML document is
returned. With the ``css`` option disabled, the default stylesheet
won't be included.
"""
import mako.template
return mako.template.Template(r"""
<%!
from mako.exceptions import RichTraceback, syntax_highlight,\
pygments_html_formatter
%>
<%page args="full=True, css=True, error=None, traceback=None"/>
% if full:
<html>
<head>
<title>Mako Runtime Error</title>
% endif
% if css:
<style>
body { font-family:verdana; margin:10px 30px 10px 30px;}
.stacktrace { margin:5px 5px 5px 5px; }
.highlight { padding:0px 10px 0px 10px; background-color:#9F9FDF; }
.nonhighlight { padding:0px; background-color:#DFDFDF; }
.sample { padding:10px; margin:10px 10px 10px 10px;
font-family:monospace; }
.sampleline { padding:0px 10px 0px 10px; }
.sourceline { margin:5px 5px 10px 5px; font-family:monospace;}
.location { font-size:80%; }
.highlight { white-space:pre; }
.sampleline { white-space:pre; }
% if pygments_html_formatter:
${pygments_html_formatter.get_style_defs()}
.linenos { min-width: 2.5em; text-align: right; }
pre { margin: 0; }
.syntax-highlighted { padding: 0 10px; }
.syntax-highlightedtable { border-spacing: 1px; }
.nonhighlight { border-top: 1px solid #DFDFDF;
border-bottom: 1px solid #DFDFDF; }
.stacktrace .nonhighlight { margin: 5px 15px 10px; }
.sourceline { margin: 0 0; font-family:monospace; }
.code { background-color: #F8F8F8; width: 100%; }
.error .code { background-color: #FFBDBD; }
.error .syntax-highlighted { background-color: #FFBDBD; }
% endif
</style>
% endif
% if full:
</head>
<body>
% endif
<h2>Error !</h2>
<%
tback = RichTraceback(error=error, traceback=traceback)
src = tback.source
line = tback.lineno
if src:
lines = src.split('\n')
else:
lines = None
%>
<h3>${tback.errorname}: ${tback.message|h}</h3>
% if lines:
<div class="sample">
<div class="nonhighlight">
% for index in range(max(0, line-4),min(len(lines), line+5)):
<%
if pygments_html_formatter:
pygments_html_formatter.linenostart = index + 1
%>
% if index + 1 == line:
<%
if pygments_html_formatter:
old_cssclass = pygments_html_formatter.cssclass
pygments_html_formatter.cssclass = 'error ' + old_cssclass
%>
${lines[index] | syntax_highlight(language='mako')}
<%
if pygments_html_formatter:
pygments_html_formatter.cssclass = old_cssclass
%>
% else:
${lines[index] | syntax_highlight(language='mako')}
% endif
% endfor
</div>
</div>
% endif
<div class="stacktrace">
% for (filename, lineno, function, line) in tback.reverse_traceback:
<div class="location">${filename}, line ${lineno}:</div>
<div class="nonhighlight">
<%
if pygments_html_formatter:
pygments_html_formatter.linenostart = lineno
%>
<div class="sourceline">${line | syntax_highlight(filename)}</div>
</div>
% endfor
</div>
% if full:
</body>
</html>
% endif
""", output_encoding=sys.getdefaultencoding(),
encoding_errors='htmlentityreplace')

View File

@ -0,0 +1,201 @@
# mako/filters.py
# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
#
# This module is part of Mako and is released under
# the MIT License: http://www.opensource.org/licenses/mit-license.php
import re
import codecs
from mako.compat import quote_plus, unquote_plus, codepoint2name, \
name2codepoint
from mako import compat
xml_escapes = {
'&': '&amp;',
'>': '&gt;',
'<': '&lt;',
'"': '&#34;', # also &quot; in html-only
"'": '&#39;' # also &apos; in html-only
}
# XXX: &quot; is valid in HTML and XML
# &apos; is not valid HTML, but is valid XML
def legacy_html_escape(s):
"""legacy HTML escape for non-unicode mode."""
s = s.replace("&", "&amp;")
s = s.replace(">", "&gt;")
s = s.replace("<", "&lt;")
s = s.replace('"', "&#34;")
s = s.replace("'", "&#39;")
return s
try:
import markupsafe
html_escape = markupsafe.escape
except ImportError:
html_escape = legacy_html_escape
def xml_escape(string):
return re.sub(r'([&<"\'>])', lambda m: xml_escapes[m.group()], string)
def url_escape(string):
# convert into a list of octets
string = string.encode("utf8")
return quote_plus(string)
def legacy_url_escape(string):
# convert into a list of octets
return quote_plus(string)
def url_unescape(string):
text = unquote_plus(string)
if not is_ascii_str(text):
text = text.decode("utf8")
return text
def trim(string):
return string.strip()
class Decode(object):
def __getattr__(self, key):
def decode(x):
if isinstance(x, compat.text_type):
return x
elif not isinstance(x, compat.binary_type):
return decode(str(x))
else:
return compat.text_type(x, encoding=key)
return decode
decode = Decode()
_ASCII_re = re.compile(r'\A[\x00-\x7f]*\Z')
def is_ascii_str(text):
return isinstance(text, str) and _ASCII_re.match(text)
################################################################
class XMLEntityEscaper(object):
def __init__(self, codepoint2name, name2codepoint):
self.codepoint2entity = dict([(c, compat.text_type('&%s;' % n))
for c, n in codepoint2name.items()])
self.name2codepoint = name2codepoint
def escape_entities(self, text):
"""Replace characters with their character entity references.
Only characters corresponding to a named entity are replaced.
"""
return compat.text_type(text).translate(self.codepoint2entity)
def __escape(self, m):
codepoint = ord(m.group())
try:
return self.codepoint2entity[codepoint]
except (KeyError, IndexError):
return '&#x%X;' % codepoint
__escapable = re.compile(r'["&<>]|[^\x00-\x7f]')
def escape(self, text):
"""Replace characters with their character references.
Replace characters by their named entity references.
Non-ASCII characters, if they do not have a named entity reference,
are replaced by numerical character references.
The return value is guaranteed to be ASCII.
"""
return self.__escapable.sub(self.__escape, compat.text_type(text)
).encode('ascii')
# XXX: This regexp will not match all valid XML entity names__.
# (It punts on details involving involving CombiningChars and Extenders.)
#
# .. __: http://www.w3.org/TR/2000/REC-xml-20001006#NT-EntityRef
__characterrefs = re.compile(r'''& (?:
\#(\d+)
| \#x([\da-f]+)
| ( (?!\d) [:\w] [-.:\w]+ )
) ;''',
re.X | re.UNICODE)
def __unescape(self, m):
dval, hval, name = m.groups()
if dval:
codepoint = int(dval)
elif hval:
codepoint = int(hval, 16)
else:
codepoint = self.name2codepoint.get(name, 0xfffd)
# U+FFFD = "REPLACEMENT CHARACTER"
if codepoint < 128:
return chr(codepoint)
return chr(codepoint)
def unescape(self, text):
"""Unescape character references.
All character references (both entity references and numerical
character references) are unescaped.
"""
return self.__characterrefs.sub(self.__unescape, text)
_html_entities_escaper = XMLEntityEscaper(codepoint2name, name2codepoint)
html_entities_escape = _html_entities_escaper.escape_entities
html_entities_unescape = _html_entities_escaper.unescape
def htmlentityreplace_errors(ex):
"""An encoding error handler.
This python `codecs`_ error handler replaces unencodable
characters with HTML entities, or, if no HTML entity exists for
the character, XML character references.
>>> u'The cost was \u20ac12.'.encode('latin1', 'htmlentityreplace')
'The cost was &euro;12.'
"""
if isinstance(ex, UnicodeEncodeError):
# Handle encoding errors
bad_text = ex.object[ex.start:ex.end]
text = _html_entities_escaper.escape(bad_text)
return (compat.text_type(text), ex.end)
raise ex
codecs.register_error('htmlentityreplace', htmlentityreplace_errors)
# TODO: options to make this dynamic per-compilation will be added in a later
# release
DEFAULT_ESCAPES = {
'x': 'filters.xml_escape',
'h': 'filters.html_escape',
'u': 'filters.url_escape',
'trim': 'filters.trim',
'entity': 'filters.html_entities_escape',
'unicode': 'unicode',
'decode': 'decode',
'str': 'str',
'n': 'n'
}
if compat.py3k:
DEFAULT_ESCAPES.update({
'unicode': 'str'
})
NON_UNICODE_ESCAPES = DEFAULT_ESCAPES.copy()
NON_UNICODE_ESCAPES['h'] = 'filters.legacy_html_escape'
NON_UNICODE_ESCAPES['u'] = 'filters.legacy_url_escape'

View File

@ -0,0 +1,441 @@
# mako/lexer.py
# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
#
# This module is part of Mako and is released under
# the MIT License: http://www.opensource.org/licenses/mit-license.php
"""provides the Lexer class for parsing template strings into parse trees."""
import re
import codecs
from mako import parsetree, exceptions, compat
from mako.pygen import adjust_whitespace
_regexp_cache = {}
class Lexer(object):
def __init__(self, text, filename=None,
disable_unicode=False,
input_encoding=None, preprocessor=None):
self.text = text
self.filename = filename
self.template = parsetree.TemplateNode(self.filename)
self.matched_lineno = 1
self.matched_charpos = 0
self.lineno = 1
self.match_position = 0
self.tag = []
self.control_line = []
self.ternary_stack = []
self.disable_unicode = disable_unicode
self.encoding = input_encoding
if compat.py3k and disable_unicode:
raise exceptions.UnsupportedError(
"Mako for Python 3 does not "
"support disabling Unicode")
if preprocessor is None:
self.preprocessor = []
elif not hasattr(preprocessor, '__iter__'):
self.preprocessor = [preprocessor]
else:
self.preprocessor = preprocessor
@property
def exception_kwargs(self):
return {'source': self.text,
'lineno': self.matched_lineno,
'pos': self.matched_charpos,
'filename': self.filename}
def match(self, regexp, flags=None):
"""compile the given regexp, cache the reg, and call match_reg()."""
try:
reg = _regexp_cache[(regexp, flags)]
except KeyError:
if flags:
reg = re.compile(regexp, flags)
else:
reg = re.compile(regexp)
_regexp_cache[(regexp, flags)] = reg
return self.match_reg(reg)
def match_reg(self, reg):
"""match the given regular expression object to the current text
position.
if a match occurs, update the current text and line position.
"""
mp = self.match_position
match = reg.match(self.text, self.match_position)
if match:
(start, end) = match.span()
if end == start:
self.match_position = end + 1
else:
self.match_position = end
self.matched_lineno = self.lineno
lines = re.findall(r"\n", self.text[mp:self.match_position])
cp = mp - 1
while (cp >= 0 and cp < self.textlength and self.text[cp] != '\n'):
cp -= 1
self.matched_charpos = mp - cp
self.lineno += len(lines)
#print "MATCHED:", match.group(0), "LINE START:",
# self.matched_lineno, "LINE END:", self.lineno
#print "MATCH:", regexp, "\n", self.text[mp : mp + 15], \
# (match and "TRUE" or "FALSE")
return match
def parse_until_text(self, *text):
startpos = self.match_position
text_re = r'|'.join(text)
brace_level = 0
while True:
match = self.match(r'#.*\n')
if match:
continue
match = self.match(r'(\"\"\"|\'\'\'|\"|\')((?<!\\)\\\1|.)*?\1',
re.S)
if match:
continue
match = self.match(r'(%s)' % text_re)
if match:
if match.group(1) == '}' and brace_level > 0:
brace_level -= 1
continue
return \
self.text[startpos:
self.match_position - len(match.group(1))],\
match.group(1)
match = self.match(r"(.*?)(?=\"|\'|#|%s)" % text_re, re.S)
if match:
brace_level += match.group(1).count('{')
brace_level -= match.group(1).count('}')
continue
raise exceptions.SyntaxException(
"Expected: %s" %
','.join(text),
**self.exception_kwargs)
def append_node(self, nodecls, *args, **kwargs):
kwargs.setdefault('source', self.text)
kwargs.setdefault('lineno', self.matched_lineno)
kwargs.setdefault('pos', self.matched_charpos)
kwargs['filename'] = self.filename
node = nodecls(*args, **kwargs)
if len(self.tag):
self.tag[-1].nodes.append(node)
else:
self.template.nodes.append(node)
# build a set of child nodes for the control line
# (used for loop variable detection)
# also build a set of child nodes on ternary control lines
# (used for determining if a pass needs to be auto-inserted
if self.control_line:
control_frame = self.control_line[-1]
control_frame.nodes.append(node)
if not (isinstance(node, parsetree.ControlLine) and
control_frame.is_ternary(node.keyword)):
if self.ternary_stack and self.ternary_stack[-1]:
self.ternary_stack[-1][-1].nodes.append(node)
if isinstance(node, parsetree.Tag):
if len(self.tag):
node.parent = self.tag[-1]
self.tag.append(node)
elif isinstance(node, parsetree.ControlLine):
if node.isend:
self.control_line.pop()
self.ternary_stack.pop()
elif node.is_primary:
self.control_line.append(node)
self.ternary_stack.append([])
elif self.control_line and \
self.control_line[-1].is_ternary(node.keyword):
self.ternary_stack[-1].append(node)
elif self.control_line and \
not self.control_line[-1].is_ternary(node.keyword):
raise exceptions.SyntaxException(
"Keyword '%s' not a legal ternary for keyword '%s'" %
(node.keyword, self.control_line[-1].keyword),
**self.exception_kwargs)
_coding_re = re.compile(r'#.*coding[:=]\s*([-\w.]+).*\r?\n')
def decode_raw_stream(self, text, decode_raw, known_encoding, filename):
"""given string/unicode or bytes/string, determine encoding
from magic encoding comment, return body as unicode
or raw if decode_raw=False
"""
if isinstance(text, compat.text_type):
m = self._coding_re.match(text)
encoding = m and m.group(1) or known_encoding or 'ascii'
return encoding, text
if text.startswith(codecs.BOM_UTF8):
text = text[len(codecs.BOM_UTF8):]
parsed_encoding = 'utf-8'
m = self._coding_re.match(text.decode('utf-8', 'ignore'))
if m is not None and m.group(1) != 'utf-8':
raise exceptions.CompileException(
"Found utf-8 BOM in file, with conflicting "
"magic encoding comment of '%s'" % m.group(1),
text.decode('utf-8', 'ignore'),
0, 0, filename)
else:
m = self._coding_re.match(text.decode('utf-8', 'ignore'))
if m:
parsed_encoding = m.group(1)
else:
parsed_encoding = known_encoding or 'ascii'
if decode_raw:
try:
text = text.decode(parsed_encoding)
except UnicodeDecodeError:
raise exceptions.CompileException(
"Unicode decode operation of encoding '%s' failed" %
parsed_encoding,
text.decode('utf-8', 'ignore'),
0, 0, filename)
return parsed_encoding, text
def parse(self):
self.encoding, self.text = self.decode_raw_stream(self.text,
not self.disable_unicode,
self.encoding,
self.filename,)
for preproc in self.preprocessor:
self.text = preproc(self.text)
# push the match marker past the
# encoding comment.
self.match_reg(self._coding_re)
self.textlength = len(self.text)
while (True):
if self.match_position > self.textlength:
break
if self.match_end():
break
if self.match_expression():
continue
if self.match_control_line():
continue
if self.match_comment():
continue
if self.match_tag_start():
continue
if self.match_tag_end():
continue
if self.match_python_block():
continue
if self.match_text():
continue
if self.match_position > self.textlength:
break
raise exceptions.CompileException("assertion failed")
if len(self.tag):
raise exceptions.SyntaxException("Unclosed tag: <%%%s>" %
self.tag[-1].keyword,
**self.exception_kwargs)
if len(self.control_line):
raise exceptions.SyntaxException(
"Unterminated control keyword: '%s'" %
self.control_line[-1].keyword,
self.text,
self.control_line[-1].lineno,
self.control_line[-1].pos, self.filename)
return self.template
def match_tag_start(self):
match = self.match(r'''
\<% # opening tag
([\w\.\:]+) # keyword
((?:\s+\w+|\s*=\s*|".*?"|'.*?')*) # attrname, = \
# sign, string expression
\s* # more whitespace
(/)?> # closing
''',
re.I | re.S | re.X)
if match:
keyword, attr, isend = match.groups()
self.keyword = keyword
attributes = {}
if attr:
for att in re.findall(
r"\s*(\w+)\s*=\s*(?:'([^']*)'|\"([^\"]*)\")", attr):
key, val1, val2 = att
text = val1 or val2
text = text.replace('\r\n', '\n')
attributes[key] = text
self.append_node(parsetree.Tag, keyword, attributes)
if isend:
self.tag.pop()
else:
if keyword == 'text':
match = self.match(r'(.*?)(?=\</%text>)', re.S)
if not match:
raise exceptions.SyntaxException(
"Unclosed tag: <%%%s>" %
self.tag[-1].keyword,
**self.exception_kwargs)
self.append_node(parsetree.Text, match.group(1))
return self.match_tag_end()
return True
else:
return False
def match_tag_end(self):
match = self.match(r'\</%[\t ]*(.+?)[\t ]*>')
if match:
if not len(self.tag):
raise exceptions.SyntaxException(
"Closing tag without opening tag: </%%%s>" %
match.group(1),
**self.exception_kwargs)
elif self.tag[-1].keyword != match.group(1):
raise exceptions.SyntaxException(
"Closing tag </%%%s> does not match tag: <%%%s>" %
(match.group(1), self.tag[-1].keyword),
**self.exception_kwargs)
self.tag.pop()
return True
else:
return False
def match_end(self):
match = self.match(r'\Z', re.S)
if match:
string = match.group()
if string:
return string
else:
return True
else:
return False
def match_text(self):
match = self.match(r"""
(.*?) # anything, followed by:
(
(?<=\n)(?=[ \t]*(?=%|\#\#)) # an eval or line-based
# comment preceded by a
# consumed newline and whitespace
|
(?=\${) # an expression
|
(?=</?[%&]) # a substitution or block or call start or end
# - don't consume
|
(\\\r?\n) # an escaped newline - throw away
|
\Z # end of string
)""", re.X | re.S)
if match:
text = match.group(1)
if text:
self.append_node(parsetree.Text, text)
return True
else:
return False
def match_python_block(self):
match = self.match(r"<%(!)?")
if match:
line, pos = self.matched_lineno, self.matched_charpos
text, end = self.parse_until_text(r'%>')
# the trailing newline helps
# compiler.parse() not complain about indentation
text = adjust_whitespace(text) + "\n"
self.append_node(
parsetree.Code,
text,
match.group(1) == '!', lineno=line, pos=pos)
return True
else:
return False
def match_expression(self):
match = self.match(r"\${")
if match:
line, pos = self.matched_lineno, self.matched_charpos
text, end = self.parse_until_text(r'\|', r'}')
if end == '|':
escapes, end = self.parse_until_text(r'}')
else:
escapes = ""
text = text.replace('\r\n', '\n')
self.append_node(
parsetree.Expression,
text, escapes.strip(),
lineno=line, pos=pos)
return True
else:
return False
def match_control_line(self):
match = self.match(
r"(?<=^)[\t ]*(%(?!%)|##)[\t ]*((?:(?:\\r?\n)|[^\r\n])*)"
r"(?:\r?\n|\Z)", re.M)
if match:
operator = match.group(1)
text = match.group(2)
if operator == '%':
m2 = re.match(r'(end)?(\w+)\s*(.*)', text)
if not m2:
raise exceptions.SyntaxException(
"Invalid control line: '%s'" %
text,
**self.exception_kwargs)
isend, keyword = m2.group(1, 2)
isend = (isend is not None)
if isend:
if not len(self.control_line):
raise exceptions.SyntaxException(
"No starting keyword '%s' for '%s'" %
(keyword, text),
**self.exception_kwargs)
elif self.control_line[-1].keyword != keyword:
raise exceptions.SyntaxException(
"Keyword '%s' doesn't match keyword '%s'" %
(text, self.control_line[-1].keyword),
**self.exception_kwargs)
self.append_node(parsetree.ControlLine, keyword, isend, text)
else:
self.append_node(parsetree.Comment, text)
return True
else:
return False
def match_comment(self):
"""matches the multiline version of a comment"""
match = self.match(r"<%doc>(.*?)</%doc>", re.S)
if match:
self.append_node(parsetree.Comment, match.group(1))
return True
else:
return False

View File

@ -0,0 +1,359 @@
# mako/lookup.py
# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
#
# This module is part of Mako and is released under
# the MIT License: http://www.opensource.org/licenses/mit-license.php
import os, stat, posixpath, re
from mako import exceptions, util
from mako.template import Template
try:
import threading
except:
import dummy_threading as threading
class TemplateCollection(object):
"""Represent a collection of :class:`.Template` objects,
identifiable via URI.
A :class:`.TemplateCollection` is linked to the usage of
all template tags that address other templates, such
as ``<%include>``, ``<%namespace>``, and ``<%inherit>``.
The ``file`` attribute of each of those tags refers
to a string URI that is passed to that :class:`.Template`
object's :class:`.TemplateCollection` for resolution.
:class:`.TemplateCollection` is an abstract class,
with the usual default implementation being :class:`.TemplateLookup`.
"""
def has_template(self, uri):
"""Return ``True`` if this :class:`.TemplateLookup` is
capable of returning a :class:`.Template` object for the
given ``uri``.
:param uri: String URI of the template to be resolved.
"""
try:
self.get_template(uri)
return True
except exceptions.TemplateLookupException:
return False
def get_template(self, uri, relativeto=None):
"""Return a :class:`.Template` object corresponding to the given
``uri``.
The default implementation raises
:class:`.NotImplementedError`. Implementations should
raise :class:`.TemplateLookupException` if the given ``uri``
cannot be resolved.
:param uri: String URI of the template to be resolved.
:param relativeto: if present, the given ``uri`` is assumed to
be relative to this URI.
"""
raise NotImplementedError()
def filename_to_uri(self, uri, filename):
"""Convert the given ``filename`` to a URI relative to
this :class:`.TemplateCollection`."""
return uri
def adjust_uri(self, uri, filename):
"""Adjust the given ``uri`` based on the calling ``filename``.
When this method is called from the runtime, the
``filename`` parameter is taken directly to the ``filename``
attribute of the calling template. Therefore a custom
:class:`.TemplateCollection` subclass can place any string
identifier desired in the ``filename`` parameter of the
:class:`.Template` objects it constructs and have them come back
here.
"""
return uri
class TemplateLookup(TemplateCollection):
"""Represent a collection of templates that locates template source files
from the local filesystem.
The primary argument is the ``directories`` argument, the list of
directories to search:
.. sourcecode:: python
lookup = TemplateLookup(["/path/to/templates"])
some_template = lookup.get_template("/index.html")
The :class:`.TemplateLookup` can also be given :class:`.Template` objects
programatically using :meth:`.put_string` or :meth:`.put_template`:
.. sourcecode:: python
lookup = TemplateLookup()
lookup.put_string("base.html", '''
<html><body>${self.next()}</body></html>
''')
lookup.put_string("hello.html", '''
<%include file='base.html'/>
Hello, world !
''')
:param directories: A list of directory names which will be
searched for a particular template URI. The URI is appended
to each directory and the filesystem checked.
:param collection_size: Approximate size of the collection used
to store templates. If left at its default of ``-1``, the size
is unbounded, and a plain Python dictionary is used to
relate URI strings to :class:`.Template` instances.
Otherwise, a least-recently-used cache object is used which
will maintain the size of the collection approximately to
the number given.
:param filesystem_checks: When at its default value of ``True``,
each call to :meth:`.TemplateLookup.get_template()` will
compare the filesystem last modified time to the time in
which an existing :class:`.Template` object was created.
This allows the :class:`.TemplateLookup` to regenerate a
new :class:`.Template` whenever the original source has
been updated. Set this to ``False`` for a very minor
performance increase.
:param modulename_callable: A callable which, when present,
is passed the path of the source file as well as the
requested URI, and then returns the full path of the
generated Python module file. This is used to inject
alternate schemes for Python module location. If left at
its default of ``None``, the built in system of generation
based on ``module_directory`` plus ``uri`` is used.
All other keyword parameters available for
:class:`.Template` are mirrored here. When new
:class:`.Template` objects are created, the keywords
established with this :class:`.TemplateLookup` are passed on
to each new :class:`.Template`.
"""
def __init__(self,
directories=None,
module_directory=None,
filesystem_checks=True,
collection_size=-1,
format_exceptions=False,
error_handler=None,
disable_unicode=False,
bytestring_passthrough=False,
output_encoding=None,
encoding_errors='strict',
cache_args=None,
cache_impl='beaker',
cache_enabled=True,
cache_type=None,
cache_dir=None,
cache_url=None,
modulename_callable=None,
module_writer=None,
default_filters=None,
buffer_filters=(),
strict_undefined=False,
imports=None,
future_imports=None,
enable_loop=True,
input_encoding=None,
preprocessor=None,
lexer_cls=None):
self.directories = [posixpath.normpath(d) for d in
util.to_list(directories, ())
]
self.module_directory = module_directory
self.modulename_callable = modulename_callable
self.filesystem_checks = filesystem_checks
self.collection_size = collection_size
if cache_args is None:
cache_args = {}
# transfer deprecated cache_* args
if cache_dir:
cache_args.setdefault('dir', cache_dir)
if cache_url:
cache_args.setdefault('url', cache_url)
if cache_type:
cache_args.setdefault('type', cache_type)
self.template_args = {
'format_exceptions':format_exceptions,
'error_handler':error_handler,
'disable_unicode':disable_unicode,
'bytestring_passthrough':bytestring_passthrough,
'output_encoding':output_encoding,
'cache_impl':cache_impl,
'encoding_errors':encoding_errors,
'input_encoding':input_encoding,
'module_directory':module_directory,
'module_writer':module_writer,
'cache_args':cache_args,
'cache_enabled':cache_enabled,
'default_filters':default_filters,
'buffer_filters':buffer_filters,
'strict_undefined':strict_undefined,
'imports':imports,
'future_imports':future_imports,
'enable_loop':enable_loop,
'preprocessor':preprocessor,
'lexer_cls':lexer_cls
}
if collection_size == -1:
self._collection = {}
self._uri_cache = {}
else:
self._collection = util.LRUCache(collection_size)
self._uri_cache = util.LRUCache(collection_size)
self._mutex = threading.Lock()
def get_template(self, uri):
"""Return a :class:`.Template` object corresponding to the given
``uri``.
.. note:: The ``relativeto`` argument is not supported here at the moment.
"""
try:
if self.filesystem_checks:
return self._check(uri, self._collection[uri])
else:
return self._collection[uri]
except KeyError:
u = re.sub(r'^\/+', '', uri)
for dir in self.directories:
srcfile = posixpath.normpath(posixpath.join(dir, u))
if os.path.isfile(srcfile):
return self._load(srcfile, uri)
else:
raise exceptions.TopLevelLookupException(
"Cant locate template for uri %r" % uri)
def adjust_uri(self, uri, relativeto):
"""Adjust the given ``uri`` based on the given relative URI."""
key = (uri, relativeto)
if key in self._uri_cache:
return self._uri_cache[key]
if uri[0] != '/':
if relativeto is not None:
v = self._uri_cache[key] = posixpath.join(
posixpath.dirname(relativeto), uri)
else:
v = self._uri_cache[key] = '/' + uri
else:
v = self._uri_cache[key] = uri
return v
def filename_to_uri(self, filename):
"""Convert the given ``filename`` to a URI relative to
this :class:`.TemplateCollection`."""
try:
return self._uri_cache[filename]
except KeyError:
value = self._relativeize(filename)
self._uri_cache[filename] = value
return value
def _relativeize(self, filename):
"""Return the portion of a filename that is 'relative'
to the directories in this lookup.
"""
filename = posixpath.normpath(filename)
for dir in self.directories:
if filename[0:len(dir)] == dir:
return filename[len(dir):]
else:
return None
def _load(self, filename, uri):
self._mutex.acquire()
try:
try:
# try returning from collection one
# more time in case concurrent thread already loaded
return self._collection[uri]
except KeyError:
pass
try:
if self.modulename_callable is not None:
module_filename = self.modulename_callable(filename, uri)
else:
module_filename = None
self._collection[uri] = template = Template(
uri=uri,
filename=posixpath.normpath(filename),
lookup=self,
module_filename=module_filename,
**self.template_args)
return template
except:
# if compilation fails etc, ensure
# template is removed from collection,
# re-raise
self._collection.pop(uri, None)
raise
finally:
self._mutex.release()
def _check(self, uri, template):
if template.filename is None:
return template
try:
template_stat = os.stat(template.filename)
if template.module._modified_time < \
template_stat[stat.ST_MTIME]:
self._collection.pop(uri, None)
return self._load(template.filename, uri)
else:
return template
except OSError:
self._collection.pop(uri, None)
raise exceptions.TemplateLookupException(
"Cant locate template for uri %r" % uri)
def put_string(self, uri, text):
"""Place a new :class:`.Template` object into this
:class:`.TemplateLookup`, based on the given string of
``text``.
"""
self._collection[uri] = Template(
text,
lookup=self,
uri=uri,
**self.template_args)
def put_template(self, uri, template):
"""Place a new :class:`.Template` object into this
:class:`.TemplateLookup`, based on the given
:class:`.Template` object.
"""
self._collection[uri] = template

View File

@ -0,0 +1,594 @@
# mako/parsetree.py
# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
#
# This module is part of Mako and is released under
# the MIT License: http://www.opensource.org/licenses/mit-license.php
"""defines the parse tree components for Mako templates."""
from mako import exceptions, ast, util, filters, compat
import re
class Node(object):
"""base class for a Node in the parse tree."""
def __init__(self, source, lineno, pos, filename):
self.source = source
self.lineno = lineno
self.pos = pos
self.filename = filename
@property
def exception_kwargs(self):
return {'source': self.source, 'lineno': self.lineno,
'pos': self.pos, 'filename': self.filename}
def get_children(self):
return []
def accept_visitor(self, visitor):
def traverse(node):
for n in node.get_children():
n.accept_visitor(visitor)
method = getattr(visitor, "visit" + self.__class__.__name__, traverse)
method(self)
class TemplateNode(Node):
"""a 'container' node that stores the overall collection of nodes."""
def __init__(self, filename):
super(TemplateNode, self).__init__('', 0, 0, filename)
self.nodes = []
self.page_attributes = {}
def get_children(self):
return self.nodes
def __repr__(self):
return "TemplateNode(%s, %r)" % (
util.sorted_dict_repr(self.page_attributes),
self.nodes)
class ControlLine(Node):
"""defines a control line, a line-oriented python line or end tag.
e.g.::
% if foo:
(markup)
% endif
"""
has_loop_context = False
def __init__(self, keyword, isend, text, **kwargs):
super(ControlLine, self).__init__(**kwargs)
self.text = text
self.keyword = keyword
self.isend = isend
self.is_primary = keyword in ['for', 'if', 'while', 'try', 'with']
self.nodes = []
if self.isend:
self._declared_identifiers = []
self._undeclared_identifiers = []
else:
code = ast.PythonFragment(text, **self.exception_kwargs)
self._declared_identifiers = code.declared_identifiers
self._undeclared_identifiers = code.undeclared_identifiers
def get_children(self):
return self.nodes
def declared_identifiers(self):
return self._declared_identifiers
def undeclared_identifiers(self):
return self._undeclared_identifiers
def is_ternary(self, keyword):
"""return true if the given keyword is a ternary keyword
for this ControlLine"""
return keyword in {
'if':set(['else', 'elif']),
'try':set(['except', 'finally']),
'for':set(['else'])
}.get(self.keyword, [])
def __repr__(self):
return "ControlLine(%r, %r, %r, %r)" % (
self.keyword,
self.text,
self.isend,
(self.lineno, self.pos)
)
class Text(Node):
"""defines plain text in the template."""
def __init__(self, content, **kwargs):
super(Text, self).__init__(**kwargs)
self.content = content
def __repr__(self):
return "Text(%r, %r)" % (self.content, (self.lineno, self.pos))
class Code(Node):
"""defines a Python code block, either inline or module level.
e.g.::
inline:
<%
x = 12
%>
module level:
<%!
import logger
%>
"""
def __init__(self, text, ismodule, **kwargs):
super(Code, self).__init__(**kwargs)
self.text = text
self.ismodule = ismodule
self.code = ast.PythonCode(text, **self.exception_kwargs)
def declared_identifiers(self):
return self.code.declared_identifiers
def undeclared_identifiers(self):
return self.code.undeclared_identifiers
def __repr__(self):
return "Code(%r, %r, %r)" % (
self.text,
self.ismodule,
(self.lineno, self.pos)
)
class Comment(Node):
"""defines a comment line.
# this is a comment
"""
def __init__(self, text, **kwargs):
super(Comment, self).__init__(**kwargs)
self.text = text
def __repr__(self):
return "Comment(%r, %r)" % (self.text, (self.lineno, self.pos))
class Expression(Node):
"""defines an inline expression.
${x+y}
"""
def __init__(self, text, escapes, **kwargs):
super(Expression, self).__init__(**kwargs)
self.text = text
self.escapes = escapes
self.escapes_code = ast.ArgumentList(escapes, **self.exception_kwargs)
self.code = ast.PythonCode(text, **self.exception_kwargs)
def declared_identifiers(self):
return []
def undeclared_identifiers(self):
# TODO: make the "filter" shortcut list configurable at parse/gen time
return self.code.undeclared_identifiers.union(
self.escapes_code.undeclared_identifiers.difference(
set(filters.DEFAULT_ESCAPES.keys())
)
).difference(self.code.declared_identifiers)
def __repr__(self):
return "Expression(%r, %r, %r)" % (
self.text,
self.escapes_code.args,
(self.lineno, self.pos)
)
class _TagMeta(type):
"""metaclass to allow Tag to produce a subclass according to
its keyword"""
_classmap = {}
def __init__(cls, clsname, bases, dict):
if getattr(cls, '__keyword__', None) is not None:
cls._classmap[cls.__keyword__] = cls
super(_TagMeta, cls).__init__(clsname, bases, dict)
def __call__(cls, keyword, attributes, **kwargs):
if ":" in keyword:
ns, defname = keyword.split(':')
return type.__call__(CallNamespaceTag, ns, defname,
attributes, **kwargs)
try:
cls = _TagMeta._classmap[keyword]
except KeyError:
raise exceptions.CompileException(
"No such tag: '%s'" % keyword,
source=kwargs['source'],
lineno=kwargs['lineno'],
pos=kwargs['pos'],
filename=kwargs['filename']
)
return type.__call__(cls, keyword, attributes, **kwargs)
class Tag(compat.with_metaclass(_TagMeta, Node)):
"""abstract base class for tags.
<%sometag/>
<%someothertag>
stuff
</%someothertag>
"""
__keyword__ = None
def __init__(self, keyword, attributes, expressions,
nonexpressions, required, **kwargs):
"""construct a new Tag instance.
this constructor not called directly, and is only called
by subclasses.
:param keyword: the tag keyword
:param attributes: raw dictionary of attribute key/value pairs
:param expressions: a set of identifiers that are legal attributes,
which can also contain embedded expressions
:param nonexpressions: a set of identifiers that are legal
attributes, which cannot contain embedded expressions
:param \**kwargs:
other arguments passed to the Node superclass (lineno, pos)
"""
super(Tag, self).__init__(**kwargs)
self.keyword = keyword
self.attributes = attributes
self._parse_attributes(expressions, nonexpressions)
missing = [r for r in required if r not in self.parsed_attributes]
if len(missing):
raise exceptions.CompileException(
"Missing attribute(s): %s" %
",".join([repr(m) for m in missing]),
**self.exception_kwargs)
self.parent = None
self.nodes = []
def is_root(self):
return self.parent is None
def get_children(self):
return self.nodes
def _parse_attributes(self, expressions, nonexpressions):
undeclared_identifiers = set()
self.parsed_attributes = {}
for key in self.attributes:
if key in expressions:
expr = []
for x in re.compile(r'(\${.+?})',
re.S).split(self.attributes[key]):
m = re.compile(r'^\${(.+?)}$', re.S).match(x)
if m:
code = ast.PythonCode(m.group(1).rstrip(),
**self.exception_kwargs)
# we aren't discarding "declared_identifiers" here,
# which we do so that list comprehension-declared
# variables aren't counted. As yet can't find a
# condition that requires it here.
undeclared_identifiers = \
undeclared_identifiers.union(
code.undeclared_identifiers)
expr.append('(%s)' % m.group(1))
else:
if x:
expr.append(repr(x))
self.parsed_attributes[key] = " + ".join(expr) or repr('')
elif key in nonexpressions:
if re.search(r'\${.+?}', self.attributes[key]):
raise exceptions.CompileException(
"Attibute '%s' in tag '%s' does not allow embedded "
"expressions" % (key, self.keyword),
**self.exception_kwargs)
self.parsed_attributes[key] = repr(self.attributes[key])
else:
raise exceptions.CompileException(
"Invalid attribute for tag '%s': '%s'" %
(self.keyword, key),
**self.exception_kwargs)
self.expression_undeclared_identifiers = undeclared_identifiers
def declared_identifiers(self):
return []
def undeclared_identifiers(self):
return self.expression_undeclared_identifiers
def __repr__(self):
return "%s(%r, %s, %r, %r)" % (self.__class__.__name__,
self.keyword,
util.sorted_dict_repr(self.attributes),
(self.lineno, self.pos),
self.nodes
)
class IncludeTag(Tag):
__keyword__ = 'include'
def __init__(self, keyword, attributes, **kwargs):
super(IncludeTag, self).__init__(
keyword,
attributes,
('file', 'import', 'args'),
(), ('file',), **kwargs)
self.page_args = ast.PythonCode(
"__DUMMY(%s)" % attributes.get('args', ''),
**self.exception_kwargs)
def declared_identifiers(self):
return []
def undeclared_identifiers(self):
identifiers = self.page_args.undeclared_identifiers.\
difference(set(["__DUMMY"])).\
difference(self.page_args.declared_identifiers)
return identifiers.union(super(IncludeTag, self).
undeclared_identifiers())
class NamespaceTag(Tag):
__keyword__ = 'namespace'
def __init__(self, keyword, attributes, **kwargs):
super(NamespaceTag, self).__init__(
keyword, attributes,
('file',),
('name','inheritable',
'import','module'),
(), **kwargs)
self.name = attributes.get('name', '__anon_%s' % hex(abs(id(self))))
if not 'name' in attributes and not 'import' in attributes:
raise exceptions.CompileException(
"'name' and/or 'import' attributes are required "
"for <%namespace>",
**self.exception_kwargs)
if 'file' in attributes and 'module' in attributes:
raise exceptions.CompileException(
"<%namespace> may only have one of 'file' or 'module'",
**self.exception_kwargs
)
def declared_identifiers(self):
return []
class TextTag(Tag):
__keyword__ = 'text'
def __init__(self, keyword, attributes, **kwargs):
super(TextTag, self).__init__(
keyword,
attributes, (),
('filter'), (), **kwargs)
self.filter_args = ast.ArgumentList(
attributes.get('filter', ''),
**self.exception_kwargs)
def undeclared_identifiers(self):
return self.filter_args.\
undeclared_identifiers.\
difference(filters.DEFAULT_ESCAPES.keys()).union(
self.expression_undeclared_identifiers
)
class DefTag(Tag):
__keyword__ = 'def'
def __init__(self, keyword, attributes, **kwargs):
expressions = ['buffered', 'cached'] + [
c for c in attributes if c.startswith('cache_')]
super(DefTag, self).__init__(
keyword,
attributes,
expressions,
('name', 'filter', 'decorator'),
('name',),
**kwargs)
name = attributes['name']
if re.match(r'^[\w_]+$', name):
raise exceptions.CompileException(
"Missing parenthesis in %def",
**self.exception_kwargs)
self.function_decl = ast.FunctionDecl("def " + name + ":pass",
**self.exception_kwargs)
self.name = self.function_decl.funcname
self.decorator = attributes.get('decorator', '')
self.filter_args = ast.ArgumentList(
attributes.get('filter', ''),
**self.exception_kwargs)
is_anonymous = False
is_block = False
@property
def funcname(self):
return self.function_decl.funcname
def get_argument_expressions(self, **kw):
return self.function_decl.get_argument_expressions(**kw)
def declared_identifiers(self):
return self.function_decl.allargnames
def undeclared_identifiers(self):
res = []
for c in self.function_decl.defaults:
res += list(ast.PythonCode(c, **self.exception_kwargs).
undeclared_identifiers)
return set(res).union(
self.filter_args.\
undeclared_identifiers.\
difference(filters.DEFAULT_ESCAPES.keys())
).union(
self.expression_undeclared_identifiers
).difference(
self.function_decl.allargnames
)
class BlockTag(Tag):
__keyword__ = 'block'
def __init__(self, keyword, attributes, **kwargs):
expressions = ['buffered', 'cached', 'args'] + [
c for c in attributes if c.startswith('cache_')]
super(BlockTag, self).__init__(
keyword,
attributes,
expressions,
('name','filter', 'decorator'),
(),
**kwargs)
name = attributes.get('name')
if name and not re.match(r'^[\w_]+$',name):
raise exceptions.CompileException(
"%block may not specify an argument signature",
**self.exception_kwargs)
if not name and attributes.get('args', None):
raise exceptions.CompileException(
"Only named %blocks may specify args",
**self.exception_kwargs
)
self.body_decl = ast.FunctionArgs(attributes.get('args', ''),
**self.exception_kwargs)
self.name = name
self.decorator = attributes.get('decorator', '')
self.filter_args = ast.ArgumentList(
attributes.get('filter', ''),
**self.exception_kwargs)
is_block = True
@property
def is_anonymous(self):
return self.name is None
@property
def funcname(self):
return self.name or "__M_anon_%d" % (self.lineno, )
def get_argument_expressions(self, **kw):
return self.body_decl.get_argument_expressions(**kw)
def declared_identifiers(self):
return self.body_decl.allargnames
def undeclared_identifiers(self):
return (self.filter_args.\
undeclared_identifiers.\
difference(filters.DEFAULT_ESCAPES.keys())
).union(self.expression_undeclared_identifiers)
class CallTag(Tag):
__keyword__ = 'call'
def __init__(self, keyword, attributes, **kwargs):
super(CallTag, self).__init__(keyword, attributes,
('args'), ('expr',), ('expr',), **kwargs)
self.expression = attributes['expr']
self.code = ast.PythonCode(self.expression, **self.exception_kwargs)
self.body_decl = ast.FunctionArgs(attributes.get('args', ''),
**self.exception_kwargs)
def declared_identifiers(self):
return self.code.declared_identifiers.union(self.body_decl.allargnames)
def undeclared_identifiers(self):
return self.code.undeclared_identifiers.\
difference(self.code.declared_identifiers)
class CallNamespaceTag(Tag):
def __init__(self, namespace, defname, attributes, **kwargs):
super(CallNamespaceTag, self).__init__(
namespace + ":" + defname,
attributes,
tuple(attributes.keys()) + ('args', ),
(),
(),
**kwargs)
self.expression = "%s.%s(%s)" % (
namespace,
defname,
",".join(["%s=%s" % (k, v) for k, v in
self.parsed_attributes.items()
if k != 'args'])
)
self.code = ast.PythonCode(self.expression, **self.exception_kwargs)
self.body_decl = ast.FunctionArgs(
attributes.get('args', ''),
**self.exception_kwargs)
def declared_identifiers(self):
return self.code.declared_identifiers.union(self.body_decl.allargnames)
def undeclared_identifiers(self):
return self.code.undeclared_identifiers.\
difference(self.code.declared_identifiers)
class InheritTag(Tag):
__keyword__ = 'inherit'
def __init__(self, keyword, attributes, **kwargs):
super(InheritTag, self).__init__(
keyword, attributes,
('file',), (), ('file',), **kwargs)
class PageTag(Tag):
__keyword__ = 'page'
def __init__(self, keyword, attributes, **kwargs):
expressions = ['cached', 'args', 'expression_filter', 'enable_loop'] + [
c for c in attributes if c.startswith('cache_')]
super(PageTag, self).__init__(
keyword,
attributes,
expressions,
(),
(),
**kwargs)
self.body_decl = ast.FunctionArgs(attributes.get('args', ''),
**self.exception_kwargs)
self.filter_args = ast.ArgumentList(
attributes.get('expression_filter', ''),
**self.exception_kwargs)
def declared_identifiers(self):
return self.body_decl.allargnames

View File

@ -0,0 +1,299 @@
# mako/pygen.py
# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
#
# This module is part of Mako and is released under
# the MIT License: http://www.opensource.org/licenses/mit-license.php
"""utilities for generating and formatting literal Python code."""
import re
from mako import exceptions
class PythonPrinter(object):
def __init__(self, stream):
# indentation counter
self.indent = 0
# a stack storing information about why we incremented
# the indentation counter, to help us determine if we
# should decrement it
self.indent_detail = []
# the string of whitespace multiplied by the indent
# counter to produce a line
self.indentstring = " "
# the stream we are writing to
self.stream = stream
# current line number
self.lineno = 1
# a list of lines that represents a buffered "block" of code,
# which can be later printed relative to an indent level
self.line_buffer = []
self.in_indent_lines = False
self._reset_multi_line_flags()
# mapping of generated python lines to template
# source lines
self.source_map = {}
def _update_lineno(self, num):
self.lineno += num
def start_source(self, lineno):
if self.lineno not in self.source_map:
self.source_map[self.lineno] = lineno
def write_blanks(self, num):
self.stream.write("\n" * num)
self._update_lineno(num)
def write_indented_block(self, block):
"""print a line or lines of python which already contain indentation.
The indentation of the total block of lines will be adjusted to that of
the current indent level."""
self.in_indent_lines = False
for l in re.split(r'\r?\n', block):
self.line_buffer.append(l)
self._update_lineno(1)
def writelines(self, *lines):
"""print a series of lines of python."""
for line in lines:
self.writeline(line)
def writeline(self, line):
"""print a line of python, indenting it according to the current
indent level.
this also adjusts the indentation counter according to the
content of the line.
"""
if not self.in_indent_lines:
self._flush_adjusted_lines()
self.in_indent_lines = True
if (line is None or
re.match(r"^\s*#",line) or
re.match(r"^\s*$", line)
):
hastext = False
else:
hastext = True
is_comment = line and len(line) and line[0] == '#'
# see if this line should decrease the indentation level
if (not is_comment and
(not hastext or self._is_unindentor(line))
):
if self.indent > 0:
self.indent -= 1
# if the indent_detail stack is empty, the user
# probably put extra closures - the resulting
# module wont compile.
if len(self.indent_detail) == 0:
raise exceptions.SyntaxException(
"Too many whitespace closures")
self.indent_detail.pop()
if line is None:
return
# write the line
self.stream.write(self._indent_line(line) + "\n")
self._update_lineno(len(line.split("\n")))
# see if this line should increase the indentation level.
# note that a line can both decrase (before printing) and
# then increase (after printing) the indentation level.
if re.search(r":[ \t]*(?:#.*)?$", line):
# increment indentation count, and also
# keep track of what the keyword was that indented us,
# if it is a python compound statement keyword
# where we might have to look for an "unindent" keyword
match = re.match(r"^\s*(if|try|elif|while|for|with)", line)
if match:
# its a "compound" keyword, so we will check for "unindentors"
indentor = match.group(1)
self.indent += 1
self.indent_detail.append(indentor)
else:
indentor = None
# its not a "compound" keyword. but lets also
# test for valid Python keywords that might be indenting us,
# else assume its a non-indenting line
m2 = re.match(r"^\s*(def|class|else|elif|except|finally)",
line)
if m2:
self.indent += 1
self.indent_detail.append(indentor)
def close(self):
"""close this printer, flushing any remaining lines."""
self._flush_adjusted_lines()
def _is_unindentor(self, line):
"""return true if the given line is an 'unindentor',
relative to the last 'indent' event received.
"""
# no indentation detail has been pushed on; return False
if len(self.indent_detail) == 0:
return False
indentor = self.indent_detail[-1]
# the last indent keyword we grabbed is not a
# compound statement keyword; return False
if indentor is None:
return False
# if the current line doesnt have one of the "unindentor" keywords,
# return False
match = re.match(r"^\s*(else|elif|except|finally).*\:", line)
if not match:
return False
# whitespace matches up, we have a compound indentor,
# and this line has an unindentor, this
# is probably good enough
return True
# should we decide that its not good enough, heres
# more stuff to check.
#keyword = match.group(1)
# match the original indent keyword
#for crit in [
# (r'if|elif', r'else|elif'),
# (r'try', r'except|finally|else'),
# (r'while|for', r'else'),
#]:
# if re.match(crit[0], indentor) and re.match(crit[1], keyword):
# return True
#return False
def _indent_line(self, line, stripspace=''):
"""indent the given line according to the current indent level.
stripspace is a string of space that will be truncated from the
start of the line before indenting."""
return re.sub(r"^%s" % stripspace, self.indentstring
* self.indent, line)
def _reset_multi_line_flags(self):
"""reset the flags which would indicate we are in a backslashed
or triple-quoted section."""
self.backslashed, self.triplequoted = False, False
def _in_multi_line(self, line):
"""return true if the given line is part of a multi-line block,
via backslash or triple-quote."""
# we are only looking for explicitly joined lines here, not
# implicit ones (i.e. brackets, braces etc.). this is just to
# guard against the possibility of modifying the space inside of
# a literal multiline string with unfortunately placed
# whitespace
current_state = (self.backslashed or self.triplequoted)
if re.search(r"\\$", line):
self.backslashed = True
else:
self.backslashed = False
triples = len(re.findall(r"\"\"\"|\'\'\'", line))
if triples == 1 or triples % 2 != 0:
self.triplequoted = not self.triplequoted
return current_state
def _flush_adjusted_lines(self):
stripspace = None
self._reset_multi_line_flags()
for entry in self.line_buffer:
if self._in_multi_line(entry):
self.stream.write(entry + "\n")
else:
entry = entry.expandtabs()
if stripspace is None and re.search(r"^[ \t]*[^# \t]", entry):
stripspace = re.match(r"^([ \t]*)", entry).group(1)
self.stream.write(self._indent_line(entry, stripspace) + "\n")
self.line_buffer = []
self._reset_multi_line_flags()
def adjust_whitespace(text):
"""remove the left-whitespace margin of a block of Python code."""
state = [False, False]
(backslashed, triplequoted) = (0, 1)
def in_multi_line(line):
start_state = (state[backslashed] or state[triplequoted])
if re.search(r"\\$", line):
state[backslashed] = True
else:
state[backslashed] = False
def match(reg, t):
m = re.match(reg, t)
if m:
return m, t[len(m.group(0)):]
else:
return None, t
while line:
if state[triplequoted]:
m, line = match(r"%s" % state[triplequoted], line)
if m:
state[triplequoted] = False
else:
m, line = match(r".*?(?=%s|$)" % state[triplequoted], line)
else:
m, line = match(r'#', line)
if m:
return start_state
m, line = match(r"\"\"\"|\'\'\'", line)
if m:
state[triplequoted] = m.group(0)
continue
m, line = match(r".*?(?=\"\"\"|\'\'\'|#|$)", line)
return start_state
def _indent_line(line, stripspace=''):
return re.sub(r"^%s" % stripspace, '', line)
lines = []
stripspace = None
for line in re.split(r'\r?\n', text):
if in_multi_line(line):
lines.append(line)
else:
line = line.expandtabs()
if stripspace is None and re.search(r"^[ \t]*[^# \t]", line):
stripspace = re.match(r"^([ \t]*)", line).group(1)
lines.append(_indent_line(line, stripspace))
return "\n".join(lines)

View File

@ -0,0 +1,232 @@
# mako/pyparser.py
# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
#
# This module is part of Mako and is released under
# the MIT License: http://www.opensource.org/licenses/mit-license.php
"""Handles parsing of Python code.
Parsing to AST is done via _ast on Python > 2.5, otherwise the compiler
module is used.
"""
from mako import exceptions, util, compat
from mako.compat import arg_stringname
import operator
if compat.py3k:
# words that cannot be assigned to (notably
# smaller than the total keys in __builtins__)
reserved = set(['True', 'False', 'None', 'print'])
# the "id" attribute on a function node
arg_id = operator.attrgetter('arg')
else:
# words that cannot be assigned to (notably
# smaller than the total keys in __builtins__)
reserved = set(['True', 'False', 'None'])
# the "id" attribute on a function node
arg_id = operator.attrgetter('id')
import _ast
util.restore__ast(_ast)
from mako import _ast_util
def parse(code, mode='exec', **exception_kwargs):
"""Parse an expression into AST"""
try:
return _ast_util.parse(code, '<unknown>', mode)
except Exception:
raise exceptions.SyntaxException(
"(%s) %s (%r)" % (
compat.exception_as().__class__.__name__,
compat.exception_as(),
code[0:50]
), **exception_kwargs)
class FindIdentifiers(_ast_util.NodeVisitor):
def __init__(self, listener, **exception_kwargs):
self.in_function = False
self.in_assign_targets = False
self.local_ident_stack = set()
self.listener = listener
self.exception_kwargs = exception_kwargs
def _add_declared(self, name):
if not self.in_function:
self.listener.declared_identifiers.add(name)
else:
self.local_ident_stack.add(name)
def visit_ClassDef(self, node):
self._add_declared(node.name)
def visit_Assign(self, node):
# flip around the visiting of Assign so the expression gets
# evaluated first, in the case of a clause like "x=x+5" (x
# is undeclared)
self.visit(node.value)
in_a = self.in_assign_targets
self.in_assign_targets = True
for n in node.targets:
self.visit(n)
self.in_assign_targets = in_a
if compat.py3k:
# ExceptHandler is in Python 2, but this block only works in
# Python 3 (and is required there)
def visit_ExceptHandler(self, node):
if node.name is not None:
self._add_declared(node.name)
if node.type is not None:
self.visit(node.type)
for statement in node.body:
self.visit(statement)
def visit_Lambda(self, node, *args):
self._visit_function(node, True)
def visit_FunctionDef(self, node):
self._add_declared(node.name)
self._visit_function(node, False)
def _expand_tuples(self, args):
for arg in args:
if isinstance(arg, _ast.Tuple):
for n in arg.elts:
yield n
else:
yield arg
def _visit_function(self, node, islambda):
# push function state onto stack. dont log any more
# identifiers as "declared" until outside of the function,
# but keep logging identifiers as "undeclared". track
# argument names in each function header so they arent
# counted as "undeclared"
inf = self.in_function
self.in_function = True
local_ident_stack = self.local_ident_stack
self.local_ident_stack = local_ident_stack.union([
arg_id(arg) for arg in self._expand_tuples(node.args.args)
])
if islambda:
self.visit(node.body)
else:
for n in node.body:
self.visit(n)
self.in_function = inf
self.local_ident_stack = local_ident_stack
def visit_For(self, node):
# flip around visit
self.visit(node.iter)
self.visit(node.target)
for statement in node.body:
self.visit(statement)
for statement in node.orelse:
self.visit(statement)
def visit_Name(self, node):
if isinstance(node.ctx, _ast.Store):
# this is eqiuvalent to visit_AssName in
# compiler
self._add_declared(node.id)
elif node.id not in reserved and node.id \
not in self.listener.declared_identifiers and node.id \
not in self.local_ident_stack:
self.listener.undeclared_identifiers.add(node.id)
def visit_Import(self, node):
for name in node.names:
if name.asname is not None:
self._add_declared(name.asname)
else:
self._add_declared(name.name.split('.')[0])
def visit_ImportFrom(self, node):
for name in node.names:
if name.asname is not None:
self._add_declared(name.asname)
else:
if name.name == '*':
raise exceptions.CompileException(
"'import *' is not supported, since all identifier "
"names must be explicitly declared. Please use the "
"form 'from <modulename> import <name1>, <name2>, "
"...' instead.", **self.exception_kwargs)
self._add_declared(name.name)
class FindTuple(_ast_util.NodeVisitor):
def __init__(self, listener, code_factory, **exception_kwargs):
self.listener = listener
self.exception_kwargs = exception_kwargs
self.code_factory = code_factory
def visit_Tuple(self, node):
for n in node.elts:
p = self.code_factory(n, **self.exception_kwargs)
self.listener.codeargs.append(p)
self.listener.args.append(ExpressionGenerator(n).value())
self.listener.declared_identifiers = \
self.listener.declared_identifiers.union(
p.declared_identifiers)
self.listener.undeclared_identifiers = \
self.listener.undeclared_identifiers.union(
p.undeclared_identifiers)
class ParseFunc(_ast_util.NodeVisitor):
def __init__(self, listener, **exception_kwargs):
self.listener = listener
self.exception_kwargs = exception_kwargs
def visit_FunctionDef(self, node):
self.listener.funcname = node.name
argnames = [arg_id(arg) for arg in node.args.args]
if node.args.vararg:
argnames.append(arg_stringname(node.args.vararg))
if compat.py2k:
# kw-only args don't exist in Python 2
kwargnames = []
else:
kwargnames = [arg_id(arg) for arg in node.args.kwonlyargs]
if node.args.kwarg:
kwargnames.append(arg_stringname(node.args.kwarg))
self.listener.argnames = argnames
self.listener.defaults = node.args.defaults # ast
self.listener.kwargnames = kwargnames
if compat.py2k:
self.listener.kwdefaults = []
else:
self.listener.kwdefaults = node.args.kw_defaults
self.listener.varargs = node.args.vararg
self.listener.kwargs = node.args.kwarg
class ExpressionGenerator(object):
def __init__(self, astnode):
self.generator = _ast_util.SourceGenerator(' ' * 4)
self.generator.visit(astnode)
def value(self):
return ''.join(self.generator.result)

View File

@ -0,0 +1,878 @@
# mako/runtime.py
# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
#
# This module is part of Mako and is released under
# the MIT License: http://www.opensource.org/licenses/mit-license.php
"""provides runtime services for templates, including Context,
Namespace, and various helper functions."""
from mako import exceptions, util, compat
from mako.compat import compat_builtins
import sys
class Context(object):
"""Provides runtime namespace, output buffer, and various
callstacks for templates.
See :ref:`runtime_toplevel` for detail on the usage of
:class:`.Context`.
"""
def __init__(self, buffer, **data):
self._buffer_stack = [buffer]
self._data = data
self._kwargs = data.copy()
self._with_template = None
self._outputting_as_unicode = None
self.namespaces = {}
# "capture" function which proxies to the
# generic "capture" function
self._data['capture'] = compat.partial(capture, self)
# "caller" stack used by def calls with content
self.caller_stack = self._data['caller'] = CallerStack()
def _set_with_template(self, t):
self._with_template = t
illegal_names = t.reserved_names.intersection(self._data)
if illegal_names:
raise exceptions.NameConflictError(
"Reserved words passed to render(): %s" %
", ".join(illegal_names))
@property
def lookup(self):
"""Return the :class:`.TemplateLookup` associated
with this :class:`.Context`.
"""
return self._with_template.lookup
@property
def kwargs(self):
"""Return the dictionary of top level keyword arguments associated
with this :class:`.Context`.
This dictionary only includes the top-level arguments passed to
:meth:`.Template.render`. It does not include names produced within
the template execution such as local variable names or special names
such as ``self``, ``next``, etc.
The purpose of this dictionary is primarily for the case that
a :class:`.Template` accepts arguments via its ``<%page>`` tag,
which are normally expected to be passed via :meth:`.Template.render`,
except the template is being called in an inheritance context,
using the ``body()`` method. :attr:`.Context.kwargs` can then be
used to propagate these arguments to the inheriting template::
${next.body(**context.kwargs)}
"""
return self._kwargs.copy()
def push_caller(self, caller):
"""Push a ``caller`` callable onto the callstack for
this :class:`.Context`."""
self.caller_stack.append(caller)
def pop_caller(self):
"""Pop a ``caller`` callable onto the callstack for this
:class:`.Context`."""
del self.caller_stack[-1]
def keys(self):
"""Return a list of all names established in this :class:`.Context`."""
return list(self._data.keys())
def __getitem__(self, key):
if key in self._data:
return self._data[key]
else:
return compat_builtins.__dict__[key]
def _push_writer(self):
"""push a capturing buffer onto this Context and return
the new writer function."""
buf = util.FastEncodingBuffer()
self._buffer_stack.append(buf)
return buf.write
def _pop_buffer_and_writer(self):
"""pop the most recent capturing buffer from this Context
and return the current writer after the pop.
"""
buf = self._buffer_stack.pop()
return buf, self._buffer_stack[-1].write
def _push_buffer(self):
"""push a capturing buffer onto this Context."""
self._push_writer()
def _pop_buffer(self):
"""pop the most recent capturing buffer from this Context."""
return self._buffer_stack.pop()
def get(self, key, default=None):
"""Return a value from this :class:`.Context`."""
return self._data.get(key, compat_builtins.__dict__.get(key, default))
def write(self, string):
"""Write a string to this :class:`.Context` object's
underlying output buffer."""
self._buffer_stack[-1].write(string)
def writer(self):
"""Return the current writer function."""
return self._buffer_stack[-1].write
def _copy(self):
c = Context.__new__(Context)
c._buffer_stack = self._buffer_stack
c._data = self._data.copy()
c._kwargs = self._kwargs
c._with_template = self._with_template
c._outputting_as_unicode = self._outputting_as_unicode
c.namespaces = self.namespaces
c.caller_stack = self.caller_stack
return c
def _locals(self, d):
"""Create a new :class:`.Context` with a copy of this
:class:`.Context`'s current state,
updated with the given dictionary.
The :attr:`.Context.kwargs` collection remains
unaffected.
"""
if not d:
return self
c = self._copy()
c._data.update(d)
return c
def _clean_inheritance_tokens(self):
"""create a new copy of this :class:`.Context`. with
tokens related to inheritance state removed."""
c = self._copy()
x = c._data
x.pop('self', None)
x.pop('parent', None)
x.pop('next', None)
return c
class CallerStack(list):
def __init__(self):
self.nextcaller = None
def __nonzero__(self):
return self.__bool__()
def __bool__(self):
return len(self) and self._get_caller() and True or False
def _get_caller(self):
# this method can be removed once
# codegen MAGIC_NUMBER moves past 7
return self[-1]
def __getattr__(self, key):
return getattr(self._get_caller(), key)
def _push_frame(self):
frame = self.nextcaller or None
self.append(frame)
self.nextcaller = None
return frame
def _pop_frame(self):
self.nextcaller = self.pop()
class Undefined(object):
"""Represents an undefined value in a template.
All template modules have a constant value
``UNDEFINED`` present which is an instance of this
object.
"""
def __str__(self):
raise NameError("Undefined")
def __nonzero__(self):
return self.__bool__()
def __bool__(self):
return False
UNDEFINED = Undefined()
class LoopStack(object):
"""a stack for LoopContexts that implements the context manager protocol
to automatically pop off the top of the stack on context exit
"""
def __init__(self):
self.stack = []
def _enter(self, iterable):
self._push(iterable)
return self._top
def _exit(self):
self._pop()
return self._top
@property
def _top(self):
if self.stack:
return self.stack[-1]
else:
return self
def _pop(self):
return self.stack.pop()
def _push(self, iterable):
new = LoopContext(iterable)
if self.stack:
new.parent = self.stack[-1]
return self.stack.append(new)
def __getattr__(self, key):
raise exceptions.RuntimeException("No loop context is established")
def __iter__(self):
return iter(self._top)
class LoopContext(object):
"""A magic loop variable.
Automatically accessible in any ``% for`` block.
See the section :ref:`loop_context` for usage
notes.
:attr:`parent` -> :class:`.LoopContext` or ``None``
The parent loop, if one exists.
:attr:`index` -> `int`
The 0-based iteration count.
:attr:`reverse_index` -> `int`
The number of iterations remaining.
:attr:`first` -> `bool`
``True`` on the first iteration, ``False`` otherwise.
:attr:`last` -> `bool`
``True`` on the last iteration, ``False`` otherwise.
:attr:`even` -> `bool`
``True`` when ``index`` is even.
:attr:`odd` -> `bool`
``True`` when ``index`` is odd.
"""
def __init__(self, iterable):
self._iterable = iterable
self.index = 0
self.parent = None
def __iter__(self):
for i in self._iterable:
yield i
self.index += 1
@util.memoized_instancemethod
def __len__(self):
return len(self._iterable)
@property
def reverse_index(self):
return len(self) - self.index - 1
@property
def first(self):
return self.index == 0
@property
def last(self):
return self.index == len(self) - 1
@property
def even(self):
return not self.odd
@property
def odd(self):
return bool(self.index % 2)
def cycle(self, *values):
"""Cycle through values as the loop progresses.
"""
if not values:
raise ValueError("You must provide values to cycle through")
return values[self.index % len(values)]
class _NSAttr(object):
def __init__(self, parent):
self.__parent = parent
def __getattr__(self, key):
ns = self.__parent
while ns:
if hasattr(ns.module, key):
return getattr(ns.module, key)
else:
ns = ns.inherits
raise AttributeError(key)
class Namespace(object):
"""Provides access to collections of rendering methods, which
can be local, from other templates, or from imported modules.
To access a particular rendering method referenced by a
:class:`.Namespace`, use plain attribute access:
.. sourcecode:: mako
${some_namespace.foo(x, y, z)}
:class:`.Namespace` also contains several built-in attributes
described here.
"""
def __init__(self, name, context,
callables=None, inherits=None,
populate_self=True, calling_uri=None):
self.name = name
self.context = context
self.inherits = inherits
if callables is not None:
self.callables = dict([(c.__name__, c) for c in callables])
callables = ()
module = None
"""The Python module referenced by this :class:`.Namespace`.
If the namespace references a :class:`.Template`, then
this module is the equivalent of ``template.module``,
i.e. the generated module for the template.
"""
template = None
"""The :class:`.Template` object referenced by this
:class:`.Namespace`, if any.
"""
context = None
"""The :class:`.Context` object for this :class:`.Namespace`.
Namespaces are often created with copies of contexts that
contain slightly different data, particularly in inheritance
scenarios. Using the :class:`.Context` off of a :class:`.Namespace` one
can traverse an entire chain of templates that inherit from
one-another.
"""
filename = None
"""The path of the filesystem file used for this
:class:`.Namespace`'s module or template.
If this is a pure module-based
:class:`.Namespace`, this evaluates to ``module.__file__``. If a
template-based namespace, it evaluates to the original
template file location.
"""
uri = None
"""The URI for this :class:`.Namespace`'s template.
I.e. whatever was sent to :meth:`.TemplateLookup.get_template()`.
This is the equivalent of :attr:`.Template.uri`.
"""
_templateuri = None
@util.memoized_property
def attr(self):
"""Access module level attributes by name.
This accessor allows templates to supply "scalar"
attributes which are particularly handy in inheritance
relationships.
.. seealso::
:ref:`inheritance_attr`
:ref:`namespace_attr_for_includes`
"""
return _NSAttr(self)
def get_namespace(self, uri):
"""Return a :class:`.Namespace` corresponding to the given ``uri``.
If the given ``uri`` is a relative URI (i.e. it does not
contain a leading slash ``/``), the ``uri`` is adjusted to
be relative to the ``uri`` of the namespace itself. This
method is therefore mostly useful off of the built-in
``local`` namespace, described in :ref:`namespace_local`.
In
most cases, a template wouldn't need this function, and
should instead use the ``<%namespace>`` tag to load
namespaces. However, since all ``<%namespace>`` tags are
evaluated before the body of a template ever runs,
this method can be used to locate namespaces using
expressions that were generated within the body code of
the template, or to conditionally use a particular
namespace.
"""
key = (self, uri)
if key in self.context.namespaces:
return self.context.namespaces[key]
else:
ns = TemplateNamespace(uri, self.context._copy(),
templateuri=uri,
calling_uri=self._templateuri)
self.context.namespaces[key] = ns
return ns
def get_template(self, uri):
"""Return a :class:`.Template` from the given ``uri``.
The ``uri`` resolution is relative to the ``uri`` of this
:class:`.Namespace` object's :class:`.Template`.
"""
return _lookup_template(self.context, uri, self._templateuri)
def get_cached(self, key, **kwargs):
"""Return a value from the :class:`.Cache` referenced by this
:class:`.Namespace` object's :class:`.Template`.
The advantage to this method versus direct access to the
:class:`.Cache` is that the configuration parameters
declared in ``<%page>`` take effect here, thereby calling
up the same configured backend as that configured
by ``<%page>``.
"""
return self.cache.get(key, **kwargs)
@property
def cache(self):
"""Return the :class:`.Cache` object referenced
by this :class:`.Namespace` object's
:class:`.Template`.
"""
return self.template.cache
def include_file(self, uri, **kwargs):
"""Include a file at the given ``uri``."""
_include_file(self.context, uri, self._templateuri, **kwargs)
def _populate(self, d, l):
for ident in l:
if ident == '*':
for (k, v) in self._get_star():
d[k] = v
else:
d[ident] = getattr(self, ident)
def _get_star(self):
if self.callables:
for key in self.callables:
yield (key, self.callables[key])
def __getattr__(self, key):
if key in self.callables:
val = self.callables[key]
elif self.inherits:
val = getattr(self.inherits, key)
else:
raise AttributeError(
"Namespace '%s' has no member '%s'" %
(self.name, key))
setattr(self, key, val)
return val
class TemplateNamespace(Namespace):
"""A :class:`.Namespace` specific to a :class:`.Template` instance."""
def __init__(self, name, context, template=None, templateuri=None,
callables=None, inherits=None,
populate_self=True, calling_uri=None):
self.name = name
self.context = context
self.inherits = inherits
if callables is not None:
self.callables = dict([(c.__name__, c) for c in callables])
if templateuri is not None:
self.template = _lookup_template(context, templateuri,
calling_uri)
self._templateuri = self.template.module._template_uri
elif template is not None:
self.template = template
self._templateuri = template.module._template_uri
else:
raise TypeError("'template' argument is required.")
if populate_self:
lclcallable, lclcontext = \
_populate_self_namespace(context, self.template,
self_ns=self)
@property
def module(self):
"""The Python module referenced by this :class:`.Namespace`.
If the namespace references a :class:`.Template`, then
this module is the equivalent of ``template.module``,
i.e. the generated module for the template.
"""
return self.template.module
@property
def filename(self):
"""The path of the filesystem file used for this
:class:`.Namespace`'s module or template.
"""
return self.template.filename
@property
def uri(self):
"""The URI for this :class:`.Namespace`'s template.
I.e. whatever was sent to :meth:`.TemplateLookup.get_template()`.
This is the equivalent of :attr:`.Template.uri`.
"""
return self.template.uri
def _get_star(self):
if self.callables:
for key in self.callables:
yield (key, self.callables[key])
def get(key):
callable_ = self.template._get_def_callable(key)
return compat.partial(callable_, self.context)
for k in self.template.module._exports:
yield (k, get(k))
def __getattr__(self, key):
if key in self.callables:
val = self.callables[key]
elif self.template.has_def(key):
callable_ = self.template._get_def_callable(key)
val = compat.partial(callable_, self.context)
elif self.inherits:
val = getattr(self.inherits, key)
else:
raise AttributeError(
"Namespace '%s' has no member '%s'" %
(self.name, key))
setattr(self, key, val)
return val
class ModuleNamespace(Namespace):
"""A :class:`.Namespace` specific to a Python module instance."""
def __init__(self, name, context, module,
callables=None, inherits=None,
populate_self=True, calling_uri=None):
self.name = name
self.context = context
self.inherits = inherits
if callables is not None:
self.callables = dict([(c.__name__, c) for c in callables])
mod = __import__(module)
for token in module.split('.')[1:]:
mod = getattr(mod, token)
self.module = mod
@property
def filename(self):
"""The path of the filesystem file used for this
:class:`.Namespace`'s module or template.
"""
return self.module.__file__
def _get_star(self):
if self.callables:
for key in self.callables:
yield (key, self.callables[key])
for key in dir(self.module):
if key[0] != '_':
callable_ = getattr(self.module, key)
if compat.callable(callable_):
yield key, compat.partial(callable_, self.context)
def __getattr__(self, key):
if key in self.callables:
val = self.callables[key]
elif hasattr(self.module, key):
callable_ = getattr(self.module, key)
val = compat.partial(callable_, self.context)
elif self.inherits:
val = getattr(self.inherits, key)
else:
raise AttributeError(
"Namespace '%s' has no member '%s'" %
(self.name, key))
setattr(self, key, val)
return val
def supports_caller(func):
"""Apply a caller_stack compatibility decorator to a plain
Python function.
See the example in :ref:`namespaces_python_modules`.
"""
def wrap_stackframe(context, *args, **kwargs):
context.caller_stack._push_frame()
try:
return func(context, *args, **kwargs)
finally:
context.caller_stack._pop_frame()
return wrap_stackframe
def capture(context, callable_, *args, **kwargs):
"""Execute the given template def, capturing the output into
a buffer.
See the example in :ref:`namespaces_python_modules`.
"""
if not compat.callable(callable_):
raise exceptions.RuntimeException(
"capture() function expects a callable as "
"its argument (i.e. capture(func, *args, **kwargs))"
)
context._push_buffer()
try:
callable_(*args, **kwargs)
finally:
buf = context._pop_buffer()
return buf.getvalue()
def _decorate_toplevel(fn):
def decorate_render(render_fn):
def go(context, *args, **kw):
def y(*args, **kw):
return render_fn(context, *args, **kw)
try:
y.__name__ = render_fn.__name__[7:]
except TypeError:
# < Python 2.4
pass
return fn(y)(context, *args, **kw)
return go
return decorate_render
def _decorate_inline(context, fn):
def decorate_render(render_fn):
dec = fn(render_fn)
def go(*args, **kw):
return dec(context, *args, **kw)
return go
return decorate_render
def _include_file(context, uri, calling_uri, **kwargs):
"""locate the template from the given uri and include it in
the current output."""
template = _lookup_template(context, uri, calling_uri)
(callable_, ctx) = _populate_self_namespace(
context._clean_inheritance_tokens(),
template)
callable_(ctx, **_kwargs_for_include(callable_, context._data, **kwargs))
def _inherit_from(context, uri, calling_uri):
"""called by the _inherit method in template modules to set
up the inheritance chain at the start of a template's
execution."""
if uri is None:
return None
template = _lookup_template(context, uri, calling_uri)
self_ns = context['self']
ih = self_ns
while ih.inherits is not None:
ih = ih.inherits
lclcontext = context._locals({'next': ih})
ih.inherits = TemplateNamespace("self:%s" % template.uri,
lclcontext,
template=template,
populate_self=False)
context._data['parent'] = lclcontext._data['local'] = ih.inherits
callable_ = getattr(template.module, '_mako_inherit', None)
if callable_ is not None:
ret = callable_(template, lclcontext)
if ret:
return ret
gen_ns = getattr(template.module, '_mako_generate_namespaces', None)
if gen_ns is not None:
gen_ns(context)
return (template.callable_, lclcontext)
def _lookup_template(context, uri, relativeto):
lookup = context._with_template.lookup
if lookup is None:
raise exceptions.TemplateLookupException(
"Template '%s' has no TemplateLookup associated" %
context._with_template.uri)
uri = lookup.adjust_uri(uri, relativeto)
try:
return lookup.get_template(uri)
except exceptions.TopLevelLookupException:
raise exceptions.TemplateLookupException(str(compat.exception_as()))
def _populate_self_namespace(context, template, self_ns=None):
if self_ns is None:
self_ns = TemplateNamespace('self:%s' % template.uri,
context, template=template,
populate_self=False)
context._data['self'] = context._data['local'] = self_ns
if hasattr(template.module, '_mako_inherit'):
ret = template.module._mako_inherit(template, context)
if ret:
return ret
return (template.callable_, context)
def _render(template, callable_, args, data, as_unicode=False):
"""create a Context and return the string
output of the given template and template callable."""
if as_unicode:
buf = util.FastEncodingBuffer(as_unicode=True)
elif template.bytestring_passthrough:
buf = compat.StringIO()
else:
buf = util.FastEncodingBuffer(
as_unicode=as_unicode,
encoding=template.output_encoding,
errors=template.encoding_errors)
context = Context(buf, **data)
context._outputting_as_unicode = as_unicode
context._set_with_template(template)
_render_context(template, callable_, context, *args,
**_kwargs_for_callable(callable_, data))
return context._pop_buffer().getvalue()
def _kwargs_for_callable(callable_, data):
argspec = compat.inspect_func_args(callable_)
# for normal pages, **pageargs is usually present
if argspec[2]:
return data
# for rendering defs from the top level, figure out the args
namedargs = argspec[0] + [v for v in argspec[1:3] if v is not None]
kwargs = {}
for arg in namedargs:
if arg != 'context' and arg in data and arg not in kwargs:
kwargs[arg] = data[arg]
return kwargs
def _kwargs_for_include(callable_, data, **kwargs):
argspec = compat.inspect_func_args(callable_)
namedargs = argspec[0] + [v for v in argspec[1:3] if v is not None]
for arg in namedargs:
if arg != 'context' and arg in data and arg not in kwargs:
kwargs[arg] = data[arg]
return kwargs
def _render_context(tmpl, callable_, context, *args, **kwargs):
import mako.template as template
# create polymorphic 'self' namespace for this
# template with possibly updated context
if not isinstance(tmpl, template.DefTemplate):
# if main render method, call from the base of the inheritance stack
(inherit, lclcontext) = _populate_self_namespace(context, tmpl)
_exec_template(inherit, lclcontext, args=args, kwargs=kwargs)
else:
# otherwise, call the actual rendering method specified
(inherit, lclcontext) = _populate_self_namespace(context, tmpl.parent)
_exec_template(callable_, context, args=args, kwargs=kwargs)
def _exec_template(callable_, context, args=None, kwargs=None):
"""execute a rendering callable given the callable, a
Context, and optional explicit arguments
the contextual Template will be located if it exists, and
the error handling options specified on that Template will
be interpreted here.
"""
template = context._with_template
if template is not None and \
(template.format_exceptions or template.error_handler):
try:
callable_(context, *args, **kwargs)
except Exception:
_render_error(template, context, compat.exception_as())
except:
e = sys.exc_info()[0]
_render_error(template, context, e)
else:
callable_(context, *args, **kwargs)
def _render_error(template, context, error):
if template.error_handler:
result = template.error_handler(context, error)
if not result:
compat.reraise(*sys.exc_info())
else:
error_template = exceptions.html_error_template()
if context._outputting_as_unicode:
context._buffer_stack[:] = [
util.FastEncodingBuffer(as_unicode=True)]
else:
context._buffer_stack[:] = [util.FastEncodingBuffer(
error_template.output_encoding,
error_template.encoding_errors)]
context._set_with_template(error_template)
error_template.render_context(context, error=error)

View File

@ -0,0 +1,705 @@
# mako/template.py
# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
#
# This module is part of Mako and is released under
# the MIT License: http://www.opensource.org/licenses/mit-license.php
"""Provides the Template class, a facade for parsing, generating and executing
template strings, as well as template runtime operations."""
from mako.lexer import Lexer
from mako import runtime, util, exceptions, codegen, cache, compat
import os
import re
import shutil
import stat
import sys
import tempfile
import types
import weakref
class Template(object):
"""Represents a compiled template.
:class:`.Template` includes a reference to the original
template source (via the :attr:`.source` attribute)
as well as the source code of the
generated Python module (i.e. the :attr:`.code` attribute),
as well as a reference to an actual Python module.
:class:`.Template` is constructed using either a literal string
representing the template text, or a filename representing a filesystem
path to a source file.
:param text: textual template source. This argument is mutually
exclusive versus the ``filename`` parameter.
:param filename: filename of the source template. This argument is
mutually exclusive versus the ``text`` parameter.
:param buffer_filters: string list of filters to be applied
to the output of ``%def``\ s which are buffered, cached, or otherwise
filtered, after all filters
defined with the ``%def`` itself have been applied. Allows the
creation of default expression filters that let the output
of return-valued ``%def``\ s "opt out" of that filtering via
passing special attributes or objects.
:param bytestring_passthrough: When ``True``, and ``output_encoding`` is
set to ``None``, and :meth:`.Template.render` is used to render,
the `StringIO` or `cStringIO` buffer will be used instead of the
default "fast" buffer. This allows raw bytestrings in the
output stream, such as in expressions, to pass straight
through to the buffer. This flag is forced
to ``True`` if ``disable_unicode`` is also configured.
.. versionadded:: 0.4
Added to provide the same behavior as that of the previous series.
:param cache_args: Dictionary of cache configuration arguments that
will be passed to the :class:`.CacheImpl`. See :ref:`caching_toplevel`.
:param cache_dir:
.. deprecated:: 0.6
Use the ``'dir'`` argument in the ``cache_args`` dictionary.
See :ref:`caching_toplevel`.
:param cache_enabled: Boolean flag which enables caching of this
template. See :ref:`caching_toplevel`.
:param cache_impl: String name of a :class:`.CacheImpl` caching
implementation to use. Defaults to ``'beaker'``.
:param cache_type:
.. deprecated:: 0.6
Use the ``'type'`` argument in the ``cache_args`` dictionary.
See :ref:`caching_toplevel`.
:param cache_url:
.. deprecated:: 0.6
Use the ``'url'`` argument in the ``cache_args`` dictionary.
See :ref:`caching_toplevel`.
:param default_filters: List of string filter names that will
be applied to all expressions. See :ref:`filtering_default_filters`.
:param disable_unicode: Disables all awareness of Python Unicode
objects. See :ref:`unicode_disabled`.
:param enable_loop: When ``True``, enable the ``loop`` context variable.
This can be set to ``False`` to support templates that may
be making usage of the name "``loop``". Individual templates can
re-enable the "loop" context by placing the directive
``enable_loop="True"`` inside the ``<%page>`` tag -- see
:ref:`migrating_loop`.
:param encoding_errors: Error parameter passed to ``encode()`` when
string encoding is performed. See :ref:`usage_unicode`.
:param error_handler: Python callable which is called whenever
compile or runtime exceptions occur. The callable is passed
the current context as well as the exception. If the
callable returns ``True``, the exception is considered to
be handled, else it is re-raised after the function
completes. Is used to provide custom error-rendering
functions.
:param format_exceptions: if ``True``, exceptions which occur during
the render phase of this template will be caught and
formatted into an HTML error page, which then becomes the
rendered result of the :meth:`.render` call. Otherwise,
runtime exceptions are propagated outwards.
:param imports: String list of Python statements, typically individual
"import" lines, which will be placed into the module level
preamble of all generated Python modules. See the example
in :ref:`filtering_default_filters`.
:param future_imports: String list of names to import from `__future__`.
These will be concatenated into a comma-separated string and inserted
into the beginning of the template, e.g. ``futures_imports=['FOO',
'BAR']`` results in ``from __future__ import FOO, BAR``. If you're
interested in using features like the new division operator, you must
use future_imports to convey that to the renderer, as otherwise the
import will not appear as the first executed statement in the generated
code and will therefore not have the desired effect.
:param input_encoding: Encoding of the template's source code. Can
be used in lieu of the coding comment. See
:ref:`usage_unicode` as well as :ref:`unicode_toplevel` for
details on source encoding.
:param lookup: a :class:`.TemplateLookup` instance that will be used
for all file lookups via the ``<%namespace>``,
``<%include>``, and ``<%inherit>`` tags. See
:ref:`usage_templatelookup`.
:param module_directory: Filesystem location where generated
Python module files will be placed.
:param module_filename: Overrides the filename of the generated
Python module file. For advanced usage only.
:param module_writer: A callable which overrides how the Python
module is written entirely. The callable is passed the
encoded source content of the module and the destination
path to be written to. The default behavior of module writing
uses a tempfile in conjunction with a file move in order
to make the operation atomic. So a user-defined module
writing function that mimics the default behavior would be:
.. sourcecode:: python
import tempfile
import os
import shutil
def module_writer(source, outputpath):
(dest, name) = \\
tempfile.mkstemp(
dir=os.path.dirname(outputpath)
)
os.write(dest, source)
os.close(dest)
shutil.move(name, outputpath)
from mako.template import Template
mytemplate = Template(
filename="index.html",
module_directory="/path/to/modules",
module_writer=module_writer
)
The function is provided for unusual configurations where
certain platform-specific permissions or other special
steps are needed.
:param output_encoding: The encoding to use when :meth:`.render`
is called.
See :ref:`usage_unicode` as well as :ref:`unicode_toplevel`.
:param preprocessor: Python callable which will be passed
the full template source before it is parsed. The return
result of the callable will be used as the template source
code.
:param lexer_cls: A :class:`.Lexer` class used to parse
the template. The :class:`.Lexer` class is used by
default.
.. versionadded:: 0.7.4
:param strict_undefined: Replaces the automatic usage of
``UNDEFINED`` for any undeclared variables not located in
the :class:`.Context` with an immediate raise of
``NameError``. The advantage is immediate reporting of
missing variables which include the name.
.. versionadded:: 0.3.6
:param uri: string URI or other identifier for this template.
If not provided, the ``uri`` is generated from the filesystem
path, or from the in-memory identity of a non-file-based
template. The primary usage of the ``uri`` is to provide a key
within :class:`.TemplateLookup`, as well as to generate the
file path of the generated Python module file, if
``module_directory`` is specified.
"""
lexer_cls = Lexer
def __init__(self,
text=None,
filename=None,
uri=None,
format_exceptions=False,
error_handler=None,
lookup=None,
output_encoding=None,
encoding_errors='strict',
module_directory=None,
cache_args=None,
cache_impl='beaker',
cache_enabled=True,
cache_type=None,
cache_dir=None,
cache_url=None,
module_filename=None,
input_encoding=None,
disable_unicode=False,
module_writer=None,
bytestring_passthrough=False,
default_filters=None,
buffer_filters=(),
strict_undefined=False,
imports=None,
future_imports=None,
enable_loop=True,
preprocessor=None,
lexer_cls=None):
if uri:
self.module_id = re.sub(r'\W', "_", uri)
self.uri = uri
elif filename:
self.module_id = re.sub(r'\W', "_", filename)
drive, path = os.path.splitdrive(filename)
path = os.path.normpath(path).replace(os.path.sep, "/")
self.uri = path
else:
self.module_id = "memory:" + hex(id(self))
self.uri = self.module_id
u_norm = self.uri
if u_norm.startswith("/"):
u_norm = u_norm[1:]
u_norm = os.path.normpath(u_norm)
if u_norm.startswith(".."):
raise exceptions.TemplateLookupException(
"Template uri \"%s\" is invalid - "
"it cannot be relative outside "
"of the root path." % self.uri)
self.input_encoding = input_encoding
self.output_encoding = output_encoding
self.encoding_errors = encoding_errors
self.disable_unicode = disable_unicode
self.bytestring_passthrough = bytestring_passthrough or disable_unicode
self.enable_loop = enable_loop
self.strict_undefined = strict_undefined
self.module_writer = module_writer
if compat.py3k and disable_unicode:
raise exceptions.UnsupportedError(
"Mako for Python 3 does not "
"support disabling Unicode")
elif output_encoding and disable_unicode:
raise exceptions.UnsupportedError(
"output_encoding must be set to "
"None when disable_unicode is used.")
if default_filters is None:
if compat.py3k or self.disable_unicode:
self.default_filters = ['str']
else:
self.default_filters = ['unicode']
else:
self.default_filters = default_filters
self.buffer_filters = buffer_filters
self.imports = imports
self.future_imports = future_imports
self.preprocessor = preprocessor
if lexer_cls is not None:
self.lexer_cls = lexer_cls
# if plain text, compile code in memory only
if text is not None:
(code, module) = _compile_text(self, text, filename)
self._code = code
self._source = text
ModuleInfo(module, None, self, filename, code, text)
elif filename is not None:
# if template filename and a module directory, load
# a filesystem-based module file, generating if needed
if module_filename is not None:
path = module_filename
elif module_directory is not None:
path = os.path.abspath(
os.path.join(
os.path.normpath(module_directory),
u_norm + ".py"
)
)
else:
path = None
module = self._compile_from_file(path, filename)
else:
raise exceptions.RuntimeException(
"Template requires text or filename")
self.module = module
self.filename = filename
self.callable_ = self.module.render_body
self.format_exceptions = format_exceptions
self.error_handler = error_handler
self.lookup = lookup
self.module_directory = module_directory
self._setup_cache_args(
cache_impl, cache_enabled, cache_args,
cache_type, cache_dir, cache_url
)
@util.memoized_property
def reserved_names(self):
if self.enable_loop:
return codegen.RESERVED_NAMES
else:
return codegen.RESERVED_NAMES.difference(['loop'])
def _setup_cache_args(self,
cache_impl, cache_enabled, cache_args,
cache_type, cache_dir, cache_url):
self.cache_impl = cache_impl
self.cache_enabled = cache_enabled
if cache_args:
self.cache_args = cache_args
else:
self.cache_args = {}
# transfer deprecated cache_* args
if cache_type:
self.cache_args['type'] = cache_type
if cache_dir:
self.cache_args['dir'] = cache_dir
if cache_url:
self.cache_args['url'] = cache_url
def _compile_from_file(self, path, filename):
if path is not None:
util.verify_directory(os.path.dirname(path))
filemtime = os.stat(filename)[stat.ST_MTIME]
if not os.path.exists(path) or \
os.stat(path)[stat.ST_MTIME] < filemtime:
data = util.read_file(filename)
_compile_module_file(
self,
data,
filename,
path,
self.module_writer)
module = compat.load_module(self.module_id, path)
del sys.modules[self.module_id]
if module._magic_number != codegen.MAGIC_NUMBER:
data = util.read_file(filename)
_compile_module_file(
self,
data,
filename,
path,
self.module_writer)
module = compat.load_module(self.module_id, path)
del sys.modules[self.module_id]
ModuleInfo(module, path, self, filename, None, None)
else:
# template filename and no module directory, compile code
# in memory
data = util.read_file(filename)
code, module = _compile_text(
self,
data,
filename)
self._source = None
self._code = code
ModuleInfo(module, None, self, filename, code, None)
return module
@property
def source(self):
"""Return the template source code for this :class:`.Template`."""
return _get_module_info_from_callable(self.callable_).source
@property
def code(self):
"""Return the module source code for this :class:`.Template`."""
return _get_module_info_from_callable(self.callable_).code
@util.memoized_property
def cache(self):
return cache.Cache(self)
@property
def cache_dir(self):
return self.cache_args['dir']
@property
def cache_url(self):
return self.cache_args['url']
@property
def cache_type(self):
return self.cache_args['type']
def render(self, *args, **data):
"""Render the output of this template as a string.
If the template specifies an output encoding, the string
will be encoded accordingly, else the output is raw (raw
output uses `cStringIO` and can't handle multibyte
characters). A :class:`.Context` object is created corresponding
to the given data. Arguments that are explicitly declared
by this template's internal rendering method are also
pulled from the given ``*args``, ``**data`` members.
"""
return runtime._render(self, self.callable_, args, data)
def render_unicode(self, *args, **data):
"""Render the output of this template as a unicode object."""
return runtime._render(self,
self.callable_,
args,
data,
as_unicode=True)
def render_context(self, context, *args, **kwargs):
"""Render this :class:`.Template` with the given context.
The data is written to the context's buffer.
"""
if getattr(context, '_with_template', None) is None:
context._set_with_template(self)
runtime._render_context(self,
self.callable_,
context,
*args,
**kwargs)
def has_def(self, name):
return hasattr(self.module, "render_%s" % name)
def get_def(self, name):
"""Return a def of this template as a :class:`.DefTemplate`."""
return DefTemplate(self, getattr(self.module, "render_%s" % name))
def _get_def_callable(self, name):
return getattr(self.module, "render_%s" % name)
@property
def last_modified(self):
return self.module._modified_time
class ModuleTemplate(Template):
"""A Template which is constructed given an existing Python module.
e.g.::
t = Template("this is a template")
f = file("mymodule.py", "w")
f.write(t.code)
f.close()
import mymodule
t = ModuleTemplate(mymodule)
print t.render()
"""
def __init__(self, module,
module_filename=None,
template=None,
template_filename=None,
module_source=None,
template_source=None,
output_encoding=None,
encoding_errors='strict',
disable_unicode=False,
bytestring_passthrough=False,
format_exceptions=False,
error_handler=None,
lookup=None,
cache_args=None,
cache_impl='beaker',
cache_enabled=True,
cache_type=None,
cache_dir=None,
cache_url=None,
):
self.module_id = re.sub(r'\W', "_", module._template_uri)
self.uri = module._template_uri
self.input_encoding = module._source_encoding
self.output_encoding = output_encoding
self.encoding_errors = encoding_errors
self.disable_unicode = disable_unicode
self.bytestring_passthrough = bytestring_passthrough or disable_unicode
self.enable_loop = module._enable_loop
if compat.py3k and disable_unicode:
raise exceptions.UnsupportedError(
"Mako for Python 3 does not "
"support disabling Unicode")
elif output_encoding and disable_unicode:
raise exceptions.UnsupportedError(
"output_encoding must be set to "
"None when disable_unicode is used.")
self.module = module
self.filename = template_filename
ModuleInfo(module,
module_filename,
self,
template_filename,
module_source,
template_source)
self.callable_ = self.module.render_body
self.format_exceptions = format_exceptions
self.error_handler = error_handler
self.lookup = lookup
self._setup_cache_args(
cache_impl, cache_enabled, cache_args,
cache_type, cache_dir, cache_url
)
class DefTemplate(Template):
"""A :class:`.Template` which represents a callable def in a parent
template."""
def __init__(self, parent, callable_):
self.parent = parent
self.callable_ = callable_
self.output_encoding = parent.output_encoding
self.module = parent.module
self.encoding_errors = parent.encoding_errors
self.format_exceptions = parent.format_exceptions
self.error_handler = parent.error_handler
self.enable_loop = parent.enable_loop
self.lookup = parent.lookup
self.bytestring_passthrough = parent.bytestring_passthrough
def get_def(self, name):
return self.parent.get_def(name)
class ModuleInfo(object):
"""Stores information about a module currently loaded into
memory, provides reverse lookups of template source, module
source code based on a module's identifier.
"""
_modules = weakref.WeakValueDictionary()
def __init__(self,
module,
module_filename,
template,
template_filename,
module_source,
template_source):
self.module = module
self.module_filename = module_filename
self.template_filename = template_filename
self.module_source = module_source
self.template_source = template_source
self._modules[module.__name__] = template._mmarker = self
if module_filename:
self._modules[module_filename] = self
@classmethod
def get_module_source_metadata(cls, module_source, full_line_map=False):
source_map = re.search(
r"__M_BEGIN_METADATA(.+?)__M_END_METADATA",
module_source, re.S).group(1)
source_map = compat.json.loads(source_map)
source_map['line_map'] = dict((int(k), int(v))
for k, v in source_map['line_map'].items())
if full_line_map:
f_line_map = source_map['full_line_map'] = []
line_map = source_map['line_map']
curr_templ_line = 1
for mod_line in range(1, max(line_map)):
if mod_line in line_map:
curr_templ_line = line_map[mod_line]
f_line_map.append(curr_templ_line)
return source_map
@property
def code(self):
if self.module_source is not None:
return self.module_source
else:
return util.read_python_file(self.module_filename)
@property
def source(self):
if self.template_source is not None:
if self.module._source_encoding and \
not isinstance(self.template_source, compat.text_type):
return self.template_source.decode(
self.module._source_encoding)
else:
return self.template_source
else:
data = util.read_file(self.template_filename)
if self.module._source_encoding:
return data.decode(self.module._source_encoding)
else:
return data
def _compile(template, text, filename, generate_magic_comment):
lexer = template.lexer_cls(text,
filename,
disable_unicode=template.disable_unicode,
input_encoding=template.input_encoding,
preprocessor=template.preprocessor)
node = lexer.parse()
source = codegen.compile(node,
template.uri,
filename,
default_filters=template.default_filters,
buffer_filters=template.buffer_filters,
imports=template.imports,
future_imports=template.future_imports,
source_encoding=lexer.encoding,
generate_magic_comment=generate_magic_comment,
disable_unicode=template.disable_unicode,
strict_undefined=template.strict_undefined,
enable_loop=template.enable_loop,
reserved_names=template.reserved_names)
return source, lexer
def _compile_text(template, text, filename):
identifier = template.module_id
source, lexer = _compile(template, text, filename,
generate_magic_comment=template.disable_unicode)
cid = identifier
if not compat.py3k and isinstance(cid, compat.text_type):
cid = cid.encode()
module = types.ModuleType(cid)
code = compile(source, cid, 'exec')
# this exec() works for 2.4->3.3.
exec(code, module.__dict__, module.__dict__)
return (source, module)
def _compile_module_file(template, text, filename, outputpath, module_writer):
source, lexer = _compile(template, text, filename,
generate_magic_comment=True)
if isinstance(source, compat.text_type):
source = source.encode(lexer.encoding or 'ascii')
if module_writer:
module_writer(source, outputpath)
else:
# make tempfiles in the same location as the ultimate
# location. this ensures they're on the same filesystem,
# avoiding synchronization issues.
(dest, name) = tempfile.mkstemp(dir=os.path.dirname(outputpath))
os.write(dest, source)
os.close(dest)
shutil.move(name, outputpath)
def _get_module_info_from_callable(callable_):
if compat.py3k:
return _get_module_info(callable_.__globals__['__name__'])
else:
return _get_module_info(callable_.func_globals['__name__'])
def _get_module_info(filename):
return ModuleInfo._modules[filename]

View File

@ -0,0 +1,360 @@
# mako/util.py
# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
#
# This module is part of Mako and is released under
# the MIT License: http://www.opensource.org/licenses/mit-license.php
import re
import collections
import codecs
import os
from mako import compat
import operator
def update_wrapper(decorated, fn):
decorated.__wrapped__ = fn
decorated.__name__ = fn.__name__
return decorated
class PluginLoader(object):
def __init__(self, group):
self.group = group
self.impls = {}
def load(self, name):
if name in self.impls:
return self.impls[name]()
else:
import pkg_resources
for impl in pkg_resources.iter_entry_points(
self.group,
name):
self.impls[name] = impl.load
return impl.load()
else:
from mako import exceptions
raise exceptions.RuntimeException(
"Can't load plugin %s %s" %
(self.group, name))
def register(self, name, modulepath, objname):
def load():
mod = __import__(modulepath)
for token in modulepath.split(".")[1:]:
mod = getattr(mod, token)
return getattr(mod, objname)
self.impls[name] = load
def verify_directory(dir):
"""create and/or verify a filesystem directory."""
tries = 0
while not os.path.exists(dir):
try:
tries += 1
os.makedirs(dir, compat.octal("0775"))
except:
if tries > 5:
raise
def to_list(x, default=None):
if x is None:
return default
if not isinstance(x, (list, tuple)):
return [x]
else:
return x
class memoized_property(object):
"""A read-only @property that is only evaluated once."""
def __init__(self, fget, doc=None):
self.fget = fget
self.__doc__ = doc or fget.__doc__
self.__name__ = fget.__name__
def __get__(self, obj, cls):
if obj is None:
return self
obj.__dict__[self.__name__] = result = self.fget(obj)
return result
class memoized_instancemethod(object):
"""Decorate a method memoize its return value.
Best applied to no-arg methods: memoization is not sensitive to
argument values, and will always return the same value even when
called with different arguments.
"""
def __init__(self, fget, doc=None):
self.fget = fget
self.__doc__ = doc or fget.__doc__
self.__name__ = fget.__name__
def __get__(self, obj, cls):
if obj is None:
return self
def oneshot(*args, **kw):
result = self.fget(obj, *args, **kw)
memo = lambda *a, **kw: result
memo.__name__ = self.__name__
memo.__doc__ = self.__doc__
obj.__dict__[self.__name__] = memo
return result
oneshot.__name__ = self.__name__
oneshot.__doc__ = self.__doc__
return oneshot
class SetLikeDict(dict):
"""a dictionary that has some setlike methods on it"""
def union(self, other):
"""produce a 'union' of this dict and another (at the key level).
values in the second dict take precedence over that of the first"""
x = SetLikeDict(**self)
x.update(other)
return x
class FastEncodingBuffer(object):
"""a very rudimentary buffer that is faster than StringIO,
but doesn't crash on unicode data like cStringIO."""
def __init__(self, encoding=None, errors='strict', as_unicode=False):
self.data = collections.deque()
self.encoding = encoding
if as_unicode:
self.delim = compat.u('')
else:
self.delim = ''
self.as_unicode = as_unicode
self.errors = errors
self.write = self.data.append
def truncate(self):
self.data = collections.deque()
self.write = self.data.append
def getvalue(self):
if self.encoding:
return self.delim.join(self.data).encode(self.encoding,
self.errors)
else:
return self.delim.join(self.data)
class LRUCache(dict):
"""A dictionary-like object that stores a limited number of items,
discarding lesser used items periodically.
this is a rewrite of LRUCache from Myghty to use a periodic timestamp-based
paradigm so that synchronization is not really needed. the size management
is inexact.
"""
class _Item(object):
def __init__(self, key, value):
self.key = key
self.value = value
self.timestamp = compat.time_func()
def __repr__(self):
return repr(self.value)
def __init__(self, capacity, threshold=.5):
self.capacity = capacity
self.threshold = threshold
def __getitem__(self, key):
item = dict.__getitem__(self, key)
item.timestamp = compat.time_func()
return item.value
def values(self):
return [i.value for i in dict.values(self)]
def setdefault(self, key, value):
if key in self:
return self[key]
else:
self[key] = value
return value
def __setitem__(self, key, value):
item = dict.get(self, key)
if item is None:
item = self._Item(key, value)
dict.__setitem__(self, key, item)
else:
item.value = value
self._manage_size()
def _manage_size(self):
while len(self) > self.capacity + self.capacity * self.threshold:
bytime = sorted(dict.values(self),
key=operator.attrgetter('timestamp'), reverse=True)
for item in bytime[self.capacity:]:
try:
del self[item.key]
except KeyError:
# if we couldn't find a key, most likely some other thread
# broke in on us. loop around and try again
break
# Regexp to match python magic encoding line
_PYTHON_MAGIC_COMMENT_re = re.compile(
r'[ \t\f]* \# .* coding[=:][ \t]*([-\w.]+)',
re.VERBOSE)
def parse_encoding(fp):
"""Deduce the encoding of a Python source file (binary mode) from magic
comment.
It does this in the same way as the `Python interpreter`__
.. __: http://docs.python.org/ref/encodings.html
The ``fp`` argument should be a seekable file object in binary mode.
"""
pos = fp.tell()
fp.seek(0)
try:
line1 = fp.readline()
has_bom = line1.startswith(codecs.BOM_UTF8)
if has_bom:
line1 = line1[len(codecs.BOM_UTF8):]
m = _PYTHON_MAGIC_COMMENT_re.match(line1.decode('ascii', 'ignore'))
if not m:
try:
import parser
parser.suite(line1.decode('ascii', 'ignore'))
except (ImportError, SyntaxError):
# Either it's a real syntax error, in which case the source
# is not valid python source, or line2 is a continuation of
# line1, in which case we don't want to scan line2 for a magic
# comment.
pass
else:
line2 = fp.readline()
m = _PYTHON_MAGIC_COMMENT_re.match(
line2.decode('ascii', 'ignore'))
if has_bom:
if m:
raise SyntaxError("python refuses to compile code with both a UTF8" \
" byte-order-mark and a magic encoding comment")
return 'utf_8'
elif m:
return m.group(1)
else:
return None
finally:
fp.seek(pos)
def sorted_dict_repr(d):
"""repr() a dictionary with the keys in order.
Used by the lexer unit test to compare parse trees based on strings.
"""
keys = list(d.keys())
keys.sort()
return "{" + ", ".join(["%r: %r" % (k, d[k]) for k in keys]) + "}"
def restore__ast(_ast):
"""Attempt to restore the required classes to the _ast module if it
appears to be missing them
"""
if hasattr(_ast, 'AST'):
return
_ast.PyCF_ONLY_AST = 2 << 9
m = compile("""\
def foo(): pass
class Bar(object): pass
if False: pass
baz = 'mako'
1 + 2 - 3 * 4 / 5
6 // 7 % 8 << 9 >> 10
11 & 12 ^ 13 | 14
15 and 16 or 17
-baz + (not +18) - ~17
baz and 'foo' or 'bar'
(mako is baz == baz) is not baz != mako
mako > baz < mako >= baz <= mako
mako in baz not in mako""", '<unknown>', 'exec', _ast.PyCF_ONLY_AST)
_ast.Module = type(m)
for cls in _ast.Module.__mro__:
if cls.__name__ == 'mod':
_ast.mod = cls
elif cls.__name__ == 'AST':
_ast.AST = cls
_ast.FunctionDef = type(m.body[0])
_ast.ClassDef = type(m.body[1])
_ast.If = type(m.body[2])
_ast.Name = type(m.body[3].targets[0])
_ast.Store = type(m.body[3].targets[0].ctx)
_ast.Str = type(m.body[3].value)
_ast.Sub = type(m.body[4].value.op)
_ast.Add = type(m.body[4].value.left.op)
_ast.Div = type(m.body[4].value.right.op)
_ast.Mult = type(m.body[4].value.right.left.op)
_ast.RShift = type(m.body[5].value.op)
_ast.LShift = type(m.body[5].value.left.op)
_ast.Mod = type(m.body[5].value.left.left.op)
_ast.FloorDiv = type(m.body[5].value.left.left.left.op)
_ast.BitOr = type(m.body[6].value.op)
_ast.BitXor = type(m.body[6].value.left.op)
_ast.BitAnd = type(m.body[6].value.left.left.op)
_ast.Or = type(m.body[7].value.op)
_ast.And = type(m.body[7].value.values[0].op)
_ast.Invert = type(m.body[8].value.right.op)
_ast.Not = type(m.body[8].value.left.right.op)
_ast.UAdd = type(m.body[8].value.left.right.operand.op)
_ast.USub = type(m.body[8].value.left.left.op)
_ast.Or = type(m.body[9].value.op)
_ast.And = type(m.body[9].value.values[0].op)
_ast.IsNot = type(m.body[10].value.ops[0])
_ast.NotEq = type(m.body[10].value.ops[1])
_ast.Is = type(m.body[10].value.left.ops[0])
_ast.Eq = type(m.body[10].value.left.ops[1])
_ast.Gt = type(m.body[11].value.ops[0])
_ast.Lt = type(m.body[11].value.ops[1])
_ast.GtE = type(m.body[11].value.ops[2])
_ast.LtE = type(m.body[11].value.ops[3])
_ast.In = type(m.body[12].value.ops[0])
_ast.NotIn = type(m.body[12].value.ops[1])
def read_file(path, mode='rb'):
fp = open(path, mode)
try:
data = fp.read()
return data
finally:
fp.close()
def read_python_file(path):
fp = open(path, "rb")
try:
encoding = parse_encoding(fp)
data = fp.read()
if encoding:
data = data.decode(encoding)
return data
finally:
fp.close()

View File

@ -0,0 +1,141 @@
<%
max_len = 0
for knob in knobs:
if len(knob[0]) > max_len: max_len = len(knob[0])
max_len += len('KNOB_ ')
if max_len % 4: max_len += 4 - (max_len % 4)
def space_knob(knob):
knob_len = len('KNOB_' + knob)
return ' '*(max_len - knob_len)
%>/******************************************************************************
*
* Copyright 2015
* Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http ://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
% if gen_header:
* @file ${filename}.h
% else:
* @file ${filename}.cpp
% endif
*
* @brief Dynamic Knobs for Core.
*
* ======================= AUTO GENERATED: DO NOT EDIT !!! ====================
*
******************************************************************************/
%if gen_header:
#pragma once
#include <string>
template <typename T>
struct Knob
{
const T& Value() const { return m_Value; }
const T& Value(const T& newValue) { m_Value = newValue; return Value(); }
protected:
Knob(const T& defaultValue) : m_Value(defaultValue) {}
private:
T m_Value;
};
#define DEFINE_KNOB(_name, _type, _default) \\
struct Knob_##_name : Knob<_type> \\
{ \\
Knob_##_name() : Knob<_type>(_default) { } \\
static const char* Name() { return "KNOB_" #_name; } \\
} _name;
#define GET_KNOB(_name) g_GlobalKnobs._name.Value()
#define SET_KNOB(_name, _newValue) g_GlobalKnobs._name.Value(_newValue)
struct GlobalKnobs
{
% for knob in knobs:
//-----------------------------------------------------------
// KNOB_${knob[0]}
//
% for line in knob[1]['desc']:
// ${line}
% endfor
DEFINE_KNOB(${knob[0]}, ${knob[1]['type']}, ${knob[1]['default']});
% endfor
GlobalKnobs();
std::string ToString(const char* optPerLinePrefix="");
};
extern GlobalKnobs g_GlobalKnobs;
% for knob in knobs:
#define KNOB_${knob[0]}${space_knob(knob[0])}GET_KNOB(${knob[0]})
% endfor
% else:
% for inc in includes:
#include <${inc}>
% endfor
//========================================================
// Static Data Members
//========================================================
GlobalKnobs g_GlobalKnobs;
//========================================================
// Knob Initialization
//========================================================
GlobalKnobs::GlobalKnobs()
{
% for knob in knobs:
InitKnob(${knob[0]});
% endfor
}
//========================================================
// Knob Display (Convert to String)
//========================================================
std::string GlobalKnobs::ToString(const char* optPerLinePrefix)
{
std::basic_stringstream<char> str;
str << std::showbase << std::setprecision(1) << std::fixed;
if (optPerLinePrefix == nullptr) { optPerLinePrefix = ""; }
% for knob in knobs:
str << optPerLinePrefix << "KNOB_${knob[0]}:${space_knob(knob[0])}";
% if knob[1]['type'] == 'bool':
str << (KNOB_${knob[0]} ? "+\n" : "-\n");
% elif knob[1]['type'] != 'float':
str << std::hex << std::setw(11) << std::left << KNOB_${knob[0]};
str << std::dec << KNOB_${knob[0]} << "\n";
% else:
str << KNOB_${knob[0]} << "\n";
% endif
% endfor
str << std::ends;
return str.str();
}
% endif