gallivm: enhance special sse2 4x4f and 2x8f -> 1x16ub conversion
There's no good reason why it can't handle 2x4f->1x8ub, 1x4f->1x4ub and 1x8f->1x8ub cases, there might be legitimate reasons why we don't have enough input vectors for a full destination vector, and using pack intrinsics should still be much better than using generic conversion (it looks like convert_alpha from the blend code might hit this though I suspect it could be avoided). v2: add another test vector format to lp_test_conv so this gets tested. Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
This commit is contained in:
parent
ce82523db9
commit
44993c1808
|
@ -530,24 +530,22 @@ int lp_build_conv_auto(struct gallivm_state *gallivm,
|
|||
dst_type->width == 8)
|
||||
{
|
||||
/* Special case 4x4f --> 1x16ub */
|
||||
if (src_type.length == 4 && util_cpu_caps.has_sse2)
|
||||
if (src_type.length == 4 &&
|
||||
util_cpu_caps.has_sse2)
|
||||
{
|
||||
assert((num_srcs % 4) == 0);
|
||||
|
||||
num_dsts = num_srcs / 4;
|
||||
dst_type->length = 16;
|
||||
num_dsts = (num_srcs + 3) / 4;
|
||||
dst_type->length = num_srcs * 4 >= 16 ? 16 : num_srcs * 4;
|
||||
|
||||
lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
|
||||
return num_dsts;
|
||||
}
|
||||
|
||||
/* Special case 2x8f --> 1x16ub */
|
||||
if (src_type.length == 8 && util_cpu_caps.has_avx)
|
||||
if (src_type.length == 8 &&
|
||||
util_cpu_caps.has_avx)
|
||||
{
|
||||
assert((num_srcs % 2) == 0);
|
||||
|
||||
num_dsts = num_srcs / 2;
|
||||
dst_type->length = 16;
|
||||
num_dsts = (num_srcs + 1) / 2;
|
||||
dst_type->length = num_srcs * 8 >= 16 ? 16 : num_srcs * 8;
|
||||
|
||||
lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
|
||||
return num_dsts;
|
||||
|
@ -602,7 +600,7 @@ lp_build_conv(struct gallivm_state *gallivm,
|
|||
num_tmps = num_srcs;
|
||||
|
||||
|
||||
/* Special case 4x4f --> 1x16ub
|
||||
/* Special case 4x4f --> 1x16ub, 2x4f -> 1x8ub, 1x4f -> 1x4ub
|
||||
*/
|
||||
if (src_type.floating == 1 &&
|
||||
src_type.fixed == 0 &&
|
||||
|
@ -616,20 +614,23 @@ lp_build_conv(struct gallivm_state *gallivm,
|
|||
dst_type.sign == 0 &&
|
||||
dst_type.norm == 1 &&
|
||||
dst_type.width == 8 &&
|
||||
dst_type.length == 16 &&
|
||||
|
||||
4 * num_dsts == num_srcs &&
|
||||
((dst_type.length == 16 && 4 * num_dsts == num_srcs) ||
|
||||
(num_dsts == 1 && dst_type.length * num_srcs == 16 && num_srcs != 3)) &&
|
||||
|
||||
util_cpu_caps.has_sse2)
|
||||
{
|
||||
struct lp_build_context bld;
|
||||
struct lp_type int16_type = dst_type;
|
||||
struct lp_type int32_type = dst_type;
|
||||
struct lp_type int16_type, int32_type;
|
||||
struct lp_type dst_type_ext = dst_type;
|
||||
LLVMValueRef const_255f;
|
||||
unsigned i, j;
|
||||
|
||||
lp_build_context_init(&bld, gallivm, src_type);
|
||||
|
||||
dst_type_ext.length = 16;
|
||||
int16_type = int32_type = dst_type_ext;
|
||||
|
||||
int16_type.width *= 2;
|
||||
int16_type.length /= 2;
|
||||
int16_type.sign = 1;
|
||||
|
@ -643,21 +644,34 @@ lp_build_conv(struct gallivm_state *gallivm,
|
|||
for (i = 0; i < num_dsts; ++i, src += 4) {
|
||||
LLVMValueRef lo, hi;
|
||||
|
||||
for (j = 0; j < 4; ++j) {
|
||||
for (j = 0; j < dst_type.length / 4; ++j) {
|
||||
tmp[j] = LLVMBuildFMul(builder, src[j], const_255f, "");
|
||||
tmp[j] = lp_build_iround(&bld, tmp[j]);
|
||||
}
|
||||
|
||||
if (num_srcs == 1) {
|
||||
tmp[1] = tmp[0];
|
||||
}
|
||||
|
||||
/* relying on clamping behavior of sse2 intrinsics here */
|
||||
lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]);
|
||||
|
||||
if (num_srcs < 4) {
|
||||
hi = lo;
|
||||
}
|
||||
else {
|
||||
hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);
|
||||
dst[i] = lp_build_pack2(gallivm, int16_type, dst_type, lo, hi);
|
||||
}
|
||||
dst[i] = lp_build_pack2(gallivm, int16_type, dst_type_ext, lo, hi);
|
||||
}
|
||||
if (num_srcs < 4) {
|
||||
dst[0] = lp_build_extract_range(gallivm, dst[0], 0, dst_type.length);
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
/* Special case 2x8f --> 1x16ub
|
||||
/* Special case 2x8f --> 1x16ub, 1x8f ->1x8ub
|
||||
*/
|
||||
else if (src_type.floating == 1 &&
|
||||
src_type.fixed == 0 &&
|
||||
|
@ -671,20 +685,23 @@ lp_build_conv(struct gallivm_state *gallivm,
|
|||
dst_type.sign == 0 &&
|
||||
dst_type.norm == 1 &&
|
||||
dst_type.width == 8 &&
|
||||
dst_type.length == 16 &&
|
||||
|
||||
2 * num_dsts == num_srcs &&
|
||||
((dst_type.length == 16 && 2 * num_dsts == num_srcs) ||
|
||||
(num_dsts == 1 && dst_type.length * num_srcs == 8)) &&
|
||||
|
||||
util_cpu_caps.has_avx) {
|
||||
|
||||
struct lp_build_context bld;
|
||||
struct lp_type int16_type = dst_type;
|
||||
struct lp_type int32_type = dst_type;
|
||||
struct lp_type int16_type, int32_type;
|
||||
struct lp_type dst_type_ext = dst_type;
|
||||
LLVMValueRef const_255f;
|
||||
unsigned i;
|
||||
|
||||
lp_build_context_init(&bld, gallivm, src_type);
|
||||
|
||||
dst_type_ext.length = 16;
|
||||
int16_type = int32_type = dst_type_ext;
|
||||
|
||||
int16_type.width *= 2;
|
||||
int16_type.length /= 2;
|
||||
int16_type.sign = 1;
|
||||
|
@ -699,21 +716,30 @@ lp_build_conv(struct gallivm_state *gallivm,
|
|||
LLVMValueRef lo, hi, a, b;
|
||||
|
||||
a = LLVMBuildFMul(builder, src[0], const_255f, "");
|
||||
b = LLVMBuildFMul(builder, src[1], const_255f, "");
|
||||
|
||||
a = lp_build_iround(&bld, a);
|
||||
b = lp_build_iround(&bld, b);
|
||||
|
||||
tmp[0] = lp_build_extract_range(gallivm, a, 0, 4);
|
||||
tmp[1] = lp_build_extract_range(gallivm, a, 4, 4);
|
||||
tmp[2] = lp_build_extract_range(gallivm, b, 0, 4);
|
||||
tmp[3] = lp_build_extract_range(gallivm, b, 4, 4);
|
||||
|
||||
/* relying on clamping behavior of sse2 intrinsics here */
|
||||
lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]);
|
||||
hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);
|
||||
dst[i] = lp_build_pack2(gallivm, int16_type, dst_type, lo, hi);
|
||||
|
||||
if (num_srcs == 1) {
|
||||
hi = lo;
|
||||
}
|
||||
else {
|
||||
b = LLVMBuildFMul(builder, src[1], const_255f, "");
|
||||
b = lp_build_iround(&bld, b);
|
||||
tmp[2] = lp_build_extract_range(gallivm, b, 0, 4);
|
||||
tmp[3] = lp_build_extract_range(gallivm, b, 4, 4);
|
||||
hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);
|
||||
|
||||
}
|
||||
dst[i] = lp_build_pack2(gallivm, int16_type, dst_type_ext, lo, hi);
|
||||
}
|
||||
|
||||
if (num_srcs == 1) {
|
||||
dst[0] = lp_build_extract_range(gallivm, dst[0], 0, dst_type.length);
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
|
@ -377,6 +377,8 @@ const struct lp_type conv_types[] = {
|
|||
{ FALSE, FALSE, TRUE, FALSE, 8, 4 },
|
||||
{ FALSE, FALSE, FALSE, TRUE, 8, 4 },
|
||||
{ FALSE, FALSE, FALSE, FALSE, 8, 4 },
|
||||
|
||||
{ FALSE, FALSE, FALSE, TRUE, 8, 8 },
|
||||
};
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue